{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.984, "eval_steps": 1, "global_step": 124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": NaN, "learning_rate": 0.0, "loss": 1.1733, "step": 1 }, { "epoch": 0.016, "eval_accuracy": 0.364, "eval_loss": 1.2354755401611328, "eval_runtime": 11.4949, "eval_samples_per_second": 21.749, "eval_steps_per_second": 2.784, "step": 1 }, { "epoch": 0.032, "grad_norm": NaN, "learning_rate": 0.0, "loss": 1.1385, "step": 2 }, { "epoch": 0.032, "eval_accuracy": 0.364, "eval_loss": 1.2354755401611328, "eval_runtime": 11.3512, "eval_samples_per_second": 22.024, "eval_steps_per_second": 2.819, "step": 2 }, { "epoch": 0.048, "grad_norm": NaN, "learning_rate": 0.0, "loss": 1.1504, "step": 3 }, { "epoch": 0.048, "eval_accuracy": 0.364, "eval_loss": 1.2354755401611328, "eval_runtime": 11.554, "eval_samples_per_second": 21.638, "eval_steps_per_second": 2.77, "step": 3 }, { "epoch": 0.064, "grad_norm": NaN, "learning_rate": 0.0, "loss": 1.4467, "step": 4 }, { "epoch": 0.064, "eval_accuracy": 0.364, "eval_loss": 1.2354755401611328, "eval_runtime": 11.2846, "eval_samples_per_second": 22.154, "eval_steps_per_second": 2.836, "step": 4 }, { "epoch": 0.08, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 1.0915, "step": 5 }, { "epoch": 0.08, "eval_accuracy": 0.364, "eval_loss": 1.2354755401611328, "eval_runtime": 11.2925, "eval_samples_per_second": 22.138, "eval_steps_per_second": 2.834, "step": 5 }, { "epoch": 0.096, "grad_norm": 87.5323715209961, "learning_rate": 2.5e-05, "loss": 1.3424, "step": 6 }, { "epoch": 0.096, "eval_accuracy": 0.364, "eval_loss": 1.2354755401611328, "eval_runtime": 11.311, "eval_samples_per_second": 22.102, "eval_steps_per_second": 2.829, "step": 6 }, { "epoch": 0.112, "grad_norm": 50.48843765258789, "learning_rate": 5e-05, "loss": 1.1041, "step": 7 }, { "epoch": 0.112, "eval_accuracy": 0.432, "eval_loss": 1.0213314294815063, "eval_runtime": 11.2832, "eval_samples_per_second": 22.157, "eval_steps_per_second": 2.836, "step": 7 }, { "epoch": 0.128, "grad_norm": 12.972390174865723, "learning_rate": 4.959016393442623e-05, "loss": 0.751, "step": 8 }, { "epoch": 0.128, "eval_accuracy": 0.456, "eval_loss": 0.9333825707435608, "eval_runtime": 11.3015, "eval_samples_per_second": 22.121, "eval_steps_per_second": 2.831, "step": 8 }, { "epoch": 0.144, "grad_norm": 38.46497344970703, "learning_rate": 4.918032786885246e-05, "loss": 0.9293, "step": 9 }, { "epoch": 0.144, "eval_accuracy": 0.536, "eval_loss": 0.9040337800979614, "eval_runtime": 11.3267, "eval_samples_per_second": 22.072, "eval_steps_per_second": 2.825, "step": 9 }, { "epoch": 0.16, "grad_norm": 9.813628196716309, "learning_rate": 4.8770491803278687e-05, "loss": 0.6036, "step": 10 }, { "epoch": 0.16, "eval_accuracy": 0.58, "eval_loss": 1.0835610628128052, "eval_runtime": 11.3275, "eval_samples_per_second": 22.07, "eval_steps_per_second": 2.825, "step": 10 }, { "epoch": 0.176, "grad_norm": 11.096491813659668, "learning_rate": 4.836065573770492e-05, "loss": 0.9184, "step": 11 }, { "epoch": 0.176, "eval_accuracy": 0.596, "eval_loss": 1.2577338218688965, "eval_runtime": 11.2856, "eval_samples_per_second": 22.152, "eval_steps_per_second": 2.835, "step": 11 }, { "epoch": 0.192, "grad_norm": 45.111083984375, "learning_rate": 4.795081967213115e-05, "loss": 0.8972, "step": 12 }, { "epoch": 0.192, "eval_accuracy": 0.612, "eval_loss": 1.280572533607483, "eval_runtime": 11.296, "eval_samples_per_second": 22.132, "eval_steps_per_second": 2.833, "step": 12 }, { "epoch": 0.208, "grad_norm": 54.95970153808594, "learning_rate": 4.754098360655738e-05, "loss": 1.1253, "step": 13 }, { "epoch": 0.208, "eval_accuracy": 0.612, "eval_loss": 1.1115046739578247, "eval_runtime": 11.3216, "eval_samples_per_second": 22.082, "eval_steps_per_second": 2.826, "step": 13 }, { "epoch": 0.224, "grad_norm": 19.976964950561523, "learning_rate": 4.713114754098361e-05, "loss": 0.9591, "step": 14 }, { "epoch": 0.224, "eval_accuracy": 0.584, "eval_loss": 0.9410788416862488, "eval_runtime": 11.3197, "eval_samples_per_second": 22.085, "eval_steps_per_second": 2.827, "step": 14 }, { "epoch": 0.24, "grad_norm": 21.54231834411621, "learning_rate": 4.672131147540984e-05, "loss": 0.4318, "step": 15 }, { "epoch": 0.24, "eval_accuracy": 0.504, "eval_loss": 0.8608736395835876, "eval_runtime": 11.2959, "eval_samples_per_second": 22.132, "eval_steps_per_second": 2.833, "step": 15 }, { "epoch": 0.256, "grad_norm": 31.807493209838867, "learning_rate": 4.631147540983607e-05, "loss": 0.7278, "step": 16 }, { "epoch": 0.256, "eval_accuracy": 0.52, "eval_loss": 0.8945115804672241, "eval_runtime": 11.3237, "eval_samples_per_second": 22.078, "eval_steps_per_second": 2.826, "step": 16 }, { "epoch": 0.272, "grad_norm": 27.370149612426758, "learning_rate": 4.59016393442623e-05, "loss": 0.8711, "step": 17 }, { "epoch": 0.272, "eval_accuracy": 0.524, "eval_loss": 0.8897060751914978, "eval_runtime": 11.3355, "eval_samples_per_second": 22.055, "eval_steps_per_second": 2.823, "step": 17 }, { "epoch": 0.288, "grad_norm": 40.707698822021484, "learning_rate": 4.549180327868853e-05, "loss": 0.9991, "step": 18 }, { "epoch": 0.288, "eval_accuracy": 0.548, "eval_loss": 0.8214626312255859, "eval_runtime": 11.3233, "eval_samples_per_second": 22.078, "eval_steps_per_second": 2.826, "step": 18 }, { "epoch": 0.304, "grad_norm": 13.598702430725098, "learning_rate": 4.508196721311476e-05, "loss": 0.807, "step": 19 }, { "epoch": 0.304, "eval_accuracy": 0.556, "eval_loss": 0.7961764931678772, "eval_runtime": 11.3396, "eval_samples_per_second": 22.047, "eval_steps_per_second": 2.822, "step": 19 }, { "epoch": 0.32, "grad_norm": 18.774343490600586, "learning_rate": 4.467213114754098e-05, "loss": 0.617, "step": 20 }, { "epoch": 0.32, "eval_accuracy": 0.516, "eval_loss": 0.807204008102417, "eval_runtime": 11.3337, "eval_samples_per_second": 22.058, "eval_steps_per_second": 2.823, "step": 20 }, { "epoch": 0.336, "grad_norm": 24.550052642822266, "learning_rate": 4.426229508196721e-05, "loss": 0.6701, "step": 21 }, { "epoch": 0.336, "eval_accuracy": 0.552, "eval_loss": 0.7810255289077759, "eval_runtime": 11.3902, "eval_samples_per_second": 21.949, "eval_steps_per_second": 2.809, "step": 21 }, { "epoch": 0.352, "grad_norm": 28.407976150512695, "learning_rate": 4.3852459016393444e-05, "loss": 0.823, "step": 22 }, { "epoch": 0.352, "eval_accuracy": 0.552, "eval_loss": 0.764638364315033, "eval_runtime": 11.3671, "eval_samples_per_second": 21.993, "eval_steps_per_second": 2.815, "step": 22 }, { "epoch": 0.368, "grad_norm": 31.69023895263672, "learning_rate": 4.3442622950819674e-05, "loss": 0.7332, "step": 23 }, { "epoch": 0.368, "eval_accuracy": 0.6, "eval_loss": 0.7719610333442688, "eval_runtime": 11.3313, "eval_samples_per_second": 22.063, "eval_steps_per_second": 2.824, "step": 23 }, { "epoch": 0.384, "grad_norm": 51.598724365234375, "learning_rate": 4.3032786885245904e-05, "loss": 1.0789, "step": 24 }, { "epoch": 0.384, "eval_accuracy": 0.604, "eval_loss": 0.792045533657074, "eval_runtime": 11.3212, "eval_samples_per_second": 22.082, "eval_steps_per_second": 2.827, "step": 24 }, { "epoch": 0.4, "grad_norm": 8.370189666748047, "learning_rate": 4.262295081967213e-05, "loss": 0.5899, "step": 25 }, { "epoch": 0.4, "eval_accuracy": 0.588, "eval_loss": 0.8152350187301636, "eval_runtime": 11.2787, "eval_samples_per_second": 22.166, "eval_steps_per_second": 2.837, "step": 25 }, { "epoch": 0.416, "grad_norm": 8.866107940673828, "learning_rate": 4.2213114754098365e-05, "loss": 0.6057, "step": 26 }, { "epoch": 0.416, "eval_accuracy": 0.604, "eval_loss": 0.8338910937309265, "eval_runtime": 11.3283, "eval_samples_per_second": 22.069, "eval_steps_per_second": 2.825, "step": 26 }, { "epoch": 0.432, "grad_norm": 32.09278106689453, "learning_rate": 4.1803278688524595e-05, "loss": 0.7418, "step": 27 }, { "epoch": 0.432, "eval_accuracy": 0.616, "eval_loss": 0.8316658735275269, "eval_runtime": 11.3272, "eval_samples_per_second": 22.071, "eval_steps_per_second": 2.825, "step": 27 }, { "epoch": 0.448, "grad_norm": 22.857614517211914, "learning_rate": 4.1393442622950826e-05, "loss": 0.8383, "step": 28 }, { "epoch": 0.448, "eval_accuracy": 0.616, "eval_loss": 0.7918907999992371, "eval_runtime": 11.3212, "eval_samples_per_second": 22.082, "eval_steps_per_second": 2.827, "step": 28 }, { "epoch": 0.464, "grad_norm": 22.1180362701416, "learning_rate": 4.098360655737705e-05, "loss": 0.8923, "step": 29 }, { "epoch": 0.464, "eval_accuracy": 0.62, "eval_loss": 0.7275803685188293, "eval_runtime": 11.3478, "eval_samples_per_second": 22.031, "eval_steps_per_second": 2.82, "step": 29 }, { "epoch": 0.48, "grad_norm": 43.09483337402344, "learning_rate": 4.057377049180328e-05, "loss": 0.9467, "step": 30 }, { "epoch": 0.48, "eval_accuracy": 0.616, "eval_loss": 0.6892617344856262, "eval_runtime": 11.348, "eval_samples_per_second": 22.03, "eval_steps_per_second": 2.82, "step": 30 }, { "epoch": 0.496, "grad_norm": 8.46947956085205, "learning_rate": 4.016393442622951e-05, "loss": 0.7778, "step": 31 }, { "epoch": 0.496, "eval_accuracy": 0.612, "eval_loss": 0.6953690052032471, "eval_runtime": 11.3372, "eval_samples_per_second": 22.051, "eval_steps_per_second": 2.823, "step": 31 }, { "epoch": 0.512, "grad_norm": 33.367454528808594, "learning_rate": 3.975409836065574e-05, "loss": 0.6468, "step": 32 }, { "epoch": 0.512, "eval_accuracy": 0.564, "eval_loss": 0.6975896954536438, "eval_runtime": 11.3103, "eval_samples_per_second": 22.104, "eval_steps_per_second": 2.829, "step": 32 }, { "epoch": 0.528, "grad_norm": 14.80160903930664, "learning_rate": 3.934426229508197e-05, "loss": 0.7333, "step": 33 }, { "epoch": 0.528, "eval_accuracy": 0.58, "eval_loss": 0.6965731978416443, "eval_runtime": 11.3519, "eval_samples_per_second": 22.023, "eval_steps_per_second": 2.819, "step": 33 }, { "epoch": 0.544, "grad_norm": 38.160823822021484, "learning_rate": 3.89344262295082e-05, "loss": 0.6591, "step": 34 }, { "epoch": 0.544, "eval_accuracy": 0.588, "eval_loss": 0.6874374747276306, "eval_runtime": 11.3235, "eval_samples_per_second": 22.078, "eval_steps_per_second": 2.826, "step": 34 }, { "epoch": 0.56, "grad_norm": 33.589561462402344, "learning_rate": 3.8524590163934424e-05, "loss": 0.7186, "step": 35 }, { "epoch": 0.56, "eval_accuracy": 0.608, "eval_loss": 0.6751595139503479, "eval_runtime": 11.3159, "eval_samples_per_second": 22.093, "eval_steps_per_second": 2.828, "step": 35 }, { "epoch": 0.576, "grad_norm": 12.282697677612305, "learning_rate": 3.8114754098360655e-05, "loss": 0.4988, "step": 36 }, { "epoch": 0.576, "eval_accuracy": 0.616, "eval_loss": 0.6890735030174255, "eval_runtime": 11.3573, "eval_samples_per_second": 22.012, "eval_steps_per_second": 2.818, "step": 36 }, { "epoch": 0.592, "grad_norm": 15.36685562133789, "learning_rate": 3.7704918032786885e-05, "loss": 0.8962, "step": 37 }, { "epoch": 0.592, "eval_accuracy": 0.576, "eval_loss": 0.7174173593521118, "eval_runtime": 11.3449, "eval_samples_per_second": 22.036, "eval_steps_per_second": 2.821, "step": 37 }, { "epoch": 0.608, "grad_norm": 37.992069244384766, "learning_rate": 3.729508196721312e-05, "loss": 0.7407, "step": 38 }, { "epoch": 0.608, "eval_accuracy": 0.528, "eval_loss": 0.7468773126602173, "eval_runtime": 11.3013, "eval_samples_per_second": 22.121, "eval_steps_per_second": 2.832, "step": 38 }, { "epoch": 0.624, "grad_norm": 43.514469146728516, "learning_rate": 3.6885245901639346e-05, "loss": 0.7984, "step": 39 }, { "epoch": 0.624, "eval_accuracy": 0.536, "eval_loss": 0.7447397708892822, "eval_runtime": 11.3203, "eval_samples_per_second": 22.084, "eval_steps_per_second": 2.827, "step": 39 }, { "epoch": 0.64, "grad_norm": 43.61343765258789, "learning_rate": 3.6475409836065576e-05, "loss": 0.6023, "step": 40 }, { "epoch": 0.64, "eval_accuracy": 0.564, "eval_loss": 0.7195525765419006, "eval_runtime": 11.3049, "eval_samples_per_second": 22.114, "eval_steps_per_second": 2.831, "step": 40 }, { "epoch": 0.656, "grad_norm": 59.5920295715332, "learning_rate": 3.6065573770491806e-05, "loss": 0.7771, "step": 41 }, { "epoch": 0.656, "eval_accuracy": 0.604, "eval_loss": 0.686376690864563, "eval_runtime": 11.276, "eval_samples_per_second": 22.171, "eval_steps_per_second": 2.838, "step": 41 }, { "epoch": 0.672, "grad_norm": 32.0897216796875, "learning_rate": 3.5655737704918037e-05, "loss": 0.6586, "step": 42 }, { "epoch": 0.672, "eval_accuracy": 0.62, "eval_loss": 0.6858681440353394, "eval_runtime": 11.2967, "eval_samples_per_second": 22.13, "eval_steps_per_second": 2.833, "step": 42 }, { "epoch": 0.688, "grad_norm": 7.754942893981934, "learning_rate": 3.524590163934427e-05, "loss": 0.5622, "step": 43 }, { "epoch": 0.688, "eval_accuracy": 0.616, "eval_loss": 0.7191852331161499, "eval_runtime": 11.3359, "eval_samples_per_second": 22.054, "eval_steps_per_second": 2.823, "step": 43 }, { "epoch": 0.704, "grad_norm": 13.65731143951416, "learning_rate": 3.483606557377049e-05, "loss": 0.6567, "step": 44 }, { "epoch": 0.704, "eval_accuracy": 0.604, "eval_loss": 0.7964421510696411, "eval_runtime": 11.3043, "eval_samples_per_second": 22.116, "eval_steps_per_second": 2.831, "step": 44 }, { "epoch": 0.72, "grad_norm": 13.19625473022461, "learning_rate": 3.442622950819672e-05, "loss": 0.637, "step": 45 }, { "epoch": 0.72, "eval_accuracy": 0.612, "eval_loss": 0.8056248426437378, "eval_runtime": 11.331, "eval_samples_per_second": 22.063, "eval_steps_per_second": 2.824, "step": 45 }, { "epoch": 0.736, "grad_norm": 10.382162094116211, "learning_rate": 3.401639344262295e-05, "loss": 0.5964, "step": 46 }, { "epoch": 0.736, "eval_accuracy": 0.62, "eval_loss": 0.8342519402503967, "eval_runtime": 11.3384, "eval_samples_per_second": 22.049, "eval_steps_per_second": 2.822, "step": 46 }, { "epoch": 0.752, "grad_norm": 31.208406448364258, "learning_rate": 3.360655737704918e-05, "loss": 0.9646, "step": 47 }, { "epoch": 0.752, "eval_accuracy": 0.612, "eval_loss": 0.7941861748695374, "eval_runtime": 11.3217, "eval_samples_per_second": 22.081, "eval_steps_per_second": 2.826, "step": 47 }, { "epoch": 0.768, "grad_norm": 28.09980583190918, "learning_rate": 3.319672131147541e-05, "loss": 0.778, "step": 48 }, { "epoch": 0.768, "eval_accuracy": 0.62, "eval_loss": 0.727009117603302, "eval_runtime": 11.3124, "eval_samples_per_second": 22.1, "eval_steps_per_second": 2.829, "step": 48 }, { "epoch": 0.784, "grad_norm": 26.889928817749023, "learning_rate": 3.2786885245901635e-05, "loss": 0.8173, "step": 49 }, { "epoch": 0.784, "eval_accuracy": 0.62, "eval_loss": 0.6923478841781616, "eval_runtime": 11.3715, "eval_samples_per_second": 21.985, "eval_steps_per_second": 2.814, "step": 49 }, { "epoch": 0.8, "grad_norm": 12.534849166870117, "learning_rate": 3.237704918032787e-05, "loss": 0.6164, "step": 50 }, { "epoch": 0.8, "eval_accuracy": 0.66, "eval_loss": 0.6402404308319092, "eval_runtime": 11.6238, "eval_samples_per_second": 21.508, "eval_steps_per_second": 2.753, "step": 50 }, { "epoch": 0.816, "grad_norm": 14.15957260131836, "learning_rate": 3.19672131147541e-05, "loss": 0.6124, "step": 51 }, { "epoch": 0.816, "eval_accuracy": 0.648, "eval_loss": 0.6378893852233887, "eval_runtime": 11.4856, "eval_samples_per_second": 21.766, "eval_steps_per_second": 2.786, "step": 51 }, { "epoch": 0.832, "grad_norm": 19.737197875976562, "learning_rate": 3.155737704918033e-05, "loss": 0.6773, "step": 52 }, { "epoch": 0.832, "eval_accuracy": 0.584, "eval_loss": 0.6686127781867981, "eval_runtime": 11.4137, "eval_samples_per_second": 21.903, "eval_steps_per_second": 2.804, "step": 52 }, { "epoch": 0.848, "grad_norm": 19.23349952697754, "learning_rate": 3.114754098360656e-05, "loss": 0.6336, "step": 53 }, { "epoch": 0.848, "eval_accuracy": 0.608, "eval_loss": 0.6502300500869751, "eval_runtime": 11.6377, "eval_samples_per_second": 21.482, "eval_steps_per_second": 2.75, "step": 53 }, { "epoch": 0.864, "grad_norm": 40.20008087158203, "learning_rate": 3.073770491803279e-05, "loss": 0.7077, "step": 54 }, { "epoch": 0.864, "eval_accuracy": 0.62, "eval_loss": 0.6335379481315613, "eval_runtime": 11.4293, "eval_samples_per_second": 21.874, "eval_steps_per_second": 2.8, "step": 54 }, { "epoch": 0.88, "grad_norm": 9.706358909606934, "learning_rate": 3.0327868852459017e-05, "loss": 0.4935, "step": 55 }, { "epoch": 0.88, "eval_accuracy": 0.616, "eval_loss": 0.640767514705658, "eval_runtime": 11.367, "eval_samples_per_second": 21.993, "eval_steps_per_second": 2.815, "step": 55 }, { "epoch": 0.896, "grad_norm": 7.35679817199707, "learning_rate": 2.9918032786885248e-05, "loss": 0.4311, "step": 56 }, { "epoch": 0.896, "eval_accuracy": 0.604, "eval_loss": 0.6300995349884033, "eval_runtime": 11.4719, "eval_samples_per_second": 21.792, "eval_steps_per_second": 2.789, "step": 56 }, { "epoch": 0.912, "grad_norm": 9.2598876953125, "learning_rate": 2.9508196721311478e-05, "loss": 0.4558, "step": 57 }, { "epoch": 0.912, "eval_accuracy": 0.664, "eval_loss": 0.630566418170929, "eval_runtime": 11.5188, "eval_samples_per_second": 21.704, "eval_steps_per_second": 2.778, "step": 57 }, { "epoch": 0.928, "grad_norm": 31.75694465637207, "learning_rate": 2.9098360655737705e-05, "loss": 0.6486, "step": 58 }, { "epoch": 0.928, "eval_accuracy": 0.64, "eval_loss": 0.614264726638794, "eval_runtime": 11.6451, "eval_samples_per_second": 21.468, "eval_steps_per_second": 2.748, "step": 58 }, { "epoch": 0.944, "grad_norm": 39.16770553588867, "learning_rate": 2.8688524590163935e-05, "loss": 0.6755, "step": 59 }, { "epoch": 0.944, "eval_accuracy": 0.668, "eval_loss": 0.5880586504936218, "eval_runtime": 11.5144, "eval_samples_per_second": 21.712, "eval_steps_per_second": 2.779, "step": 59 }, { "epoch": 0.96, "grad_norm": 17.8769474029541, "learning_rate": 2.8278688524590162e-05, "loss": 0.6925, "step": 60 }, { "epoch": 0.96, "eval_accuracy": 0.636, "eval_loss": 0.5895799398422241, "eval_runtime": 11.4443, "eval_samples_per_second": 21.845, "eval_steps_per_second": 2.796, "step": 60 }, { "epoch": 0.976, "grad_norm": 16.898263931274414, "learning_rate": 2.7868852459016392e-05, "loss": 0.3927, "step": 61 }, { "epoch": 0.976, "eval_accuracy": 0.672, "eval_loss": 0.6116553544998169, "eval_runtime": 11.4294, "eval_samples_per_second": 21.873, "eval_steps_per_second": 2.8, "step": 61 }, { "epoch": 0.992, "grad_norm": 11.950173377990723, "learning_rate": 2.7459016393442626e-05, "loss": 0.6678, "step": 62 }, { "epoch": 0.992, "eval_accuracy": 0.676, "eval_loss": 0.629517138004303, "eval_runtime": 11.2631, "eval_samples_per_second": 22.196, "eval_steps_per_second": 2.841, "step": 62 }, { "epoch": 1.008, "grad_norm": 14.72547435760498, "learning_rate": 2.7049180327868856e-05, "loss": 0.4718, "step": 63 }, { "epoch": 1.008, "eval_accuracy": 0.676, "eval_loss": 0.6391622424125671, "eval_runtime": 11.4024, "eval_samples_per_second": 21.925, "eval_steps_per_second": 2.806, "step": 63 }, { "epoch": 1.024, "grad_norm": 21.045801162719727, "learning_rate": 2.6639344262295087e-05, "loss": 0.4525, "step": 64 }, { "epoch": 1.024, "eval_accuracy": 0.676, "eval_loss": 0.6506518721580505, "eval_runtime": 11.4065, "eval_samples_per_second": 21.917, "eval_steps_per_second": 2.805, "step": 64 }, { "epoch": 1.04, "grad_norm": NaN, "learning_rate": 2.6639344262295087e-05, "loss": 0.5411, "step": 65 }, { "epoch": 1.04, "eval_accuracy": 0.676, "eval_loss": 0.6506518721580505, "eval_runtime": 11.483, "eval_samples_per_second": 21.771, "eval_steps_per_second": 2.787, "step": 65 }, { "epoch": 1.056, "grad_norm": 35.55463409423828, "learning_rate": 2.6229508196721314e-05, "loss": 0.5345, "step": 66 }, { "epoch": 1.056, "eval_accuracy": 0.68, "eval_loss": 0.64942467212677, "eval_runtime": 11.2992, "eval_samples_per_second": 22.125, "eval_steps_per_second": 2.832, "step": 66 }, { "epoch": 1.072, "grad_norm": 36.09001922607422, "learning_rate": 2.5819672131147544e-05, "loss": 0.5968, "step": 67 }, { "epoch": 1.072, "eval_accuracy": 0.696, "eval_loss": 0.6280552744865417, "eval_runtime": 11.47, "eval_samples_per_second": 21.796, "eval_steps_per_second": 2.79, "step": 67 }, { "epoch": 1.088, "grad_norm": 42.95037078857422, "learning_rate": 2.540983606557377e-05, "loss": 0.7288, "step": 68 }, { "epoch": 1.088, "eval_accuracy": 0.684, "eval_loss": 0.6100922226905823, "eval_runtime": 11.395, "eval_samples_per_second": 21.939, "eval_steps_per_second": 2.808, "step": 68 }, { "epoch": 1.104, "grad_norm": 18.122161865234375, "learning_rate": 2.5e-05, "loss": 0.3666, "step": 69 }, { "epoch": 1.104, "eval_accuracy": 0.708, "eval_loss": 0.5811479687690735, "eval_runtime": 11.467, "eval_samples_per_second": 21.802, "eval_steps_per_second": 2.791, "step": 69 }, { "epoch": 1.12, "grad_norm": 17.941131591796875, "learning_rate": 2.459016393442623e-05, "loss": 0.5333, "step": 70 }, { "epoch": 1.12, "eval_accuracy": 0.684, "eval_loss": 0.5954810976982117, "eval_runtime": 11.4815, "eval_samples_per_second": 21.774, "eval_steps_per_second": 2.787, "step": 70 }, { "epoch": 1.1360000000000001, "grad_norm": 10.752734184265137, "learning_rate": 2.418032786885246e-05, "loss": 0.4274, "step": 71 }, { "epoch": 1.1360000000000001, "eval_accuracy": 0.672, "eval_loss": 0.5998041033744812, "eval_runtime": 11.5172, "eval_samples_per_second": 21.707, "eval_steps_per_second": 2.778, "step": 71 }, { "epoch": 1.152, "grad_norm": 21.44332504272461, "learning_rate": 2.377049180327869e-05, "loss": 0.4109, "step": 72 }, { "epoch": 1.152, "eval_accuracy": 0.668, "eval_loss": 0.6016911864280701, "eval_runtime": 11.4217, "eval_samples_per_second": 21.888, "eval_steps_per_second": 2.802, "step": 72 }, { "epoch": 1.168, "grad_norm": 40.88154220581055, "learning_rate": 2.336065573770492e-05, "loss": 0.576, "step": 73 }, { "epoch": 1.168, "eval_accuracy": 0.672, "eval_loss": 0.6131250262260437, "eval_runtime": 11.4732, "eval_samples_per_second": 21.79, "eval_steps_per_second": 2.789, "step": 73 }, { "epoch": 1.184, "grad_norm": 21.387557983398438, "learning_rate": 2.295081967213115e-05, "loss": 0.598, "step": 74 }, { "epoch": 1.184, "eval_accuracy": 0.688, "eval_loss": 0.5768781900405884, "eval_runtime": 11.3223, "eval_samples_per_second": 22.08, "eval_steps_per_second": 2.826, "step": 74 }, { "epoch": 1.2, "grad_norm": 22.257291793823242, "learning_rate": 2.254098360655738e-05, "loss": 0.4916, "step": 75 }, { "epoch": 1.2, "eval_accuracy": 0.704, "eval_loss": 0.5493154525756836, "eval_runtime": 11.4048, "eval_samples_per_second": 21.921, "eval_steps_per_second": 2.806, "step": 75 }, { "epoch": 1.216, "grad_norm": 8.411641120910645, "learning_rate": 2.2131147540983607e-05, "loss": 0.3723, "step": 76 }, { "epoch": 1.216, "eval_accuracy": 0.716, "eval_loss": 0.5425886511802673, "eval_runtime": 11.4678, "eval_samples_per_second": 21.8, "eval_steps_per_second": 2.79, "step": 76 }, { "epoch": 1.232, "grad_norm": 10.33214282989502, "learning_rate": 2.1721311475409837e-05, "loss": 0.5423, "step": 77 }, { "epoch": 1.232, "eval_accuracy": 0.704, "eval_loss": 0.5367762446403503, "eval_runtime": 11.5849, "eval_samples_per_second": 21.58, "eval_steps_per_second": 2.762, "step": 77 }, { "epoch": 1.248, "grad_norm": 8.413525581359863, "learning_rate": 2.1311475409836064e-05, "loss": 0.5154, "step": 78 }, { "epoch": 1.248, "eval_accuracy": 0.728, "eval_loss": 0.5338938236236572, "eval_runtime": 11.3935, "eval_samples_per_second": 21.942, "eval_steps_per_second": 2.809, "step": 78 }, { "epoch": 1.264, "grad_norm": 29.967487335205078, "learning_rate": 2.0901639344262298e-05, "loss": 0.5072, "step": 79 }, { "epoch": 1.264, "eval_accuracy": 0.716, "eval_loss": 0.5389543175697327, "eval_runtime": 11.5734, "eval_samples_per_second": 21.601, "eval_steps_per_second": 2.765, "step": 79 }, { "epoch": 1.28, "grad_norm": 15.605040550231934, "learning_rate": 2.0491803278688525e-05, "loss": 0.38, "step": 80 }, { "epoch": 1.28, "eval_accuracy": 0.728, "eval_loss": 0.5024056434631348, "eval_runtime": 11.3479, "eval_samples_per_second": 22.031, "eval_steps_per_second": 2.82, "step": 80 }, { "epoch": 1.296, "grad_norm": 35.906517028808594, "learning_rate": 2.0081967213114755e-05, "loss": 0.6005, "step": 81 }, { "epoch": 1.296, "eval_accuracy": 0.744, "eval_loss": 0.49823418259620667, "eval_runtime": 11.4956, "eval_samples_per_second": 21.747, "eval_steps_per_second": 2.784, "step": 81 }, { "epoch": 1.312, "grad_norm": 7.512831211090088, "learning_rate": 1.9672131147540985e-05, "loss": 0.5016, "step": 82 }, { "epoch": 1.312, "eval_accuracy": 0.728, "eval_loss": 0.4885072708129883, "eval_runtime": 11.4788, "eval_samples_per_second": 21.779, "eval_steps_per_second": 2.788, "step": 82 }, { "epoch": 1.328, "grad_norm": 16.890913009643555, "learning_rate": 1.9262295081967212e-05, "loss": 0.4624, "step": 83 }, { "epoch": 1.328, "eval_accuracy": 0.736, "eval_loss": 0.5011359453201294, "eval_runtime": 11.451, "eval_samples_per_second": 21.832, "eval_steps_per_second": 2.795, "step": 83 }, { "epoch": 1.3439999999999999, "grad_norm": 8.635043144226074, "learning_rate": 1.8852459016393442e-05, "loss": 0.3138, "step": 84 }, { "epoch": 1.3439999999999999, "eval_accuracy": 0.748, "eval_loss": 0.5005082488059998, "eval_runtime": 11.5278, "eval_samples_per_second": 21.687, "eval_steps_per_second": 2.776, "step": 84 }, { "epoch": 1.3599999999999999, "grad_norm": 28.464235305786133, "learning_rate": 1.8442622950819673e-05, "loss": 0.3379, "step": 85 }, { "epoch": 1.3599999999999999, "eval_accuracy": 0.784, "eval_loss": 0.49115338921546936, "eval_runtime": 11.4489, "eval_samples_per_second": 21.836, "eval_steps_per_second": 2.795, "step": 85 }, { "epoch": 1.376, "grad_norm": 6.288327217102051, "learning_rate": 1.8032786885245903e-05, "loss": 0.2329, "step": 86 }, { "epoch": 1.376, "eval_accuracy": 0.76, "eval_loss": 0.486227810382843, "eval_runtime": 11.4903, "eval_samples_per_second": 21.757, "eval_steps_per_second": 2.785, "step": 86 }, { "epoch": 1.392, "grad_norm": 9.165458679199219, "learning_rate": 1.7622950819672133e-05, "loss": 0.4698, "step": 87 }, { "epoch": 1.392, "eval_accuracy": 0.764, "eval_loss": 0.49401524662971497, "eval_runtime": 11.6, "eval_samples_per_second": 21.552, "eval_steps_per_second": 2.759, "step": 87 }, { "epoch": 1.408, "grad_norm": 30.95488166809082, "learning_rate": 1.721311475409836e-05, "loss": 0.386, "step": 88 }, { "epoch": 1.408, "eval_accuracy": 0.788, "eval_loss": 0.4993850588798523, "eval_runtime": 11.4082, "eval_samples_per_second": 21.914, "eval_steps_per_second": 2.805, "step": 88 }, { "epoch": 1.424, "grad_norm": 37.84017562866211, "learning_rate": 1.680327868852459e-05, "loss": 0.7592, "step": 89 }, { "epoch": 1.424, "eval_accuracy": 0.768, "eval_loss": 0.4701511263847351, "eval_runtime": 11.4269, "eval_samples_per_second": 21.878, "eval_steps_per_second": 2.8, "step": 89 }, { "epoch": 1.44, "grad_norm": 41.4830436706543, "learning_rate": 1.6393442622950818e-05, "loss": 0.5635, "step": 90 }, { "epoch": 1.44, "eval_accuracy": 0.776, "eval_loss": 0.4597744047641754, "eval_runtime": 11.3414, "eval_samples_per_second": 22.043, "eval_steps_per_second": 2.822, "step": 90 }, { "epoch": 1.456, "grad_norm": 8.639835357666016, "learning_rate": 1.598360655737705e-05, "loss": 0.4412, "step": 91 }, { "epoch": 1.456, "eval_accuracy": 0.76, "eval_loss": 0.44887205958366394, "eval_runtime": 11.3759, "eval_samples_per_second": 21.976, "eval_steps_per_second": 2.813, "step": 91 }, { "epoch": 1.472, "grad_norm": 13.933167457580566, "learning_rate": 1.557377049180328e-05, "loss": 0.4016, "step": 92 }, { "epoch": 1.472, "eval_accuracy": 0.764, "eval_loss": 0.4409584403038025, "eval_runtime": 11.4154, "eval_samples_per_second": 21.9, "eval_steps_per_second": 2.803, "step": 92 }, { "epoch": 1.488, "grad_norm": 25.79916000366211, "learning_rate": 1.5163934426229509e-05, "loss": 0.3462, "step": 93 }, { "epoch": 1.488, "eval_accuracy": 0.76, "eval_loss": 0.4534677267074585, "eval_runtime": 11.4913, "eval_samples_per_second": 21.756, "eval_steps_per_second": 2.785, "step": 93 }, { "epoch": 1.504, "grad_norm": 8.152263641357422, "learning_rate": 1.4754098360655739e-05, "loss": 0.2376, "step": 94 }, { "epoch": 1.504, "eval_accuracy": 0.78, "eval_loss": 0.4606277644634247, "eval_runtime": 11.3366, "eval_samples_per_second": 22.053, "eval_steps_per_second": 2.823, "step": 94 }, { "epoch": 1.52, "grad_norm": 50.013893127441406, "learning_rate": 1.4344262295081968e-05, "loss": 0.8243, "step": 95 }, { "epoch": 1.52, "eval_accuracy": 0.76, "eval_loss": 0.4745258092880249, "eval_runtime": 11.4641, "eval_samples_per_second": 21.807, "eval_steps_per_second": 2.791, "step": 95 }, { "epoch": 1.536, "grad_norm": 7.844508647918701, "learning_rate": 1.3934426229508196e-05, "loss": 0.2607, "step": 96 }, { "epoch": 1.536, "eval_accuracy": 0.768, "eval_loss": 0.46744146943092346, "eval_runtime": 11.4558, "eval_samples_per_second": 21.823, "eval_steps_per_second": 2.793, "step": 96 }, { "epoch": 1.552, "grad_norm": 36.814781188964844, "learning_rate": 1.3524590163934428e-05, "loss": 0.6977, "step": 97 }, { "epoch": 1.552, "eval_accuracy": 0.768, "eval_loss": 0.46450626850128174, "eval_runtime": 11.6232, "eval_samples_per_second": 21.509, "eval_steps_per_second": 2.753, "step": 97 }, { "epoch": 1.568, "grad_norm": 12.111028671264648, "learning_rate": 1.3114754098360657e-05, "loss": 0.9507, "step": 98 }, { "epoch": 1.568, "eval_accuracy": 0.78, "eval_loss": 0.45364972949028015, "eval_runtime": 11.495, "eval_samples_per_second": 21.749, "eval_steps_per_second": 2.784, "step": 98 }, { "epoch": 1.584, "grad_norm": 8.090563774108887, "learning_rate": 1.2704918032786885e-05, "loss": 0.376, "step": 99 }, { "epoch": 1.584, "eval_accuracy": 0.78, "eval_loss": 0.45443812012672424, "eval_runtime": 11.4699, "eval_samples_per_second": 21.796, "eval_steps_per_second": 2.79, "step": 99 }, { "epoch": 1.6, "grad_norm": 27.078815460205078, "learning_rate": 1.2295081967213116e-05, "loss": 0.4708, "step": 100 }, { "epoch": 1.6, "eval_accuracy": 0.792, "eval_loss": 0.43265777826309204, "eval_runtime": 11.434, "eval_samples_per_second": 21.865, "eval_steps_per_second": 2.799, "step": 100 }, { "epoch": 1.616, "grad_norm": 31.743221282958984, "learning_rate": 1.1885245901639344e-05, "loss": 0.4244, "step": 101 }, { "epoch": 1.616, "eval_accuracy": 0.8, "eval_loss": 0.4210461378097534, "eval_runtime": 11.605, "eval_samples_per_second": 21.543, "eval_steps_per_second": 2.757, "step": 101 }, { "epoch": 1.6320000000000001, "grad_norm": 5.887348175048828, "learning_rate": 1.1475409836065575e-05, "loss": 0.2502, "step": 102 }, { "epoch": 1.6320000000000001, "eval_accuracy": 0.816, "eval_loss": 0.40375572443008423, "eval_runtime": 11.4278, "eval_samples_per_second": 21.876, "eval_steps_per_second": 2.8, "step": 102 }, { "epoch": 1.6480000000000001, "grad_norm": 13.274320602416992, "learning_rate": 1.1065573770491803e-05, "loss": 0.8468, "step": 103 }, { "epoch": 1.6480000000000001, "eval_accuracy": 0.816, "eval_loss": 0.39257147908210754, "eval_runtime": 11.4088, "eval_samples_per_second": 21.913, "eval_steps_per_second": 2.805, "step": 103 }, { "epoch": 1.6640000000000001, "grad_norm": 22.40627098083496, "learning_rate": 1.0655737704918032e-05, "loss": 0.5753, "step": 104 }, { "epoch": 1.6640000000000001, "eval_accuracy": 0.816, "eval_loss": 0.39645299315452576, "eval_runtime": 11.469, "eval_samples_per_second": 21.798, "eval_steps_per_second": 2.79, "step": 104 }, { "epoch": 1.6800000000000002, "grad_norm": 7.59094762802124, "learning_rate": 1.0245901639344262e-05, "loss": 0.2518, "step": 105 }, { "epoch": 1.6800000000000002, "eval_accuracy": 0.812, "eval_loss": 0.3954884707927704, "eval_runtime": 11.452, "eval_samples_per_second": 21.83, "eval_steps_per_second": 2.794, "step": 105 }, { "epoch": 1.696, "grad_norm": 10.119157791137695, "learning_rate": 9.836065573770493e-06, "loss": 0.4945, "step": 106 }, { "epoch": 1.696, "eval_accuracy": 0.796, "eval_loss": 0.39326009154319763, "eval_runtime": 11.4475, "eval_samples_per_second": 21.839, "eval_steps_per_second": 2.795, "step": 106 }, { "epoch": 1.712, "grad_norm": 12.016386032104492, "learning_rate": 9.426229508196721e-06, "loss": 0.2257, "step": 107 }, { "epoch": 1.712, "eval_accuracy": 0.82, "eval_loss": 0.38625067472457886, "eval_runtime": 11.4441, "eval_samples_per_second": 21.845, "eval_steps_per_second": 2.796, "step": 107 }, { "epoch": 1.728, "grad_norm": 37.06383514404297, "learning_rate": 9.016393442622952e-06, "loss": 0.8669, "step": 108 }, { "epoch": 1.728, "eval_accuracy": 0.804, "eval_loss": 0.388757586479187, "eval_runtime": 11.59, "eval_samples_per_second": 21.57, "eval_steps_per_second": 2.761, "step": 108 }, { "epoch": 1.744, "grad_norm": 19.463485717773438, "learning_rate": 8.60655737704918e-06, "loss": 0.4496, "step": 109 }, { "epoch": 1.744, "eval_accuracy": 0.82, "eval_loss": 0.3835999667644501, "eval_runtime": 11.3837, "eval_samples_per_second": 21.961, "eval_steps_per_second": 2.811, "step": 109 }, { "epoch": 1.76, "grad_norm": 9.631560325622559, "learning_rate": 8.196721311475409e-06, "loss": 0.3514, "step": 110 }, { "epoch": 1.76, "eval_accuracy": 0.804, "eval_loss": 0.3807854652404785, "eval_runtime": 11.3909, "eval_samples_per_second": 21.947, "eval_steps_per_second": 2.809, "step": 110 }, { "epoch": 1.776, "grad_norm": 15.87090015411377, "learning_rate": 7.78688524590164e-06, "loss": 0.2494, "step": 111 }, { "epoch": 1.776, "eval_accuracy": 0.784, "eval_loss": 0.39890381693840027, "eval_runtime": 11.3662, "eval_samples_per_second": 21.995, "eval_steps_per_second": 2.815, "step": 111 }, { "epoch": 1.792, "grad_norm": 13.411320686340332, "learning_rate": 7.3770491803278695e-06, "loss": 0.5852, "step": 112 }, { "epoch": 1.792, "eval_accuracy": 0.812, "eval_loss": 0.37704914808273315, "eval_runtime": 11.496, "eval_samples_per_second": 21.747, "eval_steps_per_second": 2.784, "step": 112 }, { "epoch": 1.808, "grad_norm": 10.2308931350708, "learning_rate": 6.967213114754098e-06, "loss": 0.2353, "step": 113 }, { "epoch": 1.808, "eval_accuracy": 0.804, "eval_loss": 0.3881246745586395, "eval_runtime": 11.6252, "eval_samples_per_second": 21.505, "eval_steps_per_second": 2.753, "step": 113 }, { "epoch": 1.8239999999999998, "grad_norm": 35.862152099609375, "learning_rate": 6.557377049180328e-06, "loss": 0.347, "step": 114 }, { "epoch": 1.8239999999999998, "eval_accuracy": 0.82, "eval_loss": 0.38348227739334106, "eval_runtime": 11.3665, "eval_samples_per_second": 21.995, "eval_steps_per_second": 2.815, "step": 114 }, { "epoch": 1.8399999999999999, "grad_norm": 16.910600662231445, "learning_rate": 6.147540983606558e-06, "loss": 0.5338, "step": 115 }, { "epoch": 1.8399999999999999, "eval_accuracy": 0.788, "eval_loss": 0.39259979128837585, "eval_runtime": 11.3572, "eval_samples_per_second": 22.013, "eval_steps_per_second": 2.818, "step": 115 }, { "epoch": 1.8559999999999999, "grad_norm": 14.902595520019531, "learning_rate": 5.737704918032787e-06, "loss": 0.3718, "step": 116 }, { "epoch": 1.8559999999999999, "eval_accuracy": 0.812, "eval_loss": 0.37997427582740784, "eval_runtime": 11.3538, "eval_samples_per_second": 22.019, "eval_steps_per_second": 2.818, "step": 116 }, { "epoch": 1.8719999999999999, "grad_norm": 13.893790245056152, "learning_rate": 5.327868852459016e-06, "loss": 0.1954, "step": 117 }, { "epoch": 1.8719999999999999, "eval_accuracy": 0.82, "eval_loss": 0.3953521251678467, "eval_runtime": 11.6285, "eval_samples_per_second": 21.499, "eval_steps_per_second": 2.752, "step": 117 }, { "epoch": 1.888, "grad_norm": 20.8792667388916, "learning_rate": 4.918032786885246e-06, "loss": 0.3679, "step": 118 }, { "epoch": 1.888, "eval_accuracy": 0.788, "eval_loss": 0.4041662812232971, "eval_runtime": 11.7007, "eval_samples_per_second": 21.366, "eval_steps_per_second": 2.735, "step": 118 }, { "epoch": 1.904, "grad_norm": 10.44711685180664, "learning_rate": 4.508196721311476e-06, "loss": 0.2971, "step": 119 }, { "epoch": 1.904, "eval_accuracy": 0.796, "eval_loss": 0.39307668805122375, "eval_runtime": 11.5223, "eval_samples_per_second": 21.697, "eval_steps_per_second": 2.777, "step": 119 }, { "epoch": 1.92, "grad_norm": 17.487539291381836, "learning_rate": 4.098360655737704e-06, "loss": 0.2742, "step": 120 }, { "epoch": 1.92, "eval_accuracy": 0.804, "eval_loss": 0.38854384422302246, "eval_runtime": 11.3276, "eval_samples_per_second": 22.07, "eval_steps_per_second": 2.825, "step": 120 }, { "epoch": 1.936, "grad_norm": 33.492523193359375, "learning_rate": 3.6885245901639347e-06, "loss": 0.5166, "step": 121 }, { "epoch": 1.936, "eval_accuracy": 0.776, "eval_loss": 0.39685389399528503, "eval_runtime": 11.3571, "eval_samples_per_second": 22.013, "eval_steps_per_second": 2.818, "step": 121 }, { "epoch": 1.952, "grad_norm": 16.691682815551758, "learning_rate": 3.278688524590164e-06, "loss": 0.2628, "step": 122 }, { "epoch": 1.952, "eval_accuracy": 0.796, "eval_loss": 0.39687633514404297, "eval_runtime": 11.3028, "eval_samples_per_second": 22.118, "eval_steps_per_second": 2.831, "step": 122 }, { "epoch": 1.968, "grad_norm": 6.603124618530273, "learning_rate": 2.8688524590163937e-06, "loss": 0.2998, "step": 123 }, { "epoch": 1.968, "eval_accuracy": 0.824, "eval_loss": 0.3869865834712982, "eval_runtime": 11.2907, "eval_samples_per_second": 22.142, "eval_steps_per_second": 2.834, "step": 123 }, { "epoch": 1.984, "grad_norm": 20.69419288635254, "learning_rate": 2.459016393442623e-06, "loss": 0.3104, "step": 124 }, { "epoch": 1.984, "eval_accuracy": 0.824, "eval_loss": 0.38347548246383667, "eval_runtime": 11.3038, "eval_samples_per_second": 22.117, "eval_steps_per_second": 2.831, "step": 124 }, { "epoch": 1.984, "step": 124, "total_flos": 1.6196776411267072e+16, "train_loss": 0.6168801505719462, "train_runtime": 1813.1247, "train_samples_per_second": 1.103, "train_steps_per_second": 0.068 } ], "logging_steps": 1, "max_steps": 124, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.6196776411267072e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }