{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998717955292536, "eval_steps": 400, "global_step": 6093, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004923051676658069, "grad_norm": 52.76170349121094, "learning_rate": 2.9508196721311474e-06, "loss": 10.8937, "num_input_tokens_seen": 393216, "step": 3 }, { "epoch": 0.0009846103353316137, "grad_norm": 28.751510620117188, "learning_rate": 5.901639344262295e-06, "loss": 10.5303, "num_input_tokens_seen": 786432, "step": 6 }, { "epoch": 0.0014769155029974205, "grad_norm": 11.043315887451172, "learning_rate": 8.852459016393442e-06, "loss": 9.5849, "num_input_tokens_seen": 1179648, "step": 9 }, { "epoch": 0.0019692206706632275, "grad_norm": 6.976105213165283, "learning_rate": 1.180327868852459e-05, "loss": 9.0558, "num_input_tokens_seen": 1572864, "step": 12 }, { "epoch": 0.0024615258383290342, "grad_norm": 4.415246486663818, "learning_rate": 1.4754098360655736e-05, "loss": 8.6827, "num_input_tokens_seen": 1966080, "step": 15 }, { "epoch": 0.002953831005994841, "grad_norm": 2.8027164936065674, "learning_rate": 1.7704918032786883e-05, "loss": 8.3359, "num_input_tokens_seen": 2359296, "step": 18 }, { "epoch": 0.0034461361736606477, "grad_norm": 1.793583631515503, "learning_rate": 2.065573770491803e-05, "loss": 8.0831, "num_input_tokens_seen": 2752512, "step": 21 }, { "epoch": 0.003938441341326455, "grad_norm": 1.6349351406097412, "learning_rate": 2.360655737704918e-05, "loss": 7.8729, "num_input_tokens_seen": 3145728, "step": 24 }, { "epoch": 0.004430746508992262, "grad_norm": 1.0413860082626343, "learning_rate": 2.6557377049180323e-05, "loss": 7.7278, "num_input_tokens_seen": 3538944, "step": 27 }, { "epoch": 0.0049230516766580685, "grad_norm": 0.8472433090209961, "learning_rate": 2.950819672131147e-05, "loss": 7.5833, "num_input_tokens_seen": 3932160, "step": 30 }, { "epoch": 0.005415356844323875, "grad_norm": 0.8556374907493591, "learning_rate": 3.245901639344262e-05, "loss": 7.4282, "num_input_tokens_seen": 4325376, "step": 33 }, { "epoch": 0.005907662011989682, "grad_norm": 0.7362769842147827, "learning_rate": 3.540983606557377e-05, "loss": 7.3597, "num_input_tokens_seen": 4718592, "step": 36 }, { "epoch": 0.006399967179655489, "grad_norm": 0.5901055335998535, "learning_rate": 3.836065573770491e-05, "loss": 7.2626, "num_input_tokens_seen": 5111808, "step": 39 }, { "epoch": 0.0068922723473212955, "grad_norm": 0.6458311676979065, "learning_rate": 4.131147540983606e-05, "loss": 7.2543, "num_input_tokens_seen": 5505024, "step": 42 }, { "epoch": 0.007384577514987102, "grad_norm": 0.5546324253082275, "learning_rate": 4.4262295081967207e-05, "loss": 7.2457, "num_input_tokens_seen": 5898240, "step": 45 }, { "epoch": 0.00787688268265291, "grad_norm": 0.5260677337646484, "learning_rate": 4.721311475409836e-05, "loss": 7.1727, "num_input_tokens_seen": 6291456, "step": 48 }, { "epoch": 0.008369187850318717, "grad_norm": 0.55791175365448, "learning_rate": 5.01639344262295e-05, "loss": 7.1299, "num_input_tokens_seen": 6684672, "step": 51 }, { "epoch": 0.008861493017984523, "grad_norm": 0.5637151002883911, "learning_rate": 5.3114754098360647e-05, "loss": 7.1604, "num_input_tokens_seen": 7077888, "step": 54 }, { "epoch": 0.00935379818565033, "grad_norm": 0.54989093542099, "learning_rate": 5.60655737704918e-05, "loss": 7.0051, "num_input_tokens_seen": 7471104, "step": 57 }, { "epoch": 0.009846103353316137, "grad_norm": 0.46848881244659424, "learning_rate": 5.901639344262294e-05, "loss": 7.02, "num_input_tokens_seen": 7864320, "step": 60 }, { "epoch": 0.010338408520981944, "grad_norm": 0.4989602565765381, "learning_rate": 6.19672131147541e-05, "loss": 6.9589, "num_input_tokens_seen": 8257536, "step": 63 }, { "epoch": 0.01083071368864775, "grad_norm": 0.5755507349967957, "learning_rate": 6.491803278688524e-05, "loss": 7.008, "num_input_tokens_seen": 8650752, "step": 66 }, { "epoch": 0.011323018856313557, "grad_norm": 0.543915867805481, "learning_rate": 6.786885245901639e-05, "loss": 6.9092, "num_input_tokens_seen": 9043968, "step": 69 }, { "epoch": 0.011815324023979364, "grad_norm": 0.5782525539398193, "learning_rate": 7.081967213114753e-05, "loss": 6.8488, "num_input_tokens_seen": 9437184, "step": 72 }, { "epoch": 0.01230762919164517, "grad_norm": 0.5549654364585876, "learning_rate": 7.377049180327868e-05, "loss": 6.8617, "num_input_tokens_seen": 9830400, "step": 75 }, { "epoch": 0.012799934359310977, "grad_norm": 0.4978010058403015, "learning_rate": 7.672131147540982e-05, "loss": 6.8245, "num_input_tokens_seen": 10223616, "step": 78 }, { "epoch": 0.013292239526976784, "grad_norm": 0.5749168395996094, "learning_rate": 7.967213114754097e-05, "loss": 6.8257, "num_input_tokens_seen": 10616832, "step": 81 }, { "epoch": 0.013784544694642591, "grad_norm": 0.4636499285697937, "learning_rate": 8.262295081967212e-05, "loss": 6.829, "num_input_tokens_seen": 11010048, "step": 84 }, { "epoch": 0.014276849862308398, "grad_norm": 0.5625278949737549, "learning_rate": 8.557377049180327e-05, "loss": 6.8155, "num_input_tokens_seen": 11403264, "step": 87 }, { "epoch": 0.014769155029974204, "grad_norm": 0.5715211629867554, "learning_rate": 8.852459016393441e-05, "loss": 6.8136, "num_input_tokens_seen": 11796480, "step": 90 }, { "epoch": 0.015261460197640011, "grad_norm": 0.6819092035293579, "learning_rate": 9.147540983606556e-05, "loss": 6.7438, "num_input_tokens_seen": 12189696, "step": 93 }, { "epoch": 0.01575376536530582, "grad_norm": 0.5896216034889221, "learning_rate": 9.442622950819672e-05, "loss": 6.7681, "num_input_tokens_seen": 12582912, "step": 96 }, { "epoch": 0.016246070532971627, "grad_norm": 0.6056619882583618, "learning_rate": 9.737704918032786e-05, "loss": 6.7136, "num_input_tokens_seen": 12976128, "step": 99 }, { "epoch": 0.016738375700637433, "grad_norm": 0.6384982466697693, "learning_rate": 0.000100327868852459, "loss": 6.7073, "num_input_tokens_seen": 13369344, "step": 102 }, { "epoch": 0.01723068086830324, "grad_norm": 0.5903695821762085, "learning_rate": 0.00010327868852459015, "loss": 6.7011, "num_input_tokens_seen": 13762560, "step": 105 }, { "epoch": 0.017722986035969047, "grad_norm": 0.5709877014160156, "learning_rate": 0.00010622950819672129, "loss": 6.7258, "num_input_tokens_seen": 14155776, "step": 108 }, { "epoch": 0.018215291203634854, "grad_norm": 0.621760904788971, "learning_rate": 0.00010918032786885245, "loss": 6.6358, "num_input_tokens_seen": 14548992, "step": 111 }, { "epoch": 0.01870759637130066, "grad_norm": 0.5689477920532227, "learning_rate": 0.0001121311475409836, "loss": 6.608, "num_input_tokens_seen": 14942208, "step": 114 }, { "epoch": 0.019199901538966467, "grad_norm": 0.5650547742843628, "learning_rate": 0.00011508196721311474, "loss": 6.6641, "num_input_tokens_seen": 15335424, "step": 117 }, { "epoch": 0.019692206706632274, "grad_norm": 0.6167349219322205, "learning_rate": 0.00011803278688524588, "loss": 6.5976, "num_input_tokens_seen": 15728640, "step": 120 }, { "epoch": 0.02018451187429808, "grad_norm": 0.5514015555381775, "learning_rate": 0.00012098360655737703, "loss": 6.5955, "num_input_tokens_seen": 16121856, "step": 123 }, { "epoch": 0.020676817041963887, "grad_norm": 0.8216041326522827, "learning_rate": 0.0001239344262295082, "loss": 6.6051, "num_input_tokens_seen": 16515072, "step": 126 }, { "epoch": 0.021169122209629694, "grad_norm": 0.6293138265609741, "learning_rate": 0.00012688524590163933, "loss": 6.6251, "num_input_tokens_seen": 16908288, "step": 129 }, { "epoch": 0.0216614273772955, "grad_norm": 0.6225654482841492, "learning_rate": 0.0001298360655737705, "loss": 6.5213, "num_input_tokens_seen": 17301504, "step": 132 }, { "epoch": 0.022153732544961308, "grad_norm": 0.7211737632751465, "learning_rate": 0.00013278688524590162, "loss": 6.5802, "num_input_tokens_seen": 17694720, "step": 135 }, { "epoch": 0.022646037712627114, "grad_norm": 0.9310851097106934, "learning_rate": 0.00013573770491803278, "loss": 6.5739, "num_input_tokens_seen": 18087936, "step": 138 }, { "epoch": 0.02313834288029292, "grad_norm": 0.9796333909034729, "learning_rate": 0.00013868852459016394, "loss": 6.6066, "num_input_tokens_seen": 18481152, "step": 141 }, { "epoch": 0.023630648047958728, "grad_norm": 0.9219833016395569, "learning_rate": 0.00014163934426229507, "loss": 6.5536, "num_input_tokens_seen": 18874368, "step": 144 }, { "epoch": 0.024122953215624535, "grad_norm": 0.9721167087554932, "learning_rate": 0.00014459016393442622, "loss": 6.5368, "num_input_tokens_seen": 19267584, "step": 147 }, { "epoch": 0.02461525838329034, "grad_norm": 0.9795536994934082, "learning_rate": 0.00014754098360655736, "loss": 6.5266, "num_input_tokens_seen": 19660800, "step": 150 }, { "epoch": 0.025107563550956148, "grad_norm": 0.7936388850212097, "learning_rate": 0.0001504918032786885, "loss": 6.5445, "num_input_tokens_seen": 20054016, "step": 153 }, { "epoch": 0.025599868718621955, "grad_norm": 0.6322927474975586, "learning_rate": 0.00015344262295081964, "loss": 6.5434, "num_input_tokens_seen": 20447232, "step": 156 }, { "epoch": 0.02609217388628776, "grad_norm": 0.6971911191940308, "learning_rate": 0.0001563934426229508, "loss": 6.5021, "num_input_tokens_seen": 20840448, "step": 159 }, { "epoch": 0.02658447905395357, "grad_norm": 0.6323833465576172, "learning_rate": 0.00015934426229508193, "loss": 6.4679, "num_input_tokens_seen": 21233664, "step": 162 }, { "epoch": 0.027076784221619375, "grad_norm": 0.590775728225708, "learning_rate": 0.00016229508196721312, "loss": 6.4743, "num_input_tokens_seen": 21626880, "step": 165 }, { "epoch": 0.027569089389285182, "grad_norm": 0.8365766406059265, "learning_rate": 0.00016524590163934425, "loss": 6.4743, "num_input_tokens_seen": 22020096, "step": 168 }, { "epoch": 0.02806139455695099, "grad_norm": 0.9162667393684387, "learning_rate": 0.0001681967213114754, "loss": 6.4554, "num_input_tokens_seen": 22413312, "step": 171 }, { "epoch": 0.028553699724616795, "grad_norm": 0.8129913210868835, "learning_rate": 0.00017114754098360654, "loss": 6.4733, "num_input_tokens_seen": 22806528, "step": 174 }, { "epoch": 0.029046004892282602, "grad_norm": 1.118039608001709, "learning_rate": 0.0001740983606557377, "loss": 6.4847, "num_input_tokens_seen": 23199744, "step": 177 }, { "epoch": 0.02953831005994841, "grad_norm": 1.1731743812561035, "learning_rate": 0.00017704918032786883, "loss": 6.469, "num_input_tokens_seen": 23592960, "step": 180 }, { "epoch": 0.030030615227614216, "grad_norm": 0.8692623972892761, "learning_rate": 0.00017999999999999998, "loss": 6.4902, "num_input_tokens_seen": 23986176, "step": 183 }, { "epoch": 0.030522920395280023, "grad_norm": 0.924551784992218, "learning_rate": 0.00018295081967213112, "loss": 6.4257, "num_input_tokens_seen": 24379392, "step": 186 }, { "epoch": 0.031015225562945833, "grad_norm": 0.7523306012153625, "learning_rate": 0.00018590163934426227, "loss": 6.4386, "num_input_tokens_seen": 24772608, "step": 189 }, { "epoch": 0.03150753073061164, "grad_norm": 0.7796934843063354, "learning_rate": 0.00018885245901639343, "loss": 6.4095, "num_input_tokens_seen": 25165824, "step": 192 }, { "epoch": 0.03199983589827744, "grad_norm": 1.026790738105774, "learning_rate": 0.0001918032786885246, "loss": 6.3698, "num_input_tokens_seen": 25559040, "step": 195 }, { "epoch": 0.03249214106594325, "grad_norm": 0.9334837794303894, "learning_rate": 0.00019475409836065572, "loss": 6.3773, "num_input_tokens_seen": 25952256, "step": 198 }, { "epoch": 0.032984446233609056, "grad_norm": 1.4499056339263916, "learning_rate": 0.00019770491803278688, "loss": 6.4132, "num_input_tokens_seen": 26345472, "step": 201 }, { "epoch": 0.03347675140127487, "grad_norm": 1.2045769691467285, "learning_rate": 0.000200655737704918, "loss": 6.4163, "num_input_tokens_seen": 26738688, "step": 204 }, { "epoch": 0.03396905656894067, "grad_norm": 1.3456003665924072, "learning_rate": 0.00020360655737704917, "loss": 6.4426, "num_input_tokens_seen": 27131904, "step": 207 }, { "epoch": 0.03446136173660648, "grad_norm": 1.1916530132293701, "learning_rate": 0.0002065573770491803, "loss": 6.4234, "num_input_tokens_seen": 27525120, "step": 210 }, { "epoch": 0.03495366690427228, "grad_norm": 1.1489585638046265, "learning_rate": 0.00020950819672131146, "loss": 6.4218, "num_input_tokens_seen": 27918336, "step": 213 }, { "epoch": 0.035445972071938094, "grad_norm": 0.6667603850364685, "learning_rate": 0.00021245901639344259, "loss": 6.3939, "num_input_tokens_seen": 28311552, "step": 216 }, { "epoch": 0.0359382772396039, "grad_norm": 0.8607211112976074, "learning_rate": 0.00021540983606557374, "loss": 6.3799, "num_input_tokens_seen": 28704768, "step": 219 }, { "epoch": 0.03643058240726971, "grad_norm": 0.837247908115387, "learning_rate": 0.0002183606557377049, "loss": 6.3738, "num_input_tokens_seen": 29097984, "step": 222 }, { "epoch": 0.03692288757493551, "grad_norm": 0.7893315553665161, "learning_rate": 0.00022131147540983606, "loss": 6.3407, "num_input_tokens_seen": 29491200, "step": 225 }, { "epoch": 0.03741519274260132, "grad_norm": 0.8632616400718689, "learning_rate": 0.0002242622950819672, "loss": 6.3659, "num_input_tokens_seen": 29884416, "step": 228 }, { "epoch": 0.037907497910267124, "grad_norm": 0.8238740563392639, "learning_rate": 0.00022721311475409835, "loss": 6.3464, "num_input_tokens_seen": 30277632, "step": 231 }, { "epoch": 0.038399803077932934, "grad_norm": 0.9048452377319336, "learning_rate": 0.00023016393442622948, "loss": 6.3345, "num_input_tokens_seen": 30670848, "step": 234 }, { "epoch": 0.03889210824559874, "grad_norm": 0.8803463578224182, "learning_rate": 0.00023311475409836064, "loss": 6.3739, "num_input_tokens_seen": 31064064, "step": 237 }, { "epoch": 0.03938441341326455, "grad_norm": 0.9749881029129028, "learning_rate": 0.00023606557377049177, "loss": 6.3295, "num_input_tokens_seen": 31457280, "step": 240 }, { "epoch": 0.03987671858093035, "grad_norm": 0.903581976890564, "learning_rate": 0.00023901639344262293, "loss": 6.3686, "num_input_tokens_seen": 31850496, "step": 243 }, { "epoch": 0.04036902374859616, "grad_norm": 0.7235903143882751, "learning_rate": 0.00024196721311475406, "loss": 6.3202, "num_input_tokens_seen": 32243712, "step": 246 }, { "epoch": 0.040861328916261964, "grad_norm": 0.9854725003242493, "learning_rate": 0.0002449180327868852, "loss": 6.35, "num_input_tokens_seen": 32636928, "step": 249 }, { "epoch": 0.041353634083927775, "grad_norm": 1.2127262353897095, "learning_rate": 0.0002478688524590164, "loss": 6.3013, "num_input_tokens_seen": 33030144, "step": 252 }, { "epoch": 0.041845939251593585, "grad_norm": 1.090453028678894, "learning_rate": 0.00025081967213114756, "loss": 6.3094, "num_input_tokens_seen": 33423360, "step": 255 }, { "epoch": 0.04233824441925939, "grad_norm": 1.2343584299087524, "learning_rate": 0.00025377049180327866, "loss": 6.3217, "num_input_tokens_seen": 33816576, "step": 258 }, { "epoch": 0.0428305495869252, "grad_norm": 1.1911264657974243, "learning_rate": 0.0002567213114754098, "loss": 6.2728, "num_input_tokens_seen": 34209792, "step": 261 }, { "epoch": 0.043322854754591, "grad_norm": 1.042060375213623, "learning_rate": 0.000259672131147541, "loss": 6.3082, "num_input_tokens_seen": 34603008, "step": 264 }, { "epoch": 0.04381515992225681, "grad_norm": 0.9093752503395081, "learning_rate": 0.0002626229508196721, "loss": 6.266, "num_input_tokens_seen": 34996224, "step": 267 }, { "epoch": 0.044307465089922615, "grad_norm": 1.0649739503860474, "learning_rate": 0.00026557377049180324, "loss": 6.2528, "num_input_tokens_seen": 35389440, "step": 270 }, { "epoch": 0.044799770257588425, "grad_norm": 1.3432960510253906, "learning_rate": 0.0002685245901639344, "loss": 6.2903, "num_input_tokens_seen": 35782656, "step": 273 }, { "epoch": 0.04529207542525423, "grad_norm": 1.2440747022628784, "learning_rate": 0.00027147540983606556, "loss": 6.274, "num_input_tokens_seen": 36175872, "step": 276 }, { "epoch": 0.04578438059292004, "grad_norm": 1.3492511510849, "learning_rate": 0.00027442622950819666, "loss": 6.2834, "num_input_tokens_seen": 36569088, "step": 279 }, { "epoch": 0.04627668576058584, "grad_norm": 1.2173079252243042, "learning_rate": 0.00027737704918032787, "loss": 6.2886, "num_input_tokens_seen": 36962304, "step": 282 }, { "epoch": 0.04676899092825165, "grad_norm": 1.230082631111145, "learning_rate": 0.00028032786885245903, "loss": 6.2568, "num_input_tokens_seen": 37355520, "step": 285 }, { "epoch": 0.047261296095917456, "grad_norm": 1.3385438919067383, "learning_rate": 0.00028327868852459013, "loss": 6.2438, "num_input_tokens_seen": 37748736, "step": 288 }, { "epoch": 0.047753601263583266, "grad_norm": 1.4713610410690308, "learning_rate": 0.0002862295081967213, "loss": 6.307, "num_input_tokens_seen": 38141952, "step": 291 }, { "epoch": 0.04824590643124907, "grad_norm": 1.210629940032959, "learning_rate": 0.00028918032786885245, "loss": 6.2704, "num_input_tokens_seen": 38535168, "step": 294 }, { "epoch": 0.04873821159891488, "grad_norm": 1.1790496110916138, "learning_rate": 0.00029213114754098355, "loss": 6.1878, "num_input_tokens_seen": 38928384, "step": 297 }, { "epoch": 0.04923051676658068, "grad_norm": 1.0269746780395508, "learning_rate": 0.0002950819672131147, "loss": 6.279, "num_input_tokens_seen": 39321600, "step": 300 }, { "epoch": 0.04972282193424649, "grad_norm": 1.2352854013442993, "learning_rate": 0.00029803278688524587, "loss": 6.2379, "num_input_tokens_seen": 39714816, "step": 303 }, { "epoch": 0.050215127101912296, "grad_norm": 1.5501960515975952, "learning_rate": 0.00029950940277884624, "loss": 6.2097, "num_input_tokens_seen": 40108032, "step": 306 }, { "epoch": 0.05070743226957811, "grad_norm": 1.5629328489303589, "learning_rate": 0.0002980519274494139, "loss": 6.1982, "num_input_tokens_seen": 40501248, "step": 309 }, { "epoch": 0.05119973743724391, "grad_norm": 1.323115587234497, "learning_rate": 0.0002966155242578669, "loss": 6.2333, "num_input_tokens_seen": 40894464, "step": 312 }, { "epoch": 0.05169204260490972, "grad_norm": 2.0143845081329346, "learning_rate": 0.00029519969028245457, "loss": 6.2445, "num_input_tokens_seen": 41287680, "step": 315 }, { "epoch": 0.05218434777257552, "grad_norm": 1.3097857236862183, "learning_rate": 0.0002938039392468745, "loss": 6.188, "num_input_tokens_seen": 41680896, "step": 318 }, { "epoch": 0.052676652940241334, "grad_norm": 1.5996229648590088, "learning_rate": 0.000292427800818576, "loss": 6.1792, "num_input_tokens_seen": 42074112, "step": 321 }, { "epoch": 0.05316895810790714, "grad_norm": 1.2864151000976562, "learning_rate": 0.000291070819942883, "loss": 6.1811, "num_input_tokens_seen": 42467328, "step": 324 }, { "epoch": 0.05366126327557295, "grad_norm": 1.683950662612915, "learning_rate": 0.00028973255621079304, "loss": 6.1795, "num_input_tokens_seen": 42860544, "step": 327 }, { "epoch": 0.05415356844323875, "grad_norm": 1.5458773374557495, "learning_rate": 0.0002884125832584601, "loss": 6.1799, "num_input_tokens_seen": 43253760, "step": 330 }, { "epoch": 0.05464587361090456, "grad_norm": 1.4698423147201538, "learning_rate": 0.0002871104881964997, "loss": 6.1903, "num_input_tokens_seen": 43646976, "step": 333 }, { "epoch": 0.055138178778570364, "grad_norm": 1.4713280200958252, "learning_rate": 0.0002858258710673835, "loss": 6.2024, "num_input_tokens_seen": 44040192, "step": 336 }, { "epoch": 0.055630483946236174, "grad_norm": 1.2734639644622803, "learning_rate": 0.000284558344329302, "loss": 6.1246, "num_input_tokens_seen": 44433408, "step": 339 }, { "epoch": 0.05612278911390198, "grad_norm": 1.0039596557617188, "learning_rate": 0.00028330753236498467, "loss": 6.1914, "num_input_tokens_seen": 44826624, "step": 342 }, { "epoch": 0.05661509428156779, "grad_norm": 1.0492082834243774, "learning_rate": 0.0002820730710140625, "loss": 6.1344, "num_input_tokens_seen": 45219840, "step": 345 }, { "epoch": 0.05710739944923359, "grad_norm": 0.9763434529304504, "learning_rate": 0.0002808546071276517, "loss": 6.159, "num_input_tokens_seen": 45613056, "step": 348 }, { "epoch": 0.0575997046168994, "grad_norm": 0.7468030452728271, "learning_rate": 0.00027965179814392076, "loss": 6.1567, "num_input_tokens_seen": 46006272, "step": 351 }, { "epoch": 0.058092009784565204, "grad_norm": 0.8997909426689148, "learning_rate": 0.0002784643116834829, "loss": 6.1655, "num_input_tokens_seen": 46399488, "step": 354 }, { "epoch": 0.058584314952231015, "grad_norm": 0.9638291001319885, "learning_rate": 0.00027729182516352875, "loss": 6.0947, "num_input_tokens_seen": 46792704, "step": 357 }, { "epoch": 0.05907662011989682, "grad_norm": 0.84872967004776, "learning_rate": 0.0002761340254296815, "loss": 6.1313, "num_input_tokens_seen": 47185920, "step": 360 }, { "epoch": 0.05956892528756263, "grad_norm": 2.2599406242370605, "learning_rate": 0.0002749906084046213, "loss": 6.1251, "num_input_tokens_seen": 47579136, "step": 363 }, { "epoch": 0.06006123045522843, "grad_norm": 2.0698065757751465, "learning_rate": 0.00027386127875258305, "loss": 6.1651, "num_input_tokens_seen": 47972352, "step": 366 }, { "epoch": 0.06055353562289424, "grad_norm": 0.8866053223609924, "learning_rate": 0.0002727457495588868, "loss": 6.1123, "num_input_tokens_seen": 48365568, "step": 369 }, { "epoch": 0.061045840790560045, "grad_norm": 1.2379621267318726, "learning_rate": 0.0002716437420237123, "loss": 6.1493, "num_input_tokens_seen": 48758784, "step": 372 }, { "epoch": 0.061538145958225855, "grad_norm": 1.01674222946167, "learning_rate": 0.00027055498516937365, "loss": 6.1375, "num_input_tokens_seen": 49152000, "step": 375 }, { "epoch": 0.062030451125891665, "grad_norm": 0.9889538884162903, "learning_rate": 0.0002694792155603983, "loss": 6.0978, "num_input_tokens_seen": 49545216, "step": 378 }, { "epoch": 0.06252275629355747, "grad_norm": 0.6733151078224182, "learning_rate": 0.00026841617703575205, "loss": 6.134, "num_input_tokens_seen": 49938432, "step": 381 }, { "epoch": 0.06301506146122328, "grad_norm": 0.9297388792037964, "learning_rate": 0.00026736562045259293, "loss": 6.1135, "num_input_tokens_seen": 50331648, "step": 384 }, { "epoch": 0.06350736662888909, "grad_norm": 0.6994606256484985, "learning_rate": 0.0002663273034409719, "loss": 6.0961, "num_input_tokens_seen": 50724864, "step": 387 }, { "epoch": 0.06399967179655489, "grad_norm": 0.9161903262138367, "learning_rate": 0.0002653009901689313, "loss": 6.1339, "num_input_tokens_seen": 51118080, "step": 390 }, { "epoch": 0.0644919769642207, "grad_norm": 0.8527827262878418, "learning_rate": 0.000264286451117485, "loss": 6.0692, "num_input_tokens_seen": 51511296, "step": 393 }, { "epoch": 0.0649842821318865, "grad_norm": 0.7189248204231262, "learning_rate": 0.0002632834628649923, "loss": 6.0884, "num_input_tokens_seen": 51904512, "step": 396 }, { "epoch": 0.06547658729955232, "grad_norm": 0.9880132079124451, "learning_rate": 0.00026229180788046543, "loss": 6.0703, "num_input_tokens_seen": 52297728, "step": 399 }, { "epoch": 0.06564068902210758, "eval_accuracy": 0.1701481843347989, "eval_loss": 6.233191013336182, "eval_runtime": 110.6814, "eval_samples_per_second": 2.71, "eval_steps_per_second": 1.355, "num_input_tokens_seen": 52428800, "step": 400 }, { "epoch": 0.06596889246721811, "grad_norm": 1.2372077703475952, "learning_rate": 0.0002613112743253766, "loss": 6.0785, "num_input_tokens_seen": 52690944, "step": 402 }, { "epoch": 0.06646119763488392, "grad_norm": 1.0689289569854736, "learning_rate": 0.0002603416558635551, "loss": 6.0829, "num_input_tokens_seen": 53084160, "step": 405 }, { "epoch": 0.06695350280254973, "grad_norm": 1.0362907648086548, "learning_rate": 0.0002593827514787864, "loss": 6.0684, "num_input_tokens_seen": 53477376, "step": 408 }, { "epoch": 0.06744580797021554, "grad_norm": 0.9470035433769226, "learning_rate": 0.00025843436529974725, "loss": 6.0331, "num_input_tokens_seen": 53870592, "step": 411 }, { "epoch": 0.06793811313788134, "grad_norm": 0.9663071036338806, "learning_rate": 0.00025749630643193106, "loss": 6.0694, "num_input_tokens_seen": 54263808, "step": 414 }, { "epoch": 0.06843041830554715, "grad_norm": 1.1301859617233276, "learning_rate": 0.0002565683887962357, "loss": 6.0433, "num_input_tokens_seen": 54657024, "step": 417 }, { "epoch": 0.06892272347321296, "grad_norm": 1.1818194389343262, "learning_rate": 0.000255650430973904, "loss": 6.0824, "num_input_tokens_seen": 55050240, "step": 420 }, { "epoch": 0.06941502864087877, "grad_norm": 1.000831127166748, "learning_rate": 0.00025474225605752297, "loss": 6.0066, "num_input_tokens_seen": 55443456, "step": 423 }, { "epoch": 0.06990733380854457, "grad_norm": 1.0827128887176514, "learning_rate": 0.00025384369150780535, "loss": 6.0375, "num_input_tokens_seen": 55836672, "step": 426 }, { "epoch": 0.07039963897621038, "grad_norm": 0.8842981457710266, "learning_rate": 0.00025295456901588867, "loss": 6.031, "num_input_tokens_seen": 56229888, "step": 429 }, { "epoch": 0.07089194414387619, "grad_norm": 0.734250545501709, "learning_rate": 0.00025207472437090286, "loss": 6.0575, "num_input_tokens_seen": 56623104, "step": 432 }, { "epoch": 0.071384249311542, "grad_norm": 1.13999605178833, "learning_rate": 0.0002512039973325704, "loss": 6.0268, "num_input_tokens_seen": 57016320, "step": 435 }, { "epoch": 0.0718765544792078, "grad_norm": 1.5524462461471558, "learning_rate": 0.0002503422315086136, "loss": 6.0097, "num_input_tokens_seen": 57409536, "step": 438 }, { "epoch": 0.0723688596468736, "grad_norm": 1.8856819868087769, "learning_rate": 0.0002494892742367568, "loss": 6.0882, "num_input_tokens_seen": 57802752, "step": 441 }, { "epoch": 0.07286116481453941, "grad_norm": 1.1143220663070679, "learning_rate": 0.000248644976471121, "loss": 5.9855, "num_input_tokens_seen": 58195968, "step": 444 }, { "epoch": 0.07335346998220522, "grad_norm": 1.0051558017730713, "learning_rate": 0.00024780919267281904, "loss": 6.0529, "num_input_tokens_seen": 58589184, "step": 447 }, { "epoch": 0.07384577514987102, "grad_norm": 0.9793909788131714, "learning_rate": 0.00024698178070456936, "loss": 6.0412, "num_input_tokens_seen": 58982400, "step": 450 }, { "epoch": 0.07433808031753683, "grad_norm": 1.0926892757415771, "learning_rate": 0.00024616260172915426, "loss": 6.0872, "num_input_tokens_seen": 59375616, "step": 453 }, { "epoch": 0.07483038548520264, "grad_norm": 0.8595672845840454, "learning_rate": 0.00024535152011155874, "loss": 6.004, "num_input_tokens_seen": 59768832, "step": 456 }, { "epoch": 0.07532269065286845, "grad_norm": 0.7526682615280151, "learning_rate": 0.00024454840332463316, "loss": 6.0032, "num_input_tokens_seen": 60162048, "step": 459 }, { "epoch": 0.07581499582053425, "grad_norm": 1.0615681409835815, "learning_rate": 0.00024375312185813004, "loss": 6.0034, "num_input_tokens_seen": 60555264, "step": 462 }, { "epoch": 0.07630730098820006, "grad_norm": 1.3442180156707764, "learning_rate": 0.00024296554913097476, "loss": 6.0288, "num_input_tokens_seen": 60948480, "step": 465 }, { "epoch": 0.07679960615586587, "grad_norm": 1.1294552087783813, "learning_rate": 0.00024218556140663327, "loss": 5.9879, "num_input_tokens_seen": 61341696, "step": 468 }, { "epoch": 0.07729191132353168, "grad_norm": 1.0565030574798584, "learning_rate": 0.00024141303771145015, "loss": 5.9733, "num_input_tokens_seen": 61734912, "step": 471 }, { "epoch": 0.07778421649119747, "grad_norm": 0.8977353572845459, "learning_rate": 0.00024064785975583342, "loss": 5.9904, "num_input_tokens_seen": 62128128, "step": 474 }, { "epoch": 0.07827652165886329, "grad_norm": 1.3904660940170288, "learning_rate": 0.00023988991185817037, "loss": 6.0155, "num_input_tokens_seen": 62521344, "step": 477 }, { "epoch": 0.0787688268265291, "grad_norm": 1.113961935043335, "learning_rate": 0.0002391390808713624, "loss": 5.9818, "num_input_tokens_seen": 62914560, "step": 480 }, { "epoch": 0.0792611319941949, "grad_norm": 0.8899008631706238, "learning_rate": 0.00023839525611187392, "loss": 5.9606, "num_input_tokens_seen": 63307776, "step": 483 }, { "epoch": 0.0797534371618607, "grad_norm": 0.802760660648346, "learning_rate": 0.00023765832929119373, "loss": 5.9345, "num_input_tokens_seen": 63700992, "step": 486 }, { "epoch": 0.08024574232952651, "grad_norm": 0.7717273235321045, "learning_rate": 0.00023692819444961244, "loss": 6.0008, "num_input_tokens_seen": 64094208, "step": 489 }, { "epoch": 0.08073804749719232, "grad_norm": 0.7769405245780945, "learning_rate": 0.00023620474789222436, "loss": 5.9624, "num_input_tokens_seen": 64487424, "step": 492 }, { "epoch": 0.08123035266485813, "grad_norm": 0.8420572876930237, "learning_rate": 0.00023548788812706575, "loss": 5.9664, "num_input_tokens_seen": 64880640, "step": 495 }, { "epoch": 0.08172265783252393, "grad_norm": 0.6086786389350891, "learning_rate": 0.00023477751580530627, "loss": 5.952, "num_input_tokens_seen": 65273856, "step": 498 }, { "epoch": 0.08221496300018974, "grad_norm": 0.9240081906318665, "learning_rate": 0.00023407353366341235, "loss": 5.9447, "num_input_tokens_seen": 65667072, "step": 501 }, { "epoch": 0.08270726816785555, "grad_norm": 0.9431899189949036, "learning_rate": 0.0002333758464672077, "loss": 5.9525, "num_input_tokens_seen": 66060288, "step": 504 }, { "epoch": 0.08319957333552136, "grad_norm": 0.8989746570587158, "learning_rate": 0.0002326843609577565, "loss": 5.9485, "num_input_tokens_seen": 66453504, "step": 507 }, { "epoch": 0.08369187850318717, "grad_norm": 0.6996486783027649, "learning_rate": 0.00023199898579900018, "loss": 5.9664, "num_input_tokens_seen": 66846720, "step": 510 }, { "epoch": 0.08418418367085297, "grad_norm": 0.8220193386077881, "learning_rate": 0.00023131963152708105, "loss": 5.9256, "num_input_tokens_seen": 67239936, "step": 513 }, { "epoch": 0.08467648883851878, "grad_norm": 0.8553633093833923, "learning_rate": 0.0002306462105012884, "loss": 5.9676, "num_input_tokens_seen": 67633152, "step": 516 }, { "epoch": 0.08516879400618459, "grad_norm": 0.6745681166648865, "learning_rate": 0.00022997863685656676, "loss": 5.9794, "num_input_tokens_seen": 68026368, "step": 519 }, { "epoch": 0.0856610991738504, "grad_norm": 0.638691246509552, "learning_rate": 0.00022931682645752736, "loss": 5.9337, "num_input_tokens_seen": 68419584, "step": 522 }, { "epoch": 0.0861534043415162, "grad_norm": 0.7927042841911316, "learning_rate": 0.00022866069685390685, "loss": 5.933, "num_input_tokens_seen": 68812800, "step": 525 }, { "epoch": 0.086645709509182, "grad_norm": 1.2133303880691528, "learning_rate": 0.00022801016723742026, "loss": 5.9536, "num_input_tokens_seen": 69206016, "step": 528 }, { "epoch": 0.08713801467684781, "grad_norm": 0.8164889812469482, "learning_rate": 0.00022736515839995644, "loss": 5.9201, "num_input_tokens_seen": 69599232, "step": 531 }, { "epoch": 0.08763031984451362, "grad_norm": 1.072871208190918, "learning_rate": 0.00022672559269306688, "loss": 5.9214, "num_input_tokens_seen": 69992448, "step": 534 }, { "epoch": 0.08812262501217942, "grad_norm": 1.145111322402954, "learning_rate": 0.00022609139398870132, "loss": 5.9051, "num_input_tokens_seen": 70385664, "step": 537 }, { "epoch": 0.08861493017984523, "grad_norm": 1.074684739112854, "learning_rate": 0.00022546248764114467, "loss": 5.9146, "num_input_tokens_seen": 70778880, "step": 540 }, { "epoch": 0.08910723534751104, "grad_norm": 1.3423213958740234, "learning_rate": 0.000224838800450112, "loss": 5.8908, "num_input_tokens_seen": 71172096, "step": 543 }, { "epoch": 0.08959954051517685, "grad_norm": 0.901797354221344, "learning_rate": 0.00022422026062496062, "loss": 5.8608, "num_input_tokens_seen": 71565312, "step": 546 }, { "epoch": 0.09009184568284265, "grad_norm": 1.848212480545044, "learning_rate": 0.00022360679774997895, "loss": 5.9256, "num_input_tokens_seen": 71958528, "step": 549 }, { "epoch": 0.09058415085050846, "grad_norm": 1.1397554874420166, "learning_rate": 0.00022299834275071466, "loss": 5.9315, "num_input_tokens_seen": 72351744, "step": 552 }, { "epoch": 0.09107645601817427, "grad_norm": 1.3297491073608398, "learning_rate": 0.00022239482786130492, "loss": 5.8853, "num_input_tokens_seen": 72744960, "step": 555 }, { "epoch": 0.09156876118584008, "grad_norm": 1.0274791717529297, "learning_rate": 0.00022179618659277431, "loss": 5.9317, "num_input_tokens_seen": 73138176, "step": 558 }, { "epoch": 0.09206106635350587, "grad_norm": 0.9891603589057922, "learning_rate": 0.00022120235370226617, "loss": 5.8753, "num_input_tokens_seen": 73531392, "step": 561 }, { "epoch": 0.09255337152117168, "grad_norm": 1.1428287029266357, "learning_rate": 0.00022061326516317517, "loss": 5.8625, "num_input_tokens_seen": 73924608, "step": 564 }, { "epoch": 0.0930456766888375, "grad_norm": 1.1694408655166626, "learning_rate": 0.00022002885813615086, "loss": 5.882, "num_input_tokens_seen": 74317824, "step": 567 }, { "epoch": 0.0935379818565033, "grad_norm": 0.9096398949623108, "learning_rate": 0.00021944907094094087, "loss": 5.874, "num_input_tokens_seen": 74711040, "step": 570 }, { "epoch": 0.0940302870241691, "grad_norm": 0.8736022710800171, "learning_rate": 0.00021887384302904644, "loss": 5.9158, "num_input_tokens_seen": 75104256, "step": 573 }, { "epoch": 0.09452259219183491, "grad_norm": 0.6410363912582397, "learning_rate": 0.00021830311495716224, "loss": 5.8954, "num_input_tokens_seen": 75497472, "step": 576 }, { "epoch": 0.09501489735950072, "grad_norm": 0.8881443738937378, "learning_rate": 0.00021773682836137405, "loss": 5.8734, "num_input_tokens_seen": 75890688, "step": 579 }, { "epoch": 0.09550720252716653, "grad_norm": 0.7709605097770691, "learning_rate": 0.00021717492593208875, "loss": 5.8888, "num_input_tokens_seen": 76283904, "step": 582 }, { "epoch": 0.09599950769483233, "grad_norm": 0.6798803210258484, "learning_rate": 0.00021661735138967265, "loss": 5.8759, "num_input_tokens_seen": 76677120, "step": 585 }, { "epoch": 0.09649181286249814, "grad_norm": 0.5487522482872009, "learning_rate": 0.0002160640494607739, "loss": 5.8661, "num_input_tokens_seen": 77070336, "step": 588 }, { "epoch": 0.09698411803016395, "grad_norm": 0.6725640892982483, "learning_rate": 0.00021551496585530715, "loss": 5.8954, "num_input_tokens_seen": 77463552, "step": 591 }, { "epoch": 0.09747642319782976, "grad_norm": 0.6947436928749084, "learning_rate": 0.00021497004724407818, "loss": 5.8171, "num_input_tokens_seen": 77856768, "step": 594 }, { "epoch": 0.09796872836549556, "grad_norm": 0.9774389266967773, "learning_rate": 0.00021442924123702773, "loss": 5.8996, "num_input_tokens_seen": 78249984, "step": 597 }, { "epoch": 0.09846103353316137, "grad_norm": 1.041482925415039, "learning_rate": 0.00021389249636207436, "loss": 5.8522, "num_input_tokens_seen": 78643200, "step": 600 }, { "epoch": 0.09895333870082718, "grad_norm": 0.7540446519851685, "learning_rate": 0.0002133597620445371, "loss": 5.7978, "num_input_tokens_seen": 79036416, "step": 603 }, { "epoch": 0.09944564386849299, "grad_norm": 0.7133435606956482, "learning_rate": 0.00021283098858711878, "loss": 5.8597, "num_input_tokens_seen": 79429632, "step": 606 }, { "epoch": 0.09993794903615878, "grad_norm": 0.9211872220039368, "learning_rate": 0.00021230612715043284, "loss": 5.827, "num_input_tokens_seen": 79822848, "step": 609 }, { "epoch": 0.10043025420382459, "grad_norm": 0.9481165409088135, "learning_rate": 0.00021178512973405518, "loss": 5.8291, "num_input_tokens_seen": 80216064, "step": 612 }, { "epoch": 0.1009225593714904, "grad_norm": 0.9099063873291016, "learning_rate": 0.00021126794915808552, "loss": 5.8853, "num_input_tokens_seen": 80609280, "step": 615 }, { "epoch": 0.10141486453915621, "grad_norm": 1.089982032775879, "learning_rate": 0.00021075453904520141, "loss": 5.8042, "num_input_tokens_seen": 81002496, "step": 618 }, { "epoch": 0.10190716970682201, "grad_norm": 1.2257575988769531, "learning_rate": 0.00021024485380318974, "loss": 5.8316, "num_input_tokens_seen": 81395712, "step": 621 }, { "epoch": 0.10239947487448782, "grad_norm": 0.922932505607605, "learning_rate": 0.00020973884860794057, "loss": 5.8464, "num_input_tokens_seen": 81788928, "step": 624 }, { "epoch": 0.10289178004215363, "grad_norm": 0.9160616397857666, "learning_rate": 0.00020923647938688914, "loss": 5.8233, "num_input_tokens_seen": 82182144, "step": 627 }, { "epoch": 0.10338408520981944, "grad_norm": 1.4857609272003174, "learning_rate": 0.00020873770280289224, "loss": 5.8783, "num_input_tokens_seen": 82575360, "step": 630 }, { "epoch": 0.10387639037748525, "grad_norm": 0.8238282799720764, "learning_rate": 0.00020824247623852486, "loss": 5.8476, "num_input_tokens_seen": 82968576, "step": 633 }, { "epoch": 0.10436869554515105, "grad_norm": 0.9078794717788696, "learning_rate": 0.0002077507577807854, "loss": 5.8344, "num_input_tokens_seen": 83361792, "step": 636 }, { "epoch": 0.10486100071281686, "grad_norm": 0.8983127474784851, "learning_rate": 0.0002072625062061955, "loss": 5.8672, "num_input_tokens_seen": 83755008, "step": 639 }, { "epoch": 0.10535330588048267, "grad_norm": 0.9999786019325256, "learning_rate": 0.00020677768096628412, "loss": 5.8052, "num_input_tokens_seen": 84148224, "step": 642 }, { "epoch": 0.10584561104814848, "grad_norm": 0.9309632778167725, "learning_rate": 0.0002062962421734427, "loss": 5.8597, "num_input_tokens_seen": 84541440, "step": 645 }, { "epoch": 0.10633791621581427, "grad_norm": 0.9224460124969482, "learning_rate": 0.00020581815058714115, "loss": 5.8526, "num_input_tokens_seen": 84934656, "step": 648 }, { "epoch": 0.10683022138348008, "grad_norm": 0.7043266892433167, "learning_rate": 0.00020534336760049378, "loss": 5.7704, "num_input_tokens_seen": 85327872, "step": 651 }, { "epoch": 0.1073225265511459, "grad_norm": 0.7217307090759277, "learning_rate": 0.00020487185522716434, "loss": 5.8222, "num_input_tokens_seen": 85721088, "step": 654 }, { "epoch": 0.1078148317188117, "grad_norm": 0.7779012322425842, "learning_rate": 0.0002044035760886003, "loss": 5.8596, "num_input_tokens_seen": 86114304, "step": 657 }, { "epoch": 0.1083071368864775, "grad_norm": 0.7242317795753479, "learning_rate": 0.00020393849340158684, "loss": 5.8273, "num_input_tokens_seen": 86507520, "step": 660 }, { "epoch": 0.10879944205414331, "grad_norm": 0.8740100264549255, "learning_rate": 0.00020347657096611072, "loss": 5.7984, "num_input_tokens_seen": 86900736, "step": 663 }, { "epoch": 0.10929174722180912, "grad_norm": 0.9838325381278992, "learning_rate": 0.0002030177731535252, "loss": 5.851, "num_input_tokens_seen": 87293952, "step": 666 }, { "epoch": 0.10978405238947493, "grad_norm": 0.989205002784729, "learning_rate": 0.0002025620648950073, "loss": 5.8132, "num_input_tokens_seen": 87687168, "step": 669 }, { "epoch": 0.11027635755714073, "grad_norm": 1.137012004852295, "learning_rate": 0.00020210941167029872, "loss": 5.7769, "num_input_tokens_seen": 88080384, "step": 672 }, { "epoch": 0.11076866272480654, "grad_norm": 1.0733401775360107, "learning_rate": 0.00020165977949672233, "loss": 5.7877, "num_input_tokens_seen": 88473600, "step": 675 }, { "epoch": 0.11126096789247235, "grad_norm": 1.1243098974227905, "learning_rate": 0.00020121313491846602, "loss": 5.7843, "num_input_tokens_seen": 88866816, "step": 678 }, { "epoch": 0.11175327306013816, "grad_norm": 0.9308670163154602, "learning_rate": 0.000200769444996127, "loss": 5.8465, "num_input_tokens_seen": 89260032, "step": 681 }, { "epoch": 0.11224557822780395, "grad_norm": 1.067434549331665, "learning_rate": 0.00020032867729650794, "loss": 5.7991, "num_input_tokens_seen": 89653248, "step": 684 }, { "epoch": 0.11273788339546977, "grad_norm": 0.9211784601211548, "learning_rate": 0.00019989079988265906, "loss": 5.7806, "num_input_tokens_seen": 90046464, "step": 687 }, { "epoch": 0.11323018856313558, "grad_norm": 0.8653129935264587, "learning_rate": 0.00019945578130415816, "loss": 5.8365, "num_input_tokens_seen": 90439680, "step": 690 }, { "epoch": 0.11372249373080139, "grad_norm": 1.8020168542861938, "learning_rate": 0.00019902359058762258, "loss": 5.8169, "num_input_tokens_seen": 90832896, "step": 693 }, { "epoch": 0.11421479889846718, "grad_norm": 1.38017737865448, "learning_rate": 0.00019859419722744617, "loss": 5.7989, "num_input_tokens_seen": 91226112, "step": 696 }, { "epoch": 0.11470710406613299, "grad_norm": 1.2626079320907593, "learning_rate": 0.0001981675711767554, "loss": 5.7555, "num_input_tokens_seen": 91619328, "step": 699 }, { "epoch": 0.1151994092337988, "grad_norm": 0.9266194105148315, "learning_rate": 0.00019774368283857792, "loss": 5.8277, "num_input_tokens_seen": 92012544, "step": 702 }, { "epoch": 0.11569171440146461, "grad_norm": 1.0578334331512451, "learning_rate": 0.00019732250305721835, "loss": 5.7924, "num_input_tokens_seen": 92405760, "step": 705 }, { "epoch": 0.11618401956913041, "grad_norm": 0.6349201798439026, "learning_rate": 0.00019690400310983514, "loss": 5.7968, "num_input_tokens_seen": 92798976, "step": 708 }, { "epoch": 0.11667632473679622, "grad_norm": 0.6735845804214478, "learning_rate": 0.0001964881546982129, "loss": 5.7552, "num_input_tokens_seen": 93192192, "step": 711 }, { "epoch": 0.11716862990446203, "grad_norm": 0.706881582736969, "learning_rate": 0.0001960749299407257, "loss": 5.7713, "num_input_tokens_seen": 93585408, "step": 714 }, { "epoch": 0.11766093507212784, "grad_norm": 0.7729910612106323, "learning_rate": 0.00019566430136448468, "loss": 5.8106, "num_input_tokens_seen": 93978624, "step": 717 }, { "epoch": 0.11815324023979364, "grad_norm": 0.7226433157920837, "learning_rate": 0.00019525624189766633, "loss": 5.7983, "num_input_tokens_seen": 94371840, "step": 720 }, { "epoch": 0.11864554540745945, "grad_norm": 0.6069265604019165, "learning_rate": 0.0001948507248620161, "loss": 5.7355, "num_input_tokens_seen": 94765056, "step": 723 }, { "epoch": 0.11913785057512526, "grad_norm": 0.7411084175109863, "learning_rate": 0.00019444772396552212, "loss": 5.7673, "num_input_tokens_seen": 95158272, "step": 726 }, { "epoch": 0.11963015574279107, "grad_norm": 0.9155630469322205, "learning_rate": 0.0001940472132952553, "loss": 5.7936, "num_input_tokens_seen": 95551488, "step": 729 }, { "epoch": 0.12012246091045686, "grad_norm": 0.8562064170837402, "learning_rate": 0.00019364916731037083, "loss": 5.7453, "num_input_tokens_seen": 95944704, "step": 732 }, { "epoch": 0.12061476607812267, "grad_norm": 0.9439576268196106, "learning_rate": 0.0001932535608352669, "loss": 5.7545, "num_input_tokens_seen": 96337920, "step": 735 }, { "epoch": 0.12110707124578848, "grad_norm": 0.9385275840759277, "learning_rate": 0.00019286036905289666, "loss": 5.7467, "num_input_tokens_seen": 96731136, "step": 738 }, { "epoch": 0.1215993764134543, "grad_norm": 0.7505801320075989, "learning_rate": 0.00019246956749822933, "loss": 5.7087, "num_input_tokens_seen": 97124352, "step": 741 }, { "epoch": 0.12209168158112009, "grad_norm": 0.6224293112754822, "learning_rate": 0.0001920811320518561, "loss": 5.7848, "num_input_tokens_seen": 97517568, "step": 744 }, { "epoch": 0.1225839867487859, "grad_norm": 0.5917067527770996, "learning_rate": 0.00019169503893373772, "loss": 5.7164, "num_input_tokens_seen": 97910784, "step": 747 }, { "epoch": 0.12307629191645171, "grad_norm": 0.6723655462265015, "learning_rate": 0.00019131126469708987, "loss": 5.7364, "num_input_tokens_seen": 98304000, "step": 750 }, { "epoch": 0.12356859708411752, "grad_norm": 0.6307470202445984, "learning_rate": 0.00019092978622240234, "loss": 5.7312, "num_input_tokens_seen": 98697216, "step": 753 }, { "epoch": 0.12406090225178333, "grad_norm": 0.6715870499610901, "learning_rate": 0.00019055058071158903, "loss": 5.7555, "num_input_tokens_seen": 99090432, "step": 756 }, { "epoch": 0.12455320741944913, "grad_norm": 0.7827840447425842, "learning_rate": 0.00019017362568226525, "loss": 5.7412, "num_input_tokens_seen": 99483648, "step": 759 }, { "epoch": 0.12504551258711494, "grad_norm": 0.7358651757240295, "learning_rate": 0.0001897988989621491, "loss": 5.7403, "num_input_tokens_seen": 99876864, "step": 762 }, { "epoch": 0.12553781775478073, "grad_norm": 0.6782851219177246, "learning_rate": 0.00018942637868358373, "loss": 5.7582, "num_input_tokens_seen": 100270080, "step": 765 }, { "epoch": 0.12603012292244656, "grad_norm": 0.675395667552948, "learning_rate": 0.00018905604327817716, "loss": 5.7459, "num_input_tokens_seen": 100663296, "step": 768 }, { "epoch": 0.12652242809011235, "grad_norm": 0.6478227972984314, "learning_rate": 0.0001886878714715573, "loss": 5.7242, "num_input_tokens_seen": 101056512, "step": 771 }, { "epoch": 0.12701473325777818, "grad_norm": 0.5236144661903381, "learning_rate": 0.00018832184227823856, "loss": 5.7381, "num_input_tokens_seen": 101449728, "step": 774 }, { "epoch": 0.12750703842544397, "grad_norm": 0.6938157081604004, "learning_rate": 0.0001879579349965979, "loss": 5.7178, "num_input_tokens_seen": 101842944, "step": 777 }, { "epoch": 0.12799934359310977, "grad_norm": 0.735215425491333, "learning_rate": 0.00018759612920395688, "loss": 5.7341, "num_input_tokens_seen": 102236160, "step": 780 }, { "epoch": 0.1284916487607756, "grad_norm": 0.7181572914123535, "learning_rate": 0.0001872364047517678, "loss": 5.7262, "num_input_tokens_seen": 102629376, "step": 783 }, { "epoch": 0.1289839539284414, "grad_norm": 0.5627852082252502, "learning_rate": 0.00018687874176090066, "loss": 5.7525, "num_input_tokens_seen": 103022592, "step": 786 }, { "epoch": 0.1294762590961072, "grad_norm": 0.6639107465744019, "learning_rate": 0.0001865231206170292, "loss": 5.7159, "num_input_tokens_seen": 103415808, "step": 789 }, { "epoch": 0.129968564263773, "grad_norm": 0.6468229293823242, "learning_rate": 0.00018616952196611267, "loss": 5.7392, "num_input_tokens_seen": 103809024, "step": 792 }, { "epoch": 0.1304608694314388, "grad_norm": 0.657605767250061, "learning_rate": 0.00018581792670997177, "loss": 5.7256, "num_input_tokens_seen": 104202240, "step": 795 }, { "epoch": 0.13095317459910463, "grad_norm": 0.6212555766105652, "learning_rate": 0.00018546831600195623, "loss": 5.723, "num_input_tokens_seen": 104595456, "step": 798 }, { "epoch": 0.13128137804421516, "eval_accuracy": 0.18926233512457255, "eval_loss": 5.911603927612305, "eval_runtime": 111.0852, "eval_samples_per_second": 2.701, "eval_steps_per_second": 1.35, "num_input_tokens_seen": 104857600, "step": 800 }, { "epoch": 0.13144547976677043, "grad_norm": 0.6446166634559631, "learning_rate": 0.00018512067124270133, "loss": 5.7414, "num_input_tokens_seen": 104988672, "step": 801 }, { "epoch": 0.13193778493443623, "grad_norm": 0.7650989294052124, "learning_rate": 0.00018477497407597197, "loss": 5.7188, "num_input_tokens_seen": 105381888, "step": 804 }, { "epoch": 0.13243009010210205, "grad_norm": 0.6955244541168213, "learning_rate": 0.00018443120638459164, "loss": 5.7054, "num_input_tokens_seen": 105775104, "step": 807 }, { "epoch": 0.13292239526976785, "grad_norm": 0.7522872090339661, "learning_rate": 0.00018408935028645438, "loss": 5.7231, "num_input_tokens_seen": 106168320, "step": 810 }, { "epoch": 0.13341470043743364, "grad_norm": 0.7220354080200195, "learning_rate": 0.00018374938813061763, "loss": 5.7101, "num_input_tokens_seen": 106561536, "step": 813 }, { "epoch": 0.13390700560509947, "grad_norm": 0.7726113796234131, "learning_rate": 0.00018341130249347484, "loss": 5.78, "num_input_tokens_seen": 106954752, "step": 816 }, { "epoch": 0.13439931077276526, "grad_norm": 0.8380802869796753, "learning_rate": 0.000183075076175004, "loss": 5.7309, "num_input_tokens_seen": 107347968, "step": 819 }, { "epoch": 0.1348916159404311, "grad_norm": 0.871356725692749, "learning_rate": 0.0001827406921950927, "loss": 5.6989, "num_input_tokens_seen": 107741184, "step": 822 }, { "epoch": 0.13538392110809688, "grad_norm": 0.8443216681480408, "learning_rate": 0.0001824081337899362, "loss": 5.7078, "num_input_tokens_seen": 108134400, "step": 825 }, { "epoch": 0.13587622627576268, "grad_norm": 0.8105941414833069, "learning_rate": 0.00018207738440850766, "loss": 5.757, "num_input_tokens_seen": 108527616, "step": 828 }, { "epoch": 0.1363685314434285, "grad_norm": 0.9102969765663147, "learning_rate": 0.00018174842770909803, "loss": 5.6674, "num_input_tokens_seen": 108920832, "step": 831 }, { "epoch": 0.1368608366110943, "grad_norm": 0.9995675086975098, "learning_rate": 0.00018142124755592492, "loss": 5.7259, "num_input_tokens_seen": 109314048, "step": 834 }, { "epoch": 0.1373531417787601, "grad_norm": 0.9919428825378418, "learning_rate": 0.00018109582801580817, "loss": 5.7217, "num_input_tokens_seen": 109707264, "step": 837 }, { "epoch": 0.13784544694642592, "grad_norm": 0.7615512609481812, "learning_rate": 0.0001807721533549109, "loss": 5.66, "num_input_tokens_seen": 110100480, "step": 840 }, { "epoch": 0.13833775211409172, "grad_norm": 0.6651118397712708, "learning_rate": 0.0001804502080355442, "loss": 5.6899, "num_input_tokens_seen": 110493696, "step": 843 }, { "epoch": 0.13883005728175754, "grad_norm": 0.8231689929962158, "learning_rate": 0.00018012997671303435, "loss": 5.7131, "num_input_tokens_seen": 110886912, "step": 846 }, { "epoch": 0.13932236244942334, "grad_norm": 0.8676853179931641, "learning_rate": 0.00017981144423265112, "loss": 5.6746, "num_input_tokens_seen": 111280128, "step": 849 }, { "epoch": 0.13981466761708913, "grad_norm": 1.0326123237609863, "learning_rate": 0.00017949459562659518, "loss": 5.7139, "num_input_tokens_seen": 111673344, "step": 852 }, { "epoch": 0.14030697278475496, "grad_norm": 1.2157782316207886, "learning_rate": 0.00017917941611104426, "loss": 5.7368, "num_input_tokens_seen": 112066560, "step": 855 }, { "epoch": 0.14079927795242075, "grad_norm": 1.1053582429885864, "learning_rate": 0.0001788658910832554, "loss": 5.7108, "num_input_tokens_seen": 112459776, "step": 858 }, { "epoch": 0.14129158312008655, "grad_norm": 1.0346806049346924, "learning_rate": 0.0001785540061187239, "loss": 5.6973, "num_input_tokens_seen": 112852992, "step": 861 }, { "epoch": 0.14178388828775237, "grad_norm": 0.9909515976905823, "learning_rate": 0.0001782437469683953, "loss": 5.6803, "num_input_tokens_seen": 113246208, "step": 864 }, { "epoch": 0.14227619345541817, "grad_norm": 0.8728073835372925, "learning_rate": 0.00017793509955593145, "loss": 5.6894, "num_input_tokens_seen": 113639424, "step": 867 }, { "epoch": 0.142768498623084, "grad_norm": 0.8351977467536926, "learning_rate": 0.00017762804997502798, "loss": 5.6577, "num_input_tokens_seen": 114032640, "step": 870 }, { "epoch": 0.1432608037907498, "grad_norm": 0.8890995979309082, "learning_rate": 0.00017732258448678262, "loss": 5.7118, "num_input_tokens_seen": 114425856, "step": 873 }, { "epoch": 0.1437531089584156, "grad_norm": 0.6228556036949158, "learning_rate": 0.0001770186895171133, "loss": 5.6714, "num_input_tokens_seen": 114819072, "step": 876 }, { "epoch": 0.1442454141260814, "grad_norm": 0.6140576004981995, "learning_rate": 0.00017671635165422445, "loss": 5.6817, "num_input_tokens_seen": 115212288, "step": 879 }, { "epoch": 0.1447377192937472, "grad_norm": 0.5662062168121338, "learning_rate": 0.00017641555764612098, "loss": 5.713, "num_input_tokens_seen": 115605504, "step": 882 }, { "epoch": 0.14523002446141303, "grad_norm": 0.578356146812439, "learning_rate": 0.00017611629439816853, "loss": 5.69, "num_input_tokens_seen": 115998720, "step": 885 }, { "epoch": 0.14572232962907883, "grad_norm": 0.5570440888404846, "learning_rate": 0.0001758185489706992, "loss": 5.6513, "num_input_tokens_seen": 116391936, "step": 888 }, { "epoch": 0.14621463479674462, "grad_norm": 0.5778940320014954, "learning_rate": 0.00017552230857666157, "loss": 5.6607, "num_input_tokens_seen": 116785152, "step": 891 }, { "epoch": 0.14670693996441045, "grad_norm": 0.5713542699813843, "learning_rate": 0.00017522756057931406, "loss": 5.6619, "num_input_tokens_seen": 117178368, "step": 894 }, { "epoch": 0.14719924513207625, "grad_norm": 0.53873211145401, "learning_rate": 0.00017493429248996095, "loss": 5.6388, "num_input_tokens_seen": 117571584, "step": 897 }, { "epoch": 0.14769155029974204, "grad_norm": 0.6463914513587952, "learning_rate": 0.0001746424919657298, "loss": 5.6553, "num_input_tokens_seen": 117964800, "step": 900 }, { "epoch": 0.14818385546740787, "grad_norm": 0.5670692920684814, "learning_rate": 0.00017435214680738953, "loss": 5.6801, "num_input_tokens_seen": 118358016, "step": 903 }, { "epoch": 0.14867616063507366, "grad_norm": 0.6744683980941772, "learning_rate": 0.00017406324495720832, "loss": 5.6817, "num_input_tokens_seen": 118751232, "step": 906 }, { "epoch": 0.14916846580273949, "grad_norm": 0.6172819137573242, "learning_rate": 0.0001737757744968504, "loss": 5.6584, "num_input_tokens_seen": 119144448, "step": 909 }, { "epoch": 0.14966077097040528, "grad_norm": 0.7147213220596313, "learning_rate": 0.0001734897236453108, "loss": 5.6536, "num_input_tokens_seen": 119537664, "step": 912 }, { "epoch": 0.15015307613807108, "grad_norm": 0.5693617463111877, "learning_rate": 0.00017320508075688773, "loss": 5.6305, "num_input_tokens_seen": 119930880, "step": 915 }, { "epoch": 0.1506453813057369, "grad_norm": 0.7820476293563843, "learning_rate": 0.00017292183431919094, "loss": 5.6358, "num_input_tokens_seen": 120324096, "step": 918 }, { "epoch": 0.1511376864734027, "grad_norm": 0.7270592451095581, "learning_rate": 0.00017263997295118624, "loss": 5.6412, "num_input_tokens_seen": 120717312, "step": 921 }, { "epoch": 0.1516299916410685, "grad_norm": 0.6628842353820801, "learning_rate": 0.00017235948540127462, "loss": 5.6695, "num_input_tokens_seen": 121110528, "step": 924 }, { "epoch": 0.15212229680873432, "grad_norm": 0.7470645308494568, "learning_rate": 0.00017208036054540591, "loss": 5.6533, "num_input_tokens_seen": 121503744, "step": 927 }, { "epoch": 0.15261460197640012, "grad_norm": 0.6823071241378784, "learning_rate": 0.00017180258738522556, "loss": 5.6321, "num_input_tokens_seen": 121896960, "step": 930 }, { "epoch": 0.15310690714406594, "grad_norm": 0.6298269033432007, "learning_rate": 0.0001715261550462546, "loss": 5.69, "num_input_tokens_seen": 122290176, "step": 933 }, { "epoch": 0.15359921231173174, "grad_norm": 0.6772837042808533, "learning_rate": 0.00017125105277610142, "loss": 5.6243, "num_input_tokens_seen": 122683392, "step": 936 }, { "epoch": 0.15409151747939753, "grad_norm": 0.9537835121154785, "learning_rate": 0.00017097726994270523, "loss": 5.6595, "num_input_tokens_seen": 123076608, "step": 939 }, { "epoch": 0.15458382264706336, "grad_norm": 0.9860975742340088, "learning_rate": 0.00017070479603261012, "loss": 5.6391, "num_input_tokens_seen": 123469824, "step": 942 }, { "epoch": 0.15507612781472915, "grad_norm": 0.7856841683387756, "learning_rate": 0.00017043362064926934, "loss": 5.627, "num_input_tokens_seen": 123863040, "step": 945 }, { "epoch": 0.15556843298239495, "grad_norm": 0.8125623464584351, "learning_rate": 0.00017016373351137908, "loss": 5.6797, "num_input_tokens_seen": 124256256, "step": 948 }, { "epoch": 0.15606073815006077, "grad_norm": 0.8572544455528259, "learning_rate": 0.0001698951244512415, "loss": 5.6211, "num_input_tokens_seen": 124649472, "step": 951 }, { "epoch": 0.15655304331772657, "grad_norm": 0.6495417356491089, "learning_rate": 0.0001696277834131554, "loss": 5.6775, "num_input_tokens_seen": 125042688, "step": 954 }, { "epoch": 0.1570453484853924, "grad_norm": 0.7361529469490051, "learning_rate": 0.00016936170045183562, "loss": 5.6332, "num_input_tokens_seen": 125435904, "step": 957 }, { "epoch": 0.1575376536530582, "grad_norm": 0.6628533601760864, "learning_rate": 0.0001690968657308585, "loss": 5.6219, "num_input_tokens_seen": 125829120, "step": 960 }, { "epoch": 0.158029958820724, "grad_norm": 0.6602532267570496, "learning_rate": 0.00016883326952113513, "loss": 5.6377, "num_input_tokens_seen": 126222336, "step": 963 }, { "epoch": 0.1585222639883898, "grad_norm": 0.7363507151603699, "learning_rate": 0.0001685709021994098, "loss": 5.6573, "num_input_tokens_seen": 126615552, "step": 966 }, { "epoch": 0.1590145691560556, "grad_norm": 0.6305029988288879, "learning_rate": 0.00016830975424678453, "loss": 5.6764, "num_input_tokens_seen": 127008768, "step": 969 }, { "epoch": 0.1595068743237214, "grad_norm": 0.7641071081161499, "learning_rate": 0.0001680498162472686, "loss": 5.6473, "num_input_tokens_seen": 127401984, "step": 972 }, { "epoch": 0.15999917949138723, "grad_norm": 0.7363148331642151, "learning_rate": 0.00016779107888635245, "loss": 5.6144, "num_input_tokens_seen": 127795200, "step": 975 }, { "epoch": 0.16049148465905302, "grad_norm": 0.8142803311347961, "learning_rate": 0.0001675335329496059, "loss": 5.6206, "num_input_tokens_seen": 128188416, "step": 978 }, { "epoch": 0.16098378982671885, "grad_norm": 0.7114639282226562, "learning_rate": 0.00016727716932129973, "loss": 5.6103, "num_input_tokens_seen": 128581632, "step": 981 }, { "epoch": 0.16147609499438464, "grad_norm": 0.8105225563049316, "learning_rate": 0.0001670219789830507, "loss": 5.6128, "num_input_tokens_seen": 128974848, "step": 984 }, { "epoch": 0.16196840016205044, "grad_norm": 0.8542172312736511, "learning_rate": 0.00016676795301248881, "loss": 5.6622, "num_input_tokens_seen": 129368064, "step": 987 }, { "epoch": 0.16246070532971627, "grad_norm": 0.848969578742981, "learning_rate": 0.00016651508258194728, "loss": 5.644, "num_input_tokens_seen": 129761280, "step": 990 }, { "epoch": 0.16295301049738206, "grad_norm": 0.7594742774963379, "learning_rate": 0.0001662633589571739, "loss": 5.6315, "num_input_tokens_seen": 130154496, "step": 993 }, { "epoch": 0.16344531566504786, "grad_norm": 0.5903003811836243, "learning_rate": 0.0001660127734960639, "loss": 5.6235, "num_input_tokens_seen": 130547712, "step": 996 }, { "epoch": 0.16393762083271368, "grad_norm": 0.680317759513855, "learning_rate": 0.00016576331764741402, "loss": 5.6303, "num_input_tokens_seen": 130940928, "step": 999 }, { "epoch": 0.16442992600037948, "grad_norm": 0.6790865659713745, "learning_rate": 0.00016551498294969648, "loss": 5.6244, "num_input_tokens_seen": 131334144, "step": 1002 }, { "epoch": 0.1649222311680453, "grad_norm": 0.77605140209198, "learning_rate": 0.00016526776102985388, "loss": 5.6269, "num_input_tokens_seen": 131727360, "step": 1005 }, { "epoch": 0.1654145363357111, "grad_norm": 0.7890040874481201, "learning_rate": 0.00016502164360211315, "loss": 5.6388, "num_input_tokens_seen": 132120576, "step": 1008 }, { "epoch": 0.1659068415033769, "grad_norm": 0.7248251438140869, "learning_rate": 0.0001647766224668193, "loss": 5.6087, "num_input_tokens_seen": 132513792, "step": 1011 }, { "epoch": 0.16639914667104272, "grad_norm": 0.7000757455825806, "learning_rate": 0.00016453268950928797, "loss": 5.61, "num_input_tokens_seen": 132907008, "step": 1014 }, { "epoch": 0.16689145183870852, "grad_norm": 0.6521958112716675, "learning_rate": 0.00016428983669867676, "loss": 5.6407, "num_input_tokens_seen": 133300224, "step": 1017 }, { "epoch": 0.16738375700637434, "grad_norm": 0.7919925451278687, "learning_rate": 0.00016404805608687456, "loss": 5.6145, "num_input_tokens_seen": 133693440, "step": 1020 }, { "epoch": 0.16787606217404014, "grad_norm": 1.0986133813858032, "learning_rate": 0.0001638073398074093, "loss": 5.6294, "num_input_tokens_seen": 134086656, "step": 1023 }, { "epoch": 0.16836836734170593, "grad_norm": 0.9454318881034851, "learning_rate": 0.0001635676800743725, "loss": 5.6277, "num_input_tokens_seen": 134479872, "step": 1026 }, { "epoch": 0.16886067250937176, "grad_norm": 0.8960498571395874, "learning_rate": 0.000163329069181362, "loss": 5.5897, "num_input_tokens_seen": 134873088, "step": 1029 }, { "epoch": 0.16935297767703755, "grad_norm": 0.7676910758018494, "learning_rate": 0.00016309149950044093, "loss": 5.6505, "num_input_tokens_seen": 135266304, "step": 1032 }, { "epoch": 0.16984528284470335, "grad_norm": 0.6834097504615784, "learning_rate": 0.0001628549634811134, "loss": 5.6003, "num_input_tokens_seen": 135659520, "step": 1035 }, { "epoch": 0.17033758801236917, "grad_norm": 0.7149432301521301, "learning_rate": 0.00016261945364931684, "loss": 5.599, "num_input_tokens_seen": 136052736, "step": 1038 }, { "epoch": 0.17082989318003497, "grad_norm": 0.635908842086792, "learning_rate": 0.00016238496260642988, "loss": 5.5852, "num_input_tokens_seen": 136445952, "step": 1041 }, { "epoch": 0.1713221983477008, "grad_norm": 0.7218221426010132, "learning_rate": 0.0001621514830282963, "loss": 5.6205, "num_input_tokens_seen": 136839168, "step": 1044 }, { "epoch": 0.1718145035153666, "grad_norm": 0.5910034775733948, "learning_rate": 0.00016191900766426384, "loss": 5.5992, "num_input_tokens_seen": 137232384, "step": 1047 }, { "epoch": 0.1723068086830324, "grad_norm": 0.729131281375885, "learning_rate": 0.000161687529336239, "loss": 5.6396, "num_input_tokens_seen": 137625600, "step": 1050 }, { "epoch": 0.1727991138506982, "grad_norm": 0.5640085935592651, "learning_rate": 0.00016145704093775551, "loss": 5.5828, "num_input_tokens_seen": 138018816, "step": 1053 }, { "epoch": 0.173291419018364, "grad_norm": 0.6498822569847107, "learning_rate": 0.00016122753543305863, "loss": 5.6024, "num_input_tokens_seen": 138412032, "step": 1056 }, { "epoch": 0.1737837241860298, "grad_norm": 0.6818384528160095, "learning_rate": 0.00016099900585620256, "loss": 5.6227, "num_input_tokens_seen": 138805248, "step": 1059 }, { "epoch": 0.17427602935369563, "grad_norm": 0.6664714813232422, "learning_rate": 0.00016077144531016272, "loss": 5.5499, "num_input_tokens_seen": 139198464, "step": 1062 }, { "epoch": 0.17476833452136142, "grad_norm": 0.5847983956336975, "learning_rate": 0.00016054484696596133, "loss": 5.5775, "num_input_tokens_seen": 139591680, "step": 1065 }, { "epoch": 0.17526063968902725, "grad_norm": 0.7117697596549988, "learning_rate": 0.00016031920406180673, "loss": 5.5939, "num_input_tokens_seen": 139984896, "step": 1068 }, { "epoch": 0.17575294485669304, "grad_norm": 0.6359485983848572, "learning_rate": 0.00016009450990224597, "loss": 5.5776, "num_input_tokens_seen": 140378112, "step": 1071 }, { "epoch": 0.17624525002435884, "grad_norm": 0.7390355467796326, "learning_rate": 0.00015987075785733017, "loss": 5.5825, "num_input_tokens_seen": 140771328, "step": 1074 }, { "epoch": 0.17673755519202466, "grad_norm": 0.6650616526603699, "learning_rate": 0.000159647941361793, "loss": 5.5637, "num_input_tokens_seen": 141164544, "step": 1077 }, { "epoch": 0.17722986035969046, "grad_norm": 0.642583429813385, "learning_rate": 0.0001594260539142416, "loss": 5.5931, "num_input_tokens_seen": 141557760, "step": 1080 }, { "epoch": 0.17772216552735626, "grad_norm": 0.7720558047294617, "learning_rate": 0.0001592050890763597, "loss": 5.6194, "num_input_tokens_seen": 141950976, "step": 1083 }, { "epoch": 0.17821447069502208, "grad_norm": 0.9935165047645569, "learning_rate": 0.00015898504047212318, "loss": 5.5858, "num_input_tokens_seen": 142344192, "step": 1086 }, { "epoch": 0.17870677586268788, "grad_norm": 1.5645318031311035, "learning_rate": 0.00015876590178702708, "loss": 5.5929, "num_input_tokens_seen": 142737408, "step": 1089 }, { "epoch": 0.1791990810303537, "grad_norm": 0.6664535403251648, "learning_rate": 0.0001585476667673247, "loss": 5.5807, "num_input_tokens_seen": 143130624, "step": 1092 }, { "epoch": 0.1796913861980195, "grad_norm": 0.889173686504364, "learning_rate": 0.0001583303292192779, "loss": 5.6241, "num_input_tokens_seen": 143523840, "step": 1095 }, { "epoch": 0.1801836913656853, "grad_norm": 0.9507172703742981, "learning_rate": 0.00015811388300841897, "loss": 5.6112, "num_input_tokens_seen": 143917056, "step": 1098 }, { "epoch": 0.18067599653335112, "grad_norm": 0.7204018831253052, "learning_rate": 0.00015789832205882312, "loss": 5.5713, "num_input_tokens_seen": 144310272, "step": 1101 }, { "epoch": 0.18116830170101691, "grad_norm": 0.7509574890136719, "learning_rate": 0.0001576836403523923, "loss": 5.5944, "num_input_tokens_seen": 144703488, "step": 1104 }, { "epoch": 0.1816606068686827, "grad_norm": 0.9350889921188354, "learning_rate": 0.00015746983192814956, "loss": 5.5641, "num_input_tokens_seen": 145096704, "step": 1107 }, { "epoch": 0.18215291203634854, "grad_norm": 1.061766266822815, "learning_rate": 0.00015725689088154365, "loss": 5.5659, "num_input_tokens_seen": 145489920, "step": 1110 }, { "epoch": 0.18264521720401433, "grad_norm": 0.8999969363212585, "learning_rate": 0.00015704481136376432, "loss": 5.5741, "num_input_tokens_seen": 145883136, "step": 1113 }, { "epoch": 0.18313752237168016, "grad_norm": 0.6309444308280945, "learning_rate": 0.00015683358758106756, "loss": 5.566, "num_input_tokens_seen": 146276352, "step": 1116 }, { "epoch": 0.18362982753934595, "grad_norm": 0.5857992768287659, "learning_rate": 0.0001566232137941107, "loss": 5.5473, "num_input_tokens_seen": 146669568, "step": 1119 }, { "epoch": 0.18412213270701175, "grad_norm": 0.6874331831932068, "learning_rate": 0.0001564136843172976, "loss": 5.5815, "num_input_tokens_seen": 147062784, "step": 1122 }, { "epoch": 0.18461443787467757, "grad_norm": 0.9051715135574341, "learning_rate": 0.00015620499351813306, "loss": 5.5711, "num_input_tokens_seen": 147456000, "step": 1125 }, { "epoch": 0.18510674304234337, "grad_norm": 0.77443528175354, "learning_rate": 0.0001559971358165871, "loss": 5.5898, "num_input_tokens_seen": 147849216, "step": 1128 }, { "epoch": 0.1855990482100092, "grad_norm": 0.5776637196540833, "learning_rate": 0.00015579010568446804, "loss": 5.6116, "num_input_tokens_seen": 148242432, "step": 1131 }, { "epoch": 0.186091353377675, "grad_norm": 0.6061506271362305, "learning_rate": 0.00015558389764480516, "loss": 5.5514, "num_input_tokens_seen": 148635648, "step": 1134 }, { "epoch": 0.18658365854534079, "grad_norm": 0.7481436729431152, "learning_rate": 0.0001553785062712401, "loss": 5.5469, "num_input_tokens_seen": 149028864, "step": 1137 }, { "epoch": 0.1870759637130066, "grad_norm": 0.7162047028541565, "learning_rate": 0.00015517392618742703, "loss": 5.5208, "num_input_tokens_seen": 149422080, "step": 1140 }, { "epoch": 0.1875682688806724, "grad_norm": 0.6034492254257202, "learning_rate": 0.00015497015206644168, "loss": 5.5255, "num_input_tokens_seen": 149815296, "step": 1143 }, { "epoch": 0.1880605740483382, "grad_norm": 0.7142524719238281, "learning_rate": 0.00015476717863019868, "loss": 5.5561, "num_input_tokens_seen": 150208512, "step": 1146 }, { "epoch": 0.18855287921600403, "grad_norm": 0.6338310241699219, "learning_rate": 0.0001545650006488774, "loss": 5.524, "num_input_tokens_seen": 150601728, "step": 1149 }, { "epoch": 0.18904518438366982, "grad_norm": 0.7229697108268738, "learning_rate": 0.00015436361294035586, "loss": 5.5738, "num_input_tokens_seen": 150994944, "step": 1152 }, { "epoch": 0.18953748955133565, "grad_norm": 0.8610914349555969, "learning_rate": 0.00015416301036965307, "loss": 5.5242, "num_input_tokens_seen": 151388160, "step": 1155 }, { "epoch": 0.19002979471900144, "grad_norm": 0.8543524742126465, "learning_rate": 0.00015396318784837899, "loss": 5.5251, "num_input_tokens_seen": 151781376, "step": 1158 }, { "epoch": 0.19052209988666724, "grad_norm": 0.9472544193267822, "learning_rate": 0.00015376414033419227, "loss": 5.5143, "num_input_tokens_seen": 152174592, "step": 1161 }, { "epoch": 0.19101440505433306, "grad_norm": 0.8483441472053528, "learning_rate": 0.00015356586283026615, "loss": 5.5312, "num_input_tokens_seen": 152567808, "step": 1164 }, { "epoch": 0.19150671022199886, "grad_norm": 0.71060711145401, "learning_rate": 0.00015336835038476135, "loss": 5.5537, "num_input_tokens_seen": 152961024, "step": 1167 }, { "epoch": 0.19199901538966466, "grad_norm": 0.5833298563957214, "learning_rate": 0.00015317159809030676, "loss": 5.5685, "num_input_tokens_seen": 153354240, "step": 1170 }, { "epoch": 0.19249132055733048, "grad_norm": 0.6280092597007751, "learning_rate": 0.0001529756010834872, "loss": 5.5238, "num_input_tokens_seen": 153747456, "step": 1173 }, { "epoch": 0.19298362572499628, "grad_norm": 0.6287539005279541, "learning_rate": 0.00015278035454433883, "loss": 5.5883, "num_input_tokens_seen": 154140672, "step": 1176 }, { "epoch": 0.1934759308926621, "grad_norm": 0.5246036648750305, "learning_rate": 0.00015258585369585086, "loss": 5.4878, "num_input_tokens_seen": 154533888, "step": 1179 }, { "epoch": 0.1939682360603279, "grad_norm": 0.5769249796867371, "learning_rate": 0.00015239209380347492, "loss": 5.5545, "num_input_tokens_seen": 154927104, "step": 1182 }, { "epoch": 0.1944605412279937, "grad_norm": 0.5999164581298828, "learning_rate": 0.00015219907017464103, "loss": 5.5295, "num_input_tokens_seen": 155320320, "step": 1185 }, { "epoch": 0.19495284639565952, "grad_norm": 0.7166159749031067, "learning_rate": 0.00015200677815828016, "loss": 5.5673, "num_input_tokens_seen": 155713536, "step": 1188 }, { "epoch": 0.19544515156332531, "grad_norm": 0.6548975110054016, "learning_rate": 0.0001518152131443535, "loss": 5.5044, "num_input_tokens_seen": 156106752, "step": 1191 }, { "epoch": 0.1959374567309911, "grad_norm": 0.6346669793128967, "learning_rate": 0.00015162437056338838, "loss": 5.5459, "num_input_tokens_seen": 156499968, "step": 1194 }, { "epoch": 0.19642976189865693, "grad_norm": 0.7255611419677734, "learning_rate": 0.00015143424588602033, "loss": 5.5296, "num_input_tokens_seen": 156893184, "step": 1197 }, { "epoch": 0.19692206706632273, "grad_norm": 0.7328153848648071, "learning_rate": 0.0001512448346225417, "loss": 5.5106, "num_input_tokens_seen": 157286400, "step": 1200 }, { "epoch": 0.19692206706632273, "eval_accuracy": 0.1976241654453672, "eval_loss": 5.751555919647217, "eval_runtime": 110.6176, "eval_samples_per_second": 2.712, "eval_steps_per_second": 1.356, "num_input_tokens_seen": 157286400, "step": 1200 }, { "epoch": 0.19741437223398856, "grad_norm": 0.6580715179443359, "learning_rate": 0.00015105613232245638, "loss": 5.545, "num_input_tokens_seen": 157679616, "step": 1203 }, { "epoch": 0.19790667740165435, "grad_norm": 0.5751286745071411, "learning_rate": 0.00015086813457404033, "loss": 5.5439, "num_input_tokens_seen": 158072832, "step": 1206 }, { "epoch": 0.19839898256932015, "grad_norm": 0.6533809900283813, "learning_rate": 0.00015068083700390872, "loss": 5.5193, "num_input_tokens_seen": 158466048, "step": 1209 }, { "epoch": 0.19889128773698597, "grad_norm": 0.6799106597900391, "learning_rate": 0.0001504942352765884, "loss": 5.5718, "num_input_tokens_seen": 158859264, "step": 1212 }, { "epoch": 0.19938359290465177, "grad_norm": 0.6222509145736694, "learning_rate": 0.00015030832509409646, "loss": 5.5177, "num_input_tokens_seen": 159252480, "step": 1215 }, { "epoch": 0.19987589807231756, "grad_norm": 0.6463878750801086, "learning_rate": 0.00015012310219552445, "loss": 5.519, "num_input_tokens_seen": 159645696, "step": 1218 }, { "epoch": 0.2003682032399834, "grad_norm": 0.6764931678771973, "learning_rate": 0.00014993856235662816, "loss": 5.5437, "num_input_tokens_seen": 160038912, "step": 1221 }, { "epoch": 0.20086050840764919, "grad_norm": 0.6322119235992432, "learning_rate": 0.00014975470138942312, "loss": 5.5726, "num_input_tokens_seen": 160432128, "step": 1224 }, { "epoch": 0.201352813575315, "grad_norm": 0.7915647029876709, "learning_rate": 0.00014957151514178522, "loss": 5.5265, "num_input_tokens_seen": 160825344, "step": 1227 }, { "epoch": 0.2018451187429808, "grad_norm": 0.9680132269859314, "learning_rate": 0.00014938899949705703, "loss": 5.5909, "num_input_tokens_seen": 161218560, "step": 1230 }, { "epoch": 0.2023374239106466, "grad_norm": 0.910897433757782, "learning_rate": 0.00014920715037365913, "loss": 5.5585, "num_input_tokens_seen": 161611776, "step": 1233 }, { "epoch": 0.20282972907831243, "grad_norm": 0.7097917199134827, "learning_rate": 0.00014902596372470695, "loss": 5.5273, "num_input_tokens_seen": 162004992, "step": 1236 }, { "epoch": 0.20332203424597822, "grad_norm": 0.8189403414726257, "learning_rate": 0.00014884543553763215, "loss": 5.4965, "num_input_tokens_seen": 162398208, "step": 1239 }, { "epoch": 0.20381433941364402, "grad_norm": 0.6247571110725403, "learning_rate": 0.00014866556183380976, "loss": 5.4886, "num_input_tokens_seen": 162791424, "step": 1242 }, { "epoch": 0.20430664458130984, "grad_norm": 0.7881289124488831, "learning_rate": 0.0001484863386681897, "loss": 5.5525, "num_input_tokens_seen": 163184640, "step": 1245 }, { "epoch": 0.20479894974897564, "grad_norm": 0.8643481731414795, "learning_rate": 0.00014830776212893345, "loss": 5.5544, "num_input_tokens_seen": 163577856, "step": 1248 }, { "epoch": 0.20529125491664146, "grad_norm": 0.7672863602638245, "learning_rate": 0.0001481298283370553, "loss": 5.5501, "num_input_tokens_seen": 163971072, "step": 1251 }, { "epoch": 0.20578356008430726, "grad_norm": 0.7655563950538635, "learning_rate": 0.0001479525334460686, "loss": 5.5266, "num_input_tokens_seen": 164364288, "step": 1254 }, { "epoch": 0.20627586525197306, "grad_norm": 0.6450381278991699, "learning_rate": 0.00014777587364163652, "loss": 5.5056, "num_input_tokens_seen": 164757504, "step": 1257 }, { "epoch": 0.20676817041963888, "grad_norm": 0.7719587683677673, "learning_rate": 0.00014759984514122729, "loss": 5.4903, "num_input_tokens_seen": 165150720, "step": 1260 }, { "epoch": 0.20726047558730468, "grad_norm": 0.7485530972480774, "learning_rate": 0.00014742444419377413, "loss": 5.5165, "num_input_tokens_seen": 165543936, "step": 1263 }, { "epoch": 0.2077527807549705, "grad_norm": 0.6037641763687134, "learning_rate": 0.00014724966707933943, "loss": 5.5313, "num_input_tokens_seen": 165937152, "step": 1266 }, { "epoch": 0.2082450859226363, "grad_norm": 0.6211085915565491, "learning_rate": 0.00014707551010878346, "loss": 5.4773, "num_input_tokens_seen": 166330368, "step": 1269 }, { "epoch": 0.2087373910903021, "grad_norm": 0.6303486824035645, "learning_rate": 0.00014690196962343724, "loss": 5.5021, "num_input_tokens_seen": 166723584, "step": 1272 }, { "epoch": 0.20922969625796792, "grad_norm": 0.5903136730194092, "learning_rate": 0.00014672904199477987, "loss": 5.5041, "num_input_tokens_seen": 167116800, "step": 1275 }, { "epoch": 0.20972200142563371, "grad_norm": 0.6164759397506714, "learning_rate": 0.00014655672362411974, "loss": 5.4536, "num_input_tokens_seen": 167510016, "step": 1278 }, { "epoch": 0.2102143065932995, "grad_norm": 0.7181910872459412, "learning_rate": 0.00014638501094227995, "loss": 5.4979, "num_input_tokens_seen": 167903232, "step": 1281 }, { "epoch": 0.21070661176096533, "grad_norm": 0.8058461546897888, "learning_rate": 0.000146213900409288, "loss": 5.564, "num_input_tokens_seen": 168296448, "step": 1284 }, { "epoch": 0.21119891692863113, "grad_norm": 0.8044641613960266, "learning_rate": 0.00014604338851406909, "loss": 5.4864, "num_input_tokens_seen": 168689664, "step": 1287 }, { "epoch": 0.21169122209629695, "grad_norm": 0.5943960547447205, "learning_rate": 0.00014587347177414357, "loss": 5.4307, "num_input_tokens_seen": 169082880, "step": 1290 }, { "epoch": 0.21218352726396275, "grad_norm": 0.6272845268249512, "learning_rate": 0.0001457041467353283, "loss": 5.5211, "num_input_tokens_seen": 169476096, "step": 1293 }, { "epoch": 0.21267583243162855, "grad_norm": 0.6123335361480713, "learning_rate": 0.0001455354099714415, "loss": 5.5025, "num_input_tokens_seen": 169869312, "step": 1296 }, { "epoch": 0.21316813759929437, "grad_norm": 0.5472930669784546, "learning_rate": 0.00014536725808401196, "loss": 5.5156, "num_input_tokens_seen": 170262528, "step": 1299 }, { "epoch": 0.21366044276696017, "grad_norm": 0.6367286443710327, "learning_rate": 0.00014519968770199113, "loss": 5.4662, "num_input_tokens_seen": 170655744, "step": 1302 }, { "epoch": 0.21415274793462596, "grad_norm": 0.662023663520813, "learning_rate": 0.0001450326954814696, "loss": 5.5199, "num_input_tokens_seen": 171048960, "step": 1305 }, { "epoch": 0.2146450531022918, "grad_norm": 0.8450511693954468, "learning_rate": 0.00014486627810539652, "loss": 5.4729, "num_input_tokens_seen": 171442176, "step": 1308 }, { "epoch": 0.21513735826995758, "grad_norm": 0.7732052803039551, "learning_rate": 0.00014470043228330322, "loss": 5.4919, "num_input_tokens_seen": 171835392, "step": 1311 }, { "epoch": 0.2156296634376234, "grad_norm": 0.6616522073745728, "learning_rate": 0.00014453515475102972, "loss": 5.5101, "num_input_tokens_seen": 172228608, "step": 1314 }, { "epoch": 0.2161219686052892, "grad_norm": 0.726275622844696, "learning_rate": 0.00014437044227045488, "loss": 5.5317, "num_input_tokens_seen": 172621824, "step": 1317 }, { "epoch": 0.216614273772955, "grad_norm": 0.8850323557853699, "learning_rate": 0.00014420629162923004, "loss": 5.4694, "num_input_tokens_seen": 173015040, "step": 1320 }, { "epoch": 0.21710657894062083, "grad_norm": 0.8814117908477783, "learning_rate": 0.00014404269964051592, "loss": 5.5393, "num_input_tokens_seen": 173408256, "step": 1323 }, { "epoch": 0.21759888410828662, "grad_norm": 0.7925286889076233, "learning_rate": 0.00014387966314272267, "loss": 5.4806, "num_input_tokens_seen": 173801472, "step": 1326 }, { "epoch": 0.21809118927595242, "grad_norm": 0.5634031891822815, "learning_rate": 0.00014371717899925318, "loss": 5.5134, "num_input_tokens_seen": 174194688, "step": 1329 }, { "epoch": 0.21858349444361824, "grad_norm": 0.7209234237670898, "learning_rate": 0.00014355524409824985, "loss": 5.4813, "num_input_tokens_seen": 174587904, "step": 1332 }, { "epoch": 0.21907579961128404, "grad_norm": 0.891487717628479, "learning_rate": 0.00014339385535234412, "loss": 5.4888, "num_input_tokens_seen": 174981120, "step": 1335 }, { "epoch": 0.21956810477894986, "grad_norm": 0.7084378600120544, "learning_rate": 0.00014323300969840914, "loss": 5.4947, "num_input_tokens_seen": 175374336, "step": 1338 }, { "epoch": 0.22006040994661566, "grad_norm": 0.6455901265144348, "learning_rate": 0.0001430727040973159, "loss": 5.5063, "num_input_tokens_seen": 175767552, "step": 1341 }, { "epoch": 0.22055271511428146, "grad_norm": 0.72601717710495, "learning_rate": 0.00014291293553369175, "loss": 5.4397, "num_input_tokens_seen": 176160768, "step": 1344 }, { "epoch": 0.22104502028194728, "grad_norm": 0.8337487578392029, "learning_rate": 0.00014275370101568235, "loss": 5.491, "num_input_tokens_seen": 176553984, "step": 1347 }, { "epoch": 0.22153732544961308, "grad_norm": 0.6962984800338745, "learning_rate": 0.00014259499757471626, "loss": 5.514, "num_input_tokens_seen": 176947200, "step": 1350 }, { "epoch": 0.22202963061727887, "grad_norm": 0.6092506051063538, "learning_rate": 0.00014243682226527246, "loss": 5.44, "num_input_tokens_seen": 177340416, "step": 1353 }, { "epoch": 0.2225219357849447, "grad_norm": 0.663375735282898, "learning_rate": 0.000142279172164651, "loss": 5.4647, "num_input_tokens_seen": 177733632, "step": 1356 }, { "epoch": 0.2230142409526105, "grad_norm": 0.7483349442481995, "learning_rate": 0.00014212204437274583, "loss": 5.4908, "num_input_tokens_seen": 178126848, "step": 1359 }, { "epoch": 0.22350654612027632, "grad_norm": 0.5030447244644165, "learning_rate": 0.00014196543601182097, "loss": 5.4734, "num_input_tokens_seen": 178520064, "step": 1362 }, { "epoch": 0.2239988512879421, "grad_norm": 0.6785706877708435, "learning_rate": 0.00014180934422628892, "loss": 5.5189, "num_input_tokens_seen": 178913280, "step": 1365 }, { "epoch": 0.2244911564556079, "grad_norm": 0.6665587425231934, "learning_rate": 0.00014165376618249234, "loss": 5.5011, "num_input_tokens_seen": 179306496, "step": 1368 }, { "epoch": 0.22498346162327373, "grad_norm": 0.6461958885192871, "learning_rate": 0.00014149869906848755, "loss": 5.4507, "num_input_tokens_seen": 179699712, "step": 1371 }, { "epoch": 0.22547576679093953, "grad_norm": 0.8757199048995972, "learning_rate": 0.00014134414009383135, "loss": 5.4327, "num_input_tokens_seen": 180092928, "step": 1374 }, { "epoch": 0.22596807195860535, "grad_norm": 0.7910983562469482, "learning_rate": 0.0001411900864893701, "loss": 5.4511, "num_input_tokens_seen": 180486144, "step": 1377 }, { "epoch": 0.22646037712627115, "grad_norm": 0.6167607307434082, "learning_rate": 0.00014103653550703125, "loss": 5.4763, "num_input_tokens_seen": 180879360, "step": 1380 }, { "epoch": 0.22695268229393695, "grad_norm": 0.6035879850387573, "learning_rate": 0.00014088348441961742, "loss": 5.516, "num_input_tokens_seen": 181272576, "step": 1383 }, { "epoch": 0.22744498746160277, "grad_norm": 0.6464037299156189, "learning_rate": 0.00014073093052060305, "loss": 5.5, "num_input_tokens_seen": 181665792, "step": 1386 }, { "epoch": 0.22793729262926857, "grad_norm": 0.6020700335502625, "learning_rate": 0.0001405788711239334, "loss": 5.4977, "num_input_tokens_seen": 182059008, "step": 1389 }, { "epoch": 0.22842959779693436, "grad_norm": 0.5898153185844421, "learning_rate": 0.00014042730356382584, "loss": 5.4852, "num_input_tokens_seen": 182452224, "step": 1392 }, { "epoch": 0.2289219029646002, "grad_norm": 0.6911790370941162, "learning_rate": 0.00014027622519457354, "loss": 5.4513, "num_input_tokens_seen": 182845440, "step": 1395 }, { "epoch": 0.22941420813226598, "grad_norm": 0.7918622493743896, "learning_rate": 0.00014012563339035157, "loss": 5.5059, "num_input_tokens_seen": 183238656, "step": 1398 }, { "epoch": 0.2299065132999318, "grad_norm": 0.7177248597145081, "learning_rate": 0.00013997552554502517, "loss": 5.4782, "num_input_tokens_seen": 183631872, "step": 1401 }, { "epoch": 0.2303988184675976, "grad_norm": 0.5545403361320496, "learning_rate": 0.00013982589907196038, "loss": 5.4598, "num_input_tokens_seen": 184025088, "step": 1404 }, { "epoch": 0.2308911236352634, "grad_norm": 0.5726358294487, "learning_rate": 0.00013967675140383676, "loss": 5.4737, "num_input_tokens_seen": 184418304, "step": 1407 }, { "epoch": 0.23138342880292923, "grad_norm": 0.6597428917884827, "learning_rate": 0.00013952807999246237, "loss": 5.4761, "num_input_tokens_seen": 184811520, "step": 1410 }, { "epoch": 0.23187573397059502, "grad_norm": 0.7973009347915649, "learning_rate": 0.000139379882308591, "loss": 5.4572, "num_input_tokens_seen": 185204736, "step": 1413 }, { "epoch": 0.23236803913826082, "grad_norm": 0.797472357749939, "learning_rate": 0.00013923215584174146, "loss": 5.456, "num_input_tokens_seen": 185597952, "step": 1416 }, { "epoch": 0.23286034430592664, "grad_norm": 0.6406548023223877, "learning_rate": 0.00013908489810001876, "loss": 5.4899, "num_input_tokens_seen": 185991168, "step": 1419 }, { "epoch": 0.23335264947359244, "grad_norm": 0.6479483246803284, "learning_rate": 0.00013893810660993777, "loss": 5.451, "num_input_tokens_seen": 186384384, "step": 1422 }, { "epoch": 0.23384495464125826, "grad_norm": 0.7050771713256836, "learning_rate": 0.00013879177891624862, "loss": 5.4301, "num_input_tokens_seen": 186777600, "step": 1425 }, { "epoch": 0.23433725980892406, "grad_norm": 0.7287415862083435, "learning_rate": 0.00013864591258176437, "loss": 5.4478, "num_input_tokens_seen": 187170816, "step": 1428 }, { "epoch": 0.23482956497658986, "grad_norm": 0.7213960289955139, "learning_rate": 0.00013850050518719026, "loss": 5.4639, "num_input_tokens_seen": 187564032, "step": 1431 }, { "epoch": 0.23532187014425568, "grad_norm": 0.6890702843666077, "learning_rate": 0.00013835555433095535, "loss": 5.4457, "num_input_tokens_seen": 187957248, "step": 1434 }, { "epoch": 0.23581417531192148, "grad_norm": 0.7307390570640564, "learning_rate": 0.0001382110576290459, "loss": 5.4538, "num_input_tokens_seen": 188350464, "step": 1437 }, { "epoch": 0.23630648047958727, "grad_norm": 0.578571617603302, "learning_rate": 0.00013806701271484075, "loss": 5.4692, "num_input_tokens_seen": 188743680, "step": 1440 }, { "epoch": 0.2367987856472531, "grad_norm": 0.7552468180656433, "learning_rate": 0.0001379234172389483, "loss": 5.4891, "num_input_tokens_seen": 189136896, "step": 1443 }, { "epoch": 0.2372910908149189, "grad_norm": 1.0629295110702515, "learning_rate": 0.00013778026886904584, "loss": 5.3936, "num_input_tokens_seen": 189530112, "step": 1446 }, { "epoch": 0.23778339598258472, "grad_norm": 1.178006649017334, "learning_rate": 0.00013763756528972017, "loss": 5.4754, "num_input_tokens_seen": 189923328, "step": 1449 }, { "epoch": 0.2382757011502505, "grad_norm": 0.9626088738441467, "learning_rate": 0.00013749530420231065, "loss": 5.4676, "num_input_tokens_seen": 190316544, "step": 1452 }, { "epoch": 0.2387680063179163, "grad_norm": 0.7882675528526306, "learning_rate": 0.00013735348332475335, "loss": 5.4157, "num_input_tokens_seen": 190709760, "step": 1455 }, { "epoch": 0.23926031148558213, "grad_norm": 0.6379857659339905, "learning_rate": 0.0001372121003914274, "loss": 5.4558, "num_input_tokens_seen": 191102976, "step": 1458 }, { "epoch": 0.23975261665324793, "grad_norm": 0.7245479822158813, "learning_rate": 0.00013707115315300314, "loss": 5.4405, "num_input_tokens_seen": 191496192, "step": 1461 }, { "epoch": 0.24024492182091373, "grad_norm": 0.7578230500221252, "learning_rate": 0.00013693063937629153, "loss": 5.4516, "num_input_tokens_seen": 191889408, "step": 1464 }, { "epoch": 0.24073722698857955, "grad_norm": 0.6436443328857422, "learning_rate": 0.00013679055684409573, "loss": 5.46, "num_input_tokens_seen": 192282624, "step": 1467 }, { "epoch": 0.24122953215624535, "grad_norm": 0.6767442226409912, "learning_rate": 0.00013665090335506422, "loss": 5.472, "num_input_tokens_seen": 192675840, "step": 1470 }, { "epoch": 0.24172183732391117, "grad_norm": 0.7930536270141602, "learning_rate": 0.00013651167672354525, "loss": 5.4666, "num_input_tokens_seen": 193069056, "step": 1473 }, { "epoch": 0.24221414249157697, "grad_norm": 0.8025949001312256, "learning_rate": 0.0001363728747794434, "loss": 5.423, "num_input_tokens_seen": 193462272, "step": 1476 }, { "epoch": 0.24270644765924276, "grad_norm": 0.79132479429245, "learning_rate": 0.00013623449536807747, "loss": 5.5005, "num_input_tokens_seen": 193855488, "step": 1479 }, { "epoch": 0.2431987528269086, "grad_norm": 0.7200051546096802, "learning_rate": 0.00013609653635003992, "loss": 5.4393, "num_input_tokens_seen": 194248704, "step": 1482 }, { "epoch": 0.24369105799457438, "grad_norm": 0.8778846263885498, "learning_rate": 0.0001359589956010579, "loss": 5.4721, "num_input_tokens_seen": 194641920, "step": 1485 }, { "epoch": 0.24418336316224018, "grad_norm": 0.7225459814071655, "learning_rate": 0.00013582187101185615, "loss": 5.4733, "num_input_tokens_seen": 195035136, "step": 1488 }, { "epoch": 0.244675668329906, "grad_norm": 0.7456029057502747, "learning_rate": 0.00013568516048802077, "loss": 5.4594, "num_input_tokens_seen": 195428352, "step": 1491 }, { "epoch": 0.2451679734975718, "grad_norm": 0.7556261420249939, "learning_rate": 0.0001355488619498652, "loss": 5.4692, "num_input_tokens_seen": 195821568, "step": 1494 }, { "epoch": 0.24566027866523762, "grad_norm": 0.6962366104125977, "learning_rate": 0.00013541297333229701, "loss": 5.4188, "num_input_tokens_seen": 196214784, "step": 1497 }, { "epoch": 0.24615258383290342, "grad_norm": 0.5996794104576111, "learning_rate": 0.00013527749258468682, "loss": 5.483, "num_input_tokens_seen": 196608000, "step": 1500 }, { "epoch": 0.24664488900056922, "grad_norm": 0.8026195764541626, "learning_rate": 0.00013514241767073804, "loss": 5.4271, "num_input_tokens_seen": 197001216, "step": 1503 }, { "epoch": 0.24713719416823504, "grad_norm": 0.7598974704742432, "learning_rate": 0.00013500774656835854, "loss": 5.457, "num_input_tokens_seen": 197394432, "step": 1506 }, { "epoch": 0.24762949933590084, "grad_norm": 0.6312159895896912, "learning_rate": 0.00013487347726953342, "loss": 5.4753, "num_input_tokens_seen": 197787648, "step": 1509 }, { "epoch": 0.24812180450356666, "grad_norm": 0.7285313606262207, "learning_rate": 0.00013473960778019915, "loss": 5.4443, "num_input_tokens_seen": 198180864, "step": 1512 }, { "epoch": 0.24861410967123246, "grad_norm": 0.7714123725891113, "learning_rate": 0.0001346061361201194, "loss": 5.4059, "num_input_tokens_seen": 198574080, "step": 1515 }, { "epoch": 0.24910641483889825, "grad_norm": 0.790026843547821, "learning_rate": 0.00013447306032276192, "loss": 5.3898, "num_input_tokens_seen": 198967296, "step": 1518 }, { "epoch": 0.24959872000656408, "grad_norm": 0.6501436233520508, "learning_rate": 0.00013434037843517677, "loss": 5.4489, "num_input_tokens_seen": 199360512, "step": 1521 }, { "epoch": 0.2500910251742299, "grad_norm": 0.6346362829208374, "learning_rate": 0.00013420808851787603, "loss": 5.4448, "num_input_tokens_seen": 199753728, "step": 1524 }, { "epoch": 0.25058333034189567, "grad_norm": 0.6744604706764221, "learning_rate": 0.00013407618864471469, "loss": 5.434, "num_input_tokens_seen": 200146944, "step": 1527 }, { "epoch": 0.25107563550956147, "grad_norm": 0.7855573296546936, "learning_rate": 0.00013394467690277295, "loss": 5.4304, "num_input_tokens_seen": 200540160, "step": 1530 }, { "epoch": 0.2515679406772273, "grad_norm": 0.5886304974555969, "learning_rate": 0.0001338135513922395, "loss": 5.428, "num_input_tokens_seen": 200933376, "step": 1533 }, { "epoch": 0.2520602458448931, "grad_norm": 0.6847313046455383, "learning_rate": 0.00013368281022629647, "loss": 5.4176, "num_input_tokens_seen": 201326592, "step": 1536 }, { "epoch": 0.2525525510125589, "grad_norm": 0.600709080696106, "learning_rate": 0.0001335524515310053, "loss": 5.4369, "num_input_tokens_seen": 201719808, "step": 1539 }, { "epoch": 0.2530448561802247, "grad_norm": 0.5900317430496216, "learning_rate": 0.00013342247344519384, "loss": 5.4217, "num_input_tokens_seen": 202113024, "step": 1542 }, { "epoch": 0.2535371613478905, "grad_norm": 0.6889588236808777, "learning_rate": 0.00013329287412034498, "loss": 5.4327, "num_input_tokens_seen": 202506240, "step": 1545 }, { "epoch": 0.25402946651555636, "grad_norm": 0.699284017086029, "learning_rate": 0.00013316365172048595, "loss": 5.4391, "num_input_tokens_seen": 202899456, "step": 1548 }, { "epoch": 0.25452177168322215, "grad_norm": 0.7852107286453247, "learning_rate": 0.0001330348044220793, "loss": 5.4022, "num_input_tokens_seen": 203292672, "step": 1551 }, { "epoch": 0.25501407685088795, "grad_norm": 0.9291219115257263, "learning_rate": 0.00013290633041391467, "loss": 5.4094, "num_input_tokens_seen": 203685888, "step": 1554 }, { "epoch": 0.25550638201855375, "grad_norm": 0.7158473134040833, "learning_rate": 0.000132778227897002, "loss": 5.397, "num_input_tokens_seen": 204079104, "step": 1557 }, { "epoch": 0.25599868718621954, "grad_norm": 0.761669933795929, "learning_rate": 0.00013265049508446564, "loss": 5.4425, "num_input_tokens_seen": 204472320, "step": 1560 }, { "epoch": 0.25649099235388534, "grad_norm": 0.7269110679626465, "learning_rate": 0.0001325231302014396, "loss": 5.4098, "num_input_tokens_seen": 204865536, "step": 1563 }, { "epoch": 0.2569832975215512, "grad_norm": 0.8132617473602295, "learning_rate": 0.0001323961314849641, "loss": 5.4133, "num_input_tokens_seen": 205258752, "step": 1566 }, { "epoch": 0.257475602689217, "grad_norm": 0.7336274981498718, "learning_rate": 0.00013226949718388306, "loss": 5.4255, "num_input_tokens_seen": 205651968, "step": 1569 }, { "epoch": 0.2579679078568828, "grad_norm": 0.6194922924041748, "learning_rate": 0.0001321432255587425, "loss": 5.4495, "num_input_tokens_seen": 206045184, "step": 1572 }, { "epoch": 0.2584602130245486, "grad_norm": 0.6153448820114136, "learning_rate": 0.00013201731488169053, "loss": 5.4278, "num_input_tokens_seen": 206438400, "step": 1575 }, { "epoch": 0.2589525181922144, "grad_norm": 0.6616208553314209, "learning_rate": 0.0001318917634363777, "loss": 5.3997, "num_input_tokens_seen": 206831616, "step": 1578 }, { "epoch": 0.25944482335988023, "grad_norm": 0.670545756816864, "learning_rate": 0.00013176656951785888, "loss": 5.4721, "num_input_tokens_seen": 207224832, "step": 1581 }, { "epoch": 0.259937128527546, "grad_norm": 0.6420415639877319, "learning_rate": 0.00013164173143249616, "loss": 5.3955, "num_input_tokens_seen": 207618048, "step": 1584 }, { "epoch": 0.2604294336952118, "grad_norm": 0.6072604060173035, "learning_rate": 0.00013151724749786237, "loss": 5.4327, "num_input_tokens_seen": 208011264, "step": 1587 }, { "epoch": 0.2609217388628776, "grad_norm": 0.6046293377876282, "learning_rate": 0.00013139311604264595, "loss": 5.4234, "num_input_tokens_seen": 208404480, "step": 1590 }, { "epoch": 0.2614140440305434, "grad_norm": 0.6301759481430054, "learning_rate": 0.00013126933540655674, "loss": 5.412, "num_input_tokens_seen": 208797696, "step": 1593 }, { "epoch": 0.26190634919820927, "grad_norm": 0.5329933762550354, "learning_rate": 0.00013114590394023272, "loss": 5.4476, "num_input_tokens_seen": 209190912, "step": 1596 }, { "epoch": 0.26239865436587506, "grad_norm": 0.6598846912384033, "learning_rate": 0.0001310228200051476, "loss": 5.455, "num_input_tokens_seen": 209584128, "step": 1599 }, { "epoch": 0.2625627560884303, "eval_accuracy": 0.20324051457417358, "eval_loss": 5.642736911773682, "eval_runtime": 109.8552, "eval_samples_per_second": 2.731, "eval_steps_per_second": 1.365, "num_input_tokens_seen": 209715200, "step": 1600 }, { "epoch": 0.26289095953354086, "grad_norm": 0.5440322756767273, "learning_rate": 0.00013090008197351962, "loss": 5.4318, "num_input_tokens_seen": 209977344, "step": 1602 }, { "epoch": 0.26338326470120665, "grad_norm": 0.5659436583518982, "learning_rate": 0.0001307776882282209, "loss": 5.4397, "num_input_tokens_seen": 210370560, "step": 1605 }, { "epoch": 0.26387556986887245, "grad_norm": 0.6262729167938232, "learning_rate": 0.0001306556371626883, "loss": 5.3882, "num_input_tokens_seen": 210763776, "step": 1608 }, { "epoch": 0.26436787503653825, "grad_norm": 0.6548302173614502, "learning_rate": 0.00013053392718083447, "loss": 5.4368, "num_input_tokens_seen": 211156992, "step": 1611 }, { "epoch": 0.2648601802042041, "grad_norm": 0.6563411355018616, "learning_rate": 0.00013041255669696042, "loss": 5.3665, "num_input_tokens_seen": 211550208, "step": 1614 }, { "epoch": 0.2653524853718699, "grad_norm": 0.6907044649124146, "learning_rate": 0.00013029152413566872, "loss": 5.4226, "num_input_tokens_seen": 211943424, "step": 1617 }, { "epoch": 0.2658447905395357, "grad_norm": 0.715350866317749, "learning_rate": 0.00013017082793177756, "loss": 5.4769, "num_input_tokens_seen": 212336640, "step": 1620 }, { "epoch": 0.2663370957072015, "grad_norm": 0.7537038326263428, "learning_rate": 0.0001300504665302358, "loss": 5.4195, "num_input_tokens_seen": 212729856, "step": 1623 }, { "epoch": 0.2668294008748673, "grad_norm": 0.7719680070877075, "learning_rate": 0.00012993043838603865, "loss": 5.4237, "num_input_tokens_seen": 213123072, "step": 1626 }, { "epoch": 0.26732170604253314, "grad_norm": 0.6925044059753418, "learning_rate": 0.00012981074196414472, "loss": 5.3933, "num_input_tokens_seen": 213516288, "step": 1629 }, { "epoch": 0.26781401121019893, "grad_norm": 0.6691201329231262, "learning_rate": 0.0001296913757393932, "loss": 5.4641, "num_input_tokens_seen": 213909504, "step": 1632 }, { "epoch": 0.26830631637786473, "grad_norm": 0.6009002923965454, "learning_rate": 0.00012957233819642244, "loss": 5.3632, "num_input_tokens_seen": 214302720, "step": 1635 }, { "epoch": 0.2687986215455305, "grad_norm": 0.5620663166046143, "learning_rate": 0.00012945362782958907, "loss": 5.4091, "num_input_tokens_seen": 214695936, "step": 1638 }, { "epoch": 0.2692909267131963, "grad_norm": 0.585953414440155, "learning_rate": 0.0001293352431428881, "loss": 5.3819, "num_input_tokens_seen": 215089152, "step": 1641 }, { "epoch": 0.2697832318808622, "grad_norm": 0.604200541973114, "learning_rate": 0.00012921718264987363, "loss": 5.4161, "num_input_tokens_seen": 215482368, "step": 1644 }, { "epoch": 0.27027553704852797, "grad_norm": 0.7634391188621521, "learning_rate": 0.00012909944487358055, "loss": 5.3987, "num_input_tokens_seen": 215875584, "step": 1647 }, { "epoch": 0.27076784221619377, "grad_norm": 0.6789970993995667, "learning_rate": 0.0001289820283464469, "loss": 5.3772, "num_input_tokens_seen": 216268800, "step": 1650 }, { "epoch": 0.27126014738385956, "grad_norm": 0.615770697593689, "learning_rate": 0.00012886493161023702, "loss": 5.403, "num_input_tokens_seen": 216662016, "step": 1653 }, { "epoch": 0.27175245255152536, "grad_norm": 0.821854829788208, "learning_rate": 0.00012874815321596553, "loss": 5.3783, "num_input_tokens_seen": 217055232, "step": 1656 }, { "epoch": 0.2722447577191912, "grad_norm": 0.7120591998100281, "learning_rate": 0.00012863169172382195, "loss": 5.4067, "num_input_tokens_seen": 217448448, "step": 1659 }, { "epoch": 0.272737062886857, "grad_norm": 0.561850368976593, "learning_rate": 0.00012851554570309626, "loss": 5.4273, "num_input_tokens_seen": 217841664, "step": 1662 }, { "epoch": 0.2732293680545228, "grad_norm": 0.6624509692192078, "learning_rate": 0.0001283997137321049, "loss": 5.4209, "num_input_tokens_seen": 218234880, "step": 1665 }, { "epoch": 0.2737216732221886, "grad_norm": 0.6127106547355652, "learning_rate": 0.00012828419439811785, "loss": 5.3708, "num_input_tokens_seen": 218628096, "step": 1668 }, { "epoch": 0.2742139783898544, "grad_norm": 0.6014697551727295, "learning_rate": 0.00012816898629728628, "loss": 5.4795, "num_input_tokens_seen": 219021312, "step": 1671 }, { "epoch": 0.2747062835575202, "grad_norm": 0.5605149865150452, "learning_rate": 0.0001280540880345707, "loss": 5.3716, "num_input_tokens_seen": 219414528, "step": 1674 }, { "epoch": 0.27519858872518604, "grad_norm": 0.680549144744873, "learning_rate": 0.00012793949822367017, "loss": 5.4007, "num_input_tokens_seen": 219807744, "step": 1677 }, { "epoch": 0.27569089389285184, "grad_norm": 0.584894597530365, "learning_rate": 0.000127825215486952, "loss": 5.3907, "num_input_tokens_seen": 220200960, "step": 1680 }, { "epoch": 0.27618319906051764, "grad_norm": 0.5957493782043457, "learning_rate": 0.00012771123845538215, "loss": 5.4456, "num_input_tokens_seen": 220594176, "step": 1683 }, { "epoch": 0.27667550422818343, "grad_norm": 0.6230633854866028, "learning_rate": 0.00012759756576845652, "loss": 5.4259, "num_input_tokens_seen": 220987392, "step": 1686 }, { "epoch": 0.27716780939584923, "grad_norm": 0.6842202544212341, "learning_rate": 0.00012748419607413246, "loss": 5.3754, "num_input_tokens_seen": 221380608, "step": 1689 }, { "epoch": 0.2776601145635151, "grad_norm": 0.7493085861206055, "learning_rate": 0.00012737112802876149, "loss": 5.3766, "num_input_tokens_seen": 221773824, "step": 1692 }, { "epoch": 0.2781524197311809, "grad_norm": 0.8707619905471802, "learning_rate": 0.00012725836029702222, "loss": 5.376, "num_input_tokens_seen": 222167040, "step": 1695 }, { "epoch": 0.2786447248988467, "grad_norm": 0.9452652335166931, "learning_rate": 0.00012714589155185432, "loss": 5.4282, "num_input_tokens_seen": 222560256, "step": 1698 }, { "epoch": 0.27913703006651247, "grad_norm": 0.7085415124893188, "learning_rate": 0.00012703372047439269, "loss": 5.4535, "num_input_tokens_seen": 222953472, "step": 1701 }, { "epoch": 0.27962933523417827, "grad_norm": 0.6184890866279602, "learning_rate": 0.00012692184575390268, "loss": 5.3622, "num_input_tokens_seen": 223346688, "step": 1704 }, { "epoch": 0.2801216404018441, "grad_norm": 0.8037028312683105, "learning_rate": 0.0001268102660877157, "loss": 5.3098, "num_input_tokens_seen": 223739904, "step": 1707 }, { "epoch": 0.2806139455695099, "grad_norm": 0.8075725436210632, "learning_rate": 0.00012669898018116552, "loss": 5.421, "num_input_tokens_seen": 224133120, "step": 1710 }, { "epoch": 0.2811062507371757, "grad_norm": 0.8237155079841614, "learning_rate": 0.0001265879867475251, "loss": 5.3636, "num_input_tokens_seen": 224526336, "step": 1713 }, { "epoch": 0.2815985559048415, "grad_norm": 0.7250863313674927, "learning_rate": 0.00012647728450794433, "loss": 5.4245, "num_input_tokens_seen": 224919552, "step": 1716 }, { "epoch": 0.2820908610725073, "grad_norm": 0.5800032019615173, "learning_rate": 0.00012636687219138784, "loss": 5.3722, "num_input_tokens_seen": 225312768, "step": 1719 }, { "epoch": 0.2825831662401731, "grad_norm": 0.6191242933273315, "learning_rate": 0.00012625674853457394, "loss": 5.3959, "num_input_tokens_seen": 225705984, "step": 1722 }, { "epoch": 0.28307547140783895, "grad_norm": 0.6112586259841919, "learning_rate": 0.00012614691228191385, "loss": 5.386, "num_input_tokens_seen": 226099200, "step": 1725 }, { "epoch": 0.28356777657550475, "grad_norm": 0.5698121190071106, "learning_rate": 0.00012603736218545143, "loss": 5.3502, "num_input_tokens_seen": 226492416, "step": 1728 }, { "epoch": 0.28406008174317054, "grad_norm": 0.5981942415237427, "learning_rate": 0.00012592809700480388, "loss": 5.3705, "num_input_tokens_seen": 226885632, "step": 1731 }, { "epoch": 0.28455238691083634, "grad_norm": 0.5503790378570557, "learning_rate": 0.00012581911550710255, "loss": 5.364, "num_input_tokens_seen": 227278848, "step": 1734 }, { "epoch": 0.28504469207850214, "grad_norm": 0.7332367897033691, "learning_rate": 0.00012571041646693466, "loss": 5.3551, "num_input_tokens_seen": 227672064, "step": 1737 }, { "epoch": 0.285536997246168, "grad_norm": 0.6840054392814636, "learning_rate": 0.0001256019986662852, "loss": 5.3328, "num_input_tokens_seen": 228065280, "step": 1740 }, { "epoch": 0.2860293024138338, "grad_norm": 0.6030425429344177, "learning_rate": 0.00012549386089447998, "loss": 5.3826, "num_input_tokens_seen": 228458496, "step": 1743 }, { "epoch": 0.2865216075814996, "grad_norm": 0.689045786857605, "learning_rate": 0.0001253860019481285, "loss": 5.3991, "num_input_tokens_seen": 228851712, "step": 1746 }, { "epoch": 0.2870139127491654, "grad_norm": 0.6273123621940613, "learning_rate": 0.0001252784206310678, "loss": 5.3241, "num_input_tokens_seen": 229244928, "step": 1749 }, { "epoch": 0.2875062179168312, "grad_norm": 0.7061653137207031, "learning_rate": 0.0001251711157543068, "loss": 5.3797, "num_input_tokens_seen": 229638144, "step": 1752 }, { "epoch": 0.287998523084497, "grad_norm": 0.6948421001434326, "learning_rate": 0.00012506408613597125, "loss": 5.4058, "num_input_tokens_seen": 230031360, "step": 1755 }, { "epoch": 0.2884908282521628, "grad_norm": 0.6483265161514282, "learning_rate": 0.00012495733060124866, "loss": 5.3982, "num_input_tokens_seen": 230424576, "step": 1758 }, { "epoch": 0.2889831334198286, "grad_norm": 0.6624945998191833, "learning_rate": 0.00012485084798233452, "loss": 5.3528, "num_input_tokens_seen": 230817792, "step": 1761 }, { "epoch": 0.2894754385874944, "grad_norm": 0.7422268390655518, "learning_rate": 0.0001247446371183784, "loss": 5.3745, "num_input_tokens_seen": 231211008, "step": 1764 }, { "epoch": 0.2899677437551602, "grad_norm": 0.5802010893821716, "learning_rate": 0.00012463869685543102, "loss": 5.3446, "num_input_tokens_seen": 231604224, "step": 1767 }, { "epoch": 0.29046004892282606, "grad_norm": 0.6384250521659851, "learning_rate": 0.00012453302604639133, "loss": 5.3833, "num_input_tokens_seen": 231997440, "step": 1770 }, { "epoch": 0.29095235409049186, "grad_norm": 0.6214941740036011, "learning_rate": 0.00012442762355095458, "loss": 5.3835, "num_input_tokens_seen": 232390656, "step": 1773 }, { "epoch": 0.29144465925815766, "grad_norm": 0.6470031142234802, "learning_rate": 0.0001243224882355605, "loss": 5.3594, "num_input_tokens_seen": 232783872, "step": 1776 }, { "epoch": 0.29193696442582345, "grad_norm": 0.5898450016975403, "learning_rate": 0.00012421761897334212, "loss": 5.3479, "num_input_tokens_seen": 233177088, "step": 1779 }, { "epoch": 0.29242926959348925, "grad_norm": 0.625322699546814, "learning_rate": 0.00012411301464407512, "loss": 5.3949, "num_input_tokens_seen": 233570304, "step": 1782 }, { "epoch": 0.29292157476115505, "grad_norm": 0.7103887796401978, "learning_rate": 0.0001240086741341274, "loss": 5.3434, "num_input_tokens_seen": 233963520, "step": 1785 }, { "epoch": 0.2934138799288209, "grad_norm": 0.6434682011604309, "learning_rate": 0.00012390459633640952, "loss": 5.3855, "num_input_tokens_seen": 234356736, "step": 1788 }, { "epoch": 0.2939061850964867, "grad_norm": 0.6767953038215637, "learning_rate": 0.00012380078015032517, "loss": 5.3375, "num_input_tokens_seen": 234749952, "step": 1791 }, { "epoch": 0.2943984902641525, "grad_norm": 0.6974107623100281, "learning_rate": 0.00012369722448172233, "loss": 5.3822, "num_input_tokens_seen": 235143168, "step": 1794 }, { "epoch": 0.2948907954318183, "grad_norm": 0.5709061026573181, "learning_rate": 0.0001235939282428449, "loss": 5.355, "num_input_tokens_seen": 235536384, "step": 1797 }, { "epoch": 0.2953831005994841, "grad_norm": 0.5687236189842224, "learning_rate": 0.00012349089035228468, "loss": 5.3811, "num_input_tokens_seen": 235929600, "step": 1800 }, { "epoch": 0.29587540576714993, "grad_norm": 0.5439820885658264, "learning_rate": 0.0001233881097349338, "loss": 5.3716, "num_input_tokens_seen": 236322816, "step": 1803 }, { "epoch": 0.29636771093481573, "grad_norm": 0.5737081170082092, "learning_rate": 0.0001232855853219376, "loss": 5.3847, "num_input_tokens_seen": 236716032, "step": 1806 }, { "epoch": 0.2968600161024815, "grad_norm": 0.663943350315094, "learning_rate": 0.000123183316050648, "loss": 5.3875, "num_input_tokens_seen": 237109248, "step": 1809 }, { "epoch": 0.2973523212701473, "grad_norm": 0.6561760902404785, "learning_rate": 0.00012308130086457713, "loss": 5.3778, "num_input_tokens_seen": 237502464, "step": 1812 }, { "epoch": 0.2978446264378131, "grad_norm": 0.6318316459655762, "learning_rate": 0.00012297953871335165, "loss": 5.3343, "num_input_tokens_seen": 237895680, "step": 1815 }, { "epoch": 0.29833693160547897, "grad_norm": 0.5666148066520691, "learning_rate": 0.00012287802855266721, "loss": 5.3317, "num_input_tokens_seen": 238288896, "step": 1818 }, { "epoch": 0.29882923677314477, "grad_norm": 0.7399240732192993, "learning_rate": 0.00012277676934424343, "loss": 5.3615, "num_input_tokens_seen": 238682112, "step": 1821 }, { "epoch": 0.29932154194081056, "grad_norm": 0.7315434813499451, "learning_rate": 0.00012267576005577937, "loss": 5.3207, "num_input_tokens_seen": 239075328, "step": 1824 }, { "epoch": 0.29981384710847636, "grad_norm": 0.6648403406143188, "learning_rate": 0.00012257499966090933, "loss": 5.387, "num_input_tokens_seen": 239468544, "step": 1827 }, { "epoch": 0.30030615227614216, "grad_norm": 0.6215511560440063, "learning_rate": 0.00012247448713915892, "loss": 5.3734, "num_input_tokens_seen": 239861760, "step": 1830 }, { "epoch": 0.30079845744380795, "grad_norm": 0.6501013040542603, "learning_rate": 0.0001223742214759018, "loss": 5.3226, "num_input_tokens_seen": 240254976, "step": 1833 }, { "epoch": 0.3012907626114738, "grad_norm": 0.5968300104141235, "learning_rate": 0.00012227420166231658, "loss": 5.3354, "num_input_tokens_seen": 240648192, "step": 1836 }, { "epoch": 0.3017830677791396, "grad_norm": 0.6109431385993958, "learning_rate": 0.00012217442669534425, "loss": 5.3623, "num_input_tokens_seen": 241041408, "step": 1839 }, { "epoch": 0.3022753729468054, "grad_norm": 0.7393885254859924, "learning_rate": 0.00012207489557764593, "loss": 5.3629, "num_input_tokens_seen": 241434624, "step": 1842 }, { "epoch": 0.3027676781144712, "grad_norm": 0.7628598213195801, "learning_rate": 0.00012197560731756083, "loss": 5.3368, "num_input_tokens_seen": 241827840, "step": 1845 }, { "epoch": 0.303259983282137, "grad_norm": 0.8370330929756165, "learning_rate": 0.00012187656092906502, "loss": 5.3438, "num_input_tokens_seen": 242221056, "step": 1848 }, { "epoch": 0.30375228844980284, "grad_norm": 0.7718897461891174, "learning_rate": 0.0001217777554317301, "loss": 5.376, "num_input_tokens_seen": 242614272, "step": 1851 }, { "epoch": 0.30424459361746864, "grad_norm": 0.6558852791786194, "learning_rate": 0.00012167918985068255, "loss": 5.3692, "num_input_tokens_seen": 243007488, "step": 1854 }, { "epoch": 0.30473689878513444, "grad_norm": 0.647803008556366, "learning_rate": 0.00012158086321656318, "loss": 5.3617, "num_input_tokens_seen": 243400704, "step": 1857 }, { "epoch": 0.30522920395280023, "grad_norm": 0.5871676206588745, "learning_rate": 0.00012148277456548738, "loss": 5.3675, "num_input_tokens_seen": 243793920, "step": 1860 }, { "epoch": 0.30572150912046603, "grad_norm": 0.690800130367279, "learning_rate": 0.00012138492293900513, "loss": 5.3202, "num_input_tokens_seen": 244187136, "step": 1863 }, { "epoch": 0.3062138142881319, "grad_norm": 0.6760239005088806, "learning_rate": 0.00012128730738406176, "loss": 5.3634, "num_input_tokens_seen": 244580352, "step": 1866 }, { "epoch": 0.3067061194557977, "grad_norm": 0.8584226369857788, "learning_rate": 0.00012118992695295909, "loss": 5.3221, "num_input_tokens_seen": 244973568, "step": 1869 }, { "epoch": 0.3071984246234635, "grad_norm": 0.7716052532196045, "learning_rate": 0.00012109278070331664, "loss": 5.378, "num_input_tokens_seen": 245366784, "step": 1872 }, { "epoch": 0.30769072979112927, "grad_norm": 0.5956524610519409, "learning_rate": 0.0001209958676980334, "loss": 5.2991, "num_input_tokens_seen": 245760000, "step": 1875 }, { "epoch": 0.30818303495879507, "grad_norm": 0.6713026762008667, "learning_rate": 0.00012089918700524974, "loss": 5.3229, "num_input_tokens_seen": 246153216, "step": 1878 }, { "epoch": 0.3086753401264609, "grad_norm": 0.8650434017181396, "learning_rate": 0.00012080273769831004, "loss": 5.3466, "num_input_tokens_seen": 246546432, "step": 1881 }, { "epoch": 0.3091676452941267, "grad_norm": 0.7258638143539429, "learning_rate": 0.00012070651885572507, "loss": 5.3342, "num_input_tokens_seen": 246939648, "step": 1884 }, { "epoch": 0.3096599504617925, "grad_norm": 0.6096934676170349, "learning_rate": 0.00012061052956113527, "loss": 5.327, "num_input_tokens_seen": 247332864, "step": 1887 }, { "epoch": 0.3101522556294583, "grad_norm": 0.7494550943374634, "learning_rate": 0.00012051476890327393, "loss": 5.3402, "num_input_tokens_seen": 247726080, "step": 1890 }, { "epoch": 0.3106445607971241, "grad_norm": 0.586797833442688, "learning_rate": 0.00012041923597593093, "loss": 5.3722, "num_input_tokens_seen": 248119296, "step": 1893 }, { "epoch": 0.3111368659647899, "grad_norm": 0.6233952641487122, "learning_rate": 0.00012032392987791671, "loss": 5.3152, "num_input_tokens_seen": 248512512, "step": 1896 }, { "epoch": 0.31162917113245575, "grad_norm": 0.7030310034751892, "learning_rate": 0.0001202288497130266, "loss": 5.2975, "num_input_tokens_seen": 248905728, "step": 1899 }, { "epoch": 0.31212147630012155, "grad_norm": 0.603950560092926, "learning_rate": 0.00012013399459000527, "loss": 5.3444, "num_input_tokens_seen": 249298944, "step": 1902 }, { "epoch": 0.31261378146778734, "grad_norm": 0.5920315384864807, "learning_rate": 0.00012003936362251192, "loss": 5.3394, "num_input_tokens_seen": 249692160, "step": 1905 }, { "epoch": 0.31310608663545314, "grad_norm": 0.5622957944869995, "learning_rate": 0.00011994495592908519, "loss": 5.3122, "num_input_tokens_seen": 250085376, "step": 1908 }, { "epoch": 0.31359839180311894, "grad_norm": 0.5327597260475159, "learning_rate": 0.0001198507706331089, "loss": 5.3086, "num_input_tokens_seen": 250478592, "step": 1911 }, { "epoch": 0.3140906969707848, "grad_norm": 0.7794554233551025, "learning_rate": 0.00011975680686277773, "loss": 5.3564, "num_input_tokens_seen": 250871808, "step": 1914 }, { "epoch": 0.3145830021384506, "grad_norm": 0.7415952682495117, "learning_rate": 0.00011966306375106347, "loss": 5.3224, "num_input_tokens_seen": 251265024, "step": 1917 }, { "epoch": 0.3150753073061164, "grad_norm": 0.7701389789581299, "learning_rate": 0.0001195695404356812, "loss": 5.3842, "num_input_tokens_seen": 251658240, "step": 1920 }, { "epoch": 0.3155676124737822, "grad_norm": 0.6872450113296509, "learning_rate": 0.00011947623605905617, "loss": 5.3254, "num_input_tokens_seen": 252051456, "step": 1923 }, { "epoch": 0.316059917641448, "grad_norm": 0.7382259964942932, "learning_rate": 0.0001193831497682907, "loss": 5.3764, "num_input_tokens_seen": 252444672, "step": 1926 }, { "epoch": 0.3165522228091138, "grad_norm": 0.7021927237510681, "learning_rate": 0.00011929028071513144, "loss": 5.3697, "num_input_tokens_seen": 252837888, "step": 1929 }, { "epoch": 0.3170445279767796, "grad_norm": 0.5273075699806213, "learning_rate": 0.00011919762805593696, "loss": 5.3248, "num_input_tokens_seen": 253231104, "step": 1932 }, { "epoch": 0.3175368331444454, "grad_norm": 0.6100038290023804, "learning_rate": 0.00011910519095164537, "loss": 5.3604, "num_input_tokens_seen": 253624320, "step": 1935 }, { "epoch": 0.3180291383121112, "grad_norm": 0.608025848865509, "learning_rate": 0.00011901296856774264, "loss": 5.3192, "num_input_tokens_seen": 254017536, "step": 1938 }, { "epoch": 0.318521443479777, "grad_norm": 0.8330180048942566, "learning_rate": 0.00011892096007423088, "loss": 5.2825, "num_input_tokens_seen": 254410752, "step": 1941 }, { "epoch": 0.3190137486474428, "grad_norm": 0.7766706347465515, "learning_rate": 0.00011882916464559686, "loss": 5.329, "num_input_tokens_seen": 254803968, "step": 1944 }, { "epoch": 0.31950605381510866, "grad_norm": 0.7442476749420166, "learning_rate": 0.00011873758146078108, "loss": 5.3182, "num_input_tokens_seen": 255197184, "step": 1947 }, { "epoch": 0.31999835898277446, "grad_norm": 0.7740505337715149, "learning_rate": 0.00011864620970314674, "loss": 5.3424, "num_input_tokens_seen": 255590400, "step": 1950 }, { "epoch": 0.32049066415044025, "grad_norm": 0.7652367949485779, "learning_rate": 0.00011855504856044936, "loss": 5.3002, "num_input_tokens_seen": 255983616, "step": 1953 }, { "epoch": 0.32098296931810605, "grad_norm": 0.683326780796051, "learning_rate": 0.00011846409722480622, "loss": 5.359, "num_input_tokens_seen": 256376832, "step": 1956 }, { "epoch": 0.32147527448577184, "grad_norm": 0.6694085597991943, "learning_rate": 0.0001183733548926665, "loss": 5.2737, "num_input_tokens_seen": 256770048, "step": 1959 }, { "epoch": 0.3219675796534377, "grad_norm": 0.653356671333313, "learning_rate": 0.00011828282076478137, "loss": 5.3363, "num_input_tokens_seen": 257163264, "step": 1962 }, { "epoch": 0.3224598848211035, "grad_norm": 0.6444686055183411, "learning_rate": 0.00011819249404617434, "loss": 5.3049, "num_input_tokens_seen": 257556480, "step": 1965 }, { "epoch": 0.3229521899887693, "grad_norm": 0.6730981469154358, "learning_rate": 0.00011810237394611218, "loss": 5.3095, "num_input_tokens_seen": 257949696, "step": 1968 }, { "epoch": 0.3234444951564351, "grad_norm": 0.5876457691192627, "learning_rate": 0.00011801245967807553, "loss": 5.301, "num_input_tokens_seen": 258342912, "step": 1971 }, { "epoch": 0.3239368003241009, "grad_norm": 0.7190306782722473, "learning_rate": 0.00011792275045973037, "loss": 5.3266, "num_input_tokens_seen": 258736128, "step": 1974 }, { "epoch": 0.32442910549176673, "grad_norm": 0.9150959253311157, "learning_rate": 0.00011783324551289922, "loss": 5.3359, "num_input_tokens_seen": 259129344, "step": 1977 }, { "epoch": 0.32492141065943253, "grad_norm": 0.6939476728439331, "learning_rate": 0.00011774394406353287, "loss": 5.3251, "num_input_tokens_seen": 259522560, "step": 1980 }, { "epoch": 0.3254137158270983, "grad_norm": 0.6032900810241699, "learning_rate": 0.00011765484534168232, "loss": 5.3072, "num_input_tokens_seen": 259915776, "step": 1983 }, { "epoch": 0.3259060209947641, "grad_norm": 0.674830436706543, "learning_rate": 0.00011756594858147075, "loss": 5.3009, "num_input_tokens_seen": 260308992, "step": 1986 }, { "epoch": 0.3263983261624299, "grad_norm": 0.5875729918479919, "learning_rate": 0.000117477253021066, "loss": 5.35, "num_input_tokens_seen": 260702208, "step": 1989 }, { "epoch": 0.3268906313300957, "grad_norm": 0.7125598788261414, "learning_rate": 0.00011738875790265313, "loss": 5.3612, "num_input_tokens_seen": 261095424, "step": 1992 }, { "epoch": 0.32738293649776157, "grad_norm": 0.6010739207267761, "learning_rate": 0.00011730046247240715, "loss": 5.3347, "num_input_tokens_seen": 261488640, "step": 1995 }, { "epoch": 0.32787524166542736, "grad_norm": 0.6644861698150635, "learning_rate": 0.00011721236598046614, "loss": 5.3236, "num_input_tokens_seen": 261881856, "step": 1998 }, { "epoch": 0.3282034451105379, "eval_accuracy": 0.2103419638495359, "eval_loss": 5.556689262390137, "eval_runtime": 109.9195, "eval_samples_per_second": 2.729, "eval_steps_per_second": 1.365, "num_input_tokens_seen": 262144000, "step": 2000 }, { "epoch": 0.32836754683309316, "grad_norm": 0.6464180946350098, "learning_rate": 0.00011712446768090445, "loss": 5.3375, "num_input_tokens_seen": 262275072, "step": 2001 }, { "epoch": 0.32885985200075896, "grad_norm": 0.6498216390609741, "learning_rate": 0.00011703676683170618, "loss": 5.3931, "num_input_tokens_seen": 262668288, "step": 2004 }, { "epoch": 0.32935215716842475, "grad_norm": 0.6177734732627869, "learning_rate": 0.00011694926269473891, "loss": 5.3224, "num_input_tokens_seen": 263061504, "step": 2007 }, { "epoch": 0.3298444623360906, "grad_norm": 0.6429308652877808, "learning_rate": 0.00011686195453572751, "loss": 5.3532, "num_input_tokens_seen": 263454720, "step": 2010 }, { "epoch": 0.3303367675037564, "grad_norm": 0.5821199417114258, "learning_rate": 0.00011677484162422844, "loss": 5.357, "num_input_tokens_seen": 263847936, "step": 2013 }, { "epoch": 0.3308290726714222, "grad_norm": 0.6923580169677734, "learning_rate": 0.00011668792323360385, "loss": 5.3397, "num_input_tokens_seen": 264241152, "step": 2016 }, { "epoch": 0.331321377839088, "grad_norm": 0.6655700206756592, "learning_rate": 0.00011660119864099633, "loss": 5.339, "num_input_tokens_seen": 264634368, "step": 2019 }, { "epoch": 0.3318136830067538, "grad_norm": 0.5696009397506714, "learning_rate": 0.00011651466712730354, "loss": 5.332, "num_input_tokens_seen": 265027584, "step": 2022 }, { "epoch": 0.33230598817441964, "grad_norm": 0.6818378567695618, "learning_rate": 0.0001164283279771532, "loss": 5.2651, "num_input_tokens_seen": 265420800, "step": 2025 }, { "epoch": 0.33279829334208544, "grad_norm": 0.6137866377830505, "learning_rate": 0.00011634218047887825, "loss": 5.2808, "num_input_tokens_seen": 265814016, "step": 2028 }, { "epoch": 0.33329059850975123, "grad_norm": 0.9078963994979858, "learning_rate": 0.00011625622392449224, "loss": 5.3266, "num_input_tokens_seen": 266207232, "step": 2031 }, { "epoch": 0.33378290367741703, "grad_norm": 0.7636304497718811, "learning_rate": 0.00011617045760966484, "loss": 5.3276, "num_input_tokens_seen": 266600448, "step": 2034 }, { "epoch": 0.3342752088450828, "grad_norm": 0.590513288974762, "learning_rate": 0.00011608488083369763, "loss": 5.3353, "num_input_tokens_seen": 266993664, "step": 2037 }, { "epoch": 0.3347675140127487, "grad_norm": 0.6252188682556152, "learning_rate": 0.00011599949289950009, "loss": 5.329, "num_input_tokens_seen": 267386880, "step": 2040 }, { "epoch": 0.3352598191804145, "grad_norm": 0.5734694600105286, "learning_rate": 0.00011591429311356567, "loss": 5.3274, "num_input_tokens_seen": 267780096, "step": 2043 }, { "epoch": 0.33575212434808027, "grad_norm": 0.5329940319061279, "learning_rate": 0.00011582928078594821, "loss": 5.3475, "num_input_tokens_seen": 268173312, "step": 2046 }, { "epoch": 0.33624442951574607, "grad_norm": 0.5748163461685181, "learning_rate": 0.00011574445523023836, "loss": 5.2668, "num_input_tokens_seen": 268566528, "step": 2049 }, { "epoch": 0.33673673468341186, "grad_norm": 0.5230370759963989, "learning_rate": 0.00011565981576354052, "loss": 5.3068, "num_input_tokens_seen": 268959744, "step": 2052 }, { "epoch": 0.33722903985107766, "grad_norm": 0.6396874189376831, "learning_rate": 0.00011557536170644955, "loss": 5.294, "num_input_tokens_seen": 269352960, "step": 2055 }, { "epoch": 0.3377213450187435, "grad_norm": 0.5591505169868469, "learning_rate": 0.00011549109238302785, "loss": 5.3216, "num_input_tokens_seen": 269746176, "step": 2058 }, { "epoch": 0.3382136501864093, "grad_norm": 0.6073426604270935, "learning_rate": 0.00011540700712078282, "loss": 5.3153, "num_input_tokens_seen": 270139392, "step": 2061 }, { "epoch": 0.3387059553540751, "grad_norm": 0.5904248356819153, "learning_rate": 0.0001153231052506442, "loss": 5.2798, "num_input_tokens_seen": 270532608, "step": 2064 }, { "epoch": 0.3391982605217409, "grad_norm": 0.570007860660553, "learning_rate": 0.0001152393861069417, "loss": 5.2725, "num_input_tokens_seen": 270925824, "step": 2067 }, { "epoch": 0.3396905656894067, "grad_norm": 0.686757504940033, "learning_rate": 0.00011515584902738283, "loss": 5.3031, "num_input_tokens_seen": 271319040, "step": 2070 }, { "epoch": 0.34018287085707255, "grad_norm": 0.7036386132240295, "learning_rate": 0.00011507249335303097, "loss": 5.2997, "num_input_tokens_seen": 271712256, "step": 2073 }, { "epoch": 0.34067517602473835, "grad_norm": 0.5441813468933105, "learning_rate": 0.00011498931842828338, "loss": 5.3026, "num_input_tokens_seen": 272105472, "step": 2076 }, { "epoch": 0.34116748119240414, "grad_norm": 0.5833845138549805, "learning_rate": 0.00011490632360084974, "loss": 5.2784, "num_input_tokens_seen": 272498688, "step": 2079 }, { "epoch": 0.34165978636006994, "grad_norm": 0.6387719511985779, "learning_rate": 0.00011482350822173052, "loss": 5.32, "num_input_tokens_seen": 272891904, "step": 2082 }, { "epoch": 0.34215209152773574, "grad_norm": 0.530159056186676, "learning_rate": 0.00011474087164519571, "loss": 5.2776, "num_input_tokens_seen": 273285120, "step": 2085 }, { "epoch": 0.3426443966954016, "grad_norm": 0.5628234148025513, "learning_rate": 0.00011465841322876368, "loss": 5.3392, "num_input_tokens_seen": 273678336, "step": 2088 }, { "epoch": 0.3431367018630674, "grad_norm": 0.6385974884033203, "learning_rate": 0.00011457613233318018, "loss": 5.3113, "num_input_tokens_seen": 274071552, "step": 2091 }, { "epoch": 0.3436290070307332, "grad_norm": 0.5761107802391052, "learning_rate": 0.00011449402832239752, "loss": 5.3004, "num_input_tokens_seen": 274464768, "step": 2094 }, { "epoch": 0.344121312198399, "grad_norm": 0.6140743494033813, "learning_rate": 0.00011441210056355405, "loss": 5.3376, "num_input_tokens_seen": 274857984, "step": 2097 }, { "epoch": 0.3446136173660648, "grad_norm": 0.7067747116088867, "learning_rate": 0.00011433034842695343, "loss": 5.2842, "num_input_tokens_seen": 275251200, "step": 2100 }, { "epoch": 0.34510592253373057, "grad_norm": 0.5610840916633606, "learning_rate": 0.00011424877128604445, "loss": 5.2966, "num_input_tokens_seen": 275644416, "step": 2103 }, { "epoch": 0.3455982277013964, "grad_norm": 0.6917890310287476, "learning_rate": 0.00011416736851740093, "loss": 5.3038, "num_input_tokens_seen": 276037632, "step": 2106 }, { "epoch": 0.3460905328690622, "grad_norm": 0.7923979759216309, "learning_rate": 0.00011408613950070158, "loss": 5.2936, "num_input_tokens_seen": 276430848, "step": 2109 }, { "epoch": 0.346582838036728, "grad_norm": 0.6658058762550354, "learning_rate": 0.00011400508361871013, "loss": 5.284, "num_input_tokens_seen": 276824064, "step": 2112 }, { "epoch": 0.3470751432043938, "grad_norm": 0.5722377896308899, "learning_rate": 0.00011392420025725577, "loss": 5.3168, "num_input_tokens_seen": 277217280, "step": 2115 }, { "epoch": 0.3475674483720596, "grad_norm": 0.6443458199501038, "learning_rate": 0.00011384348880521352, "loss": 5.3339, "num_input_tokens_seen": 277610496, "step": 2118 }, { "epoch": 0.34805975353972546, "grad_norm": 0.6641397476196289, "learning_rate": 0.00011376294865448479, "loss": 5.3043, "num_input_tokens_seen": 278003712, "step": 2121 }, { "epoch": 0.34855205870739125, "grad_norm": 0.7289448976516724, "learning_rate": 0.00011368257919997822, "loss": 5.2867, "num_input_tokens_seen": 278396928, "step": 2124 }, { "epoch": 0.34904436387505705, "grad_norm": 0.7701984643936157, "learning_rate": 0.00011360237983959058, "loss": 5.2322, "num_input_tokens_seen": 278790144, "step": 2127 }, { "epoch": 0.34953666904272285, "grad_norm": 0.7202409505844116, "learning_rate": 0.00011352234997418777, "loss": 5.2508, "num_input_tokens_seen": 279183360, "step": 2130 }, { "epoch": 0.35002897421038864, "grad_norm": 0.6271010637283325, "learning_rate": 0.00011344248900758605, "loss": 5.2954, "num_input_tokens_seen": 279576576, "step": 2133 }, { "epoch": 0.3505212793780545, "grad_norm": 0.6194177865982056, "learning_rate": 0.00011336279634653344, "loss": 5.2592, "num_input_tokens_seen": 279969792, "step": 2136 }, { "epoch": 0.3510135845457203, "grad_norm": 0.6322209239006042, "learning_rate": 0.00011328327140069108, "loss": 5.2867, "num_input_tokens_seen": 280363008, "step": 2139 }, { "epoch": 0.3515058897133861, "grad_norm": 0.6329113245010376, "learning_rate": 0.000113203913582615, "loss": 5.3342, "num_input_tokens_seen": 280756224, "step": 2142 }, { "epoch": 0.3519981948810519, "grad_norm": 0.6119714379310608, "learning_rate": 0.00011312472230773781, "loss": 5.2948, "num_input_tokens_seen": 281149440, "step": 2145 }, { "epoch": 0.3524905000487177, "grad_norm": 0.6921229362487793, "learning_rate": 0.00011304569699435066, "loss": 5.2664, "num_input_tokens_seen": 281542656, "step": 2148 }, { "epoch": 0.35298280521638353, "grad_norm": 0.6127797961235046, "learning_rate": 0.00011296683706358528, "loss": 5.3308, "num_input_tokens_seen": 281935872, "step": 2151 }, { "epoch": 0.35347511038404933, "grad_norm": 0.6175393462181091, "learning_rate": 0.00011288814193939612, "loss": 5.3327, "num_input_tokens_seen": 282329088, "step": 2154 }, { "epoch": 0.3539674155517151, "grad_norm": 0.7570388317108154, "learning_rate": 0.00011280961104854276, "loss": 5.2763, "num_input_tokens_seen": 282722304, "step": 2157 }, { "epoch": 0.3544597207193809, "grad_norm": 0.6202073693275452, "learning_rate": 0.00011273124382057234, "loss": 5.2544, "num_input_tokens_seen": 283115520, "step": 2160 }, { "epoch": 0.3549520258870467, "grad_norm": 0.6008228063583374, "learning_rate": 0.00011265303968780214, "loss": 5.2426, "num_input_tokens_seen": 283508736, "step": 2163 }, { "epoch": 0.3554443310547125, "grad_norm": 0.6557415127754211, "learning_rate": 0.00011257499808530228, "loss": 5.2976, "num_input_tokens_seen": 283901952, "step": 2166 }, { "epoch": 0.35593663622237837, "grad_norm": 0.7166002988815308, "learning_rate": 0.00011249711845087871, "loss": 5.3111, "num_input_tokens_seen": 284295168, "step": 2169 }, { "epoch": 0.35642894139004416, "grad_norm": 0.5834003686904907, "learning_rate": 0.000112419400225056, "loss": 5.2879, "num_input_tokens_seen": 284688384, "step": 2172 }, { "epoch": 0.35692124655770996, "grad_norm": 0.6676968932151794, "learning_rate": 0.00011234184285106067, "loss": 5.3055, "num_input_tokens_seen": 285081600, "step": 2175 }, { "epoch": 0.35741355172537576, "grad_norm": 0.6959543824195862, "learning_rate": 0.00011226444577480424, "loss": 5.2653, "num_input_tokens_seen": 285474816, "step": 2178 }, { "epoch": 0.35790585689304155, "grad_norm": 0.7726870775222778, "learning_rate": 0.00011218720844486681, "loss": 5.3342, "num_input_tokens_seen": 285868032, "step": 2181 }, { "epoch": 0.3583981620607074, "grad_norm": 0.7076993584632874, "learning_rate": 0.00011211013031248031, "loss": 5.2969, "num_input_tokens_seen": 286261248, "step": 2184 }, { "epoch": 0.3588904672283732, "grad_norm": 0.6197788715362549, "learning_rate": 0.0001120332108315124, "loss": 5.2731, "num_input_tokens_seen": 286654464, "step": 2187 }, { "epoch": 0.359382772396039, "grad_norm": 0.6030928492546082, "learning_rate": 0.00011195644945844996, "loss": 5.303, "num_input_tokens_seen": 287047680, "step": 2190 }, { "epoch": 0.3598750775637048, "grad_norm": 0.6310069561004639, "learning_rate": 0.00011187984565238322, "loss": 5.2488, "num_input_tokens_seen": 287440896, "step": 2193 }, { "epoch": 0.3603673827313706, "grad_norm": 0.6047622561454773, "learning_rate": 0.00011180339887498948, "loss": 5.2663, "num_input_tokens_seen": 287834112, "step": 2196 }, { "epoch": 0.36085968789903644, "grad_norm": 0.551209568977356, "learning_rate": 0.0001117271085905174, "loss": 5.2805, "num_input_tokens_seen": 288227328, "step": 2199 }, { "epoch": 0.36135199306670224, "grad_norm": 0.6387552618980408, "learning_rate": 0.00011165097426577122, "loss": 5.3208, "num_input_tokens_seen": 288620544, "step": 2202 }, { "epoch": 0.36184429823436803, "grad_norm": 0.5512715578079224, "learning_rate": 0.00011157499537009505, "loss": 5.29, "num_input_tokens_seen": 289013760, "step": 2205 }, { "epoch": 0.36233660340203383, "grad_norm": 0.6393409967422485, "learning_rate": 0.00011149917137535733, "loss": 5.2846, "num_input_tokens_seen": 289406976, "step": 2208 }, { "epoch": 0.3628289085696996, "grad_norm": 0.6270577907562256, "learning_rate": 0.00011142350175593546, "loss": 5.2922, "num_input_tokens_seen": 289800192, "step": 2211 }, { "epoch": 0.3633212137373654, "grad_norm": 0.6736873388290405, "learning_rate": 0.00011134798598870045, "loss": 5.2724, "num_input_tokens_seen": 290193408, "step": 2214 }, { "epoch": 0.3638135189050313, "grad_norm": 0.5813255906105042, "learning_rate": 0.00011127262355300185, "loss": 5.2424, "num_input_tokens_seen": 290586624, "step": 2217 }, { "epoch": 0.36430582407269707, "grad_norm": 0.6181595325469971, "learning_rate": 0.00011119741393065246, "loss": 5.3019, "num_input_tokens_seen": 290979840, "step": 2220 }, { "epoch": 0.36479812924036287, "grad_norm": 0.6010147929191589, "learning_rate": 0.00011112235660591355, "loss": 5.2262, "num_input_tokens_seen": 291373056, "step": 2223 }, { "epoch": 0.36529043440802866, "grad_norm": 0.5609191656112671, "learning_rate": 0.00011104745106547993, "loss": 5.3074, "num_input_tokens_seen": 291766272, "step": 2226 }, { "epoch": 0.36578273957569446, "grad_norm": 0.5455970168113708, "learning_rate": 0.0001109726967984652, "loss": 5.2696, "num_input_tokens_seen": 292159488, "step": 2229 }, { "epoch": 0.3662750447433603, "grad_norm": 0.6548165082931519, "learning_rate": 0.00011089809329638716, "loss": 5.2895, "num_input_tokens_seen": 292552704, "step": 2232 }, { "epoch": 0.3667673499110261, "grad_norm": 0.6180770397186279, "learning_rate": 0.00011082364005315322, "loss": 5.2736, "num_input_tokens_seen": 292945920, "step": 2235 }, { "epoch": 0.3672596550786919, "grad_norm": 0.6706951856613159, "learning_rate": 0.00011074933656504608, "loss": 5.3579, "num_input_tokens_seen": 293339136, "step": 2238 }, { "epoch": 0.3677519602463577, "grad_norm": 0.6596742868423462, "learning_rate": 0.00011067518233070927, "loss": 5.2774, "num_input_tokens_seen": 293732352, "step": 2241 }, { "epoch": 0.3682442654140235, "grad_norm": 0.6398391723632812, "learning_rate": 0.00011060117685113308, "loss": 5.2773, "num_input_tokens_seen": 294125568, "step": 2244 }, { "epoch": 0.36873657058168935, "grad_norm": 0.6828597784042358, "learning_rate": 0.00011052731962964036, "loss": 5.2476, "num_input_tokens_seen": 294518784, "step": 2247 }, { "epoch": 0.36922887574935515, "grad_norm": 0.6363155245780945, "learning_rate": 0.0001104536101718726, "loss": 5.3068, "num_input_tokens_seen": 294912000, "step": 2250 }, { "epoch": 0.36972118091702094, "grad_norm": 0.6510285139083862, "learning_rate": 0.00011038004798577598, "loss": 5.3018, "num_input_tokens_seen": 295305216, "step": 2253 }, { "epoch": 0.37021348608468674, "grad_norm": 0.6058257222175598, "learning_rate": 0.00011030663258158759, "loss": 5.2721, "num_input_tokens_seen": 295698432, "step": 2256 }, { "epoch": 0.37070579125235253, "grad_norm": 0.6524608731269836, "learning_rate": 0.0001102333634718217, "loss": 5.3041, "num_input_tokens_seen": 296091648, "step": 2259 }, { "epoch": 0.3711980964200184, "grad_norm": 0.6795722842216492, "learning_rate": 0.00011016024017125623, "loss": 5.2912, "num_input_tokens_seen": 296484864, "step": 2262 }, { "epoch": 0.3716904015876842, "grad_norm": 0.5707486867904663, "learning_rate": 0.00011008726219691922, "loss": 5.3142, "num_input_tokens_seen": 296878080, "step": 2265 }, { "epoch": 0.37218270675535, "grad_norm": 0.5843520760536194, "learning_rate": 0.00011001442906807543, "loss": 5.2925, "num_input_tokens_seen": 297271296, "step": 2268 }, { "epoch": 0.3726750119230158, "grad_norm": 0.5435792803764343, "learning_rate": 0.00010994174030621302, "loss": 5.254, "num_input_tokens_seen": 297664512, "step": 2271 }, { "epoch": 0.37316731709068157, "grad_norm": 0.5673099160194397, "learning_rate": 0.00010986919543503034, "loss": 5.2462, "num_input_tokens_seen": 298057728, "step": 2274 }, { "epoch": 0.37365962225834737, "grad_norm": 0.6960703134536743, "learning_rate": 0.00010979679398042297, "loss": 5.2829, "num_input_tokens_seen": 298450944, "step": 2277 }, { "epoch": 0.3741519274260132, "grad_norm": 0.6623157858848572, "learning_rate": 0.00010972453547047044, "loss": 5.259, "num_input_tokens_seen": 298844160, "step": 2280 }, { "epoch": 0.374644232593679, "grad_norm": 0.5777830481529236, "learning_rate": 0.00010965241943542353, "loss": 5.2606, "num_input_tokens_seen": 299237376, "step": 2283 }, { "epoch": 0.3751365377613448, "grad_norm": 0.6100009679794312, "learning_rate": 0.00010958044540769138, "loss": 5.2787, "num_input_tokens_seen": 299630592, "step": 2286 }, { "epoch": 0.3756288429290106, "grad_norm": 0.6693081259727478, "learning_rate": 0.0001095086129218287, "loss": 5.2721, "num_input_tokens_seen": 300023808, "step": 2289 }, { "epoch": 0.3761211480966764, "grad_norm": 0.7031441330909729, "learning_rate": 0.00010943692151452322, "loss": 5.265, "num_input_tokens_seen": 300417024, "step": 2292 }, { "epoch": 0.37661345326434226, "grad_norm": 0.6610632538795471, "learning_rate": 0.00010936537072458307, "loss": 5.2384, "num_input_tokens_seen": 300810240, "step": 2295 }, { "epoch": 0.37710575843200805, "grad_norm": 0.5596596002578735, "learning_rate": 0.00010929396009292432, "loss": 5.2726, "num_input_tokens_seen": 301203456, "step": 2298 }, { "epoch": 0.37759806359967385, "grad_norm": 0.599071741104126, "learning_rate": 0.00010922268916255866, "loss": 5.256, "num_input_tokens_seen": 301596672, "step": 2301 }, { "epoch": 0.37809036876733965, "grad_norm": 0.5803152322769165, "learning_rate": 0.00010915155747858112, "loss": 5.2565, "num_input_tokens_seen": 301989888, "step": 2304 }, { "epoch": 0.37858267393500544, "grad_norm": 0.6008164882659912, "learning_rate": 0.00010908056458815778, "loss": 5.2591, "num_input_tokens_seen": 302383104, "step": 2307 }, { "epoch": 0.3790749791026713, "grad_norm": 0.6585036516189575, "learning_rate": 0.00010900971004051374, "loss": 5.2829, "num_input_tokens_seen": 302776320, "step": 2310 }, { "epoch": 0.3795672842703371, "grad_norm": 0.628734290599823, "learning_rate": 0.00010893899338692111, "loss": 5.2621, "num_input_tokens_seen": 303169536, "step": 2313 }, { "epoch": 0.3800595894380029, "grad_norm": 0.6740299463272095, "learning_rate": 0.00010886841418068702, "loss": 5.2701, "num_input_tokens_seen": 303562752, "step": 2316 }, { "epoch": 0.3805518946056687, "grad_norm": 0.7948876023292542, "learning_rate": 0.00010879797197714182, "loss": 5.2949, "num_input_tokens_seen": 303955968, "step": 2319 }, { "epoch": 0.3810441997733345, "grad_norm": 0.8644827008247375, "learning_rate": 0.00010872766633362728, "loss": 5.2677, "num_input_tokens_seen": 304349184, "step": 2322 }, { "epoch": 0.3815365049410003, "grad_norm": 0.8210058808326721, "learning_rate": 0.00010865749680948491, "loss": 5.2736, "num_input_tokens_seen": 304742400, "step": 2325 }, { "epoch": 0.38202881010866613, "grad_norm": 0.7495512366294861, "learning_rate": 0.00010858746296604438, "loss": 5.2665, "num_input_tokens_seen": 305135616, "step": 2328 }, { "epoch": 0.3825211152763319, "grad_norm": 0.7410356402397156, "learning_rate": 0.00010851756436661199, "loss": 5.2467, "num_input_tokens_seen": 305528832, "step": 2331 }, { "epoch": 0.3830134204439977, "grad_norm": 0.6325095891952515, "learning_rate": 0.0001084478005764592, "loss": 5.2277, "num_input_tokens_seen": 305922048, "step": 2334 }, { "epoch": 0.3835057256116635, "grad_norm": 0.6482600569725037, "learning_rate": 0.00010837817116281135, "loss": 5.2298, "num_input_tokens_seen": 306315264, "step": 2337 }, { "epoch": 0.3839980307793293, "grad_norm": 0.6560254693031311, "learning_rate": 0.00010830867569483633, "loss": 5.2803, "num_input_tokens_seen": 306708480, "step": 2340 }, { "epoch": 0.38449033594699517, "grad_norm": 0.5692320466041565, "learning_rate": 0.00010823931374363337, "loss": 5.2488, "num_input_tokens_seen": 307101696, "step": 2343 }, { "epoch": 0.38498264111466096, "grad_norm": 0.5976374745368958, "learning_rate": 0.00010817008488222198, "loss": 5.3247, "num_input_tokens_seen": 307494912, "step": 2346 }, { "epoch": 0.38547494628232676, "grad_norm": 0.663112998008728, "learning_rate": 0.00010810098868553085, "loss": 5.2795, "num_input_tokens_seen": 307888128, "step": 2349 }, { "epoch": 0.38596725144999255, "grad_norm": 0.6092638969421387, "learning_rate": 0.00010803202473038695, "loss": 5.279, "num_input_tokens_seen": 308281344, "step": 2352 }, { "epoch": 0.38645955661765835, "grad_norm": 0.5389038920402527, "learning_rate": 0.00010796319259550458, "loss": 5.2489, "num_input_tokens_seen": 308674560, "step": 2355 }, { "epoch": 0.3869518617853242, "grad_norm": 0.5788781046867371, "learning_rate": 0.00010789449186147456, "loss": 5.2873, "num_input_tokens_seen": 309067776, "step": 2358 }, { "epoch": 0.38744416695299, "grad_norm": 0.6551850438117981, "learning_rate": 0.0001078259221107536, "loss": 5.26, "num_input_tokens_seen": 309460992, "step": 2361 }, { "epoch": 0.3879364721206558, "grad_norm": 0.7511112093925476, "learning_rate": 0.00010775748292765357, "loss": 5.236, "num_input_tokens_seen": 309854208, "step": 2364 }, { "epoch": 0.3884287772883216, "grad_norm": 0.5426856875419617, "learning_rate": 0.00010768917389833085, "loss": 5.2236, "num_input_tokens_seen": 310247424, "step": 2367 }, { "epoch": 0.3889210824559874, "grad_norm": 0.6398828029632568, "learning_rate": 0.00010762099461077592, "loss": 5.2549, "num_input_tokens_seen": 310640640, "step": 2370 }, { "epoch": 0.3894133876236532, "grad_norm": 0.5986840128898621, "learning_rate": 0.00010755294465480287, "loss": 5.2431, "num_input_tokens_seen": 311033856, "step": 2373 }, { "epoch": 0.38990569279131904, "grad_norm": 0.5876027941703796, "learning_rate": 0.00010748502362203909, "loss": 5.1995, "num_input_tokens_seen": 311427072, "step": 2376 }, { "epoch": 0.39039799795898483, "grad_norm": 0.5282983183860779, "learning_rate": 0.00010741723110591491, "loss": 5.2399, "num_input_tokens_seen": 311820288, "step": 2379 }, { "epoch": 0.39089030312665063, "grad_norm": 0.5736381411552429, "learning_rate": 0.00010734956670165345, "loss": 5.2859, "num_input_tokens_seen": 312213504, "step": 2382 }, { "epoch": 0.3913826082943164, "grad_norm": 0.5055151581764221, "learning_rate": 0.00010728203000626037, "loss": 5.269, "num_input_tokens_seen": 312606720, "step": 2385 }, { "epoch": 0.3918749134619822, "grad_norm": 0.5718653202056885, "learning_rate": 0.00010721462061851386, "loss": 5.2439, "num_input_tokens_seen": 312999936, "step": 2388 }, { "epoch": 0.3923672186296481, "grad_norm": 0.5438402891159058, "learning_rate": 0.00010714733813895464, "loss": 5.2659, "num_input_tokens_seen": 313393152, "step": 2391 }, { "epoch": 0.39285952379731387, "grad_norm": 0.550797700881958, "learning_rate": 0.00010708018216987601, "loss": 5.2214, "num_input_tokens_seen": 313786368, "step": 2394 }, { "epoch": 0.39335182896497967, "grad_norm": 0.6071121096611023, "learning_rate": 0.00010701315231531391, "loss": 5.2348, "num_input_tokens_seen": 314179584, "step": 2397 }, { "epoch": 0.39384413413264546, "grad_norm": 0.5938403606414795, "learning_rate": 0.00010694624818103718, "loss": 5.2764, "num_input_tokens_seen": 314572800, "step": 2400 }, { "epoch": 0.39384413413264546, "eval_accuracy": 0.21509688975736851, "eval_loss": 5.491861343383789, "eval_runtime": 110.2644, "eval_samples_per_second": 2.721, "eval_steps_per_second": 1.36, "num_input_tokens_seen": 314572800, "step": 2400 }, { "epoch": 0.39433643930031126, "grad_norm": 0.6181114912033081, "learning_rate": 0.00010687946937453784, "loss": 5.2461, "num_input_tokens_seen": 314966016, "step": 2403 }, { "epoch": 0.3948287444679771, "grad_norm": 0.8083803653717041, "learning_rate": 0.00010681281550502132, "loss": 5.2669, "num_input_tokens_seen": 315359232, "step": 2406 }, { "epoch": 0.3953210496356429, "grad_norm": 0.6575871109962463, "learning_rate": 0.00010674628618339699, "loss": 5.2582, "num_input_tokens_seen": 315752448, "step": 2409 }, { "epoch": 0.3958133548033087, "grad_norm": 0.6498743891716003, "learning_rate": 0.00010667988102226855, "loss": 5.2495, "num_input_tokens_seen": 316145664, "step": 2412 }, { "epoch": 0.3963056599709745, "grad_norm": 0.6563365459442139, "learning_rate": 0.00010661359963592445, "loss": 5.2312, "num_input_tokens_seen": 316538880, "step": 2415 }, { "epoch": 0.3967979651386403, "grad_norm": 0.6259288191795349, "learning_rate": 0.00010654744164032871, "loss": 5.2296, "num_input_tokens_seen": 316932096, "step": 2418 }, { "epoch": 0.39729027030630615, "grad_norm": 0.6058263778686523, "learning_rate": 0.00010648140665311141, "loss": 5.2278, "num_input_tokens_seen": 317325312, "step": 2421 }, { "epoch": 0.39778257547397194, "grad_norm": 0.6280511617660522, "learning_rate": 0.00010641549429355939, "loss": 5.2237, "num_input_tokens_seen": 317718528, "step": 2424 }, { "epoch": 0.39827488064163774, "grad_norm": 0.7042283415794373, "learning_rate": 0.00010634970418260718, "loss": 5.2665, "num_input_tokens_seen": 318111744, "step": 2427 }, { "epoch": 0.39876718580930354, "grad_norm": 0.673143208026886, "learning_rate": 0.00010628403594282772, "loss": 5.2682, "num_input_tokens_seen": 318504960, "step": 2430 }, { "epoch": 0.39925949097696933, "grad_norm": 0.6177608370780945, "learning_rate": 0.00010621848919842326, "loss": 5.2381, "num_input_tokens_seen": 318898176, "step": 2433 }, { "epoch": 0.39975179614463513, "grad_norm": 0.5575879812240601, "learning_rate": 0.00010615306357521642, "loss": 5.2435, "num_input_tokens_seen": 319291392, "step": 2436 }, { "epoch": 0.400244101312301, "grad_norm": 0.6958820223808289, "learning_rate": 0.00010608775870064112, "loss": 5.2666, "num_input_tokens_seen": 319684608, "step": 2439 }, { "epoch": 0.4007364064799668, "grad_norm": 0.7199931144714355, "learning_rate": 0.00010602257420373379, "loss": 5.2485, "num_input_tokens_seen": 320077824, "step": 2442 }, { "epoch": 0.4012287116476326, "grad_norm": 0.6789767146110535, "learning_rate": 0.00010595750971512437, "loss": 5.2245, "num_input_tokens_seen": 320471040, "step": 2445 }, { "epoch": 0.40172101681529837, "grad_norm": 0.5763687491416931, "learning_rate": 0.00010589256486702759, "loss": 5.2227, "num_input_tokens_seen": 320864256, "step": 2448 }, { "epoch": 0.40221332198296417, "grad_norm": 0.5604720115661621, "learning_rate": 0.0001058277392932343, "loss": 5.2863, "num_input_tokens_seen": 321257472, "step": 2451 }, { "epoch": 0.40270562715063, "grad_norm": 0.5740640163421631, "learning_rate": 0.00010576303262910272, "loss": 5.2624, "num_input_tokens_seen": 321650688, "step": 2454 }, { "epoch": 0.4031979323182958, "grad_norm": 0.5708188414573669, "learning_rate": 0.00010569844451154979, "loss": 5.2093, "num_input_tokens_seen": 322043904, "step": 2457 }, { "epoch": 0.4036902374859616, "grad_norm": 0.5198714137077332, "learning_rate": 0.00010563397457904276, "loss": 5.2506, "num_input_tokens_seen": 322437120, "step": 2460 }, { "epoch": 0.4041825426536274, "grad_norm": 0.7625793814659119, "learning_rate": 0.00010556962247159053, "loss": 5.2606, "num_input_tokens_seen": 322830336, "step": 2463 }, { "epoch": 0.4046748478212932, "grad_norm": 0.6418973803520203, "learning_rate": 0.00010550538783073529, "loss": 5.2228, "num_input_tokens_seen": 323223552, "step": 2466 }, { "epoch": 0.40516715298895906, "grad_norm": 0.5481504797935486, "learning_rate": 0.00010544127029954414, "loss": 5.2283, "num_input_tokens_seen": 323616768, "step": 2469 }, { "epoch": 0.40565945815662485, "grad_norm": 0.5938975214958191, "learning_rate": 0.00010537726952260071, "loss": 5.2308, "num_input_tokens_seen": 324009984, "step": 2472 }, { "epoch": 0.40615176332429065, "grad_norm": 0.6005613803863525, "learning_rate": 0.00010531338514599695, "loss": 5.2342, "num_input_tokens_seen": 324403200, "step": 2475 }, { "epoch": 0.40664406849195645, "grad_norm": 0.6536208987236023, "learning_rate": 0.00010524961681732482, "loss": 5.2289, "num_input_tokens_seen": 324796416, "step": 2478 }, { "epoch": 0.40713637365962224, "grad_norm": 0.5923652052879333, "learning_rate": 0.00010518596418566824, "loss": 5.2962, "num_input_tokens_seen": 325189632, "step": 2481 }, { "epoch": 0.40762867882728804, "grad_norm": 0.6256945133209229, "learning_rate": 0.00010512242690159487, "loss": 5.2679, "num_input_tokens_seen": 325582848, "step": 2484 }, { "epoch": 0.4081209839949539, "grad_norm": 0.6368097066879272, "learning_rate": 0.00010505900461714815, "loss": 5.1868, "num_input_tokens_seen": 325976064, "step": 2487 }, { "epoch": 0.4086132891626197, "grad_norm": 0.7050658464431763, "learning_rate": 0.00010499569698583921, "loss": 5.2024, "num_input_tokens_seen": 326369280, "step": 2490 }, { "epoch": 0.4091055943302855, "grad_norm": 0.6694772839546204, "learning_rate": 0.000104932503662639, "loss": 5.222, "num_input_tokens_seen": 326762496, "step": 2493 }, { "epoch": 0.4095978994979513, "grad_norm": 0.5945850014686584, "learning_rate": 0.00010486942430397028, "loss": 5.2364, "num_input_tokens_seen": 327155712, "step": 2496 }, { "epoch": 0.4100902046656171, "grad_norm": 0.5960330367088318, "learning_rate": 0.00010480645856769992, "loss": 5.2292, "num_input_tokens_seen": 327548928, "step": 2499 }, { "epoch": 0.4105825098332829, "grad_norm": 0.550009548664093, "learning_rate": 0.00010474360611313098, "loss": 5.2305, "num_input_tokens_seen": 327942144, "step": 2502 }, { "epoch": 0.4110748150009487, "grad_norm": 0.6489304900169373, "learning_rate": 0.00010468086660099509, "loss": 5.2478, "num_input_tokens_seen": 328335360, "step": 2505 }, { "epoch": 0.4115671201686145, "grad_norm": 0.6759921312332153, "learning_rate": 0.00010461823969344457, "loss": 5.2794, "num_input_tokens_seen": 328728576, "step": 2508 }, { "epoch": 0.4120594253362803, "grad_norm": 0.7441422939300537, "learning_rate": 0.00010455572505404502, "loss": 5.2267, "num_input_tokens_seen": 329121792, "step": 2511 }, { "epoch": 0.4125517305039461, "grad_norm": 0.6741816997528076, "learning_rate": 0.00010449332234776757, "loss": 5.2342, "num_input_tokens_seen": 329515008, "step": 2514 }, { "epoch": 0.41304403567161196, "grad_norm": 0.6322781443595886, "learning_rate": 0.00010443103124098138, "loss": 5.2423, "num_input_tokens_seen": 329908224, "step": 2517 }, { "epoch": 0.41353634083927776, "grad_norm": 0.6095474362373352, "learning_rate": 0.00010436885140144612, "loss": 5.2405, "num_input_tokens_seen": 330301440, "step": 2520 }, { "epoch": 0.41402864600694356, "grad_norm": 0.6490298509597778, "learning_rate": 0.00010430678249830464, "loss": 5.2183, "num_input_tokens_seen": 330694656, "step": 2523 }, { "epoch": 0.41452095117460935, "grad_norm": 0.6782544255256653, "learning_rate": 0.00010424482420207543, "loss": 5.2436, "num_input_tokens_seen": 331087872, "step": 2526 }, { "epoch": 0.41501325634227515, "grad_norm": 0.6933419108390808, "learning_rate": 0.00010418297618464539, "loss": 5.2103, "num_input_tokens_seen": 331481088, "step": 2529 }, { "epoch": 0.415505561509941, "grad_norm": 0.7551579475402832, "learning_rate": 0.00010412123811926243, "loss": 5.2309, "num_input_tokens_seen": 331874304, "step": 2532 }, { "epoch": 0.4159978666776068, "grad_norm": 0.5901357531547546, "learning_rate": 0.00010405960968052833, "loss": 5.2296, "num_input_tokens_seen": 332267520, "step": 2535 }, { "epoch": 0.4164901718452726, "grad_norm": 0.6693659424781799, "learning_rate": 0.0001039980905443914, "loss": 5.2299, "num_input_tokens_seen": 332660736, "step": 2538 }, { "epoch": 0.4169824770129384, "grad_norm": 0.6914536356925964, "learning_rate": 0.00010393668038813947, "loss": 5.2062, "num_input_tokens_seen": 333053952, "step": 2541 }, { "epoch": 0.4174747821806042, "grad_norm": 0.8672217130661011, "learning_rate": 0.0001038753788903927, "loss": 5.2562, "num_input_tokens_seen": 333447168, "step": 2544 }, { "epoch": 0.41796708734827, "grad_norm": 0.7356572151184082, "learning_rate": 0.0001038141857310965, "loss": 5.2203, "num_input_tokens_seen": 333840384, "step": 2547 }, { "epoch": 0.41845939251593584, "grad_norm": 0.5476716756820679, "learning_rate": 0.00010375310059151456, "loss": 5.2411, "num_input_tokens_seen": 334233600, "step": 2550 }, { "epoch": 0.41895169768360163, "grad_norm": 0.8849159479141235, "learning_rate": 0.00010369212315422186, "loss": 5.2273, "num_input_tokens_seen": 334626816, "step": 2553 }, { "epoch": 0.41944400285126743, "grad_norm": 0.8213227987289429, "learning_rate": 0.00010363125310309775, "loss": 5.1921, "num_input_tokens_seen": 335020032, "step": 2556 }, { "epoch": 0.4199363080189332, "grad_norm": 0.806098461151123, "learning_rate": 0.00010357049012331902, "loss": 5.2061, "num_input_tokens_seen": 335413248, "step": 2559 }, { "epoch": 0.420428613186599, "grad_norm": 0.8755928874015808, "learning_rate": 0.00010350983390135311, "loss": 5.228, "num_input_tokens_seen": 335806464, "step": 2562 }, { "epoch": 0.4209209183542649, "grad_norm": 0.7494667172431946, "learning_rate": 0.00010344928412495135, "loss": 5.1867, "num_input_tokens_seen": 336199680, "step": 2565 }, { "epoch": 0.42141322352193067, "grad_norm": 0.7980634570121765, "learning_rate": 0.00010338884048314206, "loss": 5.2597, "num_input_tokens_seen": 336592896, "step": 2568 }, { "epoch": 0.42190552868959647, "grad_norm": 0.606177568435669, "learning_rate": 0.00010332850266622407, "loss": 5.2364, "num_input_tokens_seen": 336986112, "step": 2571 }, { "epoch": 0.42239783385726226, "grad_norm": 0.59128737449646, "learning_rate": 0.0001032682703657598, "loss": 5.2291, "num_input_tokens_seen": 337379328, "step": 2574 }, { "epoch": 0.42289013902492806, "grad_norm": 0.8499715328216553, "learning_rate": 0.00010320814327456885, "loss": 5.2151, "num_input_tokens_seen": 337772544, "step": 2577 }, { "epoch": 0.4233824441925939, "grad_norm": 0.5828485488891602, "learning_rate": 0.00010314812108672135, "loss": 5.1643, "num_input_tokens_seen": 338165760, "step": 2580 }, { "epoch": 0.4238747493602597, "grad_norm": 0.5701519250869751, "learning_rate": 0.00010308820349753134, "loss": 5.1919, "num_input_tokens_seen": 338558976, "step": 2583 }, { "epoch": 0.4243670545279255, "grad_norm": 0.6905439496040344, "learning_rate": 0.00010302839020355037, "loss": 5.2067, "num_input_tokens_seen": 338952192, "step": 2586 }, { "epoch": 0.4248593596955913, "grad_norm": 0.6845247149467468, "learning_rate": 0.00010296868090256107, "loss": 5.1829, "num_input_tokens_seen": 339345408, "step": 2589 }, { "epoch": 0.4253516648632571, "grad_norm": 0.5913321375846863, "learning_rate": 0.00010290907529357057, "loss": 5.2025, "num_input_tokens_seen": 339738624, "step": 2592 }, { "epoch": 0.4258439700309229, "grad_norm": 0.627149760723114, "learning_rate": 0.00010284957307680437, "loss": 5.1632, "num_input_tokens_seen": 340131840, "step": 2595 }, { "epoch": 0.42633627519858874, "grad_norm": 0.574410617351532, "learning_rate": 0.0001027901739536998, "loss": 5.2493, "num_input_tokens_seen": 340525056, "step": 2598 }, { "epoch": 0.42682858036625454, "grad_norm": 0.5851385593414307, "learning_rate": 0.00010273087762689989, "loss": 5.2052, "num_input_tokens_seen": 340918272, "step": 2601 }, { "epoch": 0.42732088553392034, "grad_norm": 0.7069036960601807, "learning_rate": 0.00010267168380024689, "loss": 5.2107, "num_input_tokens_seen": 341311488, "step": 2604 }, { "epoch": 0.42781319070158613, "grad_norm": 0.5939748883247375, "learning_rate": 0.00010261259217877632, "loss": 5.2134, "num_input_tokens_seen": 341704704, "step": 2607 }, { "epoch": 0.42830549586925193, "grad_norm": 0.7017332315444946, "learning_rate": 0.0001025536024687107, "loss": 5.1983, "num_input_tokens_seen": 342097920, "step": 2610 }, { "epoch": 0.4287978010369178, "grad_norm": 0.7420720458030701, "learning_rate": 0.00010249471437745328, "loss": 5.2393, "num_input_tokens_seen": 342491136, "step": 2613 }, { "epoch": 0.4292901062045836, "grad_norm": 0.6406148672103882, "learning_rate": 0.00010243592761358217, "loss": 5.2169, "num_input_tokens_seen": 342884352, "step": 2616 }, { "epoch": 0.4297824113722494, "grad_norm": 0.6864999532699585, "learning_rate": 0.00010237724188684409, "loss": 5.22, "num_input_tokens_seen": 343277568, "step": 2619 }, { "epoch": 0.43027471653991517, "grad_norm": 0.7687135934829712, "learning_rate": 0.00010231865690814853, "loss": 5.2909, "num_input_tokens_seen": 343670784, "step": 2622 }, { "epoch": 0.43076702170758097, "grad_norm": 0.6317099332809448, "learning_rate": 0.0001022601723895616, "loss": 5.2036, "num_input_tokens_seen": 344064000, "step": 2625 }, { "epoch": 0.4312593268752468, "grad_norm": 0.6858761310577393, "learning_rate": 0.00010220178804430015, "loss": 5.1692, "num_input_tokens_seen": 344457216, "step": 2628 }, { "epoch": 0.4317516320429126, "grad_norm": 0.7586376667022705, "learning_rate": 0.00010214350358672594, "loss": 5.1816, "num_input_tokens_seen": 344850432, "step": 2631 }, { "epoch": 0.4322439372105784, "grad_norm": 0.643675684928894, "learning_rate": 0.00010208531873233962, "loss": 5.2572, "num_input_tokens_seen": 345243648, "step": 2634 }, { "epoch": 0.4327362423782442, "grad_norm": 0.5361793637275696, "learning_rate": 0.00010202723319777505, "loss": 5.2143, "num_input_tokens_seen": 345636864, "step": 2637 }, { "epoch": 0.43322854754591, "grad_norm": 0.597233772277832, "learning_rate": 0.00010196924670079342, "loss": 5.2139, "num_input_tokens_seen": 346030080, "step": 2640 }, { "epoch": 0.43372085271357586, "grad_norm": 0.6196411848068237, "learning_rate": 0.00010191135896027748, "loss": 5.2171, "num_input_tokens_seen": 346423296, "step": 2643 }, { "epoch": 0.43421315788124165, "grad_norm": 0.5964427590370178, "learning_rate": 0.00010185356969622588, "loss": 5.2317, "num_input_tokens_seen": 346816512, "step": 2646 }, { "epoch": 0.43470546304890745, "grad_norm": 0.5954868197441101, "learning_rate": 0.00010179587862974739, "loss": 5.1779, "num_input_tokens_seen": 347209728, "step": 2649 }, { "epoch": 0.43519776821657324, "grad_norm": 0.6737942099571228, "learning_rate": 0.00010173828548305536, "loss": 5.2293, "num_input_tokens_seen": 347602944, "step": 2652 }, { "epoch": 0.43569007338423904, "grad_norm": 0.6102513074874878, "learning_rate": 0.00010168078997946198, "loss": 5.1969, "num_input_tokens_seen": 347996160, "step": 2655 }, { "epoch": 0.43618237855190484, "grad_norm": 0.5308377742767334, "learning_rate": 0.00010162339184337281, "loss": 5.1909, "num_input_tokens_seen": 348389376, "step": 2658 }, { "epoch": 0.4366746837195707, "grad_norm": 0.6343761682510376, "learning_rate": 0.00010156609080028115, "loss": 5.1856, "num_input_tokens_seen": 348782592, "step": 2661 }, { "epoch": 0.4371669888872365, "grad_norm": 0.666679322719574, "learning_rate": 0.0001015088865767626, "loss": 5.2063, "num_input_tokens_seen": 349175808, "step": 2664 }, { "epoch": 0.4376592940549023, "grad_norm": 0.5723670125007629, "learning_rate": 0.00010145177890046946, "loss": 5.1875, "num_input_tokens_seen": 349569024, "step": 2667 }, { "epoch": 0.4381515992225681, "grad_norm": 0.6312902569770813, "learning_rate": 0.00010139476750012542, "loss": 5.2351, "num_input_tokens_seen": 349962240, "step": 2670 }, { "epoch": 0.4386439043902339, "grad_norm": 0.6692864298820496, "learning_rate": 0.00010133785210552012, "loss": 5.2162, "num_input_tokens_seen": 350355456, "step": 2673 }, { "epoch": 0.4391362095578997, "grad_norm": 0.5867645740509033, "learning_rate": 0.00010128103244750365, "loss": 5.2184, "num_input_tokens_seen": 350748672, "step": 2676 }, { "epoch": 0.4396285147255655, "grad_norm": 0.5672877430915833, "learning_rate": 0.00010122430825798135, "loss": 5.2469, "num_input_tokens_seen": 351141888, "step": 2679 }, { "epoch": 0.4401208198932313, "grad_norm": 0.7089299559593201, "learning_rate": 0.00010116767926990843, "loss": 5.1769, "num_input_tokens_seen": 351535104, "step": 2682 }, { "epoch": 0.4406131250608971, "grad_norm": 0.677344560623169, "learning_rate": 0.0001011111452172847, "loss": 5.2314, "num_input_tokens_seen": 351928320, "step": 2685 }, { "epoch": 0.4411054302285629, "grad_norm": 0.5708670616149902, "learning_rate": 0.00010105470583514936, "loss": 5.1938, "num_input_tokens_seen": 352321536, "step": 2688 }, { "epoch": 0.44159773539622876, "grad_norm": 0.5892400145530701, "learning_rate": 0.00010099836085957568, "loss": 5.177, "num_input_tokens_seen": 352714752, "step": 2691 }, { "epoch": 0.44209004056389456, "grad_norm": 0.6531316041946411, "learning_rate": 0.00010094211002766593, "loss": 5.2086, "num_input_tokens_seen": 353107968, "step": 2694 }, { "epoch": 0.44258234573156036, "grad_norm": 0.5908800363540649, "learning_rate": 0.00010088595307754617, "loss": 5.1706, "num_input_tokens_seen": 353501184, "step": 2697 }, { "epoch": 0.44307465089922615, "grad_norm": 0.5883475542068481, "learning_rate": 0.00010082988974836116, "loss": 5.1982, "num_input_tokens_seen": 353894400, "step": 2700 }, { "epoch": 0.44356695606689195, "grad_norm": 0.6906709671020508, "learning_rate": 0.0001007739197802692, "loss": 5.1891, "num_input_tokens_seen": 354287616, "step": 2703 }, { "epoch": 0.44405926123455775, "grad_norm": 0.7396822571754456, "learning_rate": 0.00010071804291443717, "loss": 5.2075, "num_input_tokens_seen": 354680832, "step": 2706 }, { "epoch": 0.4445515664022236, "grad_norm": 0.5959869027137756, "learning_rate": 0.00010066225889303549, "loss": 5.1908, "num_input_tokens_seen": 355074048, "step": 2709 }, { "epoch": 0.4450438715698894, "grad_norm": 0.5678725242614746, "learning_rate": 0.00010060656745923301, "loss": 5.2184, "num_input_tokens_seen": 355467264, "step": 2712 }, { "epoch": 0.4455361767375552, "grad_norm": 0.693349301815033, "learning_rate": 0.00010055096835719215, "loss": 5.2408, "num_input_tokens_seen": 355860480, "step": 2715 }, { "epoch": 0.446028481905221, "grad_norm": 0.6372326016426086, "learning_rate": 0.000100495461332064, "loss": 5.2121, "num_input_tokens_seen": 356253696, "step": 2718 }, { "epoch": 0.4465207870728868, "grad_norm": 0.6225445866584778, "learning_rate": 0.00010044004612998325, "loss": 5.1776, "num_input_tokens_seen": 356646912, "step": 2721 }, { "epoch": 0.44701309224055263, "grad_norm": 0.6060691475868225, "learning_rate": 0.0001003847224980635, "loss": 5.215, "num_input_tokens_seen": 357040128, "step": 2724 }, { "epoch": 0.44750539740821843, "grad_norm": 0.5323381423950195, "learning_rate": 0.00010032949018439226, "loss": 5.1929, "num_input_tokens_seen": 357433344, "step": 2727 }, { "epoch": 0.4479977025758842, "grad_norm": 0.5871007442474365, "learning_rate": 0.00010027434893802628, "loss": 5.1655, "num_input_tokens_seen": 357826560, "step": 2730 }, { "epoch": 0.44849000774355, "grad_norm": 0.5575119853019714, "learning_rate": 0.00010021929850898662, "loss": 5.2098, "num_input_tokens_seen": 358219776, "step": 2733 }, { "epoch": 0.4489823129112158, "grad_norm": 0.6533513069152832, "learning_rate": 0.00010016433864825397, "loss": 5.2318, "num_input_tokens_seen": 358612992, "step": 2736 }, { "epoch": 0.44947461807888167, "grad_norm": 0.6353456377983093, "learning_rate": 0.00010010946910776388, "loss": 5.1966, "num_input_tokens_seen": 359006208, "step": 2739 }, { "epoch": 0.44996692324654747, "grad_norm": 0.5887168049812317, "learning_rate": 0.00010005468964040215, "loss": 5.1984, "num_input_tokens_seen": 359399424, "step": 2742 }, { "epoch": 0.45045922841421326, "grad_norm": 0.5238946080207825, "learning_rate": 9.999999999999999e-05, "loss": 5.1752, "num_input_tokens_seen": 359792640, "step": 2745 }, { "epoch": 0.45095153358187906, "grad_norm": 0.5709421038627625, "learning_rate": 9.994539994132953e-05, "loss": 5.2082, "num_input_tokens_seen": 360185856, "step": 2748 }, { "epoch": 0.45144383874954486, "grad_norm": 0.5375566482543945, "learning_rate": 9.989088922009912e-05, "loss": 5.1813, "num_input_tokens_seen": 360579072, "step": 2751 }, { "epoch": 0.4519361439172107, "grad_norm": 0.6022065877914429, "learning_rate": 9.983646759294876e-05, "loss": 5.2037, "num_input_tokens_seen": 360972288, "step": 2754 }, { "epoch": 0.4524284490848765, "grad_norm": 0.6292663216590881, "learning_rate": 9.978213481744552e-05, "loss": 5.2479, "num_input_tokens_seen": 361365504, "step": 2757 }, { "epoch": 0.4529207542525423, "grad_norm": 0.7360475659370422, "learning_rate": 9.972789065207908e-05, "loss": 5.1351, "num_input_tokens_seen": 361758720, "step": 2760 }, { "epoch": 0.4534130594202081, "grad_norm": 0.5845218896865845, "learning_rate": 9.967373485625708e-05, "loss": 5.1405, "num_input_tokens_seen": 362151936, "step": 2763 }, { "epoch": 0.4539053645878739, "grad_norm": 0.5955424308776855, "learning_rate": 9.961966719030078e-05, "loss": 5.1566, "num_input_tokens_seen": 362545152, "step": 2766 }, { "epoch": 0.4543976697555397, "grad_norm": 0.568569004535675, "learning_rate": 9.95656874154405e-05, "loss": 5.1994, "num_input_tokens_seen": 362938368, "step": 2769 }, { "epoch": 0.45488997492320554, "grad_norm": 0.5579524636268616, "learning_rate": 9.951179529381129e-05, "loss": 5.2114, "num_input_tokens_seen": 363331584, "step": 2772 }, { "epoch": 0.45538228009087134, "grad_norm": 0.6336788535118103, "learning_rate": 9.945799058844839e-05, "loss": 5.1544, "num_input_tokens_seen": 363724800, "step": 2775 }, { "epoch": 0.45587458525853713, "grad_norm": 0.5949918627738953, "learning_rate": 9.940427306328304e-05, "loss": 5.2101, "num_input_tokens_seen": 364118016, "step": 2778 }, { "epoch": 0.45636689042620293, "grad_norm": 0.6075280904769897, "learning_rate": 9.935064248313794e-05, "loss": 5.201, "num_input_tokens_seen": 364511232, "step": 2781 }, { "epoch": 0.4568591955938687, "grad_norm": 0.580106794834137, "learning_rate": 9.929709861372308e-05, "loss": 5.1616, "num_input_tokens_seen": 364904448, "step": 2784 }, { "epoch": 0.4573515007615346, "grad_norm": 0.6350911855697632, "learning_rate": 9.924364122163132e-05, "loss": 5.2079, "num_input_tokens_seen": 365297664, "step": 2787 }, { "epoch": 0.4578438059292004, "grad_norm": 0.6221060156822205, "learning_rate": 9.919027007433417e-05, "loss": 5.211, "num_input_tokens_seen": 365690880, "step": 2790 }, { "epoch": 0.45833611109686617, "grad_norm": 0.6097714304924011, "learning_rate": 9.913698494017759e-05, "loss": 5.1863, "num_input_tokens_seen": 366084096, "step": 2793 }, { "epoch": 0.45882841626453197, "grad_norm": 0.6037392020225525, "learning_rate": 9.90837855883777e-05, "loss": 5.1546, "num_input_tokens_seen": 366477312, "step": 2796 }, { "epoch": 0.45932072143219776, "grad_norm": 0.6046000719070435, "learning_rate": 9.903067178901658e-05, "loss": 5.1625, "num_input_tokens_seen": 366870528, "step": 2799 }, { "epoch": 0.45948482315475303, "eval_accuracy": 0.21759485425826414, "eval_loss": 5.443637371063232, "eval_runtime": 107.9801, "eval_samples_per_second": 2.778, "eval_steps_per_second": 1.389, "num_input_tokens_seen": 367001600, "step": 2800 }, { "epoch": 0.4598130265998636, "grad_norm": 0.623935878276825, "learning_rate": 9.89776433130381e-05, "loss": 5.1864, "num_input_tokens_seen": 367263744, "step": 2802 }, { "epoch": 0.4603053317675294, "grad_norm": 0.6677658557891846, "learning_rate": 9.892469993224388e-05, "loss": 5.2048, "num_input_tokens_seen": 367656960, "step": 2805 }, { "epoch": 0.4607976369351952, "grad_norm": 0.7684961557388306, "learning_rate": 9.887184141928896e-05, "loss": 5.1797, "num_input_tokens_seen": 368050176, "step": 2808 }, { "epoch": 0.461289942102861, "grad_norm": 0.7386724352836609, "learning_rate": 9.881906754767789e-05, "loss": 5.169, "num_input_tokens_seen": 368443392, "step": 2811 }, { "epoch": 0.4617822472705268, "grad_norm": 0.6707373261451721, "learning_rate": 9.876637809176057e-05, "loss": 5.2031, "num_input_tokens_seen": 368836608, "step": 2814 }, { "epoch": 0.4622745524381926, "grad_norm": 0.6488140821456909, "learning_rate": 9.871377282672818e-05, "loss": 5.204, "num_input_tokens_seen": 369229824, "step": 2817 }, { "epoch": 0.46276685760585845, "grad_norm": 0.5540419816970825, "learning_rate": 9.866125152860918e-05, "loss": 5.1672, "num_input_tokens_seen": 369623040, "step": 2820 }, { "epoch": 0.46325916277352425, "grad_norm": 0.753920316696167, "learning_rate": 9.860881397426531e-05, "loss": 5.2168, "num_input_tokens_seen": 370016256, "step": 2823 }, { "epoch": 0.46375146794119004, "grad_norm": 0.7269083857536316, "learning_rate": 9.855645994138763e-05, "loss": 5.1643, "num_input_tokens_seen": 370409472, "step": 2826 }, { "epoch": 0.46424377310885584, "grad_norm": 0.7552649974822998, "learning_rate": 9.850418920849244e-05, "loss": 5.1711, "num_input_tokens_seen": 370802688, "step": 2829 }, { "epoch": 0.46473607827652164, "grad_norm": 0.7487272024154663, "learning_rate": 9.845200155491757e-05, "loss": 5.2207, "num_input_tokens_seen": 371195904, "step": 2832 }, { "epoch": 0.4652283834441875, "grad_norm": 0.7448633909225464, "learning_rate": 9.839989676081821e-05, "loss": 5.2232, "num_input_tokens_seen": 371589120, "step": 2835 }, { "epoch": 0.4657206886118533, "grad_norm": 0.5762555003166199, "learning_rate": 9.834787460716322e-05, "loss": 5.1584, "num_input_tokens_seen": 371982336, "step": 2838 }, { "epoch": 0.4662129937795191, "grad_norm": 0.6208467483520508, "learning_rate": 9.829593487573116e-05, "loss": 5.2084, "num_input_tokens_seen": 372375552, "step": 2841 }, { "epoch": 0.4667052989471849, "grad_norm": 0.5821816325187683, "learning_rate": 9.824407734910645e-05, "loss": 5.1722, "num_input_tokens_seen": 372768768, "step": 2844 }, { "epoch": 0.4671976041148507, "grad_norm": 0.5190629363059998, "learning_rate": 9.819230181067567e-05, "loss": 5.1523, "num_input_tokens_seen": 373161984, "step": 2847 }, { "epoch": 0.4676899092825165, "grad_norm": 0.6430248618125916, "learning_rate": 9.814060804462351e-05, "loss": 5.2025, "num_input_tokens_seen": 373555200, "step": 2850 }, { "epoch": 0.4681822144501823, "grad_norm": 0.6955888271331787, "learning_rate": 9.808899583592925e-05, "loss": 5.1616, "num_input_tokens_seen": 373948416, "step": 2853 }, { "epoch": 0.4686745196178481, "grad_norm": 0.6991474032402039, "learning_rate": 9.803746497036285e-05, "loss": 5.1637, "num_input_tokens_seen": 374341632, "step": 2856 }, { "epoch": 0.4691668247855139, "grad_norm": 0.7030759453773499, "learning_rate": 9.798601523448131e-05, "loss": 5.1855, "num_input_tokens_seen": 374734848, "step": 2859 }, { "epoch": 0.4696591299531797, "grad_norm": 0.6420966982841492, "learning_rate": 9.793464641562482e-05, "loss": 5.213, "num_input_tokens_seen": 375128064, "step": 2862 }, { "epoch": 0.4701514351208455, "grad_norm": 0.6913998126983643, "learning_rate": 9.788335830191324e-05, "loss": 5.2171, "num_input_tokens_seen": 375521280, "step": 2865 }, { "epoch": 0.47064374028851136, "grad_norm": 0.5875546336174011, "learning_rate": 9.783215068224234e-05, "loss": 5.1719, "num_input_tokens_seen": 375914496, "step": 2868 }, { "epoch": 0.47113604545617715, "grad_norm": 0.6099095344543457, "learning_rate": 9.778102334628006e-05, "loss": 5.1223, "num_input_tokens_seen": 376307712, "step": 2871 }, { "epoch": 0.47162835062384295, "grad_norm": 0.5603676438331604, "learning_rate": 9.772997608446309e-05, "loss": 5.1571, "num_input_tokens_seen": 376700928, "step": 2874 }, { "epoch": 0.47212065579150875, "grad_norm": 0.6761665940284729, "learning_rate": 9.767900868799307e-05, "loss": 5.1816, "num_input_tokens_seen": 377094144, "step": 2877 }, { "epoch": 0.47261296095917454, "grad_norm": 0.5453407764434814, "learning_rate": 9.762812094883316e-05, "loss": 5.1818, "num_input_tokens_seen": 377487360, "step": 2880 }, { "epoch": 0.4731052661268404, "grad_norm": 0.5907162427902222, "learning_rate": 9.757731265970434e-05, "loss": 5.1711, "num_input_tokens_seen": 377880576, "step": 2883 }, { "epoch": 0.4735975712945062, "grad_norm": 0.6772825121879578, "learning_rate": 9.752658361408191e-05, "loss": 5.1518, "num_input_tokens_seen": 378273792, "step": 2886 }, { "epoch": 0.474089876462172, "grad_norm": 0.6370013356208801, "learning_rate": 9.7475933606192e-05, "loss": 5.1785, "num_input_tokens_seen": 378667008, "step": 2889 }, { "epoch": 0.4745821816298378, "grad_norm": 0.6495605707168579, "learning_rate": 9.742536243100805e-05, "loss": 5.1962, "num_input_tokens_seen": 379060224, "step": 2892 }, { "epoch": 0.4750744867975036, "grad_norm": 0.5345600843429565, "learning_rate": 9.737486988424731e-05, "loss": 5.2161, "num_input_tokens_seen": 379453440, "step": 2895 }, { "epoch": 0.47556679196516943, "grad_norm": 0.5919015407562256, "learning_rate": 9.73244557623673e-05, "loss": 5.148, "num_input_tokens_seen": 379846656, "step": 2898 }, { "epoch": 0.47605909713283523, "grad_norm": 0.6196090579032898, "learning_rate": 9.727411986256259e-05, "loss": 5.1709, "num_input_tokens_seen": 380239872, "step": 2901 }, { "epoch": 0.476551402300501, "grad_norm": 0.6973289847373962, "learning_rate": 9.722386198276106e-05, "loss": 5.2245, "num_input_tokens_seen": 380633088, "step": 2904 }, { "epoch": 0.4770437074681668, "grad_norm": 0.5490657687187195, "learning_rate": 9.717368192162079e-05, "loss": 5.1915, "num_input_tokens_seen": 381026304, "step": 2907 }, { "epoch": 0.4775360126358326, "grad_norm": 0.5101742148399353, "learning_rate": 9.712357947852647e-05, "loss": 5.1612, "num_input_tokens_seen": 381419520, "step": 2910 }, { "epoch": 0.47802831780349847, "grad_norm": 0.5948254466056824, "learning_rate": 9.707355445358611e-05, "loss": 5.1209, "num_input_tokens_seen": 381812736, "step": 2913 }, { "epoch": 0.47852062297116427, "grad_norm": 0.6882492303848267, "learning_rate": 9.702360664762765e-05, "loss": 5.1987, "num_input_tokens_seen": 382205952, "step": 2916 }, { "epoch": 0.47901292813883006, "grad_norm": 0.6364412903785706, "learning_rate": 9.697373586219577e-05, "loss": 5.2198, "num_input_tokens_seen": 382599168, "step": 2919 }, { "epoch": 0.47950523330649586, "grad_norm": 0.5922911167144775, "learning_rate": 9.692394189954834e-05, "loss": 5.1431, "num_input_tokens_seen": 382992384, "step": 2922 }, { "epoch": 0.47999753847416166, "grad_norm": 0.7349563241004944, "learning_rate": 9.687422456265331e-05, "loss": 5.1448, "num_input_tokens_seen": 383385600, "step": 2925 }, { "epoch": 0.48048984364182745, "grad_norm": 0.5603036284446716, "learning_rate": 9.682458365518541e-05, "loss": 5.1633, "num_input_tokens_seen": 383778816, "step": 2928 }, { "epoch": 0.4809821488094933, "grad_norm": 0.5058417320251465, "learning_rate": 9.677501898152282e-05, "loss": 5.14, "num_input_tokens_seen": 384172032, "step": 2931 }, { "epoch": 0.4814744539771591, "grad_norm": 0.5244634747505188, "learning_rate": 9.6725530346744e-05, "loss": 5.1999, "num_input_tokens_seen": 384565248, "step": 2934 }, { "epoch": 0.4819667591448249, "grad_norm": 0.5387030243873596, "learning_rate": 9.667611755662445e-05, "loss": 5.151, "num_input_tokens_seen": 384958464, "step": 2937 }, { "epoch": 0.4824590643124907, "grad_norm": 0.5727872848510742, "learning_rate": 9.662678041763345e-05, "loss": 5.1763, "num_input_tokens_seen": 385351680, "step": 2940 }, { "epoch": 0.4829513694801565, "grad_norm": 0.6338843703269958, "learning_rate": 9.657751873693102e-05, "loss": 5.1369, "num_input_tokens_seen": 385744896, "step": 2943 }, { "epoch": 0.48344367464782234, "grad_norm": 0.6214335560798645, "learning_rate": 9.652833232236462e-05, "loss": 5.1523, "num_input_tokens_seen": 386138112, "step": 2946 }, { "epoch": 0.48393597981548814, "grad_norm": 0.6365739703178406, "learning_rate": 9.647922098246606e-05, "loss": 5.1471, "num_input_tokens_seen": 386531328, "step": 2949 }, { "epoch": 0.48442828498315393, "grad_norm": 0.6559484601020813, "learning_rate": 9.643018452644833e-05, "loss": 5.1941, "num_input_tokens_seen": 386924544, "step": 2952 }, { "epoch": 0.48492059015081973, "grad_norm": 0.6113404035568237, "learning_rate": 9.638122276420258e-05, "loss": 5.1769, "num_input_tokens_seen": 387317760, "step": 2955 }, { "epoch": 0.4854128953184855, "grad_norm": 0.6409229636192322, "learning_rate": 9.633233550629488e-05, "loss": 5.2046, "num_input_tokens_seen": 387710976, "step": 2958 }, { "epoch": 0.4859052004861514, "grad_norm": 0.5778577327728271, "learning_rate": 9.628352256396328e-05, "loss": 5.2029, "num_input_tokens_seen": 388104192, "step": 2961 }, { "epoch": 0.4863975056538172, "grad_norm": 0.5613879561424255, "learning_rate": 9.623478374911467e-05, "loss": 5.19, "num_input_tokens_seen": 388497408, "step": 2964 }, { "epoch": 0.48688981082148297, "grad_norm": 0.5774136781692505, "learning_rate": 9.618611887432175e-05, "loss": 5.1534, "num_input_tokens_seen": 388890624, "step": 2967 }, { "epoch": 0.48738211598914877, "grad_norm": 0.6295081377029419, "learning_rate": 9.613752775282003e-05, "loss": 5.1471, "num_input_tokens_seen": 389283840, "step": 2970 }, { "epoch": 0.48787442115681456, "grad_norm": 0.5557025074958801, "learning_rate": 9.608901019850477e-05, "loss": 5.1841, "num_input_tokens_seen": 389677056, "step": 2973 }, { "epoch": 0.48836672632448036, "grad_norm": 0.6536008715629578, "learning_rate": 9.604056602592805e-05, "loss": 5.1354, "num_input_tokens_seen": 390070272, "step": 2976 }, { "epoch": 0.4888590314921462, "grad_norm": 0.6432478427886963, "learning_rate": 9.599219505029571e-05, "loss": 5.174, "num_input_tokens_seen": 390463488, "step": 2979 }, { "epoch": 0.489351336659812, "grad_norm": 0.6222785711288452, "learning_rate": 9.594389708746449e-05, "loss": 5.11, "num_input_tokens_seen": 390856704, "step": 2982 }, { "epoch": 0.4898436418274778, "grad_norm": 0.5929418206214905, "learning_rate": 9.589567195393901e-05, "loss": 5.1108, "num_input_tokens_seen": 391249920, "step": 2985 }, { "epoch": 0.4903359469951436, "grad_norm": 0.629441499710083, "learning_rate": 9.584751946686886e-05, "loss": 5.1573, "num_input_tokens_seen": 391643136, "step": 2988 }, { "epoch": 0.4908282521628094, "grad_norm": 0.5835034251213074, "learning_rate": 9.57994394440457e-05, "loss": 5.1485, "num_input_tokens_seen": 392036352, "step": 2991 }, { "epoch": 0.49132055733047525, "grad_norm": 0.5538569092750549, "learning_rate": 9.575143170390034e-05, "loss": 5.1571, "num_input_tokens_seen": 392429568, "step": 2994 }, { "epoch": 0.49181286249814105, "grad_norm": 0.7033787369728088, "learning_rate": 9.57034960654999e-05, "loss": 5.1544, "num_input_tokens_seen": 392822784, "step": 2997 }, { "epoch": 0.49230516766580684, "grad_norm": 0.6497957706451416, "learning_rate": 9.565563234854494e-05, "loss": 5.1979, "num_input_tokens_seen": 393216000, "step": 3000 }, { "epoch": 0.49279747283347264, "grad_norm": 0.6436689496040344, "learning_rate": 9.560784037336655e-05, "loss": 5.1928, "num_input_tokens_seen": 393609216, "step": 3003 }, { "epoch": 0.49328977800113843, "grad_norm": 0.6518992781639099, "learning_rate": 9.556011996092359e-05, "loss": 5.1835, "num_input_tokens_seen": 394002432, "step": 3006 }, { "epoch": 0.4937820831688043, "grad_norm": 0.6519846320152283, "learning_rate": 9.551247093279984e-05, "loss": 5.1321, "num_input_tokens_seen": 394395648, "step": 3009 }, { "epoch": 0.4942743883364701, "grad_norm": 0.5854218006134033, "learning_rate": 9.546489311120117e-05, "loss": 5.1535, "num_input_tokens_seen": 394788864, "step": 3012 }, { "epoch": 0.4947666935041359, "grad_norm": 0.6409735083580017, "learning_rate": 9.541738631895289e-05, "loss": 5.1168, "num_input_tokens_seen": 395182080, "step": 3015 }, { "epoch": 0.4952589986718017, "grad_norm": 0.6534507870674133, "learning_rate": 9.536995037949675e-05, "loss": 5.1957, "num_input_tokens_seen": 395575296, "step": 3018 }, { "epoch": 0.49575130383946747, "grad_norm": 0.5633962154388428, "learning_rate": 9.53225851168884e-05, "loss": 5.1291, "num_input_tokens_seen": 395968512, "step": 3021 }, { "epoch": 0.4962436090071333, "grad_norm": 0.646615207195282, "learning_rate": 9.527529035579451e-05, "loss": 5.14, "num_input_tokens_seen": 396361728, "step": 3024 }, { "epoch": 0.4967359141747991, "grad_norm": 0.6950668096542358, "learning_rate": 9.522806592149013e-05, "loss": 5.153, "num_input_tokens_seen": 396754944, "step": 3027 }, { "epoch": 0.4972282193424649, "grad_norm": 0.5976789593696594, "learning_rate": 9.518091163985591e-05, "loss": 5.1526, "num_input_tokens_seen": 397148160, "step": 3030 }, { "epoch": 0.4977205245101307, "grad_norm": 0.7048370838165283, "learning_rate": 9.513382733737545e-05, "loss": 5.0725, "num_input_tokens_seen": 397541376, "step": 3033 }, { "epoch": 0.4982128296777965, "grad_norm": 0.6047758460044861, "learning_rate": 9.508681284113262e-05, "loss": 5.1815, "num_input_tokens_seen": 397934592, "step": 3036 }, { "epoch": 0.4987051348454623, "grad_norm": 0.581046462059021, "learning_rate": 9.503986797880886e-05, "loss": 5.1895, "num_input_tokens_seen": 398327808, "step": 3039 }, { "epoch": 0.49919744001312816, "grad_norm": 0.5836671590805054, "learning_rate": 9.499299257868052e-05, "loss": 5.1375, "num_input_tokens_seen": 398721024, "step": 3042 }, { "epoch": 0.49968974518079395, "grad_norm": 0.6080839037895203, "learning_rate": 9.494618646961631e-05, "loss": 5.1807, "num_input_tokens_seen": 399114240, "step": 3045 }, { "epoch": 0.5001820503484598, "grad_norm": 0.603197455406189, "learning_rate": 9.489944948107455e-05, "loss": 5.1038, "num_input_tokens_seen": 399507456, "step": 3048 }, { "epoch": 0.5006743555161256, "grad_norm": 0.5664237141609192, "learning_rate": 9.485278144310068e-05, "loss": 5.132, "num_input_tokens_seen": 399900672, "step": 3051 }, { "epoch": 0.5011666606837913, "grad_norm": 0.6530909538269043, "learning_rate": 9.480618218632454e-05, "loss": 5.1332, "num_input_tokens_seen": 400293888, "step": 3054 }, { "epoch": 0.5016589658514572, "grad_norm": 0.6573035717010498, "learning_rate": 9.475965154195791e-05, "loss": 5.2157, "num_input_tokens_seen": 400687104, "step": 3057 }, { "epoch": 0.5021512710191229, "grad_norm": 0.6319076418876648, "learning_rate": 9.471318934179186e-05, "loss": 5.1244, "num_input_tokens_seen": 401080320, "step": 3060 }, { "epoch": 0.5026435761867888, "grad_norm": 0.6034519076347351, "learning_rate": 9.466679541819426e-05, "loss": 5.1375, "num_input_tokens_seen": 401473536, "step": 3063 }, { "epoch": 0.5031358813544546, "grad_norm": 0.6358224749565125, "learning_rate": 9.462046960410713e-05, "loss": 5.1482, "num_input_tokens_seen": 401866752, "step": 3066 }, { "epoch": 0.5036281865221204, "grad_norm": 0.5579473376274109, "learning_rate": 9.457421173304426e-05, "loss": 5.1301, "num_input_tokens_seen": 402259968, "step": 3069 }, { "epoch": 0.5041204916897862, "grad_norm": 0.582528829574585, "learning_rate": 9.452802163908858e-05, "loss": 5.1343, "num_input_tokens_seen": 402653184, "step": 3072 }, { "epoch": 0.504612796857452, "grad_norm": 0.5620675683021545, "learning_rate": 9.448189915688972e-05, "loss": 5.1392, "num_input_tokens_seen": 403046400, "step": 3075 }, { "epoch": 0.5051051020251178, "grad_norm": 0.5927409529685974, "learning_rate": 9.443584412166155e-05, "loss": 5.1243, "num_input_tokens_seen": 403439616, "step": 3078 }, { "epoch": 0.5055974071927837, "grad_norm": 0.6298641562461853, "learning_rate": 9.438985636917958e-05, "loss": 5.1756, "num_input_tokens_seen": 403832832, "step": 3081 }, { "epoch": 0.5060897123604494, "grad_norm": 0.5789916515350342, "learning_rate": 9.434393573577864e-05, "loss": 5.0964, "num_input_tokens_seen": 404226048, "step": 3084 }, { "epoch": 0.5065820175281153, "grad_norm": 0.6592864394187927, "learning_rate": 9.429808205835036e-05, "loss": 5.1169, "num_input_tokens_seen": 404619264, "step": 3087 }, { "epoch": 0.507074322695781, "grad_norm": 0.5415318608283997, "learning_rate": 9.42522951743408e-05, "loss": 5.0733, "num_input_tokens_seen": 405012480, "step": 3090 }, { "epoch": 0.5075666278634469, "grad_norm": 0.6288437247276306, "learning_rate": 9.420657492174793e-05, "loss": 5.1799, "num_input_tokens_seen": 405405696, "step": 3093 }, { "epoch": 0.5080589330311127, "grad_norm": 0.5948610901832581, "learning_rate": 9.416092113911928e-05, "loss": 5.1405, "num_input_tokens_seen": 405798912, "step": 3096 }, { "epoch": 0.5085512381987785, "grad_norm": 0.5589770078659058, "learning_rate": 9.411533366554959e-05, "loss": 5.1584, "num_input_tokens_seen": 406192128, "step": 3099 }, { "epoch": 0.5090435433664443, "grad_norm": 0.6298828721046448, "learning_rate": 9.406981234067836e-05, "loss": 5.1886, "num_input_tokens_seen": 406585344, "step": 3102 }, { "epoch": 0.50953584853411, "grad_norm": 0.5577113628387451, "learning_rate": 9.40243570046875e-05, "loss": 5.145, "num_input_tokens_seen": 406978560, "step": 3105 }, { "epoch": 0.5100281537017759, "grad_norm": 0.6395171284675598, "learning_rate": 9.397896749829895e-05, "loss": 5.0892, "num_input_tokens_seen": 407371776, "step": 3108 }, { "epoch": 0.5105204588694416, "grad_norm": 0.7225360870361328, "learning_rate": 9.393364366277242e-05, "loss": 5.126, "num_input_tokens_seen": 407764992, "step": 3111 }, { "epoch": 0.5110127640371075, "grad_norm": 0.7004207968711853, "learning_rate": 9.388838533990295e-05, "loss": 5.148, "num_input_tokens_seen": 408158208, "step": 3114 }, { "epoch": 0.5115050692047733, "grad_norm": 0.705188512802124, "learning_rate": 9.384319237201867e-05, "loss": 5.1684, "num_input_tokens_seen": 408551424, "step": 3117 }, { "epoch": 0.5119973743724391, "grad_norm": 0.6463566422462463, "learning_rate": 9.379806460197844e-05, "loss": 5.1332, "num_input_tokens_seen": 408944640, "step": 3120 }, { "epoch": 0.5124896795401049, "grad_norm": 0.8040825724601746, "learning_rate": 9.375300187316961e-05, "loss": 5.1332, "num_input_tokens_seen": 409337856, "step": 3123 }, { "epoch": 0.5129819847077707, "grad_norm": 0.6625365018844604, "learning_rate": 9.37080040295057e-05, "loss": 5.1883, "num_input_tokens_seen": 409731072, "step": 3126 }, { "epoch": 0.5134742898754365, "grad_norm": 0.5686776041984558, "learning_rate": 9.36630709154241e-05, "loss": 5.108, "num_input_tokens_seen": 410124288, "step": 3129 }, { "epoch": 0.5139665950431024, "grad_norm": 0.6858258247375488, "learning_rate": 9.36182023758839e-05, "loss": 5.1547, "num_input_tokens_seen": 410517504, "step": 3132 }, { "epoch": 0.5144589002107681, "grad_norm": 0.7715529203414917, "learning_rate": 9.357339825636354e-05, "loss": 5.1571, "num_input_tokens_seen": 410910720, "step": 3135 }, { "epoch": 0.514951205378434, "grad_norm": 0.6023756265640259, "learning_rate": 9.352865840285866e-05, "loss": 5.1242, "num_input_tokens_seen": 411303936, "step": 3138 }, { "epoch": 0.5154435105460997, "grad_norm": 0.5971417427062988, "learning_rate": 9.348398266187983e-05, "loss": 5.1568, "num_input_tokens_seen": 411697152, "step": 3141 }, { "epoch": 0.5159358157137656, "grad_norm": 0.6816672682762146, "learning_rate": 9.343937088045033e-05, "loss": 5.0748, "num_input_tokens_seen": 412090368, "step": 3144 }, { "epoch": 0.5164281208814314, "grad_norm": 0.6150919795036316, "learning_rate": 9.339482290610404e-05, "loss": 5.1536, "num_input_tokens_seen": 412483584, "step": 3147 }, { "epoch": 0.5169204260490972, "grad_norm": 0.5576636791229248, "learning_rate": 9.335033858688308e-05, "loss": 5.1204, "num_input_tokens_seen": 412876800, "step": 3150 }, { "epoch": 0.517412731216763, "grad_norm": 0.6487387418746948, "learning_rate": 9.330591777133583e-05, "loss": 5.1259, "num_input_tokens_seen": 413270016, "step": 3153 }, { "epoch": 0.5179050363844288, "grad_norm": 0.5932300090789795, "learning_rate": 9.32615603085146e-05, "loss": 5.142, "num_input_tokens_seen": 413663232, "step": 3156 }, { "epoch": 0.5183973415520946, "grad_norm": 0.595696210861206, "learning_rate": 9.321726604797357e-05, "loss": 5.1505, "num_input_tokens_seen": 414056448, "step": 3159 }, { "epoch": 0.5188896467197605, "grad_norm": 0.5437079668045044, "learning_rate": 9.317303483976665e-05, "loss": 5.0969, "num_input_tokens_seen": 414449664, "step": 3162 }, { "epoch": 0.5193819518874262, "grad_norm": 0.6077457666397095, "learning_rate": 9.312886653444527e-05, "loss": 5.175, "num_input_tokens_seen": 414842880, "step": 3165 }, { "epoch": 0.519874257055092, "grad_norm": 0.5742005109786987, "learning_rate": 9.308476098305633e-05, "loss": 5.1699, "num_input_tokens_seen": 415236096, "step": 3168 }, { "epoch": 0.5203665622227578, "grad_norm": 0.5593070983886719, "learning_rate": 9.304071803714007e-05, "loss": 5.1766, "num_input_tokens_seen": 415629312, "step": 3171 }, { "epoch": 0.5208588673904236, "grad_norm": 0.6762742400169373, "learning_rate": 9.299673754872799e-05, "loss": 5.1687, "num_input_tokens_seen": 416022528, "step": 3174 }, { "epoch": 0.5213511725580895, "grad_norm": 0.6270245909690857, "learning_rate": 9.295281937034069e-05, "loss": 5.142, "num_input_tokens_seen": 416415744, "step": 3177 }, { "epoch": 0.5218434777257552, "grad_norm": 0.6227966547012329, "learning_rate": 9.290896335498588e-05, "loss": 5.1429, "num_input_tokens_seen": 416808960, "step": 3180 }, { "epoch": 0.5223357828934211, "grad_norm": 0.6371598839759827, "learning_rate": 9.286516935615632e-05, "loss": 5.1405, "num_input_tokens_seen": 417202176, "step": 3183 }, { "epoch": 0.5228280880610868, "grad_norm": 0.5920236706733704, "learning_rate": 9.282143722782764e-05, "loss": 5.1078, "num_input_tokens_seen": 417595392, "step": 3186 }, { "epoch": 0.5233203932287527, "grad_norm": 0.715696394443512, "learning_rate": 9.277776682445643e-05, "loss": 5.1564, "num_input_tokens_seen": 417988608, "step": 3189 }, { "epoch": 0.5238126983964185, "grad_norm": 0.7197597026824951, "learning_rate": 9.273415800097812e-05, "loss": 5.1441, "num_input_tokens_seen": 418381824, "step": 3192 }, { "epoch": 0.5243050035640843, "grad_norm": 0.6280763745307922, "learning_rate": 9.269061061280504e-05, "loss": 5.1577, "num_input_tokens_seen": 418775040, "step": 3195 }, { "epoch": 0.5247973087317501, "grad_norm": 0.5797785520553589, "learning_rate": 9.264712451582432e-05, "loss": 5.1851, "num_input_tokens_seen": 419168256, "step": 3198 }, { "epoch": 0.5251255121768607, "eval_accuracy": 0.22059110893991207, "eval_loss": 5.397455215454102, "eval_runtime": 110.5008, "eval_samples_per_second": 2.715, "eval_steps_per_second": 1.357, "num_input_tokens_seen": 419430400, "step": 3200 }, { "epoch": 0.5252896138994159, "grad_norm": 0.6440132260322571, "learning_rate": 9.260369956639594e-05, "loss": 5.1808, "num_input_tokens_seen": 419561472, "step": 3201 }, { "epoch": 0.5257819190670817, "grad_norm": 0.563518762588501, "learning_rate": 9.256033562135067e-05, "loss": 5.1735, "num_input_tokens_seen": 419954688, "step": 3204 }, { "epoch": 0.5262742242347476, "grad_norm": 0.6111391186714172, "learning_rate": 9.251703253798821e-05, "loss": 5.1588, "num_input_tokens_seen": 420347904, "step": 3207 }, { "epoch": 0.5267665294024133, "grad_norm": 0.723850429058075, "learning_rate": 9.247379017407515e-05, "loss": 5.1685, "num_input_tokens_seen": 420741120, "step": 3210 }, { "epoch": 0.5272588345700792, "grad_norm": 0.6539550423622131, "learning_rate": 9.24306083878429e-05, "loss": 5.1203, "num_input_tokens_seen": 421134336, "step": 3213 }, { "epoch": 0.5277511397377449, "grad_norm": 0.6335772275924683, "learning_rate": 9.238748703798599e-05, "loss": 5.1889, "num_input_tokens_seen": 421527552, "step": 3216 }, { "epoch": 0.5282434449054108, "grad_norm": 0.6394396424293518, "learning_rate": 9.234442598365984e-05, "loss": 5.0933, "num_input_tokens_seen": 421920768, "step": 3219 }, { "epoch": 0.5287357500730765, "grad_norm": 0.5733250975608826, "learning_rate": 9.230142508447905e-05, "loss": 5.1459, "num_input_tokens_seen": 422313984, "step": 3222 }, { "epoch": 0.5292280552407423, "grad_norm": 0.6153180003166199, "learning_rate": 9.225848420051536e-05, "loss": 5.1234, "num_input_tokens_seen": 422707200, "step": 3225 }, { "epoch": 0.5297203604084082, "grad_norm": 0.5874418616294861, "learning_rate": 9.221560319229582e-05, "loss": 5.1364, "num_input_tokens_seen": 423100416, "step": 3228 }, { "epoch": 0.5302126655760739, "grad_norm": 0.5672056674957275, "learning_rate": 9.217278192080077e-05, "loss": 5.1172, "num_input_tokens_seen": 423493632, "step": 3231 }, { "epoch": 0.5307049707437398, "grad_norm": 0.5874174237251282, "learning_rate": 9.213002024746207e-05, "loss": 5.1071, "num_input_tokens_seen": 423886848, "step": 3234 }, { "epoch": 0.5311972759114055, "grad_norm": 0.5822425484657288, "learning_rate": 9.208731803416115e-05, "loss": 5.1241, "num_input_tokens_seen": 424280064, "step": 3237 }, { "epoch": 0.5316895810790714, "grad_norm": 0.7283825278282166, "learning_rate": 9.204467514322719e-05, "loss": 5.1217, "num_input_tokens_seen": 424673280, "step": 3240 }, { "epoch": 0.5321818862467372, "grad_norm": 0.6498001217842102, "learning_rate": 9.200209143743507e-05, "loss": 5.0671, "num_input_tokens_seen": 425066496, "step": 3243 }, { "epoch": 0.532674191414403, "grad_norm": 0.6793512105941772, "learning_rate": 9.195956678000385e-05, "loss": 5.1119, "num_input_tokens_seen": 425459712, "step": 3246 }, { "epoch": 0.5331664965820688, "grad_norm": 0.5778270959854126, "learning_rate": 9.191710103459461e-05, "loss": 5.1512, "num_input_tokens_seen": 425852928, "step": 3249 }, { "epoch": 0.5336588017497346, "grad_norm": 0.6366815567016602, "learning_rate": 9.187469406530882e-05, "loss": 5.1113, "num_input_tokens_seen": 426246144, "step": 3252 }, { "epoch": 0.5341511069174004, "grad_norm": 0.5294430255889893, "learning_rate": 9.183234573668638e-05, "loss": 5.1288, "num_input_tokens_seen": 426639360, "step": 3255 }, { "epoch": 0.5346434120850663, "grad_norm": 0.5754626989364624, "learning_rate": 9.179005591370386e-05, "loss": 5.139, "num_input_tokens_seen": 427032576, "step": 3258 }, { "epoch": 0.535135717252732, "grad_norm": 0.6076295375823975, "learning_rate": 9.174782446177271e-05, "loss": 5.1094, "num_input_tokens_seen": 427425792, "step": 3261 }, { "epoch": 0.5356280224203979, "grad_norm": 0.6213559508323669, "learning_rate": 9.170565124673742e-05, "loss": 5.1228, "num_input_tokens_seen": 427819008, "step": 3264 }, { "epoch": 0.5361203275880636, "grad_norm": 0.5182236433029175, "learning_rate": 9.166353613487377e-05, "loss": 5.1282, "num_input_tokens_seen": 428212224, "step": 3267 }, { "epoch": 0.5366126327557295, "grad_norm": 0.5979652404785156, "learning_rate": 9.162147899288702e-05, "loss": 5.1215, "num_input_tokens_seen": 428605440, "step": 3270 }, { "epoch": 0.5371049379233953, "grad_norm": 0.5665757060050964, "learning_rate": 9.15794796879101e-05, "loss": 5.1216, "num_input_tokens_seen": 428998656, "step": 3273 }, { "epoch": 0.537597243091061, "grad_norm": 0.7053664922714233, "learning_rate": 9.1537538087502e-05, "loss": 5.1368, "num_input_tokens_seen": 429391872, "step": 3276 }, { "epoch": 0.5380895482587269, "grad_norm": 0.6297977566719055, "learning_rate": 9.149565405964579e-05, "loss": 5.1418, "num_input_tokens_seen": 429785088, "step": 3279 }, { "epoch": 0.5385818534263926, "grad_norm": 0.6866833567619324, "learning_rate": 9.145382747274708e-05, "loss": 5.0998, "num_input_tokens_seen": 430178304, "step": 3282 }, { "epoch": 0.5390741585940585, "grad_norm": 0.5878569483757019, "learning_rate": 9.141205819563218e-05, "loss": 5.1253, "num_input_tokens_seen": 430571520, "step": 3285 }, { "epoch": 0.5395664637617243, "grad_norm": 0.5595548748970032, "learning_rate": 9.137034609754635e-05, "loss": 5.1524, "num_input_tokens_seen": 430964736, "step": 3288 }, { "epoch": 0.5400587689293901, "grad_norm": 0.5856724381446838, "learning_rate": 9.132869104815211e-05, "loss": 5.1267, "num_input_tokens_seen": 431357952, "step": 3291 }, { "epoch": 0.5405510740970559, "grad_norm": 0.5340684652328491, "learning_rate": 9.128709291752767e-05, "loss": 5.1085, "num_input_tokens_seen": 431751168, "step": 3294 }, { "epoch": 0.5410433792647217, "grad_norm": 0.5203419923782349, "learning_rate": 9.124555157616496e-05, "loss": 5.061, "num_input_tokens_seen": 432144384, "step": 3297 }, { "epoch": 0.5415356844323875, "grad_norm": 0.6170658469200134, "learning_rate": 9.12040668949681e-05, "loss": 5.098, "num_input_tokens_seen": 432537600, "step": 3300 }, { "epoch": 0.5420279896000534, "grad_norm": 0.6464248299598694, "learning_rate": 9.116263874525175e-05, "loss": 5.0687, "num_input_tokens_seen": 432930816, "step": 3303 }, { "epoch": 0.5425202947677191, "grad_norm": 0.6081656217575073, "learning_rate": 9.112126699873929e-05, "loss": 5.1405, "num_input_tokens_seen": 433324032, "step": 3306 }, { "epoch": 0.543012599935385, "grad_norm": 0.7273305058479309, "learning_rate": 9.10799515275613e-05, "loss": 5.134, "num_input_tokens_seen": 433717248, "step": 3309 }, { "epoch": 0.5435049051030507, "grad_norm": 0.5423167943954468, "learning_rate": 9.103869220425383e-05, "loss": 5.0982, "num_input_tokens_seen": 434110464, "step": 3312 }, { "epoch": 0.5439972102707166, "grad_norm": 0.6296489238739014, "learning_rate": 9.099748890175672e-05, "loss": 5.1654, "num_input_tokens_seen": 434503680, "step": 3315 }, { "epoch": 0.5444895154383824, "grad_norm": 0.5735786557197571, "learning_rate": 9.0956341493412e-05, "loss": 5.1322, "num_input_tokens_seen": 434896896, "step": 3318 }, { "epoch": 0.5449818206060482, "grad_norm": 0.5762762427330017, "learning_rate": 9.091524985296227e-05, "loss": 5.1361, "num_input_tokens_seen": 435290112, "step": 3321 }, { "epoch": 0.545474125773714, "grad_norm": 0.5767176747322083, "learning_rate": 9.087421385454902e-05, "loss": 5.1571, "num_input_tokens_seen": 435683328, "step": 3324 }, { "epoch": 0.5459664309413798, "grad_norm": 0.579581081867218, "learning_rate": 9.083323337271104e-05, "loss": 5.1392, "num_input_tokens_seen": 436076544, "step": 3327 }, { "epoch": 0.5464587361090456, "grad_norm": 0.6044655442237854, "learning_rate": 9.079230828238284e-05, "loss": 5.1106, "num_input_tokens_seen": 436469760, "step": 3330 }, { "epoch": 0.5469510412767113, "grad_norm": 0.5802748203277588, "learning_rate": 9.075143845889296e-05, "loss": 5.1174, "num_input_tokens_seen": 436862976, "step": 3333 }, { "epoch": 0.5474433464443772, "grad_norm": 0.6058889031410217, "learning_rate": 9.071062377796246e-05, "loss": 5.0931, "num_input_tokens_seen": 437256192, "step": 3336 }, { "epoch": 0.547935651612043, "grad_norm": 0.5531756281852722, "learning_rate": 9.066986411570333e-05, "loss": 5.1264, "num_input_tokens_seen": 437649408, "step": 3339 }, { "epoch": 0.5484279567797088, "grad_norm": 0.5685104131698608, "learning_rate": 9.062915934861684e-05, "loss": 5.0803, "num_input_tokens_seen": 438042624, "step": 3342 }, { "epoch": 0.5489202619473746, "grad_norm": 0.6249758005142212, "learning_rate": 9.058850935359201e-05, "loss": 5.1444, "num_input_tokens_seen": 438435840, "step": 3345 }, { "epoch": 0.5494125671150404, "grad_norm": 0.6605463624000549, "learning_rate": 9.054791400790408e-05, "loss": 5.1445, "num_input_tokens_seen": 438829056, "step": 3348 }, { "epoch": 0.5499048722827062, "grad_norm": 0.5551895499229431, "learning_rate": 9.050737318921291e-05, "loss": 5.1388, "num_input_tokens_seen": 439222272, "step": 3351 }, { "epoch": 0.5503971774503721, "grad_norm": 0.6781189441680908, "learning_rate": 9.046688677556144e-05, "loss": 5.1199, "num_input_tokens_seen": 439615488, "step": 3354 }, { "epoch": 0.5508894826180378, "grad_norm": 0.7572456002235413, "learning_rate": 9.042645464537411e-05, "loss": 5.0652, "num_input_tokens_seen": 440008704, "step": 3357 }, { "epoch": 0.5513817877857037, "grad_norm": 0.642903208732605, "learning_rate": 9.038607667745545e-05, "loss": 5.1682, "num_input_tokens_seen": 440401920, "step": 3360 }, { "epoch": 0.5518740929533694, "grad_norm": 0.6645106673240662, "learning_rate": 9.03457527509884e-05, "loss": 5.1355, "num_input_tokens_seen": 440795136, "step": 3363 }, { "epoch": 0.5523663981210353, "grad_norm": 0.7782042026519775, "learning_rate": 9.03054827455329e-05, "loss": 5.0755, "num_input_tokens_seen": 441188352, "step": 3366 }, { "epoch": 0.5528587032887011, "grad_norm": 0.6609596014022827, "learning_rate": 9.026526654102436e-05, "loss": 5.1567, "num_input_tokens_seen": 441581568, "step": 3369 }, { "epoch": 0.5533510084563669, "grad_norm": 0.6382880210876465, "learning_rate": 9.02251040177721e-05, "loss": 5.0827, "num_input_tokens_seen": 441974784, "step": 3372 }, { "epoch": 0.5538433136240327, "grad_norm": 0.6710590720176697, "learning_rate": 9.018499505645787e-05, "loss": 5.0826, "num_input_tokens_seen": 442368000, "step": 3375 }, { "epoch": 0.5543356187916985, "grad_norm": 0.6333158612251282, "learning_rate": 9.014493953813449e-05, "loss": 5.1026, "num_input_tokens_seen": 442761216, "step": 3378 }, { "epoch": 0.5548279239593643, "grad_norm": 0.6261973977088928, "learning_rate": 9.010493734422417e-05, "loss": 5.1521, "num_input_tokens_seen": 443154432, "step": 3381 }, { "epoch": 0.5553202291270302, "grad_norm": 0.6474428772926331, "learning_rate": 9.006498835651718e-05, "loss": 5.1015, "num_input_tokens_seen": 443547648, "step": 3384 }, { "epoch": 0.5558125342946959, "grad_norm": 0.606153666973114, "learning_rate": 9.002509245717025e-05, "loss": 5.0685, "num_input_tokens_seen": 443940864, "step": 3387 }, { "epoch": 0.5563048394623618, "grad_norm": 0.6537392139434814, "learning_rate": 8.998524952870532e-05, "loss": 5.1488, "num_input_tokens_seen": 444334080, "step": 3390 }, { "epoch": 0.5567971446300275, "grad_norm": 0.5912271738052368, "learning_rate": 8.994545945400785e-05, "loss": 5.1406, "num_input_tokens_seen": 444727296, "step": 3393 }, { "epoch": 0.5572894497976933, "grad_norm": 0.5567827820777893, "learning_rate": 8.990572211632556e-05, "loss": 5.0851, "num_input_tokens_seen": 445120512, "step": 3396 }, { "epoch": 0.5577817549653592, "grad_norm": 0.5647953152656555, "learning_rate": 8.986603739926683e-05, "loss": 5.1124, "num_input_tokens_seen": 445513728, "step": 3399 }, { "epoch": 0.5582740601330249, "grad_norm": 0.5637276768684387, "learning_rate": 8.982640518679943e-05, "loss": 5.0972, "num_input_tokens_seen": 445906944, "step": 3402 }, { "epoch": 0.5587663653006908, "grad_norm": 0.5551077127456665, "learning_rate": 8.978682536324898e-05, "loss": 5.1073, "num_input_tokens_seen": 446300160, "step": 3405 }, { "epoch": 0.5592586704683565, "grad_norm": 0.5078896880149841, "learning_rate": 8.974729781329759e-05, "loss": 5.0471, "num_input_tokens_seen": 446693376, "step": 3408 }, { "epoch": 0.5597509756360224, "grad_norm": 0.5744684338569641, "learning_rate": 8.970782242198242e-05, "loss": 5.0684, "num_input_tokens_seen": 447086592, "step": 3411 }, { "epoch": 0.5602432808036882, "grad_norm": 0.587356448173523, "learning_rate": 8.966839907469425e-05, "loss": 5.1192, "num_input_tokens_seen": 447479808, "step": 3414 }, { "epoch": 0.560735585971354, "grad_norm": 0.5351012349128723, "learning_rate": 8.962902765717617e-05, "loss": 5.0918, "num_input_tokens_seen": 447873024, "step": 3417 }, { "epoch": 0.5612278911390198, "grad_norm": 0.5939494967460632, "learning_rate": 8.958970805552213e-05, "loss": 5.096, "num_input_tokens_seen": 448266240, "step": 3420 }, { "epoch": 0.5617201963066856, "grad_norm": 0.6274368166923523, "learning_rate": 8.955044015617547e-05, "loss": 5.0876, "num_input_tokens_seen": 448659456, "step": 3423 }, { "epoch": 0.5622125014743514, "grad_norm": 0.5798152089118958, "learning_rate": 8.951122384592781e-05, "loss": 5.0891, "num_input_tokens_seen": 449052672, "step": 3426 }, { "epoch": 0.5627048066420173, "grad_norm": 0.652629017829895, "learning_rate": 8.947205901191733e-05, "loss": 5.1373, "num_input_tokens_seen": 449445888, "step": 3429 }, { "epoch": 0.563197111809683, "grad_norm": 0.5612648725509644, "learning_rate": 8.94329455416277e-05, "loss": 5.1606, "num_input_tokens_seen": 449839104, "step": 3432 }, { "epoch": 0.5636894169773489, "grad_norm": 0.6374395489692688, "learning_rate": 8.939388332288653e-05, "loss": 5.1123, "num_input_tokens_seen": 450232320, "step": 3435 }, { "epoch": 0.5641817221450146, "grad_norm": 0.5904387831687927, "learning_rate": 8.93548722438641e-05, "loss": 5.0859, "num_input_tokens_seen": 450625536, "step": 3438 }, { "epoch": 0.5646740273126805, "grad_norm": 0.6425766944885254, "learning_rate": 8.931591219307205e-05, "loss": 5.0884, "num_input_tokens_seen": 451018752, "step": 3441 }, { "epoch": 0.5651663324803462, "grad_norm": 0.6238811612129211, "learning_rate": 8.927700305936195e-05, "loss": 5.0994, "num_input_tokens_seen": 451411968, "step": 3444 }, { "epoch": 0.565658637648012, "grad_norm": 0.6211217045783997, "learning_rate": 8.923814473192402e-05, "loss": 5.0887, "num_input_tokens_seen": 451805184, "step": 3447 }, { "epoch": 0.5661509428156779, "grad_norm": 0.6308383941650391, "learning_rate": 8.919933710028586e-05, "loss": 5.1025, "num_input_tokens_seen": 452198400, "step": 3450 }, { "epoch": 0.5666432479833436, "grad_norm": 0.6414578557014465, "learning_rate": 8.916058005431099e-05, "loss": 5.1159, "num_input_tokens_seen": 452591616, "step": 3453 }, { "epoch": 0.5671355531510095, "grad_norm": 0.6173880100250244, "learning_rate": 8.912187348419765e-05, "loss": 5.0787, "num_input_tokens_seen": 452984832, "step": 3456 }, { "epoch": 0.5676278583186752, "grad_norm": 0.567276656627655, "learning_rate": 8.908321728047749e-05, "loss": 5.0937, "num_input_tokens_seen": 453378048, "step": 3459 }, { "epoch": 0.5681201634863411, "grad_norm": 0.7052215337753296, "learning_rate": 8.904461133401418e-05, "loss": 5.1027, "num_input_tokens_seen": 453771264, "step": 3462 }, { "epoch": 0.5686124686540069, "grad_norm": 0.6379631161689758, "learning_rate": 8.900605553600226e-05, "loss": 5.1089, "num_input_tokens_seen": 454164480, "step": 3465 }, { "epoch": 0.5691047738216727, "grad_norm": 0.6003096699714661, "learning_rate": 8.896754977796572e-05, "loss": 5.1104, "num_input_tokens_seen": 454557696, "step": 3468 }, { "epoch": 0.5695970789893385, "grad_norm": 0.798956036567688, "learning_rate": 8.892909395175676e-05, "loss": 5.1038, "num_input_tokens_seen": 454950912, "step": 3471 }, { "epoch": 0.5700893841570043, "grad_norm": 0.6860202550888062, "learning_rate": 8.889068794955451e-05, "loss": 5.0823, "num_input_tokens_seen": 455344128, "step": 3474 }, { "epoch": 0.5705816893246701, "grad_norm": 0.6046934723854065, "learning_rate": 8.885233166386384e-05, "loss": 5.0855, "num_input_tokens_seen": 455737344, "step": 3477 }, { "epoch": 0.571073994492336, "grad_norm": 0.5969598293304443, "learning_rate": 8.881402498751399e-05, "loss": 5.0868, "num_input_tokens_seen": 456130560, "step": 3480 }, { "epoch": 0.5715662996600017, "grad_norm": 0.6891276240348816, "learning_rate": 8.877576781365732e-05, "loss": 5.1207, "num_input_tokens_seen": 456523776, "step": 3483 }, { "epoch": 0.5720586048276676, "grad_norm": 0.6835845112800598, "learning_rate": 8.87375600357681e-05, "loss": 5.0923, "num_input_tokens_seen": 456916992, "step": 3486 }, { "epoch": 0.5725509099953333, "grad_norm": 0.6710256338119507, "learning_rate": 8.869940154764131e-05, "loss": 5.105, "num_input_tokens_seen": 457310208, "step": 3489 }, { "epoch": 0.5730432151629992, "grad_norm": 0.5943020582199097, "learning_rate": 8.866129224339131e-05, "loss": 5.1724, "num_input_tokens_seen": 457703424, "step": 3492 }, { "epoch": 0.573535520330665, "grad_norm": 0.6241229176521301, "learning_rate": 8.862323201745062e-05, "loss": 5.0805, "num_input_tokens_seen": 458096640, "step": 3495 }, { "epoch": 0.5740278254983308, "grad_norm": 0.6597952842712402, "learning_rate": 8.85852207645687e-05, "loss": 5.051, "num_input_tokens_seen": 458489856, "step": 3498 }, { "epoch": 0.5745201306659966, "grad_norm": 0.5514165759086609, "learning_rate": 8.854725837981081e-05, "loss": 5.1251, "num_input_tokens_seen": 458883072, "step": 3501 }, { "epoch": 0.5750124358336623, "grad_norm": 0.5667795538902283, "learning_rate": 8.850934475855665e-05, "loss": 5.0993, "num_input_tokens_seen": 459276288, "step": 3504 }, { "epoch": 0.5755047410013282, "grad_norm": 0.5708199739456177, "learning_rate": 8.847147979649926e-05, "loss": 5.1099, "num_input_tokens_seen": 459669504, "step": 3507 }, { "epoch": 0.575997046168994, "grad_norm": 0.584382176399231, "learning_rate": 8.843366338964375e-05, "loss": 5.0977, "num_input_tokens_seen": 460062720, "step": 3510 }, { "epoch": 0.5764893513366598, "grad_norm": 0.5634648203849792, "learning_rate": 8.839589543430617e-05, "loss": 5.0354, "num_input_tokens_seen": 460455936, "step": 3513 }, { "epoch": 0.5769816565043256, "grad_norm": 0.7195287942886353, "learning_rate": 8.835817582711223e-05, "loss": 5.1015, "num_input_tokens_seen": 460849152, "step": 3516 }, { "epoch": 0.5774739616719914, "grad_norm": 0.5957933664321899, "learning_rate": 8.832050446499615e-05, "loss": 5.0715, "num_input_tokens_seen": 461242368, "step": 3519 }, { "epoch": 0.5779662668396572, "grad_norm": 0.6452832818031311, "learning_rate": 8.828288124519953e-05, "loss": 5.1224, "num_input_tokens_seen": 461635584, "step": 3522 }, { "epoch": 0.5784585720073231, "grad_norm": 0.6192216277122498, "learning_rate": 8.824530606527006e-05, "loss": 5.0501, "num_input_tokens_seen": 462028800, "step": 3525 }, { "epoch": 0.5789508771749888, "grad_norm": 0.6137304902076721, "learning_rate": 8.820777882306049e-05, "loss": 5.1225, "num_input_tokens_seen": 462422016, "step": 3528 }, { "epoch": 0.5794431823426547, "grad_norm": 0.5840086936950684, "learning_rate": 8.81702994167273e-05, "loss": 5.095, "num_input_tokens_seen": 462815232, "step": 3531 }, { "epoch": 0.5799354875103204, "grad_norm": 0.6205256581306458, "learning_rate": 8.81328677447297e-05, "loss": 5.094, "num_input_tokens_seen": 463208448, "step": 3534 }, { "epoch": 0.5804277926779863, "grad_norm": 0.5789561867713928, "learning_rate": 8.809548370582834e-05, "loss": 5.127, "num_input_tokens_seen": 463601664, "step": 3537 }, { "epoch": 0.5809200978456521, "grad_norm": 0.5926523208618164, "learning_rate": 8.805814719908426e-05, "loss": 5.0785, "num_input_tokens_seen": 463994880, "step": 3540 }, { "epoch": 0.5814124030133179, "grad_norm": 0.5335286855697632, "learning_rate": 8.80208581238577e-05, "loss": 5.1154, "num_input_tokens_seen": 464388096, "step": 3543 }, { "epoch": 0.5819047081809837, "grad_norm": 0.4992382824420929, "learning_rate": 8.798361637980696e-05, "loss": 5.0659, "num_input_tokens_seen": 464781312, "step": 3546 }, { "epoch": 0.5823970133486495, "grad_norm": 0.5407050251960754, "learning_rate": 8.794642186688725e-05, "loss": 5.0761, "num_input_tokens_seen": 465174528, "step": 3549 }, { "epoch": 0.5828893185163153, "grad_norm": 0.5841310024261475, "learning_rate": 8.79092744853496e-05, "loss": 5.0825, "num_input_tokens_seen": 465567744, "step": 3552 }, { "epoch": 0.583381623683981, "grad_norm": 0.5791674852371216, "learning_rate": 8.787217413573975e-05, "loss": 5.1143, "num_input_tokens_seen": 465960960, "step": 3555 }, { "epoch": 0.5838739288516469, "grad_norm": 0.6036920547485352, "learning_rate": 8.783512071889697e-05, "loss": 5.1271, "num_input_tokens_seen": 466354176, "step": 3558 }, { "epoch": 0.5843662340193128, "grad_norm": 0.5951531529426575, "learning_rate": 8.779811413595294e-05, "loss": 5.0859, "num_input_tokens_seen": 466747392, "step": 3561 }, { "epoch": 0.5848585391869785, "grad_norm": 0.7706826329231262, "learning_rate": 8.776115428833078e-05, "loss": 5.0744, "num_input_tokens_seen": 467140608, "step": 3564 }, { "epoch": 0.5853508443546444, "grad_norm": 0.6318513751029968, "learning_rate": 8.772424107774375e-05, "loss": 5.0896, "num_input_tokens_seen": 467533824, "step": 3567 }, { "epoch": 0.5858431495223101, "grad_norm": 0.6089410185813904, "learning_rate": 8.768737440619431e-05, "loss": 5.0848, "num_input_tokens_seen": 467927040, "step": 3570 }, { "epoch": 0.5863354546899759, "grad_norm": 0.7453569173812866, "learning_rate": 8.765055417597291e-05, "loss": 5.0298, "num_input_tokens_seen": 468320256, "step": 3573 }, { "epoch": 0.5868277598576418, "grad_norm": 0.6902857422828674, "learning_rate": 8.761378028965703e-05, "loss": 5.0966, "num_input_tokens_seen": 468713472, "step": 3576 }, { "epoch": 0.5873200650253075, "grad_norm": 0.5212551951408386, "learning_rate": 8.757705265010996e-05, "loss": 5.1311, "num_input_tokens_seen": 469106688, "step": 3579 }, { "epoch": 0.5878123701929734, "grad_norm": 0.679226815700531, "learning_rate": 8.754037116047984e-05, "loss": 5.0775, "num_input_tokens_seen": 469499904, "step": 3582 }, { "epoch": 0.5883046753606391, "grad_norm": 0.600480318069458, "learning_rate": 8.750373572419852e-05, "loss": 5.0304, "num_input_tokens_seen": 469893120, "step": 3585 }, { "epoch": 0.588796980528305, "grad_norm": 0.557918906211853, "learning_rate": 8.746714624498048e-05, "loss": 5.082, "num_input_tokens_seen": 470286336, "step": 3588 }, { "epoch": 0.5892892856959708, "grad_norm": 0.6487769484519958, "learning_rate": 8.743060262682181e-05, "loss": 5.0953, "num_input_tokens_seen": 470679552, "step": 3591 }, { "epoch": 0.5897815908636366, "grad_norm": 0.5848097801208496, "learning_rate": 8.739410477399918e-05, "loss": 5.1032, "num_input_tokens_seen": 471072768, "step": 3594 }, { "epoch": 0.5902738960313024, "grad_norm": 0.6712628602981567, "learning_rate": 8.735765259106869e-05, "loss": 5.104, "num_input_tokens_seen": 471465984, "step": 3597 }, { "epoch": 0.5907662011989682, "grad_norm": 0.7293500304222107, "learning_rate": 8.73212459828649e-05, "loss": 5.0618, "num_input_tokens_seen": 471859200, "step": 3600 }, { "epoch": 0.5907662011989682, "eval_accuracy": 0.21986158606090214, "eval_loss": 5.362356185913086, "eval_runtime": 110.5722, "eval_samples_per_second": 2.713, "eval_steps_per_second": 1.357, "num_input_tokens_seen": 471859200, "step": 3600 }, { "epoch": 0.591258506366634, "grad_norm": 0.6076777577400208, "learning_rate": 8.728488485449973e-05, "loss": 5.1032, "num_input_tokens_seen": 472252416, "step": 3603 }, { "epoch": 0.5917508115342999, "grad_norm": 0.5898135304450989, "learning_rate": 8.724856911136155e-05, "loss": 5.1125, "num_input_tokens_seen": 472645632, "step": 3606 }, { "epoch": 0.5922431167019656, "grad_norm": 0.6582708358764648, "learning_rate": 8.721229865911391e-05, "loss": 5.0623, "num_input_tokens_seen": 473038848, "step": 3609 }, { "epoch": 0.5927354218696315, "grad_norm": 0.6800294518470764, "learning_rate": 8.717607340369476e-05, "loss": 5.0556, "num_input_tokens_seen": 473432064, "step": 3612 }, { "epoch": 0.5932277270372972, "grad_norm": 0.6523749828338623, "learning_rate": 8.713989325131527e-05, "loss": 5.0741, "num_input_tokens_seen": 473825280, "step": 3615 }, { "epoch": 0.593720032204963, "grad_norm": 0.759653627872467, "learning_rate": 8.710375810845887e-05, "loss": 5.0508, "num_input_tokens_seen": 474218496, "step": 3618 }, { "epoch": 0.5942123373726289, "grad_norm": 0.6974580883979797, "learning_rate": 8.706766788188021e-05, "loss": 5.0766, "num_input_tokens_seen": 474611712, "step": 3621 }, { "epoch": 0.5947046425402946, "grad_norm": 0.5986417531967163, "learning_rate": 8.703162247860416e-05, "loss": 5.0783, "num_input_tokens_seen": 475004928, "step": 3624 }, { "epoch": 0.5951969477079605, "grad_norm": 0.8493059277534485, "learning_rate": 8.699562180592481e-05, "loss": 5.0947, "num_input_tokens_seen": 475398144, "step": 3627 }, { "epoch": 0.5956892528756262, "grad_norm": 0.6575183272361755, "learning_rate": 8.695966577140451e-05, "loss": 5.108, "num_input_tokens_seen": 475791360, "step": 3630 }, { "epoch": 0.5961815580432921, "grad_norm": 0.7009572982788086, "learning_rate": 8.692375428287271e-05, "loss": 5.099, "num_input_tokens_seen": 476184576, "step": 3633 }, { "epoch": 0.5966738632109579, "grad_norm": 0.8006022572517395, "learning_rate": 8.68878872484252e-05, "loss": 5.0507, "num_input_tokens_seen": 476577792, "step": 3636 }, { "epoch": 0.5971661683786237, "grad_norm": 0.5653481483459473, "learning_rate": 8.685206457642292e-05, "loss": 5.0688, "num_input_tokens_seen": 476971008, "step": 3639 }, { "epoch": 0.5976584735462895, "grad_norm": 0.7987883687019348, "learning_rate": 8.681628617549114e-05, "loss": 5.0972, "num_input_tokens_seen": 477364224, "step": 3642 }, { "epoch": 0.5981507787139553, "grad_norm": 0.6354634165763855, "learning_rate": 8.678055195451837e-05, "loss": 5.0745, "num_input_tokens_seen": 477757440, "step": 3645 }, { "epoch": 0.5986430838816211, "grad_norm": 0.6245685815811157, "learning_rate": 8.67448618226554e-05, "loss": 5.07, "num_input_tokens_seen": 478150656, "step": 3648 }, { "epoch": 0.599135389049287, "grad_norm": 0.6478342413902283, "learning_rate": 8.670921568931434e-05, "loss": 5.1107, "num_input_tokens_seen": 478543872, "step": 3651 }, { "epoch": 0.5996276942169527, "grad_norm": 0.6710765957832336, "learning_rate": 8.667361346416774e-05, "loss": 5.0968, "num_input_tokens_seen": 478937088, "step": 3654 }, { "epoch": 0.6001199993846186, "grad_norm": 0.5611339807510376, "learning_rate": 8.663805505714746e-05, "loss": 5.0798, "num_input_tokens_seen": 479330304, "step": 3657 }, { "epoch": 0.6006123045522843, "grad_norm": 0.593101441860199, "learning_rate": 8.660254037844386e-05, "loss": 5.1296, "num_input_tokens_seen": 479723520, "step": 3660 }, { "epoch": 0.6011046097199502, "grad_norm": 0.7235480546951294, "learning_rate": 8.656706933850477e-05, "loss": 5.0782, "num_input_tokens_seen": 480116736, "step": 3663 }, { "epoch": 0.6015969148876159, "grad_norm": 0.5439255237579346, "learning_rate": 8.653164184803456e-05, "loss": 5.0872, "num_input_tokens_seen": 480509952, "step": 3666 }, { "epoch": 0.6020892200552818, "grad_norm": 0.5817519426345825, "learning_rate": 8.649625781799325e-05, "loss": 5.1071, "num_input_tokens_seen": 480903168, "step": 3669 }, { "epoch": 0.6025815252229476, "grad_norm": 0.6808876395225525, "learning_rate": 8.646091715959547e-05, "loss": 5.0952, "num_input_tokens_seen": 481296384, "step": 3672 }, { "epoch": 0.6030738303906134, "grad_norm": 0.6932339668273926, "learning_rate": 8.642561978430955e-05, "loss": 5.0893, "num_input_tokens_seen": 481689600, "step": 3675 }, { "epoch": 0.6035661355582792, "grad_norm": 0.6061970591545105, "learning_rate": 8.63903656038567e-05, "loss": 5.0866, "num_input_tokens_seen": 482082816, "step": 3678 }, { "epoch": 0.604058440725945, "grad_norm": 0.5691749453544617, "learning_rate": 8.635515453020989e-05, "loss": 5.0906, "num_input_tokens_seen": 482476032, "step": 3681 }, { "epoch": 0.6045507458936108, "grad_norm": 0.5950030088424683, "learning_rate": 8.631998647559312e-05, "loss": 5.0537, "num_input_tokens_seen": 482869248, "step": 3684 }, { "epoch": 0.6050430510612766, "grad_norm": 0.6372474431991577, "learning_rate": 8.628486135248037e-05, "loss": 5.0622, "num_input_tokens_seen": 483262464, "step": 3687 }, { "epoch": 0.6055353562289424, "grad_norm": 0.7356825470924377, "learning_rate": 8.624977907359473e-05, "loss": 5.1061, "num_input_tokens_seen": 483655680, "step": 3690 }, { "epoch": 0.6060276613966082, "grad_norm": 0.604340136051178, "learning_rate": 8.621473955190753e-05, "loss": 5.0179, "num_input_tokens_seen": 484048896, "step": 3693 }, { "epoch": 0.606519966564274, "grad_norm": 0.665763258934021, "learning_rate": 8.617974270063731e-05, "loss": 5.0976, "num_input_tokens_seen": 484442112, "step": 3696 }, { "epoch": 0.6070122717319398, "grad_norm": 0.5729550719261169, "learning_rate": 8.614478843324907e-05, "loss": 5.1136, "num_input_tokens_seen": 484835328, "step": 3699 }, { "epoch": 0.6075045768996057, "grad_norm": 0.6027372479438782, "learning_rate": 8.61098766634533e-05, "loss": 5.0495, "num_input_tokens_seen": 485228544, "step": 3702 }, { "epoch": 0.6079968820672714, "grad_norm": 0.5599421858787537, "learning_rate": 8.607500730520499e-05, "loss": 5.1028, "num_input_tokens_seen": 485621760, "step": 3705 }, { "epoch": 0.6084891872349373, "grad_norm": 0.7194706201553345, "learning_rate": 8.604018027270296e-05, "loss": 5.0837, "num_input_tokens_seen": 486014976, "step": 3708 }, { "epoch": 0.608981492402603, "grad_norm": 0.5925415754318237, "learning_rate": 8.600539548038875e-05, "loss": 5.0818, "num_input_tokens_seen": 486408192, "step": 3711 }, { "epoch": 0.6094737975702689, "grad_norm": 0.5674649477005005, "learning_rate": 8.597065284294591e-05, "loss": 5.1209, "num_input_tokens_seen": 486801408, "step": 3714 }, { "epoch": 0.6099661027379347, "grad_norm": 0.5196229815483093, "learning_rate": 8.5935952275299e-05, "loss": 5.0793, "num_input_tokens_seen": 487194624, "step": 3717 }, { "epoch": 0.6104584079056005, "grad_norm": 0.5827131271362305, "learning_rate": 8.590129369261278e-05, "loss": 5.0466, "num_input_tokens_seen": 487587840, "step": 3720 }, { "epoch": 0.6109507130732663, "grad_norm": 0.6285468339920044, "learning_rate": 8.586667701029127e-05, "loss": 5.1242, "num_input_tokens_seen": 487981056, "step": 3723 }, { "epoch": 0.6114430182409321, "grad_norm": 0.5768407583236694, "learning_rate": 8.583210214397702e-05, "loss": 5.0845, "num_input_tokens_seen": 488374272, "step": 3726 }, { "epoch": 0.6119353234085979, "grad_norm": 0.6446322798728943, "learning_rate": 8.57975690095501e-05, "loss": 5.0994, "num_input_tokens_seen": 488767488, "step": 3729 }, { "epoch": 0.6124276285762638, "grad_norm": 0.5369075536727905, "learning_rate": 8.57630775231273e-05, "loss": 5.0974, "num_input_tokens_seen": 489160704, "step": 3732 }, { "epoch": 0.6129199337439295, "grad_norm": 0.6329900026321411, "learning_rate": 8.572862760106127e-05, "loss": 5.0761, "num_input_tokens_seen": 489553920, "step": 3735 }, { "epoch": 0.6134122389115954, "grad_norm": 0.578373908996582, "learning_rate": 8.569421915993972e-05, "loss": 5.086, "num_input_tokens_seen": 489947136, "step": 3738 }, { "epoch": 0.6139045440792611, "grad_norm": 0.598923921585083, "learning_rate": 8.565985211658447e-05, "loss": 5.055, "num_input_tokens_seen": 490340352, "step": 3741 }, { "epoch": 0.614396849246927, "grad_norm": 0.6580232977867126, "learning_rate": 8.562552638805071e-05, "loss": 5.0599, "num_input_tokens_seen": 490733568, "step": 3744 }, { "epoch": 0.6148891544145928, "grad_norm": 0.5640982985496521, "learning_rate": 8.559124189162605e-05, "loss": 5.0757, "num_input_tokens_seen": 491126784, "step": 3747 }, { "epoch": 0.6153814595822585, "grad_norm": 0.7263691425323486, "learning_rate": 8.555699854482974e-05, "loss": 5.0805, "num_input_tokens_seen": 491520000, "step": 3750 }, { "epoch": 0.6158737647499244, "grad_norm": 0.5789725184440613, "learning_rate": 8.552279626541192e-05, "loss": 5.0723, "num_input_tokens_seen": 491913216, "step": 3753 }, { "epoch": 0.6163660699175901, "grad_norm": 0.6067011952400208, "learning_rate": 8.548863497135262e-05, "loss": 5.0662, "num_input_tokens_seen": 492306432, "step": 3756 }, { "epoch": 0.616858375085256, "grad_norm": 0.6231025457382202, "learning_rate": 8.545451458086107e-05, "loss": 5.1009, "num_input_tokens_seen": 492699648, "step": 3759 }, { "epoch": 0.6173506802529218, "grad_norm": 0.7004355192184448, "learning_rate": 8.542043501237481e-05, "loss": 5.0582, "num_input_tokens_seen": 493092864, "step": 3762 }, { "epoch": 0.6178429854205876, "grad_norm": 0.6546366810798645, "learning_rate": 8.53863961845589e-05, "loss": 5.0931, "num_input_tokens_seen": 493486080, "step": 3765 }, { "epoch": 0.6183352905882534, "grad_norm": 0.6404768228530884, "learning_rate": 8.535239801630506e-05, "loss": 5.1405, "num_input_tokens_seen": 493879296, "step": 3768 }, { "epoch": 0.6188275957559192, "grad_norm": 0.6902008056640625, "learning_rate": 8.531844042673096e-05, "loss": 5.0707, "num_input_tokens_seen": 494272512, "step": 3771 }, { "epoch": 0.619319900923585, "grad_norm": 0.6566780209541321, "learning_rate": 8.528452333517929e-05, "loss": 5.1154, "num_input_tokens_seen": 494665728, "step": 3774 }, { "epoch": 0.6198122060912508, "grad_norm": 0.6058692932128906, "learning_rate": 8.525064666121706e-05, "loss": 5.0052, "num_input_tokens_seen": 495058944, "step": 3777 }, { "epoch": 0.6203045112589166, "grad_norm": 0.6108487844467163, "learning_rate": 8.521681032463467e-05, "loss": 5.0998, "num_input_tokens_seen": 495452160, "step": 3780 }, { "epoch": 0.6207968164265825, "grad_norm": 0.6402431130409241, "learning_rate": 8.518301424544526e-05, "loss": 5.0671, "num_input_tokens_seen": 495845376, "step": 3783 }, { "epoch": 0.6212891215942482, "grad_norm": 0.6639608144760132, "learning_rate": 8.514925834388382e-05, "loss": 5.0585, "num_input_tokens_seen": 496238592, "step": 3786 }, { "epoch": 0.6217814267619141, "grad_norm": 0.6030707955360413, "learning_rate": 8.511554254040647e-05, "loss": 5.1078, "num_input_tokens_seen": 496631808, "step": 3789 }, { "epoch": 0.6222737319295798, "grad_norm": 0.6333598494529724, "learning_rate": 8.508186675568954e-05, "loss": 5.0646, "num_input_tokens_seen": 497025024, "step": 3792 }, { "epoch": 0.6227660370972457, "grad_norm": 0.6044871211051941, "learning_rate": 8.504823091062899e-05, "loss": 5.1171, "num_input_tokens_seen": 497418240, "step": 3795 }, { "epoch": 0.6232583422649115, "grad_norm": 0.6012600064277649, "learning_rate": 8.501463492633939e-05, "loss": 5.045, "num_input_tokens_seen": 497811456, "step": 3798 }, { "epoch": 0.6237506474325772, "grad_norm": 0.6684429049491882, "learning_rate": 8.49810787241534e-05, "loss": 5.0726, "num_input_tokens_seen": 498204672, "step": 3801 }, { "epoch": 0.6242429526002431, "grad_norm": 0.5676344633102417, "learning_rate": 8.494756222562075e-05, "loss": 5.0832, "num_input_tokens_seen": 498597888, "step": 3804 }, { "epoch": 0.6247352577679088, "grad_norm": 0.606695294380188, "learning_rate": 8.491408535250763e-05, "loss": 5.0568, "num_input_tokens_seen": 498991104, "step": 3807 }, { "epoch": 0.6252275629355747, "grad_norm": 0.5882166028022766, "learning_rate": 8.488064802679595e-05, "loss": 5.0352, "num_input_tokens_seen": 499384320, "step": 3810 }, { "epoch": 0.6257198681032405, "grad_norm": 0.6526467800140381, "learning_rate": 8.484725017068234e-05, "loss": 5.0533, "num_input_tokens_seen": 499777536, "step": 3813 }, { "epoch": 0.6262121732709063, "grad_norm": 0.593462347984314, "learning_rate": 8.48138917065777e-05, "loss": 5.0718, "num_input_tokens_seen": 500170752, "step": 3816 }, { "epoch": 0.6267044784385721, "grad_norm": 0.5729905366897583, "learning_rate": 8.478057255710627e-05, "loss": 5.0125, "num_input_tokens_seen": 500563968, "step": 3819 }, { "epoch": 0.6271967836062379, "grad_norm": 0.6892577409744263, "learning_rate": 8.474729264510482e-05, "loss": 5.0931, "num_input_tokens_seen": 500957184, "step": 3822 }, { "epoch": 0.6276890887739037, "grad_norm": 0.5425217151641846, "learning_rate": 8.471405189362207e-05, "loss": 5.0371, "num_input_tokens_seen": 501350400, "step": 3825 }, { "epoch": 0.6281813939415696, "grad_norm": 0.6370276808738708, "learning_rate": 8.468085022591781e-05, "loss": 5.0494, "num_input_tokens_seen": 501743616, "step": 3828 }, { "epoch": 0.6286736991092353, "grad_norm": 0.6339290142059326, "learning_rate": 8.464768756546222e-05, "loss": 5.0355, "num_input_tokens_seen": 502136832, "step": 3831 }, { "epoch": 0.6291660042769012, "grad_norm": 0.5908637046813965, "learning_rate": 8.461456383593512e-05, "loss": 5.069, "num_input_tokens_seen": 502530048, "step": 3834 }, { "epoch": 0.6296583094445669, "grad_norm": 0.6436389088630676, "learning_rate": 8.458147896122517e-05, "loss": 5.0195, "num_input_tokens_seen": 502923264, "step": 3837 }, { "epoch": 0.6301506146122328, "grad_norm": 0.5766847133636475, "learning_rate": 8.454843286542926e-05, "loss": 5.0878, "num_input_tokens_seen": 503316480, "step": 3840 }, { "epoch": 0.6306429197798986, "grad_norm": 0.5753293037414551, "learning_rate": 8.451542547285164e-05, "loss": 5.1015, "num_input_tokens_seen": 503709696, "step": 3843 }, { "epoch": 0.6311352249475644, "grad_norm": 0.6848548650741577, "learning_rate": 8.448245670800332e-05, "loss": 5.0775, "num_input_tokens_seen": 504102912, "step": 3846 }, { "epoch": 0.6316275301152302, "grad_norm": 0.5692152380943298, "learning_rate": 8.444952649560123e-05, "loss": 5.0544, "num_input_tokens_seen": 504496128, "step": 3849 }, { "epoch": 0.632119835282896, "grad_norm": 0.7179746627807617, "learning_rate": 8.441663476056757e-05, "loss": 5.0092, "num_input_tokens_seen": 504889344, "step": 3852 }, { "epoch": 0.6326121404505618, "grad_norm": 0.7406941056251526, "learning_rate": 8.438378142802908e-05, "loss": 5.0623, "num_input_tokens_seen": 505282560, "step": 3855 }, { "epoch": 0.6331044456182277, "grad_norm": 0.5824952721595764, "learning_rate": 8.43509664233163e-05, "loss": 5.1089, "num_input_tokens_seen": 505675776, "step": 3858 }, { "epoch": 0.6335967507858934, "grad_norm": 0.7226423621177673, "learning_rate": 8.431818967196287e-05, "loss": 5.1065, "num_input_tokens_seen": 506068992, "step": 3861 }, { "epoch": 0.6340890559535592, "grad_norm": 0.6426088809967041, "learning_rate": 8.42854510997049e-05, "loss": 5.0762, "num_input_tokens_seen": 506462208, "step": 3864 }, { "epoch": 0.634581361121225, "grad_norm": 0.7668753266334534, "learning_rate": 8.425275063248005e-05, "loss": 5.0407, "num_input_tokens_seen": 506855424, "step": 3867 }, { "epoch": 0.6350736662888908, "grad_norm": 0.7967394590377808, "learning_rate": 8.422008819642705e-05, "loss": 5.0791, "num_input_tokens_seen": 507248640, "step": 3870 }, { "epoch": 0.6355659714565566, "grad_norm": 0.6385906934738159, "learning_rate": 8.418746371788493e-05, "loss": 5.0588, "num_input_tokens_seen": 507641856, "step": 3873 }, { "epoch": 0.6360582766242224, "grad_norm": 0.6711165308952332, "learning_rate": 8.415487712339226e-05, "loss": 5.0421, "num_input_tokens_seen": 508035072, "step": 3876 }, { "epoch": 0.6365505817918883, "grad_norm": 0.6327312588691711, "learning_rate": 8.412232833968649e-05, "loss": 5.0499, "num_input_tokens_seen": 508428288, "step": 3879 }, { "epoch": 0.637042886959554, "grad_norm": 0.5863744616508484, "learning_rate": 8.408981729370331e-05, "loss": 5.0552, "num_input_tokens_seen": 508821504, "step": 3882 }, { "epoch": 0.6375351921272199, "grad_norm": 0.6245995163917542, "learning_rate": 8.405734391257592e-05, "loss": 5.0508, "num_input_tokens_seen": 509214720, "step": 3885 }, { "epoch": 0.6380274972948856, "grad_norm": 0.7355886697769165, "learning_rate": 8.40249081236343e-05, "loss": 5.0546, "num_input_tokens_seen": 509607936, "step": 3888 }, { "epoch": 0.6385198024625515, "grad_norm": 0.5727904438972473, "learning_rate": 8.399250985440458e-05, "loss": 5.0456, "num_input_tokens_seen": 510001152, "step": 3891 }, { "epoch": 0.6390121076302173, "grad_norm": 0.6342762112617493, "learning_rate": 8.396014903260839e-05, "loss": 5.0593, "num_input_tokens_seen": 510394368, "step": 3894 }, { "epoch": 0.6395044127978831, "grad_norm": 0.7297379970550537, "learning_rate": 8.392782558616211e-05, "loss": 5.0787, "num_input_tokens_seen": 510787584, "step": 3897 }, { "epoch": 0.6399967179655489, "grad_norm": 0.5754993557929993, "learning_rate": 8.389553944317623e-05, "loss": 5.0372, "num_input_tokens_seen": 511180800, "step": 3900 }, { "epoch": 0.6404890231332147, "grad_norm": 0.6071540713310242, "learning_rate": 8.386329053195467e-05, "loss": 5.0784, "num_input_tokens_seen": 511574016, "step": 3903 }, { "epoch": 0.6409813283008805, "grad_norm": 0.5957722067832947, "learning_rate": 8.383107878099417e-05, "loss": 5.067, "num_input_tokens_seen": 511967232, "step": 3906 }, { "epoch": 0.6414736334685464, "grad_norm": 0.5961174964904785, "learning_rate": 8.379890411898351e-05, "loss": 5.0588, "num_input_tokens_seen": 512360448, "step": 3909 }, { "epoch": 0.6419659386362121, "grad_norm": 0.6179754734039307, "learning_rate": 8.376676647480295e-05, "loss": 5.0564, "num_input_tokens_seen": 512753664, "step": 3912 }, { "epoch": 0.642458243803878, "grad_norm": 0.5459384322166443, "learning_rate": 8.373466577752348e-05, "loss": 5.0854, "num_input_tokens_seen": 513146880, "step": 3915 }, { "epoch": 0.6429505489715437, "grad_norm": 0.7513952255249023, "learning_rate": 8.370260195640626e-05, "loss": 5.089, "num_input_tokens_seen": 513540096, "step": 3918 }, { "epoch": 0.6434428541392095, "grad_norm": 0.6562470197677612, "learning_rate": 8.367057494090192e-05, "loss": 5.052, "num_input_tokens_seen": 513933312, "step": 3921 }, { "epoch": 0.6439351593068754, "grad_norm": 0.6965239644050598, "learning_rate": 8.363858466064986e-05, "loss": 5.0638, "num_input_tokens_seen": 514326528, "step": 3924 }, { "epoch": 0.6444274644745411, "grad_norm": 0.5959606766700745, "learning_rate": 8.360663104547769e-05, "loss": 5.0521, "num_input_tokens_seen": 514719744, "step": 3927 }, { "epoch": 0.644919769642207, "grad_norm": 0.54644376039505, "learning_rate": 8.357471402540053e-05, "loss": 5.0629, "num_input_tokens_seen": 515112960, "step": 3930 }, { "epoch": 0.6454120748098727, "grad_norm": 0.5704362988471985, "learning_rate": 8.354283353062033e-05, "loss": 5.0272, "num_input_tokens_seen": 515506176, "step": 3933 }, { "epoch": 0.6459043799775386, "grad_norm": 0.522625744342804, "learning_rate": 8.351098949152536e-05, "loss": 5.0068, "num_input_tokens_seen": 515899392, "step": 3936 }, { "epoch": 0.6463966851452044, "grad_norm": 0.6017957329750061, "learning_rate": 8.347918183868937e-05, "loss": 5.0166, "num_input_tokens_seen": 516292608, "step": 3939 }, { "epoch": 0.6468889903128702, "grad_norm": 0.6562044024467468, "learning_rate": 8.344741050287123e-05, "loss": 5.0237, "num_input_tokens_seen": 516685824, "step": 3942 }, { "epoch": 0.647381295480536, "grad_norm": 0.6371638178825378, "learning_rate": 8.341567541501397e-05, "loss": 5.0629, "num_input_tokens_seen": 517079040, "step": 3945 }, { "epoch": 0.6478736006482018, "grad_norm": 0.5510682463645935, "learning_rate": 8.338397650624441e-05, "loss": 5.0254, "num_input_tokens_seen": 517472256, "step": 3948 }, { "epoch": 0.6483659058158676, "grad_norm": 0.6829474568367004, "learning_rate": 8.335231370787243e-05, "loss": 5.029, "num_input_tokens_seen": 517865472, "step": 3951 }, { "epoch": 0.6488582109835335, "grad_norm": 0.5510388016700745, "learning_rate": 8.332068695139037e-05, "loss": 5.0445, "num_input_tokens_seen": 518258688, "step": 3954 }, { "epoch": 0.6493505161511992, "grad_norm": 0.5481399893760681, "learning_rate": 8.328909616847231e-05, "loss": 5.0571, "num_input_tokens_seen": 518651904, "step": 3957 }, { "epoch": 0.6498428213188651, "grad_norm": 0.6369343400001526, "learning_rate": 8.325754129097364e-05, "loss": 5.08, "num_input_tokens_seen": 519045120, "step": 3960 }, { "epoch": 0.6503351264865308, "grad_norm": 0.5993247628211975, "learning_rate": 8.322602225093026e-05, "loss": 5.0518, "num_input_tokens_seen": 519438336, "step": 3963 }, { "epoch": 0.6508274316541967, "grad_norm": 0.64273601770401, "learning_rate": 8.319453898055805e-05, "loss": 5.0511, "num_input_tokens_seen": 519831552, "step": 3966 }, { "epoch": 0.6513197368218625, "grad_norm": 0.6449026465415955, "learning_rate": 8.316309141225228e-05, "loss": 5.0171, "num_input_tokens_seen": 520224768, "step": 3969 }, { "epoch": 0.6518120419895282, "grad_norm": 0.7018563151359558, "learning_rate": 8.313167947858695e-05, "loss": 5.0412, "num_input_tokens_seen": 520617984, "step": 3972 }, { "epoch": 0.6523043471571941, "grad_norm": 0.5524522066116333, "learning_rate": 8.310030311231415e-05, "loss": 5.1299, "num_input_tokens_seen": 521011200, "step": 3975 }, { "epoch": 0.6527966523248598, "grad_norm": 0.6293720006942749, "learning_rate": 8.306896224636362e-05, "loss": 5.0796, "num_input_tokens_seen": 521404416, "step": 3978 }, { "epoch": 0.6532889574925257, "grad_norm": 0.6255269050598145, "learning_rate": 8.303765681384188e-05, "loss": 5.0136, "num_input_tokens_seen": 521797632, "step": 3981 }, { "epoch": 0.6537812626601914, "grad_norm": 0.637909471988678, "learning_rate": 8.300638674803195e-05, "loss": 5.0853, "num_input_tokens_seen": 522190848, "step": 3984 }, { "epoch": 0.6542735678278573, "grad_norm": 0.6187140345573425, "learning_rate": 8.297515198239245e-05, "loss": 5.0677, "num_input_tokens_seen": 522584064, "step": 3987 }, { "epoch": 0.6547658729955231, "grad_norm": 0.5749291181564331, "learning_rate": 8.294395245055722e-05, "loss": 5.0805, "num_input_tokens_seen": 522977280, "step": 3990 }, { "epoch": 0.6552581781631889, "grad_norm": 0.5552260875701904, "learning_rate": 8.291278808633464e-05, "loss": 5.0614, "num_input_tokens_seen": 523370496, "step": 3993 }, { "epoch": 0.6557504833308547, "grad_norm": 0.5661478042602539, "learning_rate": 8.288165882370701e-05, "loss": 5.1098, "num_input_tokens_seen": 523763712, "step": 3996 }, { "epoch": 0.6562427884985205, "grad_norm": 0.6904596090316772, "learning_rate": 8.285056459683002e-05, "loss": 5.0278, "num_input_tokens_seen": 524156928, "step": 3999 }, { "epoch": 0.6564068902210758, "eval_accuracy": 0.22359713401726103, "eval_loss": 5.324248313903809, "eval_runtime": 113.075, "eval_samples_per_second": 2.653, "eval_steps_per_second": 1.327, "num_input_tokens_seen": 524288000, "step": 4000 }, { "epoch": 0.6567350936661863, "grad_norm": 0.6449172496795654, "learning_rate": 8.281950534003216e-05, "loss": 5.0487, "num_input_tokens_seen": 524550144, "step": 4002 }, { "epoch": 0.6572273988338522, "grad_norm": 0.6652653813362122, "learning_rate": 8.278848098781413e-05, "loss": 5.018, "num_input_tokens_seen": 524943360, "step": 4005 }, { "epoch": 0.6577197040015179, "grad_norm": 0.6469755172729492, "learning_rate": 8.275749147484824e-05, "loss": 5.0477, "num_input_tokens_seen": 525336576, "step": 4008 }, { "epoch": 0.6582120091691838, "grad_norm": 0.6589633822441101, "learning_rate": 8.272653673597785e-05, "loss": 5.0356, "num_input_tokens_seen": 525729792, "step": 4011 }, { "epoch": 0.6587043143368495, "grad_norm": 0.7347235083580017, "learning_rate": 8.269561670621681e-05, "loss": 5.0088, "num_input_tokens_seen": 526123008, "step": 4014 }, { "epoch": 0.6591966195045154, "grad_norm": 0.5741059184074402, "learning_rate": 8.266473132074881e-05, "loss": 5.0538, "num_input_tokens_seen": 526516224, "step": 4017 }, { "epoch": 0.6596889246721812, "grad_norm": 0.6932168006896973, "learning_rate": 8.263388051492694e-05, "loss": 5.0435, "num_input_tokens_seen": 526909440, "step": 4020 }, { "epoch": 0.660181229839847, "grad_norm": 0.6108718514442444, "learning_rate": 8.260306422427303e-05, "loss": 5.0209, "num_input_tokens_seen": 527302656, "step": 4023 }, { "epoch": 0.6606735350075128, "grad_norm": 0.7427350282669067, "learning_rate": 8.257228238447704e-05, "loss": 5.0025, "num_input_tokens_seen": 527695872, "step": 4026 }, { "epoch": 0.6611658401751785, "grad_norm": 0.6407040357589722, "learning_rate": 8.254153493139666e-05, "loss": 5.0381, "num_input_tokens_seen": 528089088, "step": 4029 }, { "epoch": 0.6616581453428444, "grad_norm": 0.6951069831848145, "learning_rate": 8.251082180105658e-05, "loss": 5.0644, "num_input_tokens_seen": 528482304, "step": 4032 }, { "epoch": 0.6621504505105102, "grad_norm": 0.64825439453125, "learning_rate": 8.248014292964801e-05, "loss": 5.0702, "num_input_tokens_seen": 528875520, "step": 4035 }, { "epoch": 0.662642755678176, "grad_norm": 0.631080687046051, "learning_rate": 8.244949825352815e-05, "loss": 5.0367, "num_input_tokens_seen": 529268736, "step": 4038 }, { "epoch": 0.6631350608458418, "grad_norm": 0.5960158109664917, "learning_rate": 8.241888770921956e-05, "loss": 5.0529, "num_input_tokens_seen": 529661952, "step": 4041 }, { "epoch": 0.6636273660135076, "grad_norm": 0.5926985144615173, "learning_rate": 8.238831123340965e-05, "loss": 5.0342, "num_input_tokens_seen": 530055168, "step": 4044 }, { "epoch": 0.6641196711811734, "grad_norm": 0.6134880185127258, "learning_rate": 8.235776876295013e-05, "loss": 5.0454, "num_input_tokens_seen": 530448384, "step": 4047 }, { "epoch": 0.6646119763488393, "grad_norm": 0.6768664121627808, "learning_rate": 8.232726023485646e-05, "loss": 5.0676, "num_input_tokens_seen": 530841600, "step": 4050 }, { "epoch": 0.665104281516505, "grad_norm": 0.6263231635093689, "learning_rate": 8.22967855863073e-05, "loss": 5.03, "num_input_tokens_seen": 531234816, "step": 4053 }, { "epoch": 0.6655965866841709, "grad_norm": 0.6137826442718506, "learning_rate": 8.226634475464398e-05, "loss": 5.0772, "num_input_tokens_seen": 531628032, "step": 4056 }, { "epoch": 0.6660888918518366, "grad_norm": 0.6005773544311523, "learning_rate": 8.223593767736994e-05, "loss": 5.0229, "num_input_tokens_seen": 532021248, "step": 4059 }, { "epoch": 0.6665811970195025, "grad_norm": 0.5843310356140137, "learning_rate": 8.22055642921502e-05, "loss": 5.0513, "num_input_tokens_seen": 532414464, "step": 4062 }, { "epoch": 0.6670735021871683, "grad_norm": 0.5929916501045227, "learning_rate": 8.217522453681083e-05, "loss": 5.0487, "num_input_tokens_seen": 532807680, "step": 4065 }, { "epoch": 0.6675658073548341, "grad_norm": 0.5393611788749695, "learning_rate": 8.214491834933838e-05, "loss": 5.0281, "num_input_tokens_seen": 533200896, "step": 4068 }, { "epoch": 0.6680581125224999, "grad_norm": 0.6220096945762634, "learning_rate": 8.21146456678794e-05, "loss": 5.025, "num_input_tokens_seen": 533594112, "step": 4071 }, { "epoch": 0.6685504176901657, "grad_norm": 0.6115317940711975, "learning_rate": 8.208440643073989e-05, "loss": 5.0254, "num_input_tokens_seen": 533987328, "step": 4074 }, { "epoch": 0.6690427228578315, "grad_norm": 0.6211367845535278, "learning_rate": 8.205420057638475e-05, "loss": 5.0665, "num_input_tokens_seen": 534380544, "step": 4077 }, { "epoch": 0.6695350280254974, "grad_norm": 0.5687063932418823, "learning_rate": 8.202402804343728e-05, "loss": 5.038, "num_input_tokens_seen": 534773760, "step": 4080 }, { "epoch": 0.6700273331931631, "grad_norm": 0.572433590888977, "learning_rate": 8.199388877067867e-05, "loss": 5.0709, "num_input_tokens_seen": 535166976, "step": 4083 }, { "epoch": 0.670519638360829, "grad_norm": 0.5648781061172485, "learning_rate": 8.196378269704742e-05, "loss": 5.0144, "num_input_tokens_seen": 535560192, "step": 4086 }, { "epoch": 0.6710119435284947, "grad_norm": 0.622725784778595, "learning_rate": 8.193370976163886e-05, "loss": 4.9888, "num_input_tokens_seen": 535953408, "step": 4089 }, { "epoch": 0.6715042486961605, "grad_norm": 0.6132870316505432, "learning_rate": 8.190366990370464e-05, "loss": 5.0087, "num_input_tokens_seen": 536346624, "step": 4092 }, { "epoch": 0.6719965538638263, "grad_norm": 0.5610283017158508, "learning_rate": 8.187366306265222e-05, "loss": 5.0339, "num_input_tokens_seen": 536739840, "step": 4095 }, { "epoch": 0.6724888590314921, "grad_norm": 0.5569362640380859, "learning_rate": 8.184368917804431e-05, "loss": 5.0178, "num_input_tokens_seen": 537133056, "step": 4098 }, { "epoch": 0.672981164199158, "grad_norm": 0.6829281449317932, "learning_rate": 8.181374818959841e-05, "loss": 5.0422, "num_input_tokens_seen": 537526272, "step": 4101 }, { "epoch": 0.6734734693668237, "grad_norm": 0.6499529480934143, "learning_rate": 8.178384003718625e-05, "loss": 5.0081, "num_input_tokens_seen": 537919488, "step": 4104 }, { "epoch": 0.6739657745344896, "grad_norm": 0.5391960740089417, "learning_rate": 8.175396466083337e-05, "loss": 5.026, "num_input_tokens_seen": 538312704, "step": 4107 }, { "epoch": 0.6744580797021553, "grad_norm": 0.5357514023780823, "learning_rate": 8.17241220007185e-05, "loss": 5.0033, "num_input_tokens_seen": 538705920, "step": 4110 }, { "epoch": 0.6749503848698212, "grad_norm": 0.5889911651611328, "learning_rate": 8.169431199717313e-05, "loss": 4.9921, "num_input_tokens_seen": 539099136, "step": 4113 }, { "epoch": 0.675442690037487, "grad_norm": 0.5937342643737793, "learning_rate": 8.1664534590681e-05, "loss": 5.0344, "num_input_tokens_seen": 539492352, "step": 4116 }, { "epoch": 0.6759349952051528, "grad_norm": 0.6347099542617798, "learning_rate": 8.163478972187763e-05, "loss": 5.033, "num_input_tokens_seen": 539885568, "step": 4119 }, { "epoch": 0.6764273003728186, "grad_norm": 0.5977573990821838, "learning_rate": 8.160507733154971e-05, "loss": 5.0467, "num_input_tokens_seen": 540278784, "step": 4122 }, { "epoch": 0.6769196055404844, "grad_norm": 0.5776995420455933, "learning_rate": 8.157539736063474e-05, "loss": 5.0784, "num_input_tokens_seen": 540672000, "step": 4125 }, { "epoch": 0.6774119107081502, "grad_norm": 0.5840444564819336, "learning_rate": 8.154574975022046e-05, "loss": 5.0125, "num_input_tokens_seen": 541065216, "step": 4128 }, { "epoch": 0.6779042158758161, "grad_norm": 0.6594420671463013, "learning_rate": 8.151613444154437e-05, "loss": 5.0195, "num_input_tokens_seen": 541458432, "step": 4131 }, { "epoch": 0.6783965210434818, "grad_norm": 0.6229584217071533, "learning_rate": 8.14865513759933e-05, "loss": 5.0545, "num_input_tokens_seen": 541851648, "step": 4134 }, { "epoch": 0.6788888262111477, "grad_norm": 0.5304208993911743, "learning_rate": 8.145700049510277e-05, "loss": 5.0066, "num_input_tokens_seen": 542244864, "step": 4137 }, { "epoch": 0.6793811313788134, "grad_norm": 0.7174444794654846, "learning_rate": 8.14274817405567e-05, "loss": 5.0549, "num_input_tokens_seen": 542638080, "step": 4140 }, { "epoch": 0.6798734365464792, "grad_norm": 0.5718116164207458, "learning_rate": 8.13979950541868e-05, "loss": 5.0239, "num_input_tokens_seen": 543031296, "step": 4143 }, { "epoch": 0.6803657417141451, "grad_norm": 0.6597188711166382, "learning_rate": 8.136854037797212e-05, "loss": 5.018, "num_input_tokens_seen": 543424512, "step": 4146 }, { "epoch": 0.6808580468818108, "grad_norm": 0.6705512404441833, "learning_rate": 8.133911765403855e-05, "loss": 5.0543, "num_input_tokens_seen": 543817728, "step": 4149 }, { "epoch": 0.6813503520494767, "grad_norm": 0.6903484463691711, "learning_rate": 8.130972682465842e-05, "loss": 5.0084, "num_input_tokens_seen": 544210944, "step": 4152 }, { "epoch": 0.6818426572171424, "grad_norm": 0.563361406326294, "learning_rate": 8.128036783224992e-05, "loss": 5.0479, "num_input_tokens_seen": 544604160, "step": 4155 }, { "epoch": 0.6823349623848083, "grad_norm": 0.5460705757141113, "learning_rate": 8.125104061937669e-05, "loss": 5.0404, "num_input_tokens_seen": 544997376, "step": 4158 }, { "epoch": 0.6828272675524741, "grad_norm": 0.5406615734100342, "learning_rate": 8.122174512874733e-05, "loss": 5.0741, "num_input_tokens_seen": 545390592, "step": 4161 }, { "epoch": 0.6833195727201399, "grad_norm": 0.6041008830070496, "learning_rate": 8.119248130321494e-05, "loss": 5.0259, "num_input_tokens_seen": 545783808, "step": 4164 }, { "epoch": 0.6838118778878057, "grad_norm": 0.6043838262557983, "learning_rate": 8.116324908577667e-05, "loss": 5.0465, "num_input_tokens_seen": 546177024, "step": 4167 }, { "epoch": 0.6843041830554715, "grad_norm": 0.6404688954353333, "learning_rate": 8.113404841957315e-05, "loss": 5.023, "num_input_tokens_seen": 546570240, "step": 4170 }, { "epoch": 0.6847964882231373, "grad_norm": 0.5821146965026855, "learning_rate": 8.110487924788816e-05, "loss": 5.0608, "num_input_tokens_seen": 546963456, "step": 4173 }, { "epoch": 0.6852887933908032, "grad_norm": 0.704936146736145, "learning_rate": 8.107574151414814e-05, "loss": 5.091, "num_input_tokens_seen": 547356672, "step": 4176 }, { "epoch": 0.6857810985584689, "grad_norm": 0.5520325899124146, "learning_rate": 8.104663516192164e-05, "loss": 5.0377, "num_input_tokens_seen": 547749888, "step": 4179 }, { "epoch": 0.6862734037261348, "grad_norm": 0.5666207075119019, "learning_rate": 8.101756013491894e-05, "loss": 5.014, "num_input_tokens_seen": 548143104, "step": 4182 }, { "epoch": 0.6867657088938005, "grad_norm": 0.5827277302742004, "learning_rate": 8.09885163769916e-05, "loss": 5.008, "num_input_tokens_seen": 548536320, "step": 4185 }, { "epoch": 0.6872580140614664, "grad_norm": 0.5662521719932556, "learning_rate": 8.095950383213192e-05, "loss": 5.0114, "num_input_tokens_seen": 548929536, "step": 4188 }, { "epoch": 0.6877503192291322, "grad_norm": 0.6049064993858337, "learning_rate": 8.093052244447264e-05, "loss": 5.0039, "num_input_tokens_seen": 549322752, "step": 4191 }, { "epoch": 0.688242624396798, "grad_norm": 0.6402872204780579, "learning_rate": 8.090157215828629e-05, "loss": 5.0325, "num_input_tokens_seen": 549715968, "step": 4194 }, { "epoch": 0.6887349295644638, "grad_norm": 0.582022488117218, "learning_rate": 8.08726529179849e-05, "loss": 5.0176, "num_input_tokens_seen": 550109184, "step": 4197 }, { "epoch": 0.6892272347321295, "grad_norm": 0.577924370765686, "learning_rate": 8.08437646681195e-05, "loss": 5.043, "num_input_tokens_seen": 550502400, "step": 4200 }, { "epoch": 0.6897195398997954, "grad_norm": 0.6571632623672485, "learning_rate": 8.081490735337961e-05, "loss": 4.9929, "num_input_tokens_seen": 550895616, "step": 4203 }, { "epoch": 0.6902118450674611, "grad_norm": 0.5845437049865723, "learning_rate": 8.078608091859296e-05, "loss": 5.0122, "num_input_tokens_seen": 551288832, "step": 4206 }, { "epoch": 0.690704150235127, "grad_norm": 0.5887883901596069, "learning_rate": 8.075728530872482e-05, "loss": 5.0236, "num_input_tokens_seen": 551682048, "step": 4209 }, { "epoch": 0.6911964554027928, "grad_norm": 0.5670571327209473, "learning_rate": 8.072852046887776e-05, "loss": 4.9989, "num_input_tokens_seen": 552075264, "step": 4212 }, { "epoch": 0.6916887605704586, "grad_norm": 0.6224859952926636, "learning_rate": 8.069978634429111e-05, "loss": 5.0138, "num_input_tokens_seen": 552468480, "step": 4215 }, { "epoch": 0.6921810657381244, "grad_norm": 0.610889732837677, "learning_rate": 8.067108288034053e-05, "loss": 5.0593, "num_input_tokens_seen": 552861696, "step": 4218 }, { "epoch": 0.6926733709057902, "grad_norm": 0.5440495610237122, "learning_rate": 8.064241002253757e-05, "loss": 5.0239, "num_input_tokens_seen": 553254912, "step": 4221 }, { "epoch": 0.693165676073456, "grad_norm": 0.5816980004310608, "learning_rate": 8.061376771652931e-05, "loss": 5.045, "num_input_tokens_seen": 553648128, "step": 4224 }, { "epoch": 0.6936579812411219, "grad_norm": 0.5322626233100891, "learning_rate": 8.058515590809782e-05, "loss": 5.0538, "num_input_tokens_seen": 554041344, "step": 4227 }, { "epoch": 0.6941502864087876, "grad_norm": 0.6580582857131958, "learning_rate": 8.055657454315977e-05, "loss": 5.0129, "num_input_tokens_seen": 554434560, "step": 4230 }, { "epoch": 0.6946425915764535, "grad_norm": 0.5960021615028381, "learning_rate": 8.052802356776606e-05, "loss": 5.0385, "num_input_tokens_seen": 554827776, "step": 4233 }, { "epoch": 0.6951348967441192, "grad_norm": 0.6201943159103394, "learning_rate": 8.049950292810128e-05, "loss": 5.0126, "num_input_tokens_seen": 555220992, "step": 4236 }, { "epoch": 0.6956272019117851, "grad_norm": 0.6654148697853088, "learning_rate": 8.047101257048339e-05, "loss": 5.0154, "num_input_tokens_seen": 555614208, "step": 4239 }, { "epoch": 0.6961195070794509, "grad_norm": 0.5792354941368103, "learning_rate": 8.044255244136322e-05, "loss": 5.0095, "num_input_tokens_seen": 556007424, "step": 4242 }, { "epoch": 0.6966118122471167, "grad_norm": 0.5518591403961182, "learning_rate": 8.041412248732407e-05, "loss": 4.9994, "num_input_tokens_seen": 556400640, "step": 4245 }, { "epoch": 0.6971041174147825, "grad_norm": 0.6194866299629211, "learning_rate": 8.038572265508136e-05, "loss": 5.0041, "num_input_tokens_seen": 556793856, "step": 4248 }, { "epoch": 0.6975964225824483, "grad_norm": 0.6971485614776611, "learning_rate": 8.035735289148207e-05, "loss": 5.0282, "num_input_tokens_seen": 557187072, "step": 4251 }, { "epoch": 0.6980887277501141, "grad_norm": 0.5932923555374146, "learning_rate": 8.032901314350443e-05, "loss": 5.111, "num_input_tokens_seen": 557580288, "step": 4254 }, { "epoch": 0.69858103291778, "grad_norm": 0.6528842449188232, "learning_rate": 8.030070335825747e-05, "loss": 5.0172, "num_input_tokens_seen": 557973504, "step": 4257 }, { "epoch": 0.6990733380854457, "grad_norm": 0.658224880695343, "learning_rate": 8.027242348298066e-05, "loss": 5.036, "num_input_tokens_seen": 558366720, "step": 4260 }, { "epoch": 0.6995656432531115, "grad_norm": 0.4988958537578583, "learning_rate": 8.024417346504334e-05, "loss": 4.9453, "num_input_tokens_seen": 558759936, "step": 4263 }, { "epoch": 0.7000579484207773, "grad_norm": 0.6576696038246155, "learning_rate": 8.021595325194448e-05, "loss": 5.0356, "num_input_tokens_seen": 559153152, "step": 4266 }, { "epoch": 0.7005502535884431, "grad_norm": 0.6395735144615173, "learning_rate": 8.01877627913122e-05, "loss": 5.041, "num_input_tokens_seen": 559546368, "step": 4269 }, { "epoch": 0.701042558756109, "grad_norm": 0.5595338940620422, "learning_rate": 8.015960203090336e-05, "loss": 4.9971, "num_input_tokens_seen": 559939584, "step": 4272 }, { "epoch": 0.7015348639237747, "grad_norm": 0.656607449054718, "learning_rate": 8.013147091860318e-05, "loss": 5.0286, "num_input_tokens_seen": 560332800, "step": 4275 }, { "epoch": 0.7020271690914406, "grad_norm": 0.6094218492507935, "learning_rate": 8.010336940242475e-05, "loss": 4.9691, "num_input_tokens_seen": 560726016, "step": 4278 }, { "epoch": 0.7025194742591063, "grad_norm": 0.632666289806366, "learning_rate": 8.007529743050875e-05, "loss": 5.0467, "num_input_tokens_seen": 561119232, "step": 4281 }, { "epoch": 0.7030117794267722, "grad_norm": 0.6173758506774902, "learning_rate": 8.004725495112299e-05, "loss": 5.021, "num_input_tokens_seen": 561512448, "step": 4284 }, { "epoch": 0.703504084594438, "grad_norm": 0.6932005286216736, "learning_rate": 8.001924191266195e-05, "loss": 5.0294, "num_input_tokens_seen": 561905664, "step": 4287 }, { "epoch": 0.7039963897621038, "grad_norm": 0.537021815776825, "learning_rate": 7.999125826364651e-05, "loss": 5.0358, "num_input_tokens_seen": 562298880, "step": 4290 }, { "epoch": 0.7044886949297696, "grad_norm": 0.6217671632766724, "learning_rate": 7.996330395272346e-05, "loss": 5.0181, "num_input_tokens_seen": 562692096, "step": 4293 }, { "epoch": 0.7049810000974354, "grad_norm": 0.6611473560333252, "learning_rate": 7.993537892866508e-05, "loss": 5.0083, "num_input_tokens_seen": 563085312, "step": 4296 }, { "epoch": 0.7054733052651012, "grad_norm": 0.6191869378089905, "learning_rate": 7.990748314036885e-05, "loss": 5.0469, "num_input_tokens_seen": 563478528, "step": 4299 }, { "epoch": 0.7059656104327671, "grad_norm": 0.6508629322052002, "learning_rate": 7.987961653685697e-05, "loss": 5.0245, "num_input_tokens_seen": 563871744, "step": 4302 }, { "epoch": 0.7064579156004328, "grad_norm": 0.5989941954612732, "learning_rate": 7.9851779067276e-05, "loss": 5.0273, "num_input_tokens_seen": 564264960, "step": 4305 }, { "epoch": 0.7069502207680987, "grad_norm": 0.6267138123512268, "learning_rate": 7.98239706808965e-05, "loss": 4.9692, "num_input_tokens_seen": 564658176, "step": 4308 }, { "epoch": 0.7074425259357644, "grad_norm": 0.6717625260353088, "learning_rate": 7.979619132711254e-05, "loss": 4.9929, "num_input_tokens_seen": 565051392, "step": 4311 }, { "epoch": 0.7079348311034303, "grad_norm": 0.7013946771621704, "learning_rate": 7.976844095544147e-05, "loss": 4.9909, "num_input_tokens_seen": 565444608, "step": 4314 }, { "epoch": 0.708427136271096, "grad_norm": 0.589557945728302, "learning_rate": 7.974071951552337e-05, "loss": 5.0382, "num_input_tokens_seen": 565837824, "step": 4317 }, { "epoch": 0.7089194414387618, "grad_norm": 0.6243999600410461, "learning_rate": 7.97130269571208e-05, "loss": 5.0769, "num_input_tokens_seen": 566231040, "step": 4320 }, { "epoch": 0.7094117466064277, "grad_norm": 0.6162127256393433, "learning_rate": 7.968536323011831e-05, "loss": 5.0046, "num_input_tokens_seen": 566624256, "step": 4323 }, { "epoch": 0.7099040517740934, "grad_norm": 0.598885178565979, "learning_rate": 7.965772828452217e-05, "loss": 5.0263, "num_input_tokens_seen": 567017472, "step": 4326 }, { "epoch": 0.7103963569417593, "grad_norm": 0.5507720708847046, "learning_rate": 7.963012207045987e-05, "loss": 5.0186, "num_input_tokens_seen": 567410688, "step": 4329 }, { "epoch": 0.710888662109425, "grad_norm": 0.5605682730674744, "learning_rate": 7.960254453817985e-05, "loss": 4.9507, "num_input_tokens_seen": 567803904, "step": 4332 }, { "epoch": 0.7113809672770909, "grad_norm": 0.6122700572013855, "learning_rate": 7.957499563805107e-05, "loss": 5.0315, "num_input_tokens_seen": 568197120, "step": 4335 }, { "epoch": 0.7118732724447567, "grad_norm": 0.7516876459121704, "learning_rate": 7.954747532056262e-05, "loss": 4.9979, "num_input_tokens_seen": 568590336, "step": 4338 }, { "epoch": 0.7123655776124225, "grad_norm": 0.5879480242729187, "learning_rate": 7.951998353632336e-05, "loss": 5.018, "num_input_tokens_seen": 568983552, "step": 4341 }, { "epoch": 0.7128578827800883, "grad_norm": 0.7155163884162903, "learning_rate": 7.949252023606159e-05, "loss": 5.0276, "num_input_tokens_seen": 569376768, "step": 4344 }, { "epoch": 0.7133501879477541, "grad_norm": 0.5591400265693665, "learning_rate": 7.946508537062463e-05, "loss": 5.0113, "num_input_tokens_seen": 569769984, "step": 4347 }, { "epoch": 0.7138424931154199, "grad_norm": 0.5849490761756897, "learning_rate": 7.943767889097847e-05, "loss": 4.9933, "num_input_tokens_seen": 570163200, "step": 4350 }, { "epoch": 0.7143347982830858, "grad_norm": 0.6266775727272034, "learning_rate": 7.941030074820736e-05, "loss": 5.0199, "num_input_tokens_seen": 570556416, "step": 4353 }, { "epoch": 0.7148271034507515, "grad_norm": 0.6590829491615295, "learning_rate": 7.938295089351354e-05, "loss": 5.0225, "num_input_tokens_seen": 570949632, "step": 4356 }, { "epoch": 0.7153194086184174, "grad_norm": 0.5544537901878357, "learning_rate": 7.935562927821676e-05, "loss": 5.0161, "num_input_tokens_seen": 571342848, "step": 4359 }, { "epoch": 0.7158117137860831, "grad_norm": 0.5898229479789734, "learning_rate": 7.932833585375402e-05, "loss": 5.0445, "num_input_tokens_seen": 571736064, "step": 4362 }, { "epoch": 0.716304018953749, "grad_norm": 0.6551963686943054, "learning_rate": 7.930107057167912e-05, "loss": 4.9816, "num_input_tokens_seen": 572129280, "step": 4365 }, { "epoch": 0.7167963241214148, "grad_norm": 0.530617356300354, "learning_rate": 7.927383338366234e-05, "loss": 5.0351, "num_input_tokens_seen": 572522496, "step": 4368 }, { "epoch": 0.7172886292890805, "grad_norm": 0.6499817371368408, "learning_rate": 7.924662424149012e-05, "loss": 5.0164, "num_input_tokens_seen": 572915712, "step": 4371 }, { "epoch": 0.7177809344567464, "grad_norm": 0.6318878531455994, "learning_rate": 7.921944309706458e-05, "loss": 5.0125, "num_input_tokens_seen": 573308928, "step": 4374 }, { "epoch": 0.7182732396244121, "grad_norm": 0.5628673434257507, "learning_rate": 7.919228990240331e-05, "loss": 4.9925, "num_input_tokens_seen": 573702144, "step": 4377 }, { "epoch": 0.718765544792078, "grad_norm": 0.7107114791870117, "learning_rate": 7.916516460963895e-05, "loss": 4.9837, "num_input_tokens_seen": 574095360, "step": 4380 }, { "epoch": 0.7192578499597438, "grad_norm": 0.6181208491325378, "learning_rate": 7.913806717101879e-05, "loss": 5.0152, "num_input_tokens_seen": 574488576, "step": 4383 }, { "epoch": 0.7197501551274096, "grad_norm": 0.6599327921867371, "learning_rate": 7.911099753890446e-05, "loss": 5.0476, "num_input_tokens_seen": 574881792, "step": 4386 }, { "epoch": 0.7202424602950754, "grad_norm": 0.5509748458862305, "learning_rate": 7.90839556657716e-05, "loss": 5.0521, "num_input_tokens_seen": 575275008, "step": 4389 }, { "epoch": 0.7207347654627412, "grad_norm": 0.5559970736503601, "learning_rate": 7.905694150420948e-05, "loss": 5.0064, "num_input_tokens_seen": 575668224, "step": 4392 }, { "epoch": 0.721227070630407, "grad_norm": 0.5684667825698853, "learning_rate": 7.902995500692065e-05, "loss": 5.0044, "num_input_tokens_seen": 576061440, "step": 4395 }, { "epoch": 0.7217193757980729, "grad_norm": 0.6226740479469299, "learning_rate": 7.900299612672062e-05, "loss": 5.0389, "num_input_tokens_seen": 576454656, "step": 4398 }, { "epoch": 0.7220475792431834, "eval_accuracy": 0.22640612278130598, "eval_loss": 5.292022228240967, "eval_runtime": 114.3223, "eval_samples_per_second": 2.624, "eval_steps_per_second": 1.312, "num_input_tokens_seen": 576716800, "step": 4400 }, { "epoch": 0.7222116809657386, "grad_norm": 0.6325856447219849, "learning_rate": 7.897606481653748e-05, "loss": 5.0615, "num_input_tokens_seen": 576847872, "step": 4401 }, { "epoch": 0.7227039861334045, "grad_norm": 0.607279896736145, "learning_rate": 7.894916102941156e-05, "loss": 4.9912, "num_input_tokens_seen": 577241088, "step": 4404 }, { "epoch": 0.7231962913010702, "grad_norm": 0.5997862219810486, "learning_rate": 7.892228471849507e-05, "loss": 5.0194, "num_input_tokens_seen": 577634304, "step": 4407 }, { "epoch": 0.7236885964687361, "grad_norm": 0.6025493741035461, "learning_rate": 7.889543583705186e-05, "loss": 4.9893, "num_input_tokens_seen": 578027520, "step": 4410 }, { "epoch": 0.7241809016364019, "grad_norm": 0.6023775339126587, "learning_rate": 7.886861433845691e-05, "loss": 4.9841, "num_input_tokens_seen": 578420736, "step": 4413 }, { "epoch": 0.7246732068040677, "grad_norm": 0.5165770649909973, "learning_rate": 7.884182017619615e-05, "loss": 4.9778, "num_input_tokens_seen": 578813952, "step": 4416 }, { "epoch": 0.7251655119717335, "grad_norm": 0.6931847333908081, "learning_rate": 7.881505330386602e-05, "loss": 4.9895, "num_input_tokens_seen": 579207168, "step": 4419 }, { "epoch": 0.7256578171393993, "grad_norm": 0.6051077842712402, "learning_rate": 7.878831367517315e-05, "loss": 4.9822, "num_input_tokens_seen": 579600384, "step": 4422 }, { "epoch": 0.7261501223070651, "grad_norm": 0.5695605874061584, "learning_rate": 7.876160124393405e-05, "loss": 4.9885, "num_input_tokens_seen": 579993600, "step": 4425 }, { "epoch": 0.7266424274747308, "grad_norm": 0.5569063425064087, "learning_rate": 7.873491596407478e-05, "loss": 4.9892, "num_input_tokens_seen": 580386816, "step": 4428 }, { "epoch": 0.7271347326423967, "grad_norm": 0.5842316746711731, "learning_rate": 7.870825778963058e-05, "loss": 4.9598, "num_input_tokens_seen": 580780032, "step": 4431 }, { "epoch": 0.7276270378100625, "grad_norm": 0.6461383104324341, "learning_rate": 7.868162667474556e-05, "loss": 4.989, "num_input_tokens_seen": 581173248, "step": 4434 }, { "epoch": 0.7281193429777283, "grad_norm": 0.6318100690841675, "learning_rate": 7.865502257367235e-05, "loss": 5.0184, "num_input_tokens_seen": 581566464, "step": 4437 }, { "epoch": 0.7286116481453941, "grad_norm": 0.6120467782020569, "learning_rate": 7.862844544077183e-05, "loss": 5.0424, "num_input_tokens_seen": 581959680, "step": 4440 }, { "epoch": 0.7291039533130599, "grad_norm": 0.6271392107009888, "learning_rate": 7.860189523051269e-05, "loss": 5.0165, "num_input_tokens_seen": 582352896, "step": 4443 }, { "epoch": 0.7295962584807257, "grad_norm": 0.5989521741867065, "learning_rate": 7.857537189747122e-05, "loss": 5.0157, "num_input_tokens_seen": 582746112, "step": 4446 }, { "epoch": 0.7300885636483916, "grad_norm": 0.5683810114860535, "learning_rate": 7.854887539633091e-05, "loss": 4.9685, "num_input_tokens_seen": 583139328, "step": 4449 }, { "epoch": 0.7305808688160573, "grad_norm": 0.598704993724823, "learning_rate": 7.852240568188216e-05, "loss": 5.0173, "num_input_tokens_seen": 583532544, "step": 4452 }, { "epoch": 0.7310731739837232, "grad_norm": 0.5917366743087769, "learning_rate": 7.849596270902193e-05, "loss": 5.0061, "num_input_tokens_seen": 583925760, "step": 4455 }, { "epoch": 0.7315654791513889, "grad_norm": 0.6544704437255859, "learning_rate": 7.846954643275341e-05, "loss": 4.9838, "num_input_tokens_seen": 584318976, "step": 4458 }, { "epoch": 0.7320577843190548, "grad_norm": 0.5870675444602966, "learning_rate": 7.844315680818579e-05, "loss": 4.988, "num_input_tokens_seen": 584712192, "step": 4461 }, { "epoch": 0.7325500894867206, "grad_norm": 0.7421653270721436, "learning_rate": 7.841679379053378e-05, "loss": 5.0049, "num_input_tokens_seen": 585105408, "step": 4464 }, { "epoch": 0.7330423946543864, "grad_norm": 0.5522703528404236, "learning_rate": 7.839045733511741e-05, "loss": 5.0234, "num_input_tokens_seen": 585498624, "step": 4467 }, { "epoch": 0.7335346998220522, "grad_norm": 0.6532583832740784, "learning_rate": 7.836414739736173e-05, "loss": 4.9713, "num_input_tokens_seen": 585891840, "step": 4470 }, { "epoch": 0.734027004989718, "grad_norm": 0.6363163590431213, "learning_rate": 7.833786393279637e-05, "loss": 5.0337, "num_input_tokens_seen": 586285056, "step": 4473 }, { "epoch": 0.7345193101573838, "grad_norm": 0.7302047610282898, "learning_rate": 7.831160689705535e-05, "loss": 5.0239, "num_input_tokens_seen": 586678272, "step": 4476 }, { "epoch": 0.7350116153250497, "grad_norm": 0.5957253575325012, "learning_rate": 7.828537624587667e-05, "loss": 5.0409, "num_input_tokens_seen": 587071488, "step": 4479 }, { "epoch": 0.7355039204927154, "grad_norm": 0.675913393497467, "learning_rate": 7.825917193510209e-05, "loss": 5.004, "num_input_tokens_seen": 587464704, "step": 4482 }, { "epoch": 0.7359962256603813, "grad_norm": 0.6078398823738098, "learning_rate": 7.823299392067672e-05, "loss": 5.012, "num_input_tokens_seen": 587857920, "step": 4485 }, { "epoch": 0.736488530828047, "grad_norm": 0.7093663215637207, "learning_rate": 7.82068421586488e-05, "loss": 4.9781, "num_input_tokens_seen": 588251136, "step": 4488 }, { "epoch": 0.7369808359957128, "grad_norm": 0.5911455154418945, "learning_rate": 7.81807166051693e-05, "loss": 5.0096, "num_input_tokens_seen": 588644352, "step": 4491 }, { "epoch": 0.7374731411633787, "grad_norm": 0.5189294815063477, "learning_rate": 7.815461721649169e-05, "loss": 5.0144, "num_input_tokens_seen": 589037568, "step": 4494 }, { "epoch": 0.7379654463310444, "grad_norm": 0.5418742299079895, "learning_rate": 7.812854394897162e-05, "loss": 5.0046, "num_input_tokens_seen": 589430784, "step": 4497 }, { "epoch": 0.7384577514987103, "grad_norm": 0.6336432099342346, "learning_rate": 7.810249675906653e-05, "loss": 4.998, "num_input_tokens_seen": 589824000, "step": 4500 }, { "epoch": 0.738950056666376, "grad_norm": 0.5630114078521729, "learning_rate": 7.807647560333547e-05, "loss": 5.0358, "num_input_tokens_seen": 590217216, "step": 4503 }, { "epoch": 0.7394423618340419, "grad_norm": 0.7701685428619385, "learning_rate": 7.80504804384387e-05, "loss": 5.008, "num_input_tokens_seen": 590610432, "step": 4506 }, { "epoch": 0.7399346670017077, "grad_norm": 0.7231709361076355, "learning_rate": 7.802451122113745e-05, "loss": 5.036, "num_input_tokens_seen": 591003648, "step": 4509 }, { "epoch": 0.7404269721693735, "grad_norm": 0.5655918717384338, "learning_rate": 7.799856790829355e-05, "loss": 4.991, "num_input_tokens_seen": 591396864, "step": 4512 }, { "epoch": 0.7409192773370393, "grad_norm": 0.6875422596931458, "learning_rate": 7.797265045686918e-05, "loss": 5.0266, "num_input_tokens_seen": 591790080, "step": 4515 }, { "epoch": 0.7414115825047051, "grad_norm": 0.7806153297424316, "learning_rate": 7.794675882392659e-05, "loss": 4.9771, "num_input_tokens_seen": 592183296, "step": 4518 }, { "epoch": 0.7419038876723709, "grad_norm": 0.5837035775184631, "learning_rate": 7.792089296662772e-05, "loss": 5.0107, "num_input_tokens_seen": 592576512, "step": 4521 }, { "epoch": 0.7423961928400368, "grad_norm": 0.6467218399047852, "learning_rate": 7.789505284223402e-05, "loss": 5.0342, "num_input_tokens_seen": 592969728, "step": 4524 }, { "epoch": 0.7428884980077025, "grad_norm": 0.7257503867149353, "learning_rate": 7.786923840810598e-05, "loss": 5.0151, "num_input_tokens_seen": 593362944, "step": 4527 }, { "epoch": 0.7433808031753684, "grad_norm": 0.5904244184494019, "learning_rate": 7.784344962170305e-05, "loss": 4.9642, "num_input_tokens_seen": 593756160, "step": 4530 }, { "epoch": 0.7438731083430341, "grad_norm": 0.6016355752944946, "learning_rate": 7.781768644058319e-05, "loss": 5.0225, "num_input_tokens_seen": 594149376, "step": 4533 }, { "epoch": 0.7443654135107, "grad_norm": 0.6705912947654724, "learning_rate": 7.779194882240258e-05, "loss": 5.0378, "num_input_tokens_seen": 594542592, "step": 4536 }, { "epoch": 0.7448577186783657, "grad_norm": 0.6506786942481995, "learning_rate": 7.776623672491541e-05, "loss": 5.0302, "num_input_tokens_seen": 594935808, "step": 4539 }, { "epoch": 0.7453500238460316, "grad_norm": 0.6254527568817139, "learning_rate": 7.77405501059736e-05, "loss": 4.9927, "num_input_tokens_seen": 595329024, "step": 4542 }, { "epoch": 0.7458423290136974, "grad_norm": 0.6810278296470642, "learning_rate": 7.771488892352636e-05, "loss": 4.9727, "num_input_tokens_seen": 595722240, "step": 4545 }, { "epoch": 0.7463346341813631, "grad_norm": 0.6676033139228821, "learning_rate": 7.768925313562004e-05, "loss": 4.952, "num_input_tokens_seen": 596115456, "step": 4548 }, { "epoch": 0.746826939349029, "grad_norm": 0.5896539688110352, "learning_rate": 7.766364270039782e-05, "loss": 4.972, "num_input_tokens_seen": 596508672, "step": 4551 }, { "epoch": 0.7473192445166947, "grad_norm": 0.5440996885299683, "learning_rate": 7.763805757609938e-05, "loss": 5.0342, "num_input_tokens_seen": 596901888, "step": 4554 }, { "epoch": 0.7478115496843606, "grad_norm": 0.6157152056694031, "learning_rate": 7.761249772106066e-05, "loss": 5.0039, "num_input_tokens_seen": 597295104, "step": 4557 }, { "epoch": 0.7483038548520264, "grad_norm": 0.5251792073249817, "learning_rate": 7.758696309371352e-05, "loss": 5.0235, "num_input_tokens_seen": 597688320, "step": 4560 }, { "epoch": 0.7487961600196922, "grad_norm": 0.5628390908241272, "learning_rate": 7.756145365258549e-05, "loss": 4.9777, "num_input_tokens_seen": 598081536, "step": 4563 }, { "epoch": 0.749288465187358, "grad_norm": 0.5079121589660645, "learning_rate": 7.753596935629956e-05, "loss": 4.975, "num_input_tokens_seen": 598474752, "step": 4566 }, { "epoch": 0.7497807703550238, "grad_norm": 0.5366020798683167, "learning_rate": 7.751051016357372e-05, "loss": 5.0269, "num_input_tokens_seen": 598867968, "step": 4569 }, { "epoch": 0.7502730755226896, "grad_norm": 0.5857319235801697, "learning_rate": 7.748507603322084e-05, "loss": 4.9907, "num_input_tokens_seen": 599261184, "step": 4572 }, { "epoch": 0.7507653806903555, "grad_norm": 0.631243884563446, "learning_rate": 7.745966692414832e-05, "loss": 5.0049, "num_input_tokens_seen": 599654400, "step": 4575 }, { "epoch": 0.7512576858580212, "grad_norm": 0.554707407951355, "learning_rate": 7.743428279535785e-05, "loss": 4.9768, "num_input_tokens_seen": 600047616, "step": 4578 }, { "epoch": 0.7517499910256871, "grad_norm": 0.5789132118225098, "learning_rate": 7.740892360594508e-05, "loss": 5.0059, "num_input_tokens_seen": 600440832, "step": 4581 }, { "epoch": 0.7522422961933528, "grad_norm": 0.6106734275817871, "learning_rate": 7.738358931509934e-05, "loss": 5.0244, "num_input_tokens_seen": 600834048, "step": 4584 }, { "epoch": 0.7527346013610187, "grad_norm": 0.7959611415863037, "learning_rate": 7.735827988210347e-05, "loss": 4.9899, "num_input_tokens_seen": 601227264, "step": 4587 }, { "epoch": 0.7532269065286845, "grad_norm": 0.6123759150505066, "learning_rate": 7.733299526633342e-05, "loss": 5.0084, "num_input_tokens_seen": 601620480, "step": 4590 }, { "epoch": 0.7537192116963503, "grad_norm": 0.8088337779045105, "learning_rate": 7.730773542725799e-05, "loss": 5.0053, "num_input_tokens_seen": 602013696, "step": 4593 }, { "epoch": 0.7542115168640161, "grad_norm": 0.6326596140861511, "learning_rate": 7.72825003244387e-05, "loss": 4.9837, "num_input_tokens_seen": 602406912, "step": 4596 }, { "epoch": 0.7547038220316818, "grad_norm": 0.8043537139892578, "learning_rate": 7.72572899175293e-05, "loss": 4.9436, "num_input_tokens_seen": 602800128, "step": 4599 }, { "epoch": 0.7551961271993477, "grad_norm": 0.70109623670578, "learning_rate": 7.723210416627567e-05, "loss": 5.0085, "num_input_tokens_seen": 603193344, "step": 4602 }, { "epoch": 0.7556884323670136, "grad_norm": 0.6619580984115601, "learning_rate": 7.720694303051547e-05, "loss": 5.0244, "num_input_tokens_seen": 603586560, "step": 4605 }, { "epoch": 0.7561807375346793, "grad_norm": 0.6413691639900208, "learning_rate": 7.718180647017793e-05, "loss": 4.9864, "num_input_tokens_seen": 603979776, "step": 4608 }, { "epoch": 0.7566730427023451, "grad_norm": 0.6121329665184021, "learning_rate": 7.715669444528345e-05, "loss": 4.9885, "num_input_tokens_seen": 604372992, "step": 4611 }, { "epoch": 0.7571653478700109, "grad_norm": 0.5942865014076233, "learning_rate": 7.713160691594354e-05, "loss": 4.9964, "num_input_tokens_seen": 604766208, "step": 4614 }, { "epoch": 0.7576576530376767, "grad_norm": 0.7060537934303284, "learning_rate": 7.710654384236036e-05, "loss": 5.0532, "num_input_tokens_seen": 605159424, "step": 4617 }, { "epoch": 0.7581499582053426, "grad_norm": 0.6082600355148315, "learning_rate": 7.708150518482654e-05, "loss": 5.0, "num_input_tokens_seen": 605552640, "step": 4620 }, { "epoch": 0.7586422633730083, "grad_norm": 0.5539378523826599, "learning_rate": 7.705649090372494e-05, "loss": 4.9649, "num_input_tokens_seen": 605945856, "step": 4623 }, { "epoch": 0.7591345685406742, "grad_norm": 0.6369094848632812, "learning_rate": 7.703150095952836e-05, "loss": 4.9854, "num_input_tokens_seen": 606339072, "step": 4626 }, { "epoch": 0.7596268737083399, "grad_norm": 0.6065540909767151, "learning_rate": 7.700653531279927e-05, "loss": 5.0692, "num_input_tokens_seen": 606732288, "step": 4629 }, { "epoch": 0.7601191788760058, "grad_norm": 0.6990041136741638, "learning_rate": 7.698159392418949e-05, "loss": 5.0129, "num_input_tokens_seen": 607125504, "step": 4632 }, { "epoch": 0.7606114840436716, "grad_norm": 0.5794108510017395, "learning_rate": 7.695667675444007e-05, "loss": 5.0128, "num_input_tokens_seen": 607518720, "step": 4635 }, { "epoch": 0.7611037892113374, "grad_norm": 0.6325658559799194, "learning_rate": 7.693178376438095e-05, "loss": 5.0184, "num_input_tokens_seen": 607911936, "step": 4638 }, { "epoch": 0.7615960943790032, "grad_norm": 0.6222713589668274, "learning_rate": 7.690691491493065e-05, "loss": 5.0287, "num_input_tokens_seen": 608305152, "step": 4641 }, { "epoch": 0.762088399546669, "grad_norm": 0.7314369082450867, "learning_rate": 7.688207016709613e-05, "loss": 5.0145, "num_input_tokens_seen": 608698368, "step": 4644 }, { "epoch": 0.7625807047143348, "grad_norm": 0.6101865768432617, "learning_rate": 7.685724948197242e-05, "loss": 5.0326, "num_input_tokens_seen": 609091584, "step": 4647 }, { "epoch": 0.7630730098820006, "grad_norm": 0.6560894846916199, "learning_rate": 7.683245282074242e-05, "loss": 4.9619, "num_input_tokens_seen": 609484800, "step": 4650 }, { "epoch": 0.7635653150496664, "grad_norm": 0.5975686311721802, "learning_rate": 7.68076801446767e-05, "loss": 5.0259, "num_input_tokens_seen": 609878016, "step": 4653 }, { "epoch": 0.7640576202173323, "grad_norm": 0.5379384160041809, "learning_rate": 7.678293141513307e-05, "loss": 4.9856, "num_input_tokens_seen": 610271232, "step": 4656 }, { "epoch": 0.764549925384998, "grad_norm": 0.5923475623130798, "learning_rate": 7.675820659355657e-05, "loss": 5.0146, "num_input_tokens_seen": 610664448, "step": 4659 }, { "epoch": 0.7650422305526638, "grad_norm": 0.5225929617881775, "learning_rate": 7.673350564147898e-05, "loss": 5.037, "num_input_tokens_seen": 611057664, "step": 4662 }, { "epoch": 0.7655345357203296, "grad_norm": 0.5816430449485779, "learning_rate": 7.670882852051876e-05, "loss": 4.9901, "num_input_tokens_seen": 611450880, "step": 4665 }, { "epoch": 0.7660268408879954, "grad_norm": 0.5520696043968201, "learning_rate": 7.668417519238068e-05, "loss": 4.9269, "num_input_tokens_seen": 611844096, "step": 4668 }, { "epoch": 0.7665191460556613, "grad_norm": 0.5480571985244751, "learning_rate": 7.66595456188556e-05, "loss": 4.9748, "num_input_tokens_seen": 612237312, "step": 4671 }, { "epoch": 0.767011451223327, "grad_norm": 0.609230101108551, "learning_rate": 7.663493976182024e-05, "loss": 4.9698, "num_input_tokens_seen": 612630528, "step": 4674 }, { "epoch": 0.7675037563909929, "grad_norm": 0.7071127891540527, "learning_rate": 7.661035758323693e-05, "loss": 5.0464, "num_input_tokens_seen": 613023744, "step": 4677 }, { "epoch": 0.7679960615586586, "grad_norm": 0.7234042286872864, "learning_rate": 7.658579904515338e-05, "loss": 4.9778, "num_input_tokens_seen": 613416960, "step": 4680 }, { "epoch": 0.7684883667263245, "grad_norm": 0.652027428150177, "learning_rate": 7.656126410970232e-05, "loss": 4.971, "num_input_tokens_seen": 613810176, "step": 4683 }, { "epoch": 0.7689806718939903, "grad_norm": 0.7037466168403625, "learning_rate": 7.653675273910141e-05, "loss": 5.0228, "num_input_tokens_seen": 614203392, "step": 4686 }, { "epoch": 0.7694729770616561, "grad_norm": 0.5814207196235657, "learning_rate": 7.651226489565297e-05, "loss": 4.9948, "num_input_tokens_seen": 614596608, "step": 4689 }, { "epoch": 0.7699652822293219, "grad_norm": 0.7240739464759827, "learning_rate": 7.64878005417436e-05, "loss": 5.0276, "num_input_tokens_seen": 614989824, "step": 4692 }, { "epoch": 0.7704575873969877, "grad_norm": 0.7340640425682068, "learning_rate": 7.646335963984411e-05, "loss": 4.984, "num_input_tokens_seen": 615383040, "step": 4695 }, { "epoch": 0.7709498925646535, "grad_norm": 0.6027811765670776, "learning_rate": 7.643894215250911e-05, "loss": 4.9804, "num_input_tokens_seen": 615776256, "step": 4698 }, { "epoch": 0.7714421977323194, "grad_norm": 0.5704403519630432, "learning_rate": 7.641454804237697e-05, "loss": 4.9516, "num_input_tokens_seen": 616169472, "step": 4701 }, { "epoch": 0.7719345028999851, "grad_norm": 0.5998306274414062, "learning_rate": 7.639017727216942e-05, "loss": 4.991, "num_input_tokens_seen": 616562688, "step": 4704 }, { "epoch": 0.772426808067651, "grad_norm": 0.6468780040740967, "learning_rate": 7.636582980469131e-05, "loss": 5.0081, "num_input_tokens_seen": 616955904, "step": 4707 }, { "epoch": 0.7729191132353167, "grad_norm": 0.5996934771537781, "learning_rate": 7.634150560283054e-05, "loss": 4.9508, "num_input_tokens_seen": 617349120, "step": 4710 }, { "epoch": 0.7734114184029826, "grad_norm": 0.7018175721168518, "learning_rate": 7.631720462955758e-05, "loss": 5.0056, "num_input_tokens_seen": 617742336, "step": 4713 }, { "epoch": 0.7739037235706484, "grad_norm": 0.6839625239372253, "learning_rate": 7.629292684792543e-05, "loss": 4.9534, "num_input_tokens_seen": 618135552, "step": 4716 }, { "epoch": 0.7743960287383141, "grad_norm": 0.652540922164917, "learning_rate": 7.62686722210693e-05, "loss": 4.965, "num_input_tokens_seen": 618528768, "step": 4719 }, { "epoch": 0.77488833390598, "grad_norm": 0.7679561972618103, "learning_rate": 7.624444071220638e-05, "loss": 4.9414, "num_input_tokens_seen": 618921984, "step": 4722 }, { "epoch": 0.7753806390736457, "grad_norm": 0.7038884162902832, "learning_rate": 7.62202322846356e-05, "loss": 4.9849, "num_input_tokens_seen": 619315200, "step": 4725 }, { "epoch": 0.7758729442413116, "grad_norm": 0.6602532267570496, "learning_rate": 7.619604690173746e-05, "loss": 5.0283, "num_input_tokens_seen": 619708416, "step": 4728 }, { "epoch": 0.7763652494089774, "grad_norm": 0.6303278803825378, "learning_rate": 7.617188452697369e-05, "loss": 4.9665, "num_input_tokens_seen": 620101632, "step": 4731 }, { "epoch": 0.7768575545766432, "grad_norm": 0.669486403465271, "learning_rate": 7.614774512388709e-05, "loss": 4.9801, "num_input_tokens_seen": 620494848, "step": 4734 }, { "epoch": 0.777349859744309, "grad_norm": 0.6238739490509033, "learning_rate": 7.61236286561013e-05, "loss": 5.0117, "num_input_tokens_seen": 620888064, "step": 4737 }, { "epoch": 0.7778421649119748, "grad_norm": 0.5390612483024597, "learning_rate": 7.609953508732051e-05, "loss": 4.966, "num_input_tokens_seen": 621281280, "step": 4740 }, { "epoch": 0.7783344700796406, "grad_norm": 0.6080785393714905, "learning_rate": 7.607546438132936e-05, "loss": 4.9709, "num_input_tokens_seen": 621674496, "step": 4743 }, { "epoch": 0.7788267752473064, "grad_norm": 0.5530922412872314, "learning_rate": 7.605141650199254e-05, "loss": 4.9836, "num_input_tokens_seen": 622067712, "step": 4746 }, { "epoch": 0.7793190804149722, "grad_norm": 0.5536843538284302, "learning_rate": 7.602739141325468e-05, "loss": 4.9132, "num_input_tokens_seen": 622460928, "step": 4749 }, { "epoch": 0.7798113855826381, "grad_norm": 0.6630847454071045, "learning_rate": 7.600338907914008e-05, "loss": 4.9765, "num_input_tokens_seen": 622854144, "step": 4752 }, { "epoch": 0.7803036907503038, "grad_norm": 0.5388240814208984, "learning_rate": 7.597940946375254e-05, "loss": 4.9774, "num_input_tokens_seen": 623247360, "step": 4755 }, { "epoch": 0.7807959959179697, "grad_norm": 0.5779323577880859, "learning_rate": 7.595545253127499e-05, "loss": 4.9994, "num_input_tokens_seen": 623640576, "step": 4758 }, { "epoch": 0.7812883010856354, "grad_norm": 0.622727632522583, "learning_rate": 7.593151824596948e-05, "loss": 5.0106, "num_input_tokens_seen": 624033792, "step": 4761 }, { "epoch": 0.7817806062533013, "grad_norm": 0.6555529236793518, "learning_rate": 7.590760657217675e-05, "loss": 4.9756, "num_input_tokens_seen": 624427008, "step": 4764 }, { "epoch": 0.7822729114209671, "grad_norm": 0.6330806612968445, "learning_rate": 7.588371747431617e-05, "loss": 5.0433, "num_input_tokens_seen": 624820224, "step": 4767 }, { "epoch": 0.7827652165886329, "grad_norm": 0.5727117657661438, "learning_rate": 7.585985091688538e-05, "loss": 4.9545, "num_input_tokens_seen": 625213440, "step": 4770 }, { "epoch": 0.7832575217562987, "grad_norm": 0.5896506309509277, "learning_rate": 7.583600686446016e-05, "loss": 4.9836, "num_input_tokens_seen": 625606656, "step": 4773 }, { "epoch": 0.7837498269239644, "grad_norm": 0.5335837602615356, "learning_rate": 7.581218528169419e-05, "loss": 4.9992, "num_input_tokens_seen": 625999872, "step": 4776 }, { "epoch": 0.7842421320916303, "grad_norm": 0.6446117758750916, "learning_rate": 7.578838613331881e-05, "loss": 4.9502, "num_input_tokens_seen": 626393088, "step": 4779 }, { "epoch": 0.7847344372592961, "grad_norm": 0.5382349491119385, "learning_rate": 7.576460938414284e-05, "loss": 4.9647, "num_input_tokens_seen": 626786304, "step": 4782 }, { "epoch": 0.7852267424269619, "grad_norm": 0.7169778943061829, "learning_rate": 7.574085499905227e-05, "loss": 4.9854, "num_input_tokens_seen": 627179520, "step": 4785 }, { "epoch": 0.7857190475946277, "grad_norm": 0.5940950512886047, "learning_rate": 7.571712294301016e-05, "loss": 5.0078, "num_input_tokens_seen": 627572736, "step": 4788 }, { "epoch": 0.7862113527622935, "grad_norm": 0.6756063103675842, "learning_rate": 7.56934131810564e-05, "loss": 4.9913, "num_input_tokens_seen": 627965952, "step": 4791 }, { "epoch": 0.7867036579299593, "grad_norm": 0.5659769177436829, "learning_rate": 7.566972567830736e-05, "loss": 5.0014, "num_input_tokens_seen": 628359168, "step": 4794 }, { "epoch": 0.7871959630976252, "grad_norm": 0.595694363117218, "learning_rate": 7.564606039995587e-05, "loss": 4.9739, "num_input_tokens_seen": 628752384, "step": 4797 }, { "epoch": 0.7876882682652909, "grad_norm": 0.6841439604759216, "learning_rate": 7.562241731127086e-05, "loss": 4.9732, "num_input_tokens_seen": 629145600, "step": 4800 }, { "epoch": 0.7876882682652909, "eval_accuracy": 0.22760788145253216, "eval_loss": 5.267364978790283, "eval_runtime": 115.6402, "eval_samples_per_second": 2.594, "eval_steps_per_second": 1.297, "num_input_tokens_seen": 629145600, "step": 4800 }, { "epoch": 0.7881805734329568, "grad_norm": 0.5944079160690308, "learning_rate": 7.559879637759725e-05, "loss": 4.956, "num_input_tokens_seen": 629538816, "step": 4803 }, { "epoch": 0.7886728786006225, "grad_norm": 0.6290807723999023, "learning_rate": 7.557519756435562e-05, "loss": 4.9765, "num_input_tokens_seen": 629932032, "step": 4806 }, { "epoch": 0.7891651837682884, "grad_norm": 0.6066911220550537, "learning_rate": 7.555162083704213e-05, "loss": 5.0129, "num_input_tokens_seen": 630325248, "step": 4809 }, { "epoch": 0.7896574889359542, "grad_norm": 0.5937833786010742, "learning_rate": 7.552806616122819e-05, "loss": 4.9783, "num_input_tokens_seen": 630718464, "step": 4812 }, { "epoch": 0.79014979410362, "grad_norm": 0.6029826402664185, "learning_rate": 7.550453350256031e-05, "loss": 4.9904, "num_input_tokens_seen": 631111680, "step": 4815 }, { "epoch": 0.7906420992712858, "grad_norm": 0.6355852484703064, "learning_rate": 7.548102282675988e-05, "loss": 4.9274, "num_input_tokens_seen": 631504896, "step": 4818 }, { "epoch": 0.7911344044389516, "grad_norm": 0.6588253378868103, "learning_rate": 7.545753409962299e-05, "loss": 4.9691, "num_input_tokens_seen": 631898112, "step": 4821 }, { "epoch": 0.7916267096066174, "grad_norm": 0.5973790884017944, "learning_rate": 7.543406728702016e-05, "loss": 4.9812, "num_input_tokens_seen": 632291328, "step": 4824 }, { "epoch": 0.7921190147742833, "grad_norm": 0.6492071151733398, "learning_rate": 7.541062235489617e-05, "loss": 4.9996, "num_input_tokens_seen": 632684544, "step": 4827 }, { "epoch": 0.792611319941949, "grad_norm": 0.5668359994888306, "learning_rate": 7.538719926926982e-05, "loss": 4.9706, "num_input_tokens_seen": 633077760, "step": 4830 }, { "epoch": 0.7931036251096149, "grad_norm": 0.6291391849517822, "learning_rate": 7.536379799623378e-05, "loss": 5.001, "num_input_tokens_seen": 633470976, "step": 4833 }, { "epoch": 0.7935959302772806, "grad_norm": 0.5828115940093994, "learning_rate": 7.534041850195436e-05, "loss": 4.9702, "num_input_tokens_seen": 633864192, "step": 4836 }, { "epoch": 0.7940882354449464, "grad_norm": 0.6520923376083374, "learning_rate": 7.531706075267127e-05, "loss": 4.9241, "num_input_tokens_seen": 634257408, "step": 4839 }, { "epoch": 0.7945805406126123, "grad_norm": 0.5931693911552429, "learning_rate": 7.529372471469743e-05, "loss": 4.9394, "num_input_tokens_seen": 634650624, "step": 4842 }, { "epoch": 0.795072845780278, "grad_norm": 0.684350311756134, "learning_rate": 7.527041035441882e-05, "loss": 4.9617, "num_input_tokens_seen": 635043840, "step": 4845 }, { "epoch": 0.7955651509479439, "grad_norm": 0.5431036949157715, "learning_rate": 7.52471176382942e-05, "loss": 4.9856, "num_input_tokens_seen": 635437056, "step": 4848 }, { "epoch": 0.7960574561156096, "grad_norm": 0.5861454606056213, "learning_rate": 7.522384653285496e-05, "loss": 4.972, "num_input_tokens_seen": 635830272, "step": 4851 }, { "epoch": 0.7965497612832755, "grad_norm": 0.6265558004379272, "learning_rate": 7.520059700470488e-05, "loss": 4.9853, "num_input_tokens_seen": 636223488, "step": 4854 }, { "epoch": 0.7970420664509412, "grad_norm": 0.6842302083969116, "learning_rate": 7.517736902051998e-05, "loss": 4.981, "num_input_tokens_seen": 636616704, "step": 4857 }, { "epoch": 0.7975343716186071, "grad_norm": 0.5554177761077881, "learning_rate": 7.515416254704823e-05, "loss": 4.9695, "num_input_tokens_seen": 637009920, "step": 4860 }, { "epoch": 0.7980266767862729, "grad_norm": 0.6592026352882385, "learning_rate": 7.513097755110948e-05, "loss": 4.9942, "num_input_tokens_seen": 637403136, "step": 4863 }, { "epoch": 0.7985189819539387, "grad_norm": 0.5816788077354431, "learning_rate": 7.510781399959514e-05, "loss": 5.0034, "num_input_tokens_seen": 637796352, "step": 4866 }, { "epoch": 0.7990112871216045, "grad_norm": 0.6009346842765808, "learning_rate": 7.508467185946803e-05, "loss": 4.9395, "num_input_tokens_seen": 638189568, "step": 4869 }, { "epoch": 0.7995035922892703, "grad_norm": 0.6724573373794556, "learning_rate": 7.506155109776222e-05, "loss": 5.0087, "num_input_tokens_seen": 638582784, "step": 4872 }, { "epoch": 0.7999958974569361, "grad_norm": 0.6114869117736816, "learning_rate": 7.503845168158274e-05, "loss": 4.9479, "num_input_tokens_seen": 638976000, "step": 4875 }, { "epoch": 0.800488202624602, "grad_norm": 0.7219544649124146, "learning_rate": 7.501537357810552e-05, "loss": 4.9821, "num_input_tokens_seen": 639369216, "step": 4878 }, { "epoch": 0.8009805077922677, "grad_norm": 0.6012791395187378, "learning_rate": 7.499231675457698e-05, "loss": 4.9551, "num_input_tokens_seen": 639762432, "step": 4881 }, { "epoch": 0.8014728129599336, "grad_norm": 0.6682071685791016, "learning_rate": 7.496928117831408e-05, "loss": 4.9587, "num_input_tokens_seen": 640155648, "step": 4884 }, { "epoch": 0.8019651181275993, "grad_norm": 0.6674060821533203, "learning_rate": 7.494626681670401e-05, "loss": 4.9892, "num_input_tokens_seen": 640548864, "step": 4887 }, { "epoch": 0.8024574232952651, "grad_norm": 0.6695443391799927, "learning_rate": 7.492327363720392e-05, "loss": 4.9558, "num_input_tokens_seen": 640942080, "step": 4890 }, { "epoch": 0.802949728462931, "grad_norm": 0.6352146863937378, "learning_rate": 7.490030160734087e-05, "loss": 5.0049, "num_input_tokens_seen": 641335296, "step": 4893 }, { "epoch": 0.8034420336305967, "grad_norm": 0.6130344271659851, "learning_rate": 7.487735069471156e-05, "loss": 5.0011, "num_input_tokens_seen": 641728512, "step": 4896 }, { "epoch": 0.8039343387982626, "grad_norm": 0.7044432163238525, "learning_rate": 7.485442086698213e-05, "loss": 4.9738, "num_input_tokens_seen": 642121728, "step": 4899 }, { "epoch": 0.8044266439659283, "grad_norm": 0.5631691217422485, "learning_rate": 7.483151209188803e-05, "loss": 4.9646, "num_input_tokens_seen": 642514944, "step": 4902 }, { "epoch": 0.8049189491335942, "grad_norm": 0.6033437848091125, "learning_rate": 7.480862433723372e-05, "loss": 4.9488, "num_input_tokens_seen": 642908160, "step": 4905 }, { "epoch": 0.80541125430126, "grad_norm": 0.577086329460144, "learning_rate": 7.478575757089261e-05, "loss": 4.9401, "num_input_tokens_seen": 643301376, "step": 4908 }, { "epoch": 0.8059035594689258, "grad_norm": 0.6318846940994263, "learning_rate": 7.47629117608068e-05, "loss": 5.0187, "num_input_tokens_seen": 643694592, "step": 4911 }, { "epoch": 0.8063958646365916, "grad_norm": 0.635599672794342, "learning_rate": 7.474008687498688e-05, "loss": 4.9588, "num_input_tokens_seen": 644087808, "step": 4914 }, { "epoch": 0.8068881698042574, "grad_norm": 0.5748466849327087, "learning_rate": 7.471728288151176e-05, "loss": 4.9887, "num_input_tokens_seen": 644481024, "step": 4917 }, { "epoch": 0.8073804749719232, "grad_norm": 0.5932797789573669, "learning_rate": 7.469449974852852e-05, "loss": 4.974, "num_input_tokens_seen": 644874240, "step": 4920 }, { "epoch": 0.8078727801395891, "grad_norm": 0.6433725953102112, "learning_rate": 7.467173744425214e-05, "loss": 4.9916, "num_input_tokens_seen": 645267456, "step": 4923 }, { "epoch": 0.8083650853072548, "grad_norm": 0.6424766778945923, "learning_rate": 7.46489959369654e-05, "loss": 5.0115, "num_input_tokens_seen": 645660672, "step": 4926 }, { "epoch": 0.8088573904749207, "grad_norm": 0.6171865463256836, "learning_rate": 7.462627519501864e-05, "loss": 5.0431, "num_input_tokens_seen": 646053888, "step": 4929 }, { "epoch": 0.8093496956425864, "grad_norm": 0.7216885089874268, "learning_rate": 7.460357518682957e-05, "loss": 4.988, "num_input_tokens_seen": 646447104, "step": 4932 }, { "epoch": 0.8098420008102523, "grad_norm": 0.7182767987251282, "learning_rate": 7.458089588088317e-05, "loss": 4.967, "num_input_tokens_seen": 646840320, "step": 4935 }, { "epoch": 0.8103343059779181, "grad_norm": 0.6302428841590881, "learning_rate": 7.455823724573136e-05, "loss": 4.9635, "num_input_tokens_seen": 647233536, "step": 4938 }, { "epoch": 0.8108266111455839, "grad_norm": 0.6370172500610352, "learning_rate": 7.453559924999299e-05, "loss": 4.9557, "num_input_tokens_seen": 647626752, "step": 4941 }, { "epoch": 0.8113189163132497, "grad_norm": 0.6621046662330627, "learning_rate": 7.451298186235347e-05, "loss": 4.9624, "num_input_tokens_seen": 648019968, "step": 4944 }, { "epoch": 0.8118112214809154, "grad_norm": 0.6274940967559814, "learning_rate": 7.449038505156474e-05, "loss": 4.9531, "num_input_tokens_seen": 648413184, "step": 4947 }, { "epoch": 0.8123035266485813, "grad_norm": 0.6614890098571777, "learning_rate": 7.446780878644506e-05, "loss": 4.9893, "num_input_tokens_seen": 648806400, "step": 4950 }, { "epoch": 0.8127958318162471, "grad_norm": 0.7011924386024475, "learning_rate": 7.444525303587875e-05, "loss": 4.9168, "num_input_tokens_seen": 649199616, "step": 4953 }, { "epoch": 0.8132881369839129, "grad_norm": 0.5855690836906433, "learning_rate": 7.442271776881608e-05, "loss": 4.9858, "num_input_tokens_seen": 649592832, "step": 4956 }, { "epoch": 0.8137804421515787, "grad_norm": 0.7736572027206421, "learning_rate": 7.440020295427308e-05, "loss": 4.9729, "num_input_tokens_seen": 649986048, "step": 4959 }, { "epoch": 0.8142727473192445, "grad_norm": 0.7245569229125977, "learning_rate": 7.437770856133132e-05, "loss": 4.9601, "num_input_tokens_seen": 650379264, "step": 4962 }, { "epoch": 0.8147650524869103, "grad_norm": 0.5991591811180115, "learning_rate": 7.435523455913788e-05, "loss": 4.9826, "num_input_tokens_seen": 650772480, "step": 4965 }, { "epoch": 0.8152573576545761, "grad_norm": 0.680020272731781, "learning_rate": 7.433278091690488e-05, "loss": 4.9621, "num_input_tokens_seen": 651165696, "step": 4968 }, { "epoch": 0.8157496628222419, "grad_norm": 0.5399065613746643, "learning_rate": 7.431034760390964e-05, "loss": 4.9357, "num_input_tokens_seen": 651558912, "step": 4971 }, { "epoch": 0.8162419679899078, "grad_norm": 0.7673788666725159, "learning_rate": 7.428793458949426e-05, "loss": 4.98, "num_input_tokens_seen": 651952128, "step": 4974 }, { "epoch": 0.8167342731575735, "grad_norm": 0.5334210395812988, "learning_rate": 7.426554184306555e-05, "loss": 4.9979, "num_input_tokens_seen": 652345344, "step": 4977 }, { "epoch": 0.8172265783252394, "grad_norm": 0.7658263444900513, "learning_rate": 7.424316933409486e-05, "loss": 4.994, "num_input_tokens_seen": 652738560, "step": 4980 }, { "epoch": 0.8177188834929051, "grad_norm": 0.6794727444648743, "learning_rate": 7.42208170321178e-05, "loss": 4.9735, "num_input_tokens_seen": 653131776, "step": 4983 }, { "epoch": 0.818211188660571, "grad_norm": 0.6948509812355042, "learning_rate": 7.419848490673427e-05, "loss": 4.9359, "num_input_tokens_seen": 653524992, "step": 4986 }, { "epoch": 0.8187034938282368, "grad_norm": 0.7119221687316895, "learning_rate": 7.417617292760803e-05, "loss": 4.9919, "num_input_tokens_seen": 653918208, "step": 4989 }, { "epoch": 0.8191957989959026, "grad_norm": 0.6246024966239929, "learning_rate": 7.415388106446673e-05, "loss": 4.9472, "num_input_tokens_seen": 654311424, "step": 4992 }, { "epoch": 0.8196881041635684, "grad_norm": 0.5838922262191772, "learning_rate": 7.413160928710163e-05, "loss": 4.9537, "num_input_tokens_seen": 654704640, "step": 4995 }, { "epoch": 0.8201804093312342, "grad_norm": 0.6566455960273743, "learning_rate": 7.410935756536754e-05, "loss": 4.9545, "num_input_tokens_seen": 655097856, "step": 4998 }, { "epoch": 0.8206727144989, "grad_norm": 0.6249347925186157, "learning_rate": 7.408712586918248e-05, "loss": 4.9567, "num_input_tokens_seen": 655491072, "step": 5001 }, { "epoch": 0.8211650196665659, "grad_norm": 0.604857325553894, "learning_rate": 7.406491416852765e-05, "loss": 4.955, "num_input_tokens_seen": 655884288, "step": 5004 }, { "epoch": 0.8216573248342316, "grad_norm": 0.6290884017944336, "learning_rate": 7.404272243344716e-05, "loss": 4.9201, "num_input_tokens_seen": 656277504, "step": 5007 }, { "epoch": 0.8221496300018974, "grad_norm": 0.5492717623710632, "learning_rate": 7.4020550634048e-05, "loss": 5.0007, "num_input_tokens_seen": 656670720, "step": 5010 }, { "epoch": 0.8226419351695632, "grad_norm": 0.5919643044471741, "learning_rate": 7.39983987404997e-05, "loss": 4.979, "num_input_tokens_seen": 657063936, "step": 5013 }, { "epoch": 0.823134240337229, "grad_norm": 0.6006259322166443, "learning_rate": 7.39762667230343e-05, "loss": 4.9535, "num_input_tokens_seen": 657457152, "step": 5016 }, { "epoch": 0.8236265455048949, "grad_norm": 0.5834858417510986, "learning_rate": 7.395415455194607e-05, "loss": 4.9279, "num_input_tokens_seen": 657850368, "step": 5019 }, { "epoch": 0.8241188506725606, "grad_norm": 0.5802402496337891, "learning_rate": 7.393206219759145e-05, "loss": 4.9848, "num_input_tokens_seen": 658243584, "step": 5022 }, { "epoch": 0.8246111558402265, "grad_norm": 0.6450897455215454, "learning_rate": 7.390998963038879e-05, "loss": 5.0166, "num_input_tokens_seen": 658636800, "step": 5025 }, { "epoch": 0.8251034610078922, "grad_norm": 0.5459268689155579, "learning_rate": 7.388793682081826e-05, "loss": 4.9584, "num_input_tokens_seen": 659030016, "step": 5028 }, { "epoch": 0.8255957661755581, "grad_norm": 0.6466140747070312, "learning_rate": 7.386590373942163e-05, "loss": 4.9798, "num_input_tokens_seen": 659423232, "step": 5031 }, { "epoch": 0.8260880713432239, "grad_norm": 0.6367524266242981, "learning_rate": 7.384389035680211e-05, "loss": 4.9722, "num_input_tokens_seen": 659816448, "step": 5034 }, { "epoch": 0.8265803765108897, "grad_norm": 0.5187392234802246, "learning_rate": 7.382189664362424e-05, "loss": 4.9654, "num_input_tokens_seen": 660209664, "step": 5037 }, { "epoch": 0.8270726816785555, "grad_norm": 0.5242530107498169, "learning_rate": 7.379992257061364e-05, "loss": 4.9528, "num_input_tokens_seen": 660602880, "step": 5040 }, { "epoch": 0.8275649868462213, "grad_norm": 0.7018054723739624, "learning_rate": 7.377796810855694e-05, "loss": 4.9836, "num_input_tokens_seen": 660996096, "step": 5043 }, { "epoch": 0.8280572920138871, "grad_norm": 0.6890904903411865, "learning_rate": 7.37560332283015e-05, "loss": 5.0295, "num_input_tokens_seen": 661389312, "step": 5046 }, { "epoch": 0.828549597181553, "grad_norm": 0.5935149192810059, "learning_rate": 7.373411790075538e-05, "loss": 5.0017, "num_input_tokens_seen": 661782528, "step": 5049 }, { "epoch": 0.8290419023492187, "grad_norm": 0.6082046031951904, "learning_rate": 7.371222209688707e-05, "loss": 4.9301, "num_input_tokens_seen": 662175744, "step": 5052 }, { "epoch": 0.8295342075168846, "grad_norm": 0.6416882872581482, "learning_rate": 7.36903457877254e-05, "loss": 4.9764, "num_input_tokens_seen": 662568960, "step": 5055 }, { "epoch": 0.8300265126845503, "grad_norm": 0.6466403603553772, "learning_rate": 7.366848894435935e-05, "loss": 4.9977, "num_input_tokens_seen": 662962176, "step": 5058 }, { "epoch": 0.8305188178522162, "grad_norm": 0.5742791891098022, "learning_rate": 7.364665153793785e-05, "loss": 4.932, "num_input_tokens_seen": 663355392, "step": 5061 }, { "epoch": 0.831011123019882, "grad_norm": 0.5598244667053223, "learning_rate": 7.362483353966971e-05, "loss": 4.9783, "num_input_tokens_seen": 663748608, "step": 5064 }, { "epoch": 0.8315034281875477, "grad_norm": 0.6468334794044495, "learning_rate": 7.360303492082339e-05, "loss": 4.9809, "num_input_tokens_seen": 664141824, "step": 5067 }, { "epoch": 0.8319957333552136, "grad_norm": 0.7428439855575562, "learning_rate": 7.358125565272689e-05, "loss": 4.9878, "num_input_tokens_seen": 664535040, "step": 5070 }, { "epoch": 0.8324880385228793, "grad_norm": 0.656428873538971, "learning_rate": 7.355949570676748e-05, "loss": 4.9558, "num_input_tokens_seen": 664928256, "step": 5073 }, { "epoch": 0.8329803436905452, "grad_norm": 0.6178997755050659, "learning_rate": 7.353775505439173e-05, "loss": 5.0094, "num_input_tokens_seen": 665321472, "step": 5076 }, { "epoch": 0.8334726488582109, "grad_norm": 0.6288952827453613, "learning_rate": 7.351603366710516e-05, "loss": 4.9676, "num_input_tokens_seen": 665714688, "step": 5079 }, { "epoch": 0.8339649540258768, "grad_norm": 0.5963147878646851, "learning_rate": 7.349433151647226e-05, "loss": 4.936, "num_input_tokens_seen": 666107904, "step": 5082 }, { "epoch": 0.8344572591935426, "grad_norm": 0.6190651655197144, "learning_rate": 7.347264857411617e-05, "loss": 4.9584, "num_input_tokens_seen": 666501120, "step": 5085 }, { "epoch": 0.8349495643612084, "grad_norm": 0.6707886457443237, "learning_rate": 7.345098481171862e-05, "loss": 4.9514, "num_input_tokens_seen": 666894336, "step": 5088 }, { "epoch": 0.8354418695288742, "grad_norm": 0.6272398829460144, "learning_rate": 7.342934020101978e-05, "loss": 4.9309, "num_input_tokens_seen": 667287552, "step": 5091 }, { "epoch": 0.83593417469654, "grad_norm": 0.6572047472000122, "learning_rate": 7.340771471381805e-05, "loss": 4.9722, "num_input_tokens_seen": 667680768, "step": 5094 }, { "epoch": 0.8364264798642058, "grad_norm": 0.644632875919342, "learning_rate": 7.338610832196996e-05, "loss": 4.982, "num_input_tokens_seen": 668073984, "step": 5097 }, { "epoch": 0.8369187850318717, "grad_norm": 0.6848341822624207, "learning_rate": 7.336452099738994e-05, "loss": 4.9322, "num_input_tokens_seen": 668467200, "step": 5100 }, { "epoch": 0.8374110901995374, "grad_norm": 0.576561450958252, "learning_rate": 7.334295271205029e-05, "loss": 4.9732, "num_input_tokens_seen": 668860416, "step": 5103 }, { "epoch": 0.8379033953672033, "grad_norm": 0.6817082166671753, "learning_rate": 7.332140343798091e-05, "loss": 4.9491, "num_input_tokens_seen": 669253632, "step": 5106 }, { "epoch": 0.838395700534869, "grad_norm": 0.716584324836731, "learning_rate": 7.329987314726919e-05, "loss": 4.9788, "num_input_tokens_seen": 669646848, "step": 5109 }, { "epoch": 0.8388880057025349, "grad_norm": 0.5894129872322083, "learning_rate": 7.327836181205987e-05, "loss": 4.9469, "num_input_tokens_seen": 670040064, "step": 5112 }, { "epoch": 0.8393803108702007, "grad_norm": 0.5741015672683716, "learning_rate": 7.325686940455489e-05, "loss": 5.0015, "num_input_tokens_seen": 670433280, "step": 5115 }, { "epoch": 0.8398726160378664, "grad_norm": 0.601706326007843, "learning_rate": 7.323539589701322e-05, "loss": 4.9758, "num_input_tokens_seen": 670826496, "step": 5118 }, { "epoch": 0.8403649212055323, "grad_norm": 0.6591434478759766, "learning_rate": 7.321394126175073e-05, "loss": 4.915, "num_input_tokens_seen": 671219712, "step": 5121 }, { "epoch": 0.840857226373198, "grad_norm": 0.6301112174987793, "learning_rate": 7.319250547113997e-05, "loss": 4.9975, "num_input_tokens_seen": 671612928, "step": 5124 }, { "epoch": 0.8413495315408639, "grad_norm": 0.6425953507423401, "learning_rate": 7.317108849761018e-05, "loss": 4.9452, "num_input_tokens_seen": 672006144, "step": 5127 }, { "epoch": 0.8418418367085297, "grad_norm": 0.6861438155174255, "learning_rate": 7.314969031364696e-05, "loss": 4.9237, "num_input_tokens_seen": 672399360, "step": 5130 }, { "epoch": 0.8423341418761955, "grad_norm": 0.6459963917732239, "learning_rate": 7.312831089179222e-05, "loss": 4.9467, "num_input_tokens_seen": 672792576, "step": 5133 }, { "epoch": 0.8428264470438613, "grad_norm": 0.5769563913345337, "learning_rate": 7.3106950204644e-05, "loss": 4.9517, "num_input_tokens_seen": 673185792, "step": 5136 }, { "epoch": 0.8433187522115271, "grad_norm": 0.6215202212333679, "learning_rate": 7.30856082248564e-05, "loss": 4.9513, "num_input_tokens_seen": 673579008, "step": 5139 }, { "epoch": 0.8438110573791929, "grad_norm": 0.5655343532562256, "learning_rate": 7.306428492513929e-05, "loss": 4.9578, "num_input_tokens_seen": 673972224, "step": 5142 }, { "epoch": 0.8443033625468588, "grad_norm": 0.6174660325050354, "learning_rate": 7.304298027825828e-05, "loss": 4.9143, "num_input_tokens_seen": 674365440, "step": 5145 }, { "epoch": 0.8447956677145245, "grad_norm": 0.5804635286331177, "learning_rate": 7.302169425703454e-05, "loss": 4.9371, "num_input_tokens_seen": 674758656, "step": 5148 }, { "epoch": 0.8452879728821904, "grad_norm": 0.6029345989227295, "learning_rate": 7.300042683434464e-05, "loss": 4.9934, "num_input_tokens_seen": 675151872, "step": 5151 }, { "epoch": 0.8457802780498561, "grad_norm": 0.5814540386199951, "learning_rate": 7.297917798312041e-05, "loss": 4.9335, "num_input_tokens_seen": 675545088, "step": 5154 }, { "epoch": 0.846272583217522, "grad_norm": 0.6849837303161621, "learning_rate": 7.29579476763488e-05, "loss": 4.9467, "num_input_tokens_seen": 675938304, "step": 5157 }, { "epoch": 0.8467648883851878, "grad_norm": 0.5766465663909912, "learning_rate": 7.293673588707179e-05, "loss": 4.9537, "num_input_tokens_seen": 676331520, "step": 5160 }, { "epoch": 0.8472571935528536, "grad_norm": 0.6713130474090576, "learning_rate": 7.291554258838608e-05, "loss": 4.9514, "num_input_tokens_seen": 676724736, "step": 5163 }, { "epoch": 0.8477494987205194, "grad_norm": 0.5405411720275879, "learning_rate": 7.289436775344317e-05, "loss": 4.97, "num_input_tokens_seen": 677117952, "step": 5166 }, { "epoch": 0.8482418038881852, "grad_norm": 0.6186428666114807, "learning_rate": 7.287321135544904e-05, "loss": 4.9795, "num_input_tokens_seen": 677511168, "step": 5169 }, { "epoch": 0.848734109055851, "grad_norm": 0.6150509715080261, "learning_rate": 7.285207336766414e-05, "loss": 5.0054, "num_input_tokens_seen": 677904384, "step": 5172 }, { "epoch": 0.8492264142235169, "grad_norm": 0.5928124785423279, "learning_rate": 7.283095376340306e-05, "loss": 4.9536, "num_input_tokens_seen": 678297600, "step": 5175 }, { "epoch": 0.8497187193911826, "grad_norm": 0.6393954753875732, "learning_rate": 7.280985251603467e-05, "loss": 4.976, "num_input_tokens_seen": 678690816, "step": 5178 }, { "epoch": 0.8502110245588484, "grad_norm": 0.583487331867218, "learning_rate": 7.278876959898168e-05, "loss": 4.98, "num_input_tokens_seen": 679084032, "step": 5181 }, { "epoch": 0.8507033297265142, "grad_norm": 0.6160304546356201, "learning_rate": 7.276770498572075e-05, "loss": 4.9436, "num_input_tokens_seen": 679477248, "step": 5184 }, { "epoch": 0.85119563489418, "grad_norm": 0.6791704893112183, "learning_rate": 7.274665864978213e-05, "loss": 4.9429, "num_input_tokens_seen": 679870464, "step": 5187 }, { "epoch": 0.8516879400618458, "grad_norm": 0.5873450636863708, "learning_rate": 7.272563056474974e-05, "loss": 4.9702, "num_input_tokens_seen": 680263680, "step": 5190 }, { "epoch": 0.8521802452295116, "grad_norm": 0.5966980457305908, "learning_rate": 7.270462070426084e-05, "loss": 4.9582, "num_input_tokens_seen": 680656896, "step": 5193 }, { "epoch": 0.8526725503971775, "grad_norm": 0.5830942392349243, "learning_rate": 7.268362904200598e-05, "loss": 4.9787, "num_input_tokens_seen": 681050112, "step": 5196 }, { "epoch": 0.8531648555648432, "grad_norm": 0.5953894853591919, "learning_rate": 7.266265555172888e-05, "loss": 4.9375, "num_input_tokens_seen": 681443328, "step": 5199 }, { "epoch": 0.8533289572873985, "eval_accuracy": 0.2291695163654128, "eval_loss": 5.2417755126953125, "eval_runtime": 124.158, "eval_samples_per_second": 2.416, "eval_steps_per_second": 1.208, "num_input_tokens_seen": 681574400, "step": 5200 }, { "epoch": 0.8536571607325091, "grad_norm": 0.6928351521492004, "learning_rate": 7.264170020722628e-05, "loss": 4.9173, "num_input_tokens_seen": 681836544, "step": 5202 }, { "epoch": 0.8541494659001748, "grad_norm": 0.5389966368675232, "learning_rate": 7.262076298234773e-05, "loss": 4.9654, "num_input_tokens_seen": 682229760, "step": 5205 }, { "epoch": 0.8546417710678407, "grad_norm": 0.6330162882804871, "learning_rate": 7.259984385099556e-05, "loss": 4.9327, "num_input_tokens_seen": 682622976, "step": 5208 }, { "epoch": 0.8551340762355065, "grad_norm": 0.5867341160774231, "learning_rate": 7.257894278712468e-05, "loss": 4.934, "num_input_tokens_seen": 683016192, "step": 5211 }, { "epoch": 0.8556263814031723, "grad_norm": 0.5803040862083435, "learning_rate": 7.255805976474242e-05, "loss": 4.942, "num_input_tokens_seen": 683409408, "step": 5214 }, { "epoch": 0.8561186865708381, "grad_norm": 0.6194251179695129, "learning_rate": 7.253719475790852e-05, "loss": 4.9473, "num_input_tokens_seen": 683802624, "step": 5217 }, { "epoch": 0.8566109917385039, "grad_norm": 0.638539731502533, "learning_rate": 7.25163477407348e-05, "loss": 4.9615, "num_input_tokens_seen": 684195840, "step": 5220 }, { "epoch": 0.8571032969061697, "grad_norm": 0.6786993741989136, "learning_rate": 7.24955186873852e-05, "loss": 4.9214, "num_input_tokens_seen": 684589056, "step": 5223 }, { "epoch": 0.8575956020738356, "grad_norm": 0.5903679728507996, "learning_rate": 7.247470757207554e-05, "loss": 4.9546, "num_input_tokens_seen": 684982272, "step": 5226 }, { "epoch": 0.8580879072415013, "grad_norm": 0.6937805414199829, "learning_rate": 7.245391436907346e-05, "loss": 4.9727, "num_input_tokens_seen": 685375488, "step": 5229 }, { "epoch": 0.8585802124091672, "grad_norm": 0.6725658774375916, "learning_rate": 7.243313905269826e-05, "loss": 4.9675, "num_input_tokens_seen": 685768704, "step": 5232 }, { "epoch": 0.8590725175768329, "grad_norm": 0.6803759336471558, "learning_rate": 7.241238159732069e-05, "loss": 4.9197, "num_input_tokens_seen": 686161920, "step": 5235 }, { "epoch": 0.8595648227444987, "grad_norm": 0.7206405401229858, "learning_rate": 7.239164197736292e-05, "loss": 4.9093, "num_input_tokens_seen": 686555136, "step": 5238 }, { "epoch": 0.8600571279121646, "grad_norm": 0.6930558681488037, "learning_rate": 7.237092016729838e-05, "loss": 4.9747, "num_input_tokens_seen": 686948352, "step": 5241 }, { "epoch": 0.8605494330798303, "grad_norm": 0.6751710176467896, "learning_rate": 7.235021614165161e-05, "loss": 4.9482, "num_input_tokens_seen": 687341568, "step": 5244 }, { "epoch": 0.8610417382474962, "grad_norm": 0.5579334497451782, "learning_rate": 7.232952987499815e-05, "loss": 4.9393, "num_input_tokens_seen": 687734784, "step": 5247 }, { "epoch": 0.8615340434151619, "grad_norm": 0.5883084535598755, "learning_rate": 7.230886134196436e-05, "loss": 4.8905, "num_input_tokens_seen": 688128000, "step": 5250 }, { "epoch": 0.8620263485828278, "grad_norm": 0.6635749936103821, "learning_rate": 7.228821051722736e-05, "loss": 4.9485, "num_input_tokens_seen": 688521216, "step": 5253 }, { "epoch": 0.8625186537504936, "grad_norm": 0.6582626104354858, "learning_rate": 7.226757737551486e-05, "loss": 4.9598, "num_input_tokens_seen": 688914432, "step": 5256 }, { "epoch": 0.8630109589181594, "grad_norm": 0.7006992101669312, "learning_rate": 7.224696189160501e-05, "loss": 4.9825, "num_input_tokens_seen": 689307648, "step": 5259 }, { "epoch": 0.8635032640858252, "grad_norm": 0.654933512210846, "learning_rate": 7.222636404032635e-05, "loss": 4.9345, "num_input_tokens_seen": 689700864, "step": 5262 }, { "epoch": 0.863995569253491, "grad_norm": 0.6875039935112, "learning_rate": 7.220578379655756e-05, "loss": 5.0074, "num_input_tokens_seen": 690094080, "step": 5265 }, { "epoch": 0.8644878744211568, "grad_norm": 0.7432628870010376, "learning_rate": 7.218522113522744e-05, "loss": 4.9506, "num_input_tokens_seen": 690487296, "step": 5268 }, { "epoch": 0.8649801795888227, "grad_norm": 0.6537792086601257, "learning_rate": 7.216467603131472e-05, "loss": 4.9174, "num_input_tokens_seen": 690880512, "step": 5271 }, { "epoch": 0.8654724847564884, "grad_norm": 0.6503370404243469, "learning_rate": 7.214414845984798e-05, "loss": 4.9177, "num_input_tokens_seen": 691273728, "step": 5274 }, { "epoch": 0.8659647899241543, "grad_norm": 0.6077583432197571, "learning_rate": 7.212363839590548e-05, "loss": 4.9793, "num_input_tokens_seen": 691666944, "step": 5277 }, { "epoch": 0.86645709509182, "grad_norm": 0.5995897054672241, "learning_rate": 7.210314581461502e-05, "loss": 4.9067, "num_input_tokens_seen": 692060160, "step": 5280 }, { "epoch": 0.8669494002594859, "grad_norm": 0.5458451509475708, "learning_rate": 7.208267069115388e-05, "loss": 4.9198, "num_input_tokens_seen": 692453376, "step": 5283 }, { "epoch": 0.8674417054271517, "grad_norm": 0.6200904250144958, "learning_rate": 7.206221300074863e-05, "loss": 4.9502, "num_input_tokens_seen": 692846592, "step": 5286 }, { "epoch": 0.8679340105948175, "grad_norm": 0.5652337670326233, "learning_rate": 7.204177271867505e-05, "loss": 4.9348, "num_input_tokens_seen": 693239808, "step": 5289 }, { "epoch": 0.8684263157624833, "grad_norm": 0.5828121304512024, "learning_rate": 7.202134982025796e-05, "loss": 5.0021, "num_input_tokens_seen": 693633024, "step": 5292 }, { "epoch": 0.868918620930149, "grad_norm": 0.6214671730995178, "learning_rate": 7.200094428087114e-05, "loss": 4.9013, "num_input_tokens_seen": 694026240, "step": 5295 }, { "epoch": 0.8694109260978149, "grad_norm": 0.6146253347396851, "learning_rate": 7.198055607593714e-05, "loss": 4.8906, "num_input_tokens_seen": 694419456, "step": 5298 }, { "epoch": 0.8699032312654806, "grad_norm": 0.6827002167701721, "learning_rate": 7.196018518092727e-05, "loss": 4.9346, "num_input_tokens_seen": 694812672, "step": 5301 }, { "epoch": 0.8703955364331465, "grad_norm": 0.6271741986274719, "learning_rate": 7.193983157136133e-05, "loss": 4.9341, "num_input_tokens_seen": 695205888, "step": 5304 }, { "epoch": 0.8708878416008123, "grad_norm": 0.6707040667533875, "learning_rate": 7.191949522280763e-05, "loss": 4.9357, "num_input_tokens_seen": 695599104, "step": 5307 }, { "epoch": 0.8713801467684781, "grad_norm": 0.5915789604187012, "learning_rate": 7.189917611088272e-05, "loss": 4.9366, "num_input_tokens_seen": 695992320, "step": 5310 }, { "epoch": 0.8718724519361439, "grad_norm": 0.6474863290786743, "learning_rate": 7.187887421125144e-05, "loss": 4.9588, "num_input_tokens_seen": 696385536, "step": 5313 }, { "epoch": 0.8723647571038097, "grad_norm": 0.5836812853813171, "learning_rate": 7.185858949962659e-05, "loss": 4.9235, "num_input_tokens_seen": 696778752, "step": 5316 }, { "epoch": 0.8728570622714755, "grad_norm": 0.6480825543403625, "learning_rate": 7.183832195176905e-05, "loss": 4.9712, "num_input_tokens_seen": 697171968, "step": 5319 }, { "epoch": 0.8733493674391414, "grad_norm": 0.5924190878868103, "learning_rate": 7.181807154348743e-05, "loss": 4.9862, "num_input_tokens_seen": 697565184, "step": 5322 }, { "epoch": 0.8738416726068071, "grad_norm": 0.5856485962867737, "learning_rate": 7.179783825063807e-05, "loss": 4.907, "num_input_tokens_seen": 697958400, "step": 5325 }, { "epoch": 0.874333977774473, "grad_norm": 0.581123948097229, "learning_rate": 7.177762204912492e-05, "loss": 4.9002, "num_input_tokens_seen": 698351616, "step": 5328 }, { "epoch": 0.8748262829421387, "grad_norm": 0.5788645148277283, "learning_rate": 7.17574229148994e-05, "loss": 4.93, "num_input_tokens_seen": 698744832, "step": 5331 }, { "epoch": 0.8753185881098046, "grad_norm": 0.5507704615592957, "learning_rate": 7.173724082396026e-05, "loss": 4.9437, "num_input_tokens_seen": 699138048, "step": 5334 }, { "epoch": 0.8758108932774704, "grad_norm": 0.5941615104675293, "learning_rate": 7.171707575235344e-05, "loss": 4.9439, "num_input_tokens_seen": 699531264, "step": 5337 }, { "epoch": 0.8763031984451362, "grad_norm": 0.5601658821105957, "learning_rate": 7.169692767617206e-05, "loss": 4.9691, "num_input_tokens_seen": 699924480, "step": 5340 }, { "epoch": 0.876795503612802, "grad_norm": 0.6255508065223694, "learning_rate": 7.167679657155616e-05, "loss": 4.9546, "num_input_tokens_seen": 700317696, "step": 5343 }, { "epoch": 0.8772878087804677, "grad_norm": 0.5593776702880859, "learning_rate": 7.165668241469273e-05, "loss": 4.9272, "num_input_tokens_seen": 700710912, "step": 5346 }, { "epoch": 0.8777801139481336, "grad_norm": 0.5787507891654968, "learning_rate": 7.163658518181542e-05, "loss": 4.937, "num_input_tokens_seen": 701104128, "step": 5349 }, { "epoch": 0.8782724191157995, "grad_norm": 0.589417576789856, "learning_rate": 7.161650484920457e-05, "loss": 4.9483, "num_input_tokens_seen": 701497344, "step": 5352 }, { "epoch": 0.8787647242834652, "grad_norm": 0.5617730021476746, "learning_rate": 7.159644139318704e-05, "loss": 4.9215, "num_input_tokens_seen": 701890560, "step": 5355 }, { "epoch": 0.879257029451131, "grad_norm": 0.6318708658218384, "learning_rate": 7.157639479013606e-05, "loss": 4.9201, "num_input_tokens_seen": 702283776, "step": 5358 }, { "epoch": 0.8797493346187968, "grad_norm": 0.6035711765289307, "learning_rate": 7.155636501647111e-05, "loss": 4.9469, "num_input_tokens_seen": 702676992, "step": 5361 }, { "epoch": 0.8802416397864626, "grad_norm": 0.5588794946670532, "learning_rate": 7.153635204865795e-05, "loss": 4.9162, "num_input_tokens_seen": 703070208, "step": 5364 }, { "epoch": 0.8807339449541285, "grad_norm": 0.6277647614479065, "learning_rate": 7.151635586320828e-05, "loss": 4.917, "num_input_tokens_seen": 703463424, "step": 5367 }, { "epoch": 0.8812262501217942, "grad_norm": 0.5754868984222412, "learning_rate": 7.149637643667977e-05, "loss": 4.9523, "num_input_tokens_seen": 703856640, "step": 5370 }, { "epoch": 0.8817185552894601, "grad_norm": 0.6343851089477539, "learning_rate": 7.147641374567592e-05, "loss": 4.971, "num_input_tokens_seen": 704249856, "step": 5373 }, { "epoch": 0.8822108604571258, "grad_norm": 0.5441321134567261, "learning_rate": 7.145646776684588e-05, "loss": 4.969, "num_input_tokens_seen": 704643072, "step": 5376 }, { "epoch": 0.8827031656247917, "grad_norm": 0.6083698272705078, "learning_rate": 7.143653847688449e-05, "loss": 5.0037, "num_input_tokens_seen": 705036288, "step": 5379 }, { "epoch": 0.8831954707924575, "grad_norm": 0.5505862832069397, "learning_rate": 7.141662585253196e-05, "loss": 4.9558, "num_input_tokens_seen": 705429504, "step": 5382 }, { "epoch": 0.8836877759601233, "grad_norm": 0.5923398733139038, "learning_rate": 7.13967298705739e-05, "loss": 4.9021, "num_input_tokens_seen": 705822720, "step": 5385 }, { "epoch": 0.8841800811277891, "grad_norm": 0.5669833421707153, "learning_rate": 7.137685050784117e-05, "loss": 4.9494, "num_input_tokens_seen": 706215936, "step": 5388 }, { "epoch": 0.8846723862954549, "grad_norm": 0.7324468493461609, "learning_rate": 7.135698774120979e-05, "loss": 4.9386, "num_input_tokens_seen": 706609152, "step": 5391 }, { "epoch": 0.8851646914631207, "grad_norm": 0.6149271130561829, "learning_rate": 7.133714154760073e-05, "loss": 4.963, "num_input_tokens_seen": 707002368, "step": 5394 }, { "epoch": 0.8856569966307866, "grad_norm": 0.6040337681770325, "learning_rate": 7.131731190397995e-05, "loss": 4.9305, "num_input_tokens_seen": 707395584, "step": 5397 }, { "epoch": 0.8861493017984523, "grad_norm": 0.5804337859153748, "learning_rate": 7.129749878735813e-05, "loss": 4.8961, "num_input_tokens_seen": 707788800, "step": 5400 }, { "epoch": 0.8866416069661182, "grad_norm": 0.6081647276878357, "learning_rate": 7.127770217479066e-05, "loss": 4.9379, "num_input_tokens_seen": 708182016, "step": 5403 }, { "epoch": 0.8871339121337839, "grad_norm": 0.5394389033317566, "learning_rate": 7.125792204337751e-05, "loss": 4.9666, "num_input_tokens_seen": 708575232, "step": 5406 }, { "epoch": 0.8876262173014497, "grad_norm": 0.5974504947662354, "learning_rate": 7.123815837026311e-05, "loss": 4.9122, "num_input_tokens_seen": 708968448, "step": 5409 }, { "epoch": 0.8881185224691155, "grad_norm": 0.5746287107467651, "learning_rate": 7.121841113263623e-05, "loss": 4.9027, "num_input_tokens_seen": 709361664, "step": 5412 }, { "epoch": 0.8886108276367813, "grad_norm": 0.6007826328277588, "learning_rate": 7.119868030772991e-05, "loss": 4.9593, "num_input_tokens_seen": 709754880, "step": 5415 }, { "epoch": 0.8891031328044472, "grad_norm": 0.5774713158607483, "learning_rate": 7.117896587282125e-05, "loss": 4.9635, "num_input_tokens_seen": 710148096, "step": 5418 }, { "epoch": 0.8895954379721129, "grad_norm": 0.636423647403717, "learning_rate": 7.115926780523142e-05, "loss": 4.9186, "num_input_tokens_seen": 710541312, "step": 5421 }, { "epoch": 0.8900877431397788, "grad_norm": 0.6212565898895264, "learning_rate": 7.11395860823255e-05, "loss": 4.9337, "num_input_tokens_seen": 710934528, "step": 5424 }, { "epoch": 0.8905800483074445, "grad_norm": 0.6649476885795593, "learning_rate": 7.111992068151236e-05, "loss": 4.9409, "num_input_tokens_seen": 711327744, "step": 5427 }, { "epoch": 0.8910723534751104, "grad_norm": 0.6479114890098572, "learning_rate": 7.110027158024453e-05, "loss": 4.9311, "num_input_tokens_seen": 711720960, "step": 5430 }, { "epoch": 0.8915646586427762, "grad_norm": 0.6290377974510193, "learning_rate": 7.108063875601819e-05, "loss": 4.9444, "num_input_tokens_seen": 712114176, "step": 5433 }, { "epoch": 0.892056963810442, "grad_norm": 0.6628143787384033, "learning_rate": 7.106102218637291e-05, "loss": 4.9459, "num_input_tokens_seen": 712507392, "step": 5436 }, { "epoch": 0.8925492689781078, "grad_norm": 0.6291846632957458, "learning_rate": 7.104142184889171e-05, "loss": 4.8912, "num_input_tokens_seen": 712900608, "step": 5439 }, { "epoch": 0.8930415741457736, "grad_norm": 0.7024665474891663, "learning_rate": 7.10218377212008e-05, "loss": 4.9319, "num_input_tokens_seen": 713293824, "step": 5442 }, { "epoch": 0.8935338793134394, "grad_norm": 0.6190261840820312, "learning_rate": 7.100226978096957e-05, "loss": 4.9276, "num_input_tokens_seen": 713687040, "step": 5445 }, { "epoch": 0.8940261844811053, "grad_norm": 0.6652445197105408, "learning_rate": 7.098271800591048e-05, "loss": 4.9636, "num_input_tokens_seen": 714080256, "step": 5448 }, { "epoch": 0.894518489648771, "grad_norm": 0.5882824659347534, "learning_rate": 7.096318237377886e-05, "loss": 4.9664, "num_input_tokens_seen": 714473472, "step": 5451 }, { "epoch": 0.8950107948164369, "grad_norm": 0.5919190645217896, "learning_rate": 7.094366286237293e-05, "loss": 4.9181, "num_input_tokens_seen": 714866688, "step": 5454 }, { "epoch": 0.8955030999841026, "grad_norm": 0.6125955581665039, "learning_rate": 7.092415944953361e-05, "loss": 4.9809, "num_input_tokens_seen": 715259904, "step": 5457 }, { "epoch": 0.8959954051517685, "grad_norm": 0.5722345113754272, "learning_rate": 7.090467211314446e-05, "loss": 4.9237, "num_input_tokens_seen": 715653120, "step": 5460 }, { "epoch": 0.8964877103194343, "grad_norm": 0.5687103271484375, "learning_rate": 7.088520083113153e-05, "loss": 4.9173, "num_input_tokens_seen": 716046336, "step": 5463 }, { "epoch": 0.8969800154871, "grad_norm": 0.662970244884491, "learning_rate": 7.08657455814633e-05, "loss": 4.9096, "num_input_tokens_seen": 716439552, "step": 5466 }, { "epoch": 0.8974723206547659, "grad_norm": 0.601774275302887, "learning_rate": 7.08463063421505e-05, "loss": 4.9254, "num_input_tokens_seen": 716832768, "step": 5469 }, { "epoch": 0.8979646258224316, "grad_norm": 0.6241858005523682, "learning_rate": 7.082688309124617e-05, "loss": 4.9082, "num_input_tokens_seen": 717225984, "step": 5472 }, { "epoch": 0.8984569309900975, "grad_norm": 0.5964360237121582, "learning_rate": 7.080747580684533e-05, "loss": 4.9056, "num_input_tokens_seen": 717619200, "step": 5475 }, { "epoch": 0.8989492361577633, "grad_norm": 0.6412652730941772, "learning_rate": 7.078808446708505e-05, "loss": 4.8908, "num_input_tokens_seen": 718012416, "step": 5478 }, { "epoch": 0.8994415413254291, "grad_norm": 0.5989963412284851, "learning_rate": 7.076870905014429e-05, "loss": 4.9292, "num_input_tokens_seen": 718405632, "step": 5481 }, { "epoch": 0.8999338464930949, "grad_norm": 0.7762419581413269, "learning_rate": 7.074934953424378e-05, "loss": 4.9499, "num_input_tokens_seen": 718798848, "step": 5484 }, { "epoch": 0.9004261516607607, "grad_norm": 0.5652441382408142, "learning_rate": 7.073000589764593e-05, "loss": 4.898, "num_input_tokens_seen": 719192064, "step": 5487 }, { "epoch": 0.9009184568284265, "grad_norm": 0.6855098009109497, "learning_rate": 7.071067811865475e-05, "loss": 4.9768, "num_input_tokens_seen": 719585280, "step": 5490 }, { "epoch": 0.9014107619960924, "grad_norm": 0.7517238855361938, "learning_rate": 7.069136617561571e-05, "loss": 4.9106, "num_input_tokens_seen": 719978496, "step": 5493 }, { "epoch": 0.9019030671637581, "grad_norm": 0.5679114460945129, "learning_rate": 7.067207004691567e-05, "loss": 4.9333, "num_input_tokens_seen": 720371712, "step": 5496 }, { "epoch": 0.902395372331424, "grad_norm": 0.6112279295921326, "learning_rate": 7.065278971098276e-05, "loss": 4.9465, "num_input_tokens_seen": 720764928, "step": 5499 }, { "epoch": 0.9028876774990897, "grad_norm": 0.5691413879394531, "learning_rate": 7.063352514628629e-05, "loss": 4.8991, "num_input_tokens_seen": 721158144, "step": 5502 }, { "epoch": 0.9033799826667556, "grad_norm": 0.7350879907608032, "learning_rate": 7.06142763313366e-05, "loss": 4.9199, "num_input_tokens_seen": 721551360, "step": 5505 }, { "epoch": 0.9038722878344214, "grad_norm": 0.5895538926124573, "learning_rate": 7.059504324468505e-05, "loss": 4.9424, "num_input_tokens_seen": 721944576, "step": 5508 }, { "epoch": 0.9043645930020872, "grad_norm": 0.6411991119384766, "learning_rate": 7.057582586492387e-05, "loss": 4.9006, "num_input_tokens_seen": 722337792, "step": 5511 }, { "epoch": 0.904856898169753, "grad_norm": 0.5808833837509155, "learning_rate": 7.055662417068605e-05, "loss": 4.9046, "num_input_tokens_seen": 722731008, "step": 5514 }, { "epoch": 0.9053492033374188, "grad_norm": 0.5693103671073914, "learning_rate": 7.05374381406452e-05, "loss": 4.8972, "num_input_tokens_seen": 723124224, "step": 5517 }, { "epoch": 0.9058415085050846, "grad_norm": 0.5854239463806152, "learning_rate": 7.051826775351563e-05, "loss": 4.9348, "num_input_tokens_seen": 723517440, "step": 5520 }, { "epoch": 0.9063338136727503, "grad_norm": 0.7103624939918518, "learning_rate": 7.049911298805197e-05, "loss": 4.9114, "num_input_tokens_seen": 723910656, "step": 5523 }, { "epoch": 0.9068261188404162, "grad_norm": 0.591009259223938, "learning_rate": 7.047997382304934e-05, "loss": 5.0049, "num_input_tokens_seen": 724303872, "step": 5526 }, { "epoch": 0.907318424008082, "grad_norm": 0.6358263492584229, "learning_rate": 7.046085023734305e-05, "loss": 4.9576, "num_input_tokens_seen": 724697088, "step": 5529 }, { "epoch": 0.9078107291757478, "grad_norm": 0.671718180179596, "learning_rate": 7.044174220980871e-05, "loss": 4.8979, "num_input_tokens_seen": 725090304, "step": 5532 }, { "epoch": 0.9083030343434136, "grad_norm": 0.5911664962768555, "learning_rate": 7.042264971936185e-05, "loss": 4.9507, "num_input_tokens_seen": 725483520, "step": 5535 }, { "epoch": 0.9087953395110794, "grad_norm": 0.6500189304351807, "learning_rate": 7.040357274495808e-05, "loss": 4.8758, "num_input_tokens_seen": 725876736, "step": 5538 }, { "epoch": 0.9092876446787452, "grad_norm": 0.5432878732681274, "learning_rate": 7.038451126559289e-05, "loss": 4.9449, "num_input_tokens_seen": 726269952, "step": 5541 }, { "epoch": 0.9097799498464111, "grad_norm": 0.5804717540740967, "learning_rate": 7.036546526030153e-05, "loss": 4.9024, "num_input_tokens_seen": 726663168, "step": 5544 }, { "epoch": 0.9102722550140768, "grad_norm": 0.5860951542854309, "learning_rate": 7.034643470815894e-05, "loss": 4.9393, "num_input_tokens_seen": 727056384, "step": 5547 }, { "epoch": 0.9107645601817427, "grad_norm": 0.6187010407447815, "learning_rate": 7.032741958827968e-05, "loss": 4.8962, "num_input_tokens_seen": 727449600, "step": 5550 }, { "epoch": 0.9112568653494084, "grad_norm": 0.5530468225479126, "learning_rate": 7.030841987981778e-05, "loss": 4.9029, "num_input_tokens_seen": 727842816, "step": 5553 }, { "epoch": 0.9117491705170743, "grad_norm": 0.6535037755966187, "learning_rate": 7.02894355619667e-05, "loss": 4.9293, "num_input_tokens_seen": 728236032, "step": 5556 }, { "epoch": 0.9122414756847401, "grad_norm": 0.5901381969451904, "learning_rate": 7.027046661395916e-05, "loss": 4.8875, "num_input_tokens_seen": 728629248, "step": 5559 }, { "epoch": 0.9127337808524059, "grad_norm": 0.6690804958343506, "learning_rate": 7.025151301506713e-05, "loss": 4.9228, "num_input_tokens_seen": 729022464, "step": 5562 }, { "epoch": 0.9132260860200717, "grad_norm": 0.6213147044181824, "learning_rate": 7.02325747446017e-05, "loss": 4.9315, "num_input_tokens_seen": 729415680, "step": 5565 }, { "epoch": 0.9137183911877375, "grad_norm": 0.6875079870223999, "learning_rate": 7.021365178191292e-05, "loss": 4.9614, "num_input_tokens_seen": 729808896, "step": 5568 }, { "epoch": 0.9142106963554033, "grad_norm": 0.5743534564971924, "learning_rate": 7.019474410638983e-05, "loss": 4.9304, "num_input_tokens_seen": 730202112, "step": 5571 }, { "epoch": 0.9147030015230692, "grad_norm": 0.6975659728050232, "learning_rate": 7.017585169746028e-05, "loss": 4.9321, "num_input_tokens_seen": 730595328, "step": 5574 }, { "epoch": 0.9151953066907349, "grad_norm": 0.6249175071716309, "learning_rate": 7.015697453459085e-05, "loss": 4.8852, "num_input_tokens_seen": 730988544, "step": 5577 }, { "epoch": 0.9156876118584008, "grad_norm": 0.6176156401634216, "learning_rate": 7.013811259728677e-05, "loss": 4.9137, "num_input_tokens_seen": 731381760, "step": 5580 }, { "epoch": 0.9161799170260665, "grad_norm": 0.5664033889770508, "learning_rate": 7.011926586509181e-05, "loss": 4.9301, "num_input_tokens_seen": 731774976, "step": 5583 }, { "epoch": 0.9166722221937323, "grad_norm": 0.5968320369720459, "learning_rate": 7.010043431758822e-05, "loss": 4.9191, "num_input_tokens_seen": 732168192, "step": 5586 }, { "epoch": 0.9171645273613982, "grad_norm": 0.5811692476272583, "learning_rate": 7.008161793439657e-05, "loss": 4.9041, "num_input_tokens_seen": 732561408, "step": 5589 }, { "epoch": 0.9176568325290639, "grad_norm": 0.631001889705658, "learning_rate": 7.006281669517578e-05, "loss": 4.9357, "num_input_tokens_seen": 732954624, "step": 5592 }, { "epoch": 0.9181491376967298, "grad_norm": 0.5815140604972839, "learning_rate": 7.004403057962285e-05, "loss": 4.9433, "num_input_tokens_seen": 733347840, "step": 5595 }, { "epoch": 0.9186414428643955, "grad_norm": 0.5923864841461182, "learning_rate": 7.002525956747294e-05, "loss": 4.9322, "num_input_tokens_seen": 733741056, "step": 5598 }, { "epoch": 0.9189696463095061, "eval_accuracy": 0.23118384627910762, "eval_loss": 5.216609477996826, "eval_runtime": 125.2559, "eval_samples_per_second": 2.395, "eval_steps_per_second": 1.198, "num_input_tokens_seen": 734003200, "step": 5600 }, { "epoch": 0.9191337480320614, "grad_norm": 0.6140096187591553, "learning_rate": 7.000650363849917e-05, "loss": 4.9178, "num_input_tokens_seen": 734134272, "step": 5601 }, { "epoch": 0.9196260531997272, "grad_norm": 0.6851264238357544, "learning_rate": 6.998776277251258e-05, "loss": 4.8845, "num_input_tokens_seen": 734527488, "step": 5604 }, { "epoch": 0.920118358367393, "grad_norm": 0.6549767851829529, "learning_rate": 6.996903694936202e-05, "loss": 4.8942, "num_input_tokens_seen": 734920704, "step": 5607 }, { "epoch": 0.9206106635350588, "grad_norm": 0.6244533061981201, "learning_rate": 6.995032614893404e-05, "loss": 4.9043, "num_input_tokens_seen": 735313920, "step": 5610 }, { "epoch": 0.9211029687027246, "grad_norm": 0.7099424600601196, "learning_rate": 6.993163035115284e-05, "loss": 4.8973, "num_input_tokens_seen": 735707136, "step": 5613 }, { "epoch": 0.9215952738703904, "grad_norm": 0.6875070929527283, "learning_rate": 6.991294953598019e-05, "loss": 4.9278, "num_input_tokens_seen": 736100352, "step": 5616 }, { "epoch": 0.9220875790380562, "grad_norm": 0.5664244890213013, "learning_rate": 6.989428368341524e-05, "loss": 4.9095, "num_input_tokens_seen": 736493568, "step": 5619 }, { "epoch": 0.922579884205722, "grad_norm": 0.7427223920822144, "learning_rate": 6.987563277349452e-05, "loss": 4.9275, "num_input_tokens_seen": 736886784, "step": 5622 }, { "epoch": 0.9230721893733879, "grad_norm": 0.668783962726593, "learning_rate": 6.985699678629191e-05, "loss": 4.9108, "num_input_tokens_seen": 737280000, "step": 5625 }, { "epoch": 0.9235644945410536, "grad_norm": 0.6262427568435669, "learning_rate": 6.983837570191838e-05, "loss": 4.9542, "num_input_tokens_seen": 737673216, "step": 5628 }, { "epoch": 0.9240567997087195, "grad_norm": 0.6482858061790466, "learning_rate": 6.981976950052198e-05, "loss": 4.9115, "num_input_tokens_seen": 738066432, "step": 5631 }, { "epoch": 0.9245491048763852, "grad_norm": 0.7812525629997253, "learning_rate": 6.980117816228785e-05, "loss": 4.9108, "num_input_tokens_seen": 738459648, "step": 5634 }, { "epoch": 0.925041410044051, "grad_norm": 0.5804930329322815, "learning_rate": 6.978260166743796e-05, "loss": 4.9111, "num_input_tokens_seen": 738852864, "step": 5637 }, { "epoch": 0.9255337152117169, "grad_norm": 0.563565194606781, "learning_rate": 6.976403999623119e-05, "loss": 4.9069, "num_input_tokens_seen": 739246080, "step": 5640 }, { "epoch": 0.9260260203793826, "grad_norm": 0.555899441242218, "learning_rate": 6.974549312896306e-05, "loss": 4.8704, "num_input_tokens_seen": 739639296, "step": 5643 }, { "epoch": 0.9265183255470485, "grad_norm": 0.5783566832542419, "learning_rate": 6.972696104596579e-05, "loss": 4.9685, "num_input_tokens_seen": 740032512, "step": 5646 }, { "epoch": 0.9270106307147142, "grad_norm": 0.5310668349266052, "learning_rate": 6.97084437276082e-05, "loss": 4.9193, "num_input_tokens_seen": 740425728, "step": 5649 }, { "epoch": 0.9275029358823801, "grad_norm": 0.5854000449180603, "learning_rate": 6.96899411542955e-05, "loss": 4.9021, "num_input_tokens_seen": 740818944, "step": 5652 }, { "epoch": 0.9279952410500459, "grad_norm": 0.6978147029876709, "learning_rate": 6.967145330646938e-05, "loss": 4.8868, "num_input_tokens_seen": 741212160, "step": 5655 }, { "epoch": 0.9284875462177117, "grad_norm": 0.5259647965431213, "learning_rate": 6.965298016460775e-05, "loss": 4.9378, "num_input_tokens_seen": 741605376, "step": 5658 }, { "epoch": 0.9289798513853775, "grad_norm": 0.8059858679771423, "learning_rate": 6.963452170922476e-05, "loss": 4.9439, "num_input_tokens_seen": 741998592, "step": 5661 }, { "epoch": 0.9294721565530433, "grad_norm": 0.6470625996589661, "learning_rate": 6.961607792087073e-05, "loss": 4.9094, "num_input_tokens_seen": 742391808, "step": 5664 }, { "epoch": 0.9299644617207091, "grad_norm": 0.8249083757400513, "learning_rate": 6.959764878013196e-05, "loss": 4.9275, "num_input_tokens_seen": 742785024, "step": 5667 }, { "epoch": 0.930456766888375, "grad_norm": 0.5849491953849792, "learning_rate": 6.957923426763075e-05, "loss": 4.9589, "num_input_tokens_seen": 743178240, "step": 5670 }, { "epoch": 0.9309490720560407, "grad_norm": 0.7007787227630615, "learning_rate": 6.956083436402524e-05, "loss": 4.9004, "num_input_tokens_seen": 743571456, "step": 5673 }, { "epoch": 0.9314413772237066, "grad_norm": 0.6450519561767578, "learning_rate": 6.954244905000938e-05, "loss": 4.949, "num_input_tokens_seen": 743964672, "step": 5676 }, { "epoch": 0.9319336823913723, "grad_norm": 0.689784049987793, "learning_rate": 6.95240783063128e-05, "loss": 4.9121, "num_input_tokens_seen": 744357888, "step": 5679 }, { "epoch": 0.9324259875590382, "grad_norm": 0.5879255533218384, "learning_rate": 6.950572211370075e-05, "loss": 4.9665, "num_input_tokens_seen": 744751104, "step": 5682 }, { "epoch": 0.932918292726704, "grad_norm": 0.6298468708992004, "learning_rate": 6.948738045297404e-05, "loss": 4.9033, "num_input_tokens_seen": 745144320, "step": 5685 }, { "epoch": 0.9334105978943698, "grad_norm": 0.541263997554779, "learning_rate": 6.946905330496889e-05, "loss": 4.9259, "num_input_tokens_seen": 745537536, "step": 5688 }, { "epoch": 0.9339029030620356, "grad_norm": 0.6083950996398926, "learning_rate": 6.945074065055687e-05, "loss": 4.8963, "num_input_tokens_seen": 745930752, "step": 5691 }, { "epoch": 0.9343952082297013, "grad_norm": 0.60537189245224, "learning_rate": 6.943244247064488e-05, "loss": 4.9197, "num_input_tokens_seen": 746323968, "step": 5694 }, { "epoch": 0.9348875133973672, "grad_norm": 0.6046431064605713, "learning_rate": 6.941415874617496e-05, "loss": 4.9622, "num_input_tokens_seen": 746717184, "step": 5697 }, { "epoch": 0.935379818565033, "grad_norm": 0.5610204339027405, "learning_rate": 6.939588945812431e-05, "loss": 4.9442, "num_input_tokens_seen": 747110400, "step": 5700 }, { "epoch": 0.9358721237326988, "grad_norm": 0.5891165733337402, "learning_rate": 6.937763458750514e-05, "loss": 4.9079, "num_input_tokens_seen": 747503616, "step": 5703 }, { "epoch": 0.9363644289003646, "grad_norm": 0.5680641531944275, "learning_rate": 6.93593941153646e-05, "loss": 4.9396, "num_input_tokens_seen": 747896832, "step": 5706 }, { "epoch": 0.9368567340680304, "grad_norm": 0.6169180870056152, "learning_rate": 6.934116802278468e-05, "loss": 4.9221, "num_input_tokens_seen": 748290048, "step": 5709 }, { "epoch": 0.9373490392356962, "grad_norm": 0.5749015212059021, "learning_rate": 6.932295629088219e-05, "loss": 4.9258, "num_input_tokens_seen": 748683264, "step": 5712 }, { "epoch": 0.9378413444033621, "grad_norm": 0.5365791916847229, "learning_rate": 6.930475890080862e-05, "loss": 4.8815, "num_input_tokens_seen": 749076480, "step": 5715 }, { "epoch": 0.9383336495710278, "grad_norm": 0.5742934942245483, "learning_rate": 6.928657583375008e-05, "loss": 4.8828, "num_input_tokens_seen": 749469696, "step": 5718 }, { "epoch": 0.9388259547386937, "grad_norm": 0.6056268215179443, "learning_rate": 6.92684070709272e-05, "loss": 4.9585, "num_input_tokens_seen": 749862912, "step": 5721 }, { "epoch": 0.9393182599063594, "grad_norm": 0.5517481565475464, "learning_rate": 6.925025259359513e-05, "loss": 4.8756, "num_input_tokens_seen": 750256128, "step": 5724 }, { "epoch": 0.9398105650740253, "grad_norm": 0.5459040999412537, "learning_rate": 6.923211238304328e-05, "loss": 4.9259, "num_input_tokens_seen": 750649344, "step": 5727 }, { "epoch": 0.940302870241691, "grad_norm": 0.640325129032135, "learning_rate": 6.92139864205954e-05, "loss": 4.898, "num_input_tokens_seen": 751042560, "step": 5730 }, { "epoch": 0.9407951754093569, "grad_norm": 0.7371880412101746, "learning_rate": 6.919587468760951e-05, "loss": 4.885, "num_input_tokens_seen": 751435776, "step": 5733 }, { "epoch": 0.9412874805770227, "grad_norm": 0.6491408348083496, "learning_rate": 6.917777716547768e-05, "loss": 4.9081, "num_input_tokens_seen": 751828992, "step": 5736 }, { "epoch": 0.9417797857446885, "grad_norm": 0.5965387225151062, "learning_rate": 6.915969383562604e-05, "loss": 4.9093, "num_input_tokens_seen": 752222208, "step": 5739 }, { "epoch": 0.9422720909123543, "grad_norm": 0.6211044788360596, "learning_rate": 6.914162467951475e-05, "loss": 4.9024, "num_input_tokens_seen": 752615424, "step": 5742 }, { "epoch": 0.94276439608002, "grad_norm": 0.6093422174453735, "learning_rate": 6.912356967863777e-05, "loss": 4.9659, "num_input_tokens_seen": 753008640, "step": 5745 }, { "epoch": 0.9432567012476859, "grad_norm": 0.7026922106742859, "learning_rate": 6.910552881452296e-05, "loss": 4.8806, "num_input_tokens_seen": 753401856, "step": 5748 }, { "epoch": 0.9437490064153518, "grad_norm": 0.5915043950080872, "learning_rate": 6.908750206873184e-05, "loss": 4.8699, "num_input_tokens_seen": 753795072, "step": 5751 }, { "epoch": 0.9442413115830175, "grad_norm": 0.7234703898429871, "learning_rate": 6.90694894228596e-05, "loss": 4.8763, "num_input_tokens_seen": 754188288, "step": 5754 }, { "epoch": 0.9447336167506833, "grad_norm": 0.5900036096572876, "learning_rate": 6.905149085853502e-05, "loss": 4.895, "num_input_tokens_seen": 754581504, "step": 5757 }, { "epoch": 0.9452259219183491, "grad_norm": 0.7614732384681702, "learning_rate": 6.903350635742038e-05, "loss": 4.9233, "num_input_tokens_seen": 754974720, "step": 5760 }, { "epoch": 0.9457182270860149, "grad_norm": 0.6385030746459961, "learning_rate": 6.901553590121132e-05, "loss": 4.8984, "num_input_tokens_seen": 755367936, "step": 5763 }, { "epoch": 0.9462105322536808, "grad_norm": 0.6103296279907227, "learning_rate": 6.899757947163688e-05, "loss": 4.9036, "num_input_tokens_seen": 755761152, "step": 5766 }, { "epoch": 0.9467028374213465, "grad_norm": 0.5251742005348206, "learning_rate": 6.897963705045933e-05, "loss": 4.9414, "num_input_tokens_seen": 756154368, "step": 5769 }, { "epoch": 0.9471951425890124, "grad_norm": 0.6542143821716309, "learning_rate": 6.896170861947415e-05, "loss": 4.9107, "num_input_tokens_seen": 756547584, "step": 5772 }, { "epoch": 0.9476874477566781, "grad_norm": 0.6727720499038696, "learning_rate": 6.894379416050985e-05, "loss": 4.8905, "num_input_tokens_seen": 756940800, "step": 5775 }, { "epoch": 0.948179752924344, "grad_norm": 0.5717095732688904, "learning_rate": 6.892589365542804e-05, "loss": 4.9338, "num_input_tokens_seen": 757334016, "step": 5778 }, { "epoch": 0.9486720580920098, "grad_norm": 0.6129010915756226, "learning_rate": 6.890800708612326e-05, "loss": 4.8975, "num_input_tokens_seen": 757727232, "step": 5781 }, { "epoch": 0.9491643632596756, "grad_norm": 0.6094048619270325, "learning_rate": 6.889013443452292e-05, "loss": 4.9282, "num_input_tokens_seen": 758120448, "step": 5784 }, { "epoch": 0.9496566684273414, "grad_norm": 0.5988612771034241, "learning_rate": 6.887227568258717e-05, "loss": 4.9152, "num_input_tokens_seen": 758513664, "step": 5787 }, { "epoch": 0.9501489735950072, "grad_norm": 0.6123887300491333, "learning_rate": 6.885443081230899e-05, "loss": 4.9025, "num_input_tokens_seen": 758906880, "step": 5790 }, { "epoch": 0.950641278762673, "grad_norm": 0.6150357127189636, "learning_rate": 6.883659980571393e-05, "loss": 4.944, "num_input_tokens_seen": 759300096, "step": 5793 }, { "epoch": 0.9511335839303389, "grad_norm": 0.558800995349884, "learning_rate": 6.881878264486008e-05, "loss": 4.9404, "num_input_tokens_seen": 759693312, "step": 5796 }, { "epoch": 0.9516258890980046, "grad_norm": 0.5992735624313354, "learning_rate": 6.880097931183812e-05, "loss": 4.9464, "num_input_tokens_seen": 760086528, "step": 5799 }, { "epoch": 0.9521181942656705, "grad_norm": 0.597300112247467, "learning_rate": 6.878318978877102e-05, "loss": 4.9496, "num_input_tokens_seen": 760479744, "step": 5802 }, { "epoch": 0.9526104994333362, "grad_norm": 0.5881853103637695, "learning_rate": 6.876541405781422e-05, "loss": 4.8984, "num_input_tokens_seen": 760872960, "step": 5805 }, { "epoch": 0.953102804601002, "grad_norm": 0.5540621280670166, "learning_rate": 6.874765210115533e-05, "loss": 4.8732, "num_input_tokens_seen": 761266176, "step": 5808 }, { "epoch": 0.9535951097686679, "grad_norm": 0.5574669241905212, "learning_rate": 6.872990390101416e-05, "loss": 4.9345, "num_input_tokens_seen": 761659392, "step": 5811 }, { "epoch": 0.9540874149363336, "grad_norm": 0.6479876637458801, "learning_rate": 6.871216943964268e-05, "loss": 4.9443, "num_input_tokens_seen": 762052608, "step": 5814 }, { "epoch": 0.9545797201039995, "grad_norm": 0.586850106716156, "learning_rate": 6.869444869932488e-05, "loss": 4.9271, "num_input_tokens_seen": 762445824, "step": 5817 }, { "epoch": 0.9550720252716652, "grad_norm": 0.736300528049469, "learning_rate": 6.867674166237667e-05, "loss": 4.973, "num_input_tokens_seen": 762839040, "step": 5820 }, { "epoch": 0.9555643304393311, "grad_norm": 0.5725162029266357, "learning_rate": 6.865904831114593e-05, "loss": 4.8662, "num_input_tokens_seen": 763232256, "step": 5823 }, { "epoch": 0.9560566356069969, "grad_norm": 0.6230468153953552, "learning_rate": 6.86413686280123e-05, "loss": 4.8975, "num_input_tokens_seen": 763625472, "step": 5826 }, { "epoch": 0.9565489407746627, "grad_norm": 0.5963659882545471, "learning_rate": 6.862370259538721e-05, "loss": 4.9168, "num_input_tokens_seen": 764018688, "step": 5829 }, { "epoch": 0.9570412459423285, "grad_norm": 0.6411994695663452, "learning_rate": 6.86060501957137e-05, "loss": 4.9349, "num_input_tokens_seen": 764411904, "step": 5832 }, { "epoch": 0.9575335511099943, "grad_norm": 0.6455360651016235, "learning_rate": 6.858841141146649e-05, "loss": 4.9462, "num_input_tokens_seen": 764805120, "step": 5835 }, { "epoch": 0.9580258562776601, "grad_norm": 0.7855854034423828, "learning_rate": 6.857078622515172e-05, "loss": 4.93, "num_input_tokens_seen": 765198336, "step": 5838 }, { "epoch": 0.9585181614453259, "grad_norm": 0.6942725777626038, "learning_rate": 6.855317461930706e-05, "loss": 4.9313, "num_input_tokens_seen": 765591552, "step": 5841 }, { "epoch": 0.9590104666129917, "grad_norm": 0.645770788192749, "learning_rate": 6.853557657650157e-05, "loss": 4.9185, "num_input_tokens_seen": 765984768, "step": 5844 }, { "epoch": 0.9595027717806576, "grad_norm": 0.6724271178245544, "learning_rate": 6.851799207933553e-05, "loss": 4.9469, "num_input_tokens_seen": 766377984, "step": 5847 }, { "epoch": 0.9599950769483233, "grad_norm": 0.6254433393478394, "learning_rate": 6.850042111044057e-05, "loss": 4.9137, "num_input_tokens_seen": 766771200, "step": 5850 }, { "epoch": 0.9604873821159892, "grad_norm": 0.5859279036521912, "learning_rate": 6.848286365247937e-05, "loss": 4.8999, "num_input_tokens_seen": 767164416, "step": 5853 }, { "epoch": 0.9609796872836549, "grad_norm": 0.6462947130203247, "learning_rate": 6.846531968814576e-05, "loss": 4.9484, "num_input_tokens_seen": 767557632, "step": 5856 }, { "epoch": 0.9614719924513208, "grad_norm": 0.5213596820831299, "learning_rate": 6.844778920016459e-05, "loss": 4.9116, "num_input_tokens_seen": 767950848, "step": 5859 }, { "epoch": 0.9619642976189866, "grad_norm": 0.6776494979858398, "learning_rate": 6.843027217129164e-05, "loss": 4.9351, "num_input_tokens_seen": 768344064, "step": 5862 }, { "epoch": 0.9624566027866523, "grad_norm": 0.5639019012451172, "learning_rate": 6.841276858431358e-05, "loss": 4.8733, "num_input_tokens_seen": 768737280, "step": 5865 }, { "epoch": 0.9629489079543182, "grad_norm": 0.6252104640007019, "learning_rate": 6.839527842204787e-05, "loss": 4.8784, "num_input_tokens_seen": 769130496, "step": 5868 }, { "epoch": 0.9634412131219839, "grad_norm": 0.6042758822441101, "learning_rate": 6.837780166734271e-05, "loss": 4.9082, "num_input_tokens_seen": 769523712, "step": 5871 }, { "epoch": 0.9639335182896498, "grad_norm": 0.6575847268104553, "learning_rate": 6.836033830307697e-05, "loss": 4.8967, "num_input_tokens_seen": 769916928, "step": 5874 }, { "epoch": 0.9644258234573156, "grad_norm": 0.728636622428894, "learning_rate": 6.834288831216011e-05, "loss": 4.9266, "num_input_tokens_seen": 770310144, "step": 5877 }, { "epoch": 0.9649181286249814, "grad_norm": 0.5777960419654846, "learning_rate": 6.832545167753211e-05, "loss": 4.9058, "num_input_tokens_seen": 770703360, "step": 5880 }, { "epoch": 0.9654104337926472, "grad_norm": 0.7681834697723389, "learning_rate": 6.830802838216338e-05, "loss": 4.9101, "num_input_tokens_seen": 771096576, "step": 5883 }, { "epoch": 0.965902738960313, "grad_norm": 0.5604771971702576, "learning_rate": 6.829061840905477e-05, "loss": 4.9314, "num_input_tokens_seen": 771489792, "step": 5886 }, { "epoch": 0.9663950441279788, "grad_norm": 0.5763574838638306, "learning_rate": 6.82732217412374e-05, "loss": 4.8776, "num_input_tokens_seen": 771883008, "step": 5889 }, { "epoch": 0.9668873492956447, "grad_norm": 0.580155611038208, "learning_rate": 6.825583836177263e-05, "loss": 4.9113, "num_input_tokens_seen": 772276224, "step": 5892 }, { "epoch": 0.9673796544633104, "grad_norm": 0.6300919651985168, "learning_rate": 6.823846825375201e-05, "loss": 4.8782, "num_input_tokens_seen": 772669440, "step": 5895 }, { "epoch": 0.9678719596309763, "grad_norm": 0.5544500350952148, "learning_rate": 6.822111140029719e-05, "loss": 4.8604, "num_input_tokens_seen": 773062656, "step": 5898 }, { "epoch": 0.968364264798642, "grad_norm": 0.6302488446235657, "learning_rate": 6.820376778455987e-05, "loss": 4.9071, "num_input_tokens_seen": 773455872, "step": 5901 }, { "epoch": 0.9688565699663079, "grad_norm": 0.6206812858581543, "learning_rate": 6.81864373897217e-05, "loss": 4.9005, "num_input_tokens_seen": 773849088, "step": 5904 }, { "epoch": 0.9693488751339737, "grad_norm": 0.638460636138916, "learning_rate": 6.816912019899426e-05, "loss": 4.9282, "num_input_tokens_seen": 774242304, "step": 5907 }, { "epoch": 0.9698411803016395, "grad_norm": 0.6319560408592224, "learning_rate": 6.815181619561888e-05, "loss": 4.8894, "num_input_tokens_seen": 774635520, "step": 5910 }, { "epoch": 0.9703334854693053, "grad_norm": 0.5597271919250488, "learning_rate": 6.813452536286677e-05, "loss": 4.9043, "num_input_tokens_seen": 775028736, "step": 5913 }, { "epoch": 0.970825790636971, "grad_norm": 0.8066889643669128, "learning_rate": 6.811724768403874e-05, "loss": 4.9336, "num_input_tokens_seen": 775421952, "step": 5916 }, { "epoch": 0.9713180958046369, "grad_norm": 0.6452884078025818, "learning_rate": 6.809998314246527e-05, "loss": 4.9215, "num_input_tokens_seen": 775815168, "step": 5919 }, { "epoch": 0.9718104009723028, "grad_norm": 0.5996139645576477, "learning_rate": 6.80827317215064e-05, "loss": 4.9279, "num_input_tokens_seen": 776208384, "step": 5922 }, { "epoch": 0.9723027061399685, "grad_norm": 0.7165923118591309, "learning_rate": 6.806549340455164e-05, "loss": 4.9125, "num_input_tokens_seen": 776601600, "step": 5925 }, { "epoch": 0.9727950113076343, "grad_norm": 0.6175607442855835, "learning_rate": 6.804826817501996e-05, "loss": 4.8999, "num_input_tokens_seen": 776994816, "step": 5928 }, { "epoch": 0.9732873164753001, "grad_norm": 0.6167675256729126, "learning_rate": 6.803105601635961e-05, "loss": 4.9054, "num_input_tokens_seen": 777388032, "step": 5931 }, { "epoch": 0.9737796216429659, "grad_norm": 0.6145119667053223, "learning_rate": 6.801385691204829e-05, "loss": 4.878, "num_input_tokens_seen": 777781248, "step": 5934 }, { "epoch": 0.9742719268106318, "grad_norm": 0.5707881450653076, "learning_rate": 6.799667084559273e-05, "loss": 4.8818, "num_input_tokens_seen": 778174464, "step": 5937 }, { "epoch": 0.9747642319782975, "grad_norm": 0.5859269499778748, "learning_rate": 6.797949780052896e-05, "loss": 4.8894, "num_input_tokens_seen": 778567680, "step": 5940 }, { "epoch": 0.9752565371459634, "grad_norm": 0.560761570930481, "learning_rate": 6.796233776042202e-05, "loss": 4.9154, "num_input_tokens_seen": 778960896, "step": 5943 }, { "epoch": 0.9757488423136291, "grad_norm": 0.5537312626838684, "learning_rate": 6.794519070886606e-05, "loss": 4.8822, "num_input_tokens_seen": 779354112, "step": 5946 }, { "epoch": 0.976241147481295, "grad_norm": 0.6321630477905273, "learning_rate": 6.792805662948407e-05, "loss": 4.9161, "num_input_tokens_seen": 779747328, "step": 5949 }, { "epoch": 0.9767334526489607, "grad_norm": 0.6216323375701904, "learning_rate": 6.791093550592807e-05, "loss": 4.912, "num_input_tokens_seen": 780140544, "step": 5952 }, { "epoch": 0.9772257578166266, "grad_norm": 0.5987245440483093, "learning_rate": 6.789382732187882e-05, "loss": 4.9255, "num_input_tokens_seen": 780533760, "step": 5955 }, { "epoch": 0.9777180629842924, "grad_norm": 0.6332117319107056, "learning_rate": 6.787673206104584e-05, "loss": 4.8713, "num_input_tokens_seen": 780926976, "step": 5958 }, { "epoch": 0.9782103681519582, "grad_norm": 0.5283944010734558, "learning_rate": 6.785964970716741e-05, "loss": 4.9124, "num_input_tokens_seen": 781320192, "step": 5961 }, { "epoch": 0.978702673319624, "grad_norm": 0.5525709390640259, "learning_rate": 6.784258024401038e-05, "loss": 4.9321, "num_input_tokens_seen": 781713408, "step": 5964 }, { "epoch": 0.9791949784872898, "grad_norm": 0.5929332971572876, "learning_rate": 6.782552365537023e-05, "loss": 4.9264, "num_input_tokens_seen": 782106624, "step": 5967 }, { "epoch": 0.9796872836549556, "grad_norm": 0.6105230450630188, "learning_rate": 6.780847992507089e-05, "loss": 4.9408, "num_input_tokens_seen": 782499840, "step": 5970 }, { "epoch": 0.9801795888226215, "grad_norm": 0.6432685256004333, "learning_rate": 6.779144903696476e-05, "loss": 4.8833, "num_input_tokens_seen": 782893056, "step": 5973 }, { "epoch": 0.9806718939902872, "grad_norm": 0.6585260033607483, "learning_rate": 6.77744309749326e-05, "loss": 4.915, "num_input_tokens_seen": 783286272, "step": 5976 }, { "epoch": 0.981164199157953, "grad_norm": 0.6275220513343811, "learning_rate": 6.775742572288348e-05, "loss": 4.9577, "num_input_tokens_seen": 783679488, "step": 5979 }, { "epoch": 0.9816565043256188, "grad_norm": 0.6095502972602844, "learning_rate": 6.774043326475473e-05, "loss": 4.8891, "num_input_tokens_seen": 784072704, "step": 5982 }, { "epoch": 0.9821488094932846, "grad_norm": 0.632768988609314, "learning_rate": 6.772345358451186e-05, "loss": 4.9223, "num_input_tokens_seen": 784465920, "step": 5985 }, { "epoch": 0.9826411146609505, "grad_norm": 0.6620305180549622, "learning_rate": 6.770648666614851e-05, "loss": 4.8904, "num_input_tokens_seen": 784859136, "step": 5988 }, { "epoch": 0.9831334198286162, "grad_norm": 0.6766555309295654, "learning_rate": 6.768953249368636e-05, "loss": 4.8817, "num_input_tokens_seen": 785252352, "step": 5991 }, { "epoch": 0.9836257249962821, "grad_norm": 0.5768389105796814, "learning_rate": 6.767259105117506e-05, "loss": 4.8491, "num_input_tokens_seen": 785645568, "step": 5994 }, { "epoch": 0.9841180301639478, "grad_norm": 0.6032988429069519, "learning_rate": 6.765566232269226e-05, "loss": 4.9208, "num_input_tokens_seen": 786038784, "step": 5997 }, { "epoch": 0.9846103353316137, "grad_norm": 0.6204380393028259, "learning_rate": 6.763874629234341e-05, "loss": 4.8818, "num_input_tokens_seen": 786432000, "step": 6000 }, { "epoch": 0.9846103353316137, "eval_accuracy": 0.2314671877544374, "eval_loss": 5.198113918304443, "eval_runtime": 127.6077, "eval_samples_per_second": 2.351, "eval_steps_per_second": 1.175, "num_input_tokens_seen": 786432000, "step": 6000 }, { "epoch": 0.9851026404992795, "grad_norm": 0.5865297317504883, "learning_rate": 6.762184294426182e-05, "loss": 4.8824, "num_input_tokens_seen": 786825216, "step": 6003 }, { "epoch": 0.9855949456669453, "grad_norm": 0.6484345197677612, "learning_rate": 6.760495226260847e-05, "loss": 4.8941, "num_input_tokens_seen": 787218432, "step": 6006 }, { "epoch": 0.9860872508346111, "grad_norm": 0.59868985414505, "learning_rate": 6.75880742315721e-05, "loss": 4.9025, "num_input_tokens_seen": 787611648, "step": 6009 }, { "epoch": 0.9865795560022769, "grad_norm": 0.5721721053123474, "learning_rate": 6.757120883536902e-05, "loss": 4.8878, "num_input_tokens_seen": 788004864, "step": 6012 }, { "epoch": 0.9870718611699427, "grad_norm": 0.647430956363678, "learning_rate": 6.755435605824312e-05, "loss": 4.9164, "num_input_tokens_seen": 788398080, "step": 6015 }, { "epoch": 0.9875641663376086, "grad_norm": 0.6465507745742798, "learning_rate": 6.753751588446576e-05, "loss": 4.9158, "num_input_tokens_seen": 788791296, "step": 6018 }, { "epoch": 0.9880564715052743, "grad_norm": 0.6825433969497681, "learning_rate": 6.752068829833576e-05, "loss": 4.9292, "num_input_tokens_seen": 789184512, "step": 6021 }, { "epoch": 0.9885487766729402, "grad_norm": 0.5541806221008301, "learning_rate": 6.750387328417927e-05, "loss": 4.9168, "num_input_tokens_seen": 789577728, "step": 6024 }, { "epoch": 0.9890410818406059, "grad_norm": 0.6401199698448181, "learning_rate": 6.748707082634982e-05, "loss": 4.8989, "num_input_tokens_seen": 789970944, "step": 6027 }, { "epoch": 0.9895333870082718, "grad_norm": 0.5784100294113159, "learning_rate": 6.747028090922809e-05, "loss": 4.8809, "num_input_tokens_seen": 790364160, "step": 6030 }, { "epoch": 0.9900256921759376, "grad_norm": 0.6119440197944641, "learning_rate": 6.745350351722202e-05, "loss": 4.8772, "num_input_tokens_seen": 790757376, "step": 6033 }, { "epoch": 0.9905179973436034, "grad_norm": 0.5223164558410645, "learning_rate": 6.743673863476671e-05, "loss": 4.9168, "num_input_tokens_seen": 791150592, "step": 6036 }, { "epoch": 0.9910103025112692, "grad_norm": 0.6039283275604248, "learning_rate": 6.74199862463242e-05, "loss": 4.8858, "num_input_tokens_seen": 791543808, "step": 6039 }, { "epoch": 0.9915026076789349, "grad_norm": 0.5842203497886658, "learning_rate": 6.740324633638366e-05, "loss": 4.9642, "num_input_tokens_seen": 791937024, "step": 6042 }, { "epoch": 0.9919949128466008, "grad_norm": 0.5527199506759644, "learning_rate": 6.738651888946112e-05, "loss": 4.8472, "num_input_tokens_seen": 792330240, "step": 6045 }, { "epoch": 0.9924872180142666, "grad_norm": 0.7194064259529114, "learning_rate": 6.736980389009957e-05, "loss": 4.9052, "num_input_tokens_seen": 792723456, "step": 6048 }, { "epoch": 0.9929795231819324, "grad_norm": 0.5239654779434204, "learning_rate": 6.735310132286876e-05, "loss": 4.8818, "num_input_tokens_seen": 793116672, "step": 6051 }, { "epoch": 0.9934718283495982, "grad_norm": 0.5916581153869629, "learning_rate": 6.733641117236525e-05, "loss": 4.9061, "num_input_tokens_seen": 793509888, "step": 6054 }, { "epoch": 0.993964133517264, "grad_norm": 0.5269445776939392, "learning_rate": 6.731973342321227e-05, "loss": 4.8932, "num_input_tokens_seen": 793903104, "step": 6057 }, { "epoch": 0.9944564386849298, "grad_norm": 0.5389668941497803, "learning_rate": 6.73030680600597e-05, "loss": 4.8693, "num_input_tokens_seen": 794296320, "step": 6060 }, { "epoch": 0.9949487438525956, "grad_norm": 0.5677322745323181, "learning_rate": 6.728641506758407e-05, "loss": 4.9268, "num_input_tokens_seen": 794689536, "step": 6063 }, { "epoch": 0.9954410490202614, "grad_norm": 0.5324602723121643, "learning_rate": 6.726977443048832e-05, "loss": 4.9348, "num_input_tokens_seen": 795082752, "step": 6066 }, { "epoch": 0.9959333541879273, "grad_norm": 0.5456458330154419, "learning_rate": 6.725314613350202e-05, "loss": 4.8845, "num_input_tokens_seen": 795475968, "step": 6069 }, { "epoch": 0.996425659355593, "grad_norm": 0.5815337896347046, "learning_rate": 6.723653016138096e-05, "loss": 4.8419, "num_input_tokens_seen": 795869184, "step": 6072 }, { "epoch": 0.9969179645232589, "grad_norm": 0.6590341329574585, "learning_rate": 6.721992649890743e-05, "loss": 4.9176, "num_input_tokens_seen": 796262400, "step": 6075 }, { "epoch": 0.9974102696909246, "grad_norm": 0.5114787817001343, "learning_rate": 6.720333513088994e-05, "loss": 4.9158, "num_input_tokens_seen": 796655616, "step": 6078 }, { "epoch": 0.9979025748585905, "grad_norm": 0.6558582186698914, "learning_rate": 6.71867560421633e-05, "loss": 4.8964, "num_input_tokens_seen": 797048832, "step": 6081 }, { "epoch": 0.9983948800262563, "grad_norm": 0.5851151347160339, "learning_rate": 6.717018921758838e-05, "loss": 4.9178, "num_input_tokens_seen": 797442048, "step": 6084 }, { "epoch": 0.9988871851939221, "grad_norm": 0.6128942370414734, "learning_rate": 6.715363464205227e-05, "loss": 4.9023, "num_input_tokens_seen": 797835264, "step": 6087 }, { "epoch": 0.9993794903615879, "grad_norm": 0.6216326355934143, "learning_rate": 6.713709230046812e-05, "loss": 4.8119, "num_input_tokens_seen": 798228480, "step": 6090 }, { "epoch": 0.9998717955292536, "grad_norm": 0.656932532787323, "learning_rate": 6.712056217777502e-05, "loss": 4.9169, "num_input_tokens_seen": 798621696, "step": 6093 }, { "epoch": 0.9998717955292536, "num_input_tokens_seen": 798621696, "step": 6093, "total_flos": 4.855554488590664e+17, "train_loss": 5.304058988399935, "train_runtime": 127910.3478, "train_samples_per_second": 3.049, "train_steps_per_second": 0.048 } ], "logging_steps": 3, "max_steps": 6093, "num_input_tokens_seen": 798621696, "num_train_epochs": 1, "save_steps": 100, "total_flos": 4.855554488590664e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }