{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999786256278722, "eval_steps": 400, "global_step": 5848, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005129849310676499, "grad_norm": 2.118435859680176, "learning_rate": 3.0716723549488053e-06, "loss": 7.7542, "num_input_tokens_seen": 393216, "step": 3 }, { "epoch": 0.0010259698621352998, "grad_norm": 1.6133321523666382, "learning_rate": 6.1433447098976105e-06, "loss": 7.78, "num_input_tokens_seen": 786432, "step": 6 }, { "epoch": 0.0015389547932029496, "grad_norm": 1.2574195861816406, "learning_rate": 9.215017064846415e-06, "loss": 7.7144, "num_input_tokens_seen": 1179648, "step": 9 }, { "epoch": 0.0020519397242705997, "grad_norm": 1.3165267705917358, "learning_rate": 1.2286689419795221e-05, "loss": 7.656, "num_input_tokens_seen": 1572864, "step": 12 }, { "epoch": 0.0025649246553382495, "grad_norm": 1.064561367034912, "learning_rate": 1.5358361774744027e-05, "loss": 7.662, "num_input_tokens_seen": 1966080, "step": 15 }, { "epoch": 0.0030779095864058993, "grad_norm": 1.0571919679641724, "learning_rate": 1.843003412969283e-05, "loss": 7.6047, "num_input_tokens_seen": 2359296, "step": 18 }, { "epoch": 0.003590894517473549, "grad_norm": 0.8261664509773254, "learning_rate": 2.1501706484641635e-05, "loss": 7.5477, "num_input_tokens_seen": 2752512, "step": 21 }, { "epoch": 0.004103879448541199, "grad_norm": 0.777800977230072, "learning_rate": 2.4573378839590442e-05, "loss": 7.5543, "num_input_tokens_seen": 3145728, "step": 24 }, { "epoch": 0.004616864379608849, "grad_norm": 0.6865202188491821, "learning_rate": 2.7645051194539246e-05, "loss": 7.4878, "num_input_tokens_seen": 3538944, "step": 27 }, { "epoch": 0.005129849310676499, "grad_norm": 0.694611132144928, "learning_rate": 3.0716723549488054e-05, "loss": 7.4608, "num_input_tokens_seen": 3932160, "step": 30 }, { "epoch": 0.005642834241744148, "grad_norm": 0.6728237271308899, "learning_rate": 3.3788395904436854e-05, "loss": 7.4121, "num_input_tokens_seen": 4325376, "step": 33 }, { "epoch": 0.006155819172811799, "grad_norm": 0.6782873868942261, "learning_rate": 3.686006825938566e-05, "loss": 7.4209, "num_input_tokens_seen": 4718592, "step": 36 }, { "epoch": 0.006668804103879449, "grad_norm": 0.6442039012908936, "learning_rate": 3.993174061433447e-05, "loss": 7.3416, "num_input_tokens_seen": 5111808, "step": 39 }, { "epoch": 0.007181789034947098, "grad_norm": 0.6313900947570801, "learning_rate": 4.300341296928327e-05, "loss": 7.3897, "num_input_tokens_seen": 5505024, "step": 42 }, { "epoch": 0.0076947739660147485, "grad_norm": 0.6429070830345154, "learning_rate": 4.6075085324232084e-05, "loss": 7.3358, "num_input_tokens_seen": 5898240, "step": 45 }, { "epoch": 0.008207758897082399, "grad_norm": 0.59568852186203, "learning_rate": 4.9146757679180884e-05, "loss": 7.2775, "num_input_tokens_seen": 6291456, "step": 48 }, { "epoch": 0.008720743828150048, "grad_norm": 0.6876631379127502, "learning_rate": 5.221843003412969e-05, "loss": 7.261, "num_input_tokens_seen": 6684672, "step": 51 }, { "epoch": 0.009233728759217697, "grad_norm": 0.6824859380722046, "learning_rate": 5.529010238907849e-05, "loss": 7.2941, "num_input_tokens_seen": 7077888, "step": 54 }, { "epoch": 0.009746713690285349, "grad_norm": 0.7125466465950012, "learning_rate": 5.83617747440273e-05, "loss": 7.23, "num_input_tokens_seen": 7471104, "step": 57 }, { "epoch": 0.010259698621352998, "grad_norm": 0.7616642713546753, "learning_rate": 6.143344709897611e-05, "loss": 7.1531, "num_input_tokens_seen": 7864320, "step": 60 }, { "epoch": 0.010772683552420647, "grad_norm": 0.7782993912696838, "learning_rate": 6.450511945392491e-05, "loss": 7.158, "num_input_tokens_seen": 8257536, "step": 63 }, { "epoch": 0.011285668483488297, "grad_norm": 0.7161626815795898, "learning_rate": 6.757679180887371e-05, "loss": 7.1733, "num_input_tokens_seen": 8650752, "step": 66 }, { "epoch": 0.011798653414555948, "grad_norm": 0.6889225244522095, "learning_rate": 7.064846416382252e-05, "loss": 7.1141, "num_input_tokens_seen": 9043968, "step": 69 }, { "epoch": 0.012311638345623597, "grad_norm": 0.7553271055221558, "learning_rate": 7.372013651877132e-05, "loss": 7.1584, "num_input_tokens_seen": 9437184, "step": 72 }, { "epoch": 0.012824623276691247, "grad_norm": 0.6760936975479126, "learning_rate": 7.679180887372012e-05, "loss": 7.0536, "num_input_tokens_seen": 9830400, "step": 75 }, { "epoch": 0.013337608207758898, "grad_norm": 0.6408318281173706, "learning_rate": 7.986348122866894e-05, "loss": 7.0168, "num_input_tokens_seen": 10223616, "step": 78 }, { "epoch": 0.013850593138826547, "grad_norm": 0.679551899433136, "learning_rate": 8.293515358361774e-05, "loss": 7.0783, "num_input_tokens_seen": 10616832, "step": 81 }, { "epoch": 0.014363578069894196, "grad_norm": 0.7019697427749634, "learning_rate": 8.600682593856654e-05, "loss": 7.1219, "num_input_tokens_seen": 11010048, "step": 84 }, { "epoch": 0.014876563000961848, "grad_norm": 0.7888438701629639, "learning_rate": 8.907849829351534e-05, "loss": 7.0021, "num_input_tokens_seen": 11403264, "step": 87 }, { "epoch": 0.015389547932029497, "grad_norm": 0.8103983998298645, "learning_rate": 9.215017064846417e-05, "loss": 7.0218, "num_input_tokens_seen": 11796480, "step": 90 }, { "epoch": 0.015902532863097146, "grad_norm": 0.8136906027793884, "learning_rate": 9.522184300341297e-05, "loss": 6.9954, "num_input_tokens_seen": 12189696, "step": 93 }, { "epoch": 0.016415517794164797, "grad_norm": 0.8006098866462708, "learning_rate": 9.829351535836177e-05, "loss": 7.0089, "num_input_tokens_seen": 12582912, "step": 96 }, { "epoch": 0.016928502725232445, "grad_norm": 0.7752631902694702, "learning_rate": 0.00010136518771331057, "loss": 7.0756, "num_input_tokens_seen": 12976128, "step": 99 }, { "epoch": 0.017441487656300096, "grad_norm": 0.8363745212554932, "learning_rate": 0.00010443686006825938, "loss": 6.9849, "num_input_tokens_seen": 13369344, "step": 102 }, { "epoch": 0.017954472587367747, "grad_norm": 0.9236883521080017, "learning_rate": 0.00010750853242320818, "loss": 6.9253, "num_input_tokens_seen": 13762560, "step": 105 }, { "epoch": 0.018467457518435395, "grad_norm": 0.993817925453186, "learning_rate": 0.00011058020477815698, "loss": 7.0273, "num_input_tokens_seen": 14155776, "step": 108 }, { "epoch": 0.018980442449503046, "grad_norm": 0.9351868033409119, "learning_rate": 0.00011365187713310579, "loss": 6.9702, "num_input_tokens_seen": 14548992, "step": 111 }, { "epoch": 0.019493427380570697, "grad_norm": 0.8399161696434021, "learning_rate": 0.0001167235494880546, "loss": 6.9723, "num_input_tokens_seen": 14942208, "step": 114 }, { "epoch": 0.020006412311638345, "grad_norm": 0.9522429704666138, "learning_rate": 0.0001197952218430034, "loss": 6.9254, "num_input_tokens_seen": 15335424, "step": 117 }, { "epoch": 0.020519397242705996, "grad_norm": 1.0010826587677002, "learning_rate": 0.00012286689419795221, "loss": 6.8872, "num_input_tokens_seen": 15728640, "step": 120 }, { "epoch": 0.021032382173773647, "grad_norm": 0.8947874903678894, "learning_rate": 0.000125938566552901, "loss": 6.9628, "num_input_tokens_seen": 16121856, "step": 123 }, { "epoch": 0.021545367104841295, "grad_norm": 1.0230917930603027, "learning_rate": 0.00012901023890784982, "loss": 6.8417, "num_input_tokens_seen": 16515072, "step": 126 }, { "epoch": 0.022058352035908946, "grad_norm": 0.9666141271591187, "learning_rate": 0.00013208191126279863, "loss": 6.9335, "num_input_tokens_seen": 16908288, "step": 129 }, { "epoch": 0.022571336966976593, "grad_norm": 1.1200824975967407, "learning_rate": 0.00013515358361774742, "loss": 6.8832, "num_input_tokens_seen": 17301504, "step": 132 }, { "epoch": 0.023084321898044245, "grad_norm": 0.9717016220092773, "learning_rate": 0.00013822525597269623, "loss": 6.8967, "num_input_tokens_seen": 17694720, "step": 135 }, { "epoch": 0.023597306829111896, "grad_norm": 1.024543285369873, "learning_rate": 0.00014129692832764505, "loss": 6.8902, "num_input_tokens_seen": 18087936, "step": 138 }, { "epoch": 0.024110291760179543, "grad_norm": 0.9085791707038879, "learning_rate": 0.00014436860068259383, "loss": 6.8873, "num_input_tokens_seen": 18481152, "step": 141 }, { "epoch": 0.024623276691247194, "grad_norm": 0.8841372132301331, "learning_rate": 0.00014744027303754265, "loss": 6.8524, "num_input_tokens_seen": 18874368, "step": 144 }, { "epoch": 0.025136261622314846, "grad_norm": 1.0399245023727417, "learning_rate": 0.00015051194539249146, "loss": 6.8402, "num_input_tokens_seen": 19267584, "step": 147 }, { "epoch": 0.025649246553382493, "grad_norm": 1.4088637828826904, "learning_rate": 0.00015358361774744025, "loss": 6.8282, "num_input_tokens_seen": 19660800, "step": 150 }, { "epoch": 0.026162231484450144, "grad_norm": 1.1549632549285889, "learning_rate": 0.00015665529010238906, "loss": 6.8356, "num_input_tokens_seen": 20054016, "step": 153 }, { "epoch": 0.026675216415517795, "grad_norm": 1.6987155675888062, "learning_rate": 0.00015972696245733788, "loss": 6.787, "num_input_tokens_seen": 20447232, "step": 156 }, { "epoch": 0.027188201346585443, "grad_norm": 1.1561607122421265, "learning_rate": 0.00016279863481228666, "loss": 6.7794, "num_input_tokens_seen": 20840448, "step": 159 }, { "epoch": 0.027701186277653094, "grad_norm": 1.8383941650390625, "learning_rate": 0.00016587030716723548, "loss": 6.7788, "num_input_tokens_seen": 21233664, "step": 162 }, { "epoch": 0.028214171208720745, "grad_norm": 1.4902769327163696, "learning_rate": 0.0001689419795221843, "loss": 6.7697, "num_input_tokens_seen": 21626880, "step": 165 }, { "epoch": 0.028727156139788393, "grad_norm": 1.2899839878082275, "learning_rate": 0.00017201365187713308, "loss": 6.8425, "num_input_tokens_seen": 22020096, "step": 168 }, { "epoch": 0.029240141070856044, "grad_norm": 1.4876312017440796, "learning_rate": 0.0001750853242320819, "loss": 6.776, "num_input_tokens_seen": 22413312, "step": 171 }, { "epoch": 0.029753126001923695, "grad_norm": 1.0720467567443848, "learning_rate": 0.00017815699658703068, "loss": 6.7536, "num_input_tokens_seen": 22806528, "step": 174 }, { "epoch": 0.030266110932991343, "grad_norm": 1.0955810546875, "learning_rate": 0.0001812286689419795, "loss": 6.7726, "num_input_tokens_seen": 23199744, "step": 177 }, { "epoch": 0.030779095864058994, "grad_norm": 1.5811485052108765, "learning_rate": 0.00018430034129692833, "loss": 6.7334, "num_input_tokens_seen": 23592960, "step": 180 }, { "epoch": 0.031292080795126645, "grad_norm": 1.4859919548034668, "learning_rate": 0.0001873720136518771, "loss": 6.7376, "num_input_tokens_seen": 23986176, "step": 183 }, { "epoch": 0.03180506572619429, "grad_norm": 1.0954172611236572, "learning_rate": 0.00019044368600682594, "loss": 6.7563, "num_input_tokens_seen": 24379392, "step": 186 }, { "epoch": 0.03231805065726194, "grad_norm": 1.2160760164260864, "learning_rate": 0.00019351535836177475, "loss": 6.7449, "num_input_tokens_seen": 24772608, "step": 189 }, { "epoch": 0.032831035588329595, "grad_norm": 1.6859344244003296, "learning_rate": 0.00019658703071672354, "loss": 6.7386, "num_input_tokens_seen": 25165824, "step": 192 }, { "epoch": 0.03334402051939724, "grad_norm": 1.3483397960662842, "learning_rate": 0.00019965870307167235, "loss": 6.7087, "num_input_tokens_seen": 25559040, "step": 195 }, { "epoch": 0.03385700545046489, "grad_norm": 1.5949305295944214, "learning_rate": 0.00020273037542662114, "loss": 6.6816, "num_input_tokens_seen": 25952256, "step": 198 }, { "epoch": 0.034369990381532545, "grad_norm": 1.3635272979736328, "learning_rate": 0.00020580204778156995, "loss": 6.6855, "num_input_tokens_seen": 26345472, "step": 201 }, { "epoch": 0.03488297531260019, "grad_norm": 1.3199516534805298, "learning_rate": 0.00020887372013651877, "loss": 6.7078, "num_input_tokens_seen": 26738688, "step": 204 }, { "epoch": 0.03539596024366784, "grad_norm": 1.3045519590377808, "learning_rate": 0.00021194539249146755, "loss": 6.6587, "num_input_tokens_seen": 27131904, "step": 207 }, { "epoch": 0.035908945174735495, "grad_norm": 1.1506019830703735, "learning_rate": 0.00021501706484641637, "loss": 6.7181, "num_input_tokens_seen": 27525120, "step": 210 }, { "epoch": 0.03642193010580314, "grad_norm": 1.068679690361023, "learning_rate": 0.00021808873720136518, "loss": 6.6458, "num_input_tokens_seen": 27918336, "step": 213 }, { "epoch": 0.03693491503687079, "grad_norm": 1.2845734357833862, "learning_rate": 0.00022116040955631397, "loss": 6.6676, "num_input_tokens_seen": 28311552, "step": 216 }, { "epoch": 0.037447899967938444, "grad_norm": 1.4203448295593262, "learning_rate": 0.00022423208191126278, "loss": 6.6261, "num_input_tokens_seen": 28704768, "step": 219 }, { "epoch": 0.03796088489900609, "grad_norm": 1.3917193412780762, "learning_rate": 0.00022730375426621157, "loss": 6.6926, "num_input_tokens_seen": 29097984, "step": 222 }, { "epoch": 0.03847386983007374, "grad_norm": 1.2271850109100342, "learning_rate": 0.00023037542662116038, "loss": 6.6867, "num_input_tokens_seen": 29491200, "step": 225 }, { "epoch": 0.038986854761141394, "grad_norm": 1.958269715309143, "learning_rate": 0.0002334470989761092, "loss": 6.6826, "num_input_tokens_seen": 29884416, "step": 228 }, { "epoch": 0.03949983969220904, "grad_norm": 1.7181731462478638, "learning_rate": 0.00023651877133105799, "loss": 6.6605, "num_input_tokens_seen": 30277632, "step": 231 }, { "epoch": 0.04001282462327669, "grad_norm": 1.3246721029281616, "learning_rate": 0.0002395904436860068, "loss": 6.6036, "num_input_tokens_seen": 30670848, "step": 234 }, { "epoch": 0.040525809554344344, "grad_norm": 1.2964049577713013, "learning_rate": 0.00024266211604095561, "loss": 6.6086, "num_input_tokens_seen": 31064064, "step": 237 }, { "epoch": 0.04103879448541199, "grad_norm": 1.2255417108535767, "learning_rate": 0.00024573378839590443, "loss": 6.6079, "num_input_tokens_seen": 31457280, "step": 240 }, { "epoch": 0.04155177941647964, "grad_norm": 1.6006697416305542, "learning_rate": 0.0002488054607508532, "loss": 6.5652, "num_input_tokens_seen": 31850496, "step": 243 }, { "epoch": 0.042064764347547294, "grad_norm": 1.1786364316940308, "learning_rate": 0.000251877133105802, "loss": 6.6156, "num_input_tokens_seen": 32243712, "step": 246 }, { "epoch": 0.04257774927861494, "grad_norm": 1.554391622543335, "learning_rate": 0.00025494880546075084, "loss": 6.6044, "num_input_tokens_seen": 32636928, "step": 249 }, { "epoch": 0.04309073420968259, "grad_norm": 1.8817625045776367, "learning_rate": 0.00025802047781569963, "loss": 6.5893, "num_input_tokens_seen": 33030144, "step": 252 }, { "epoch": 0.043603719140750244, "grad_norm": 1.1505640745162964, "learning_rate": 0.0002610921501706484, "loss": 6.578, "num_input_tokens_seen": 33423360, "step": 255 }, { "epoch": 0.04411670407181789, "grad_norm": 2.1356940269470215, "learning_rate": 0.00026416382252559726, "loss": 6.6012, "num_input_tokens_seen": 33816576, "step": 258 }, { "epoch": 0.04462968900288554, "grad_norm": 1.7814842462539673, "learning_rate": 0.00026723549488054605, "loss": 6.5964, "num_input_tokens_seen": 34209792, "step": 261 }, { "epoch": 0.04514267393395319, "grad_norm": 2.086648941040039, "learning_rate": 0.00027030716723549483, "loss": 6.6411, "num_input_tokens_seen": 34603008, "step": 264 }, { "epoch": 0.04565565886502084, "grad_norm": 1.9855871200561523, "learning_rate": 0.0002733788395904437, "loss": 6.5595, "num_input_tokens_seen": 34996224, "step": 267 }, { "epoch": 0.04616864379608849, "grad_norm": 1.3594361543655396, "learning_rate": 0.00027645051194539246, "loss": 6.6081, "num_input_tokens_seen": 35389440, "step": 270 }, { "epoch": 0.04668162872715614, "grad_norm": 1.9229851961135864, "learning_rate": 0.00027952218430034125, "loss": 6.5241, "num_input_tokens_seen": 35782656, "step": 273 }, { "epoch": 0.04719461365822379, "grad_norm": 1.7178096771240234, "learning_rate": 0.0002825938566552901, "loss": 6.5881, "num_input_tokens_seen": 36175872, "step": 276 }, { "epoch": 0.04770759858929144, "grad_norm": 1.7998623847961426, "learning_rate": 0.0002856655290102389, "loss": 6.5847, "num_input_tokens_seen": 36569088, "step": 279 }, { "epoch": 0.04822058352035909, "grad_norm": 2.0001308917999268, "learning_rate": 0.00028873720136518766, "loss": 6.5402, "num_input_tokens_seen": 36962304, "step": 282 }, { "epoch": 0.04873356845142674, "grad_norm": 1.416505217552185, "learning_rate": 0.0002918088737201365, "loss": 6.5869, "num_input_tokens_seen": 37355520, "step": 285 }, { "epoch": 0.04924655338249439, "grad_norm": 1.462956190109253, "learning_rate": 0.0002948805460750853, "loss": 6.5351, "num_input_tokens_seen": 37748736, "step": 288 }, { "epoch": 0.049759538313562036, "grad_norm": 2.0115163326263428, "learning_rate": 0.0002979522184300341, "loss": 6.5625, "num_input_tokens_seen": 38141952, "step": 291 }, { "epoch": 0.05027252324462969, "grad_norm": 1.7845978736877441, "learning_rate": 0.00029948936133195183, "loss": 6.6064, "num_input_tokens_seen": 38535168, "step": 294 }, { "epoch": 0.05078550817569734, "grad_norm": 1.81064772605896, "learning_rate": 0.0002979729497586631, "loss": 6.5544, "num_input_tokens_seen": 38928384, "step": 297 }, { "epoch": 0.051298493106764986, "grad_norm": 1.8805071115493774, "learning_rate": 0.00029647934160747185, "loss": 6.5701, "num_input_tokens_seen": 39321600, "step": 300 }, { "epoch": 0.05181147803783264, "grad_norm": 2.1542809009552, "learning_rate": 0.0002950079710284063, "loss": 6.5521, "num_input_tokens_seen": 39714816, "step": 303 }, { "epoch": 0.05232446296890029, "grad_norm": 1.9905306100845337, "learning_rate": 0.0002935582916359803, "loss": 6.5324, "num_input_tokens_seen": 40108032, "step": 306 }, { "epoch": 0.052837447899967936, "grad_norm": 1.9549764394760132, "learning_rate": 0.00029212977565671515, "loss": 6.5147, "num_input_tokens_seen": 40501248, "step": 309 }, { "epoch": 0.05335043283103559, "grad_norm": 1.7094388008117676, "learning_rate": 0.0002907219131218538, "loss": 6.5199, "num_input_tokens_seen": 40894464, "step": 312 }, { "epoch": 0.05386341776210324, "grad_norm": 1.6284552812576294, "learning_rate": 0.00028933421110246486, "loss": 6.5146, "num_input_tokens_seen": 41287680, "step": 315 }, { "epoch": 0.054376402693170886, "grad_norm": 1.983896017074585, "learning_rate": 0.0002879661929843272, "loss": 6.5427, "num_input_tokens_seen": 41680896, "step": 318 }, { "epoch": 0.05488938762423854, "grad_norm": 1.9521673917770386, "learning_rate": 0.00028661739778017726, "loss": 6.5047, "num_input_tokens_seen": 42074112, "step": 321 }, { "epoch": 0.05540237255530619, "grad_norm": 2.61079740524292, "learning_rate": 0.0002852873794770615, "loss": 6.5187, "num_input_tokens_seen": 42467328, "step": 324 }, { "epoch": 0.055915357486373836, "grad_norm": 2.143825054168701, "learning_rate": 0.00028397570641669755, "loss": 6.5335, "num_input_tokens_seen": 42860544, "step": 327 }, { "epoch": 0.05642834241744149, "grad_norm": 1.2572081089019775, "learning_rate": 0.00028268196070688857, "loss": 6.492, "num_input_tokens_seen": 43253760, "step": 330 }, { "epoch": 0.05694132734850914, "grad_norm": 1.3765265941619873, "learning_rate": 0.0002814057376621684, "loss": 6.5031, "num_input_tokens_seen": 43646976, "step": 333 }, { "epoch": 0.057454312279576786, "grad_norm": 1.4847021102905273, "learning_rate": 0.00028014664527197685, "loss": 6.5166, "num_input_tokens_seen": 44040192, "step": 336 }, { "epoch": 0.05796729721064444, "grad_norm": 1.3950855731964111, "learning_rate": 0.0002789043036947781, "loss": 6.5069, "num_input_tokens_seen": 44433408, "step": 339 }, { "epoch": 0.05848028214171209, "grad_norm": 1.9534938335418701, "learning_rate": 0.0002776783447766403, "loss": 6.4242, "num_input_tokens_seen": 44826624, "step": 342 }, { "epoch": 0.058993267072779736, "grad_norm": 1.5846917629241943, "learning_rate": 0.00027646841159289, "loss": 6.4923, "num_input_tokens_seen": 45219840, "step": 345 }, { "epoch": 0.05950625200384739, "grad_norm": 1.5529935359954834, "learning_rate": 0.00027527415801154584, "loss": 6.5015, "num_input_tokens_seen": 45613056, "step": 348 }, { "epoch": 0.06001923693491504, "grad_norm": 2.177635908126831, "learning_rate": 0.000274095248277319, "loss": 6.4769, "num_input_tokens_seen": 46006272, "step": 351 }, { "epoch": 0.060532221865982686, "grad_norm": 1.630927324295044, "learning_rate": 0.0002729313566150449, "loss": 6.4383, "num_input_tokens_seen": 46399488, "step": 354 }, { "epoch": 0.06104520679705034, "grad_norm": 1.6595220565795898, "learning_rate": 0.0002717821668514831, "loss": 6.4398, "num_input_tokens_seen": 46792704, "step": 357 }, { "epoch": 0.06155819172811799, "grad_norm": 2.464336395263672, "learning_rate": 0.0002706473720544871, "loss": 6.4676, "num_input_tokens_seen": 47185920, "step": 360 }, { "epoch": 0.062071176659185635, "grad_norm": 1.580959439277649, "learning_rate": 0.0002695266741886108, "loss": 6.4727, "num_input_tokens_seen": 47579136, "step": 363 }, { "epoch": 0.06258416159025329, "grad_norm": 1.2866970300674438, "learning_rate": 0.0002684197837862717, "loss": 6.4078, "num_input_tokens_seen": 47972352, "step": 366 }, { "epoch": 0.06309714652132094, "grad_norm": 1.84207022190094, "learning_rate": 0.00026732641963364995, "loss": 6.3775, "num_input_tokens_seen": 48365568, "step": 369 }, { "epoch": 0.06361013145238859, "grad_norm": 1.3852074146270752, "learning_rate": 0.0002662463084705468, "loss": 6.4169, "num_input_tokens_seen": 48758784, "step": 372 }, { "epoch": 0.06412311638345623, "grad_norm": 1.644254207611084, "learning_rate": 0.00026517918470347554, "loss": 6.4822, "num_input_tokens_seen": 49152000, "step": 375 }, { "epoch": 0.06463610131452388, "grad_norm": 1.1741936206817627, "learning_rate": 0.0002641247901313028, "loss": 6.4092, "num_input_tokens_seen": 49545216, "step": 378 }, { "epoch": 0.06514908624559154, "grad_norm": 2.0758259296417236, "learning_rate": 0.0002630828736827938, "loss": 6.4429, "num_input_tokens_seen": 49938432, "step": 381 }, { "epoch": 0.06566207117665919, "grad_norm": 1.7567379474639893, "learning_rate": 0.00026205319116545786, "loss": 6.4423, "num_input_tokens_seen": 50331648, "step": 384 }, { "epoch": 0.06617505610772684, "grad_norm": 1.2850079536437988, "learning_rate": 0.0002610355050251228, "loss": 6.4584, "num_input_tokens_seen": 50724864, "step": 387 }, { "epoch": 0.06668804103879448, "grad_norm": 1.6754025220870972, "learning_rate": 0.00026002958411570134, "loss": 6.461, "num_input_tokens_seen": 51118080, "step": 390 }, { "epoch": 0.06720102596986213, "grad_norm": 1.3733254671096802, "learning_rate": 0.0002590352034786418, "loss": 6.4098, "num_input_tokens_seen": 51511296, "step": 393 }, { "epoch": 0.06771401090092978, "grad_norm": 1.1443592309951782, "learning_rate": 0.0002580521441315865, "loss": 6.34, "num_input_tokens_seen": 51904512, "step": 396 }, { "epoch": 0.06822699583199744, "grad_norm": 1.9371726512908936, "learning_rate": 0.0002570801928657861, "loss": 6.4019, "num_input_tokens_seen": 52297728, "step": 399 }, { "epoch": 0.06839799080901998, "eval_accuracy": 0.1277788633773001, "eval_loss": 6.768958568572998, "eval_runtime": 112.6432, "eval_samples_per_second": 2.663, "eval_steps_per_second": 1.332, "num_input_tokens_seen": 52428800, "step": 400 }, { "epoch": 0.06873998076306509, "grad_norm": 1.4626785516738892, "learning_rate": 0.0002561191420518449, "loss": 6.3792, "num_input_tokens_seen": 52690944, "step": 402 }, { "epoch": 0.06925296569413274, "grad_norm": 1.1538268327713013, "learning_rate": 0.0002551687894533952, "loss": 6.3429, "num_input_tokens_seen": 53084160, "step": 405 }, { "epoch": 0.06976595062520038, "grad_norm": 1.803175926208496, "learning_rate": 0.00025422893804831985, "loss": 6.3593, "num_input_tokens_seen": 53477376, "step": 408 }, { "epoch": 0.07027893555626803, "grad_norm": 1.3619705438613892, "learning_rate": 0.0002532993958571671, "loss": 6.3489, "num_input_tokens_seen": 53870592, "step": 411 }, { "epoch": 0.07079192048733568, "grad_norm": 1.1999845504760742, "learning_rate": 0.0002523799757784144, "loss": 6.3924, "num_input_tokens_seen": 54263808, "step": 414 }, { "epoch": 0.07130490541840333, "grad_norm": 1.1951202154159546, "learning_rate": 0.000251470495430264, "loss": 6.356, "num_input_tokens_seen": 54657024, "step": 417 }, { "epoch": 0.07181789034947099, "grad_norm": 1.117151141166687, "learning_rate": 0.0002505707769986641, "loss": 6.2931, "num_input_tokens_seen": 55050240, "step": 420 }, { "epoch": 0.07233087528053864, "grad_norm": 1.0864099264144897, "learning_rate": 0.00024968064709126914, "loss": 6.3978, "num_input_tokens_seen": 55443456, "step": 423 }, { "epoch": 0.07284386021160628, "grad_norm": 1.4912980794906616, "learning_rate": 0.0002487999365970663, "loss": 6.3532, "num_input_tokens_seen": 55836672, "step": 426 }, { "epoch": 0.07335684514267393, "grad_norm": 1.2617565393447876, "learning_rate": 0.0002479284805514112, "loss": 6.3529, "num_input_tokens_seen": 56229888, "step": 429 }, { "epoch": 0.07386983007374158, "grad_norm": 1.585567831993103, "learning_rate": 0.00024706611800622653, "loss": 6.2868, "num_input_tokens_seen": 56623104, "step": 432 }, { "epoch": 0.07438281500480923, "grad_norm": 1.342955231666565, "learning_rate": 0.00024621269190513393, "loss": 6.3653, "num_input_tokens_seen": 57016320, "step": 435 }, { "epoch": 0.07489579993587689, "grad_norm": 1.0312269926071167, "learning_rate": 0.00024536804896329673, "loss": 6.3066, "num_input_tokens_seen": 57409536, "step": 438 }, { "epoch": 0.07540878486694454, "grad_norm": 1.047814130783081, "learning_rate": 0.00024453203955176697, "loss": 6.2996, "num_input_tokens_seen": 57802752, "step": 441 }, { "epoch": 0.07592176979801218, "grad_norm": 2.0140538215637207, "learning_rate": 0.00024370451758613725, "loss": 6.3169, "num_input_tokens_seen": 58195968, "step": 444 }, { "epoch": 0.07643475472907983, "grad_norm": 1.7778139114379883, "learning_rate": 0.00024288534041930982, "loss": 6.4133, "num_input_tokens_seen": 58589184, "step": 447 }, { "epoch": 0.07694773966014748, "grad_norm": 1.4507876634597778, "learning_rate": 0.0002420743687382041, "loss": 6.3126, "num_input_tokens_seen": 58982400, "step": 450 }, { "epoch": 0.07746072459121513, "grad_norm": 1.4134477376937866, "learning_rate": 0.0002412714664642326, "loss": 6.2931, "num_input_tokens_seen": 59375616, "step": 453 }, { "epoch": 0.07797370952228279, "grad_norm": 0.8874240517616272, "learning_rate": 0.00024047650065738447, "loss": 6.3292, "num_input_tokens_seen": 59768832, "step": 456 }, { "epoch": 0.07848669445335044, "grad_norm": 0.8925871253013611, "learning_rate": 0.00023968934142376222, "loss": 6.3195, "num_input_tokens_seen": 60162048, "step": 459 }, { "epoch": 0.07899967938441808, "grad_norm": 1.0737918615341187, "learning_rate": 0.00023890986182642624, "loss": 6.3276, "num_input_tokens_seen": 60555264, "step": 462 }, { "epoch": 0.07951266431548573, "grad_norm": 1.271182894706726, "learning_rate": 0.00023813793779940825, "loss": 6.271, "num_input_tokens_seen": 60948480, "step": 465 }, { "epoch": 0.08002564924655338, "grad_norm": 1.3476979732513428, "learning_rate": 0.0002373734480647611, "loss": 6.2534, "num_input_tokens_seen": 61341696, "step": 468 }, { "epoch": 0.08053863417762103, "grad_norm": 1.1347360610961914, "learning_rate": 0.00023661627405251905, "loss": 6.322, "num_input_tokens_seen": 61734912, "step": 471 }, { "epoch": 0.08105161910868869, "grad_norm": 1.2227282524108887, "learning_rate": 0.00023586629982344883, "loss": 6.2605, "num_input_tokens_seen": 62128128, "step": 474 }, { "epoch": 0.08156460403975634, "grad_norm": 1.0676379203796387, "learning_rate": 0.0002351234119944769, "loss": 6.2544, "num_input_tokens_seen": 62521344, "step": 477 }, { "epoch": 0.08207758897082398, "grad_norm": 1.1656867265701294, "learning_rate": 0.00023438749966668443, "loss": 6.2735, "num_input_tokens_seen": 62914560, "step": 480 }, { "epoch": 0.08259057390189163, "grad_norm": 0.9971214532852173, "learning_rate": 0.00023365845435576572, "loss": 6.2611, "num_input_tokens_seen": 63307776, "step": 483 }, { "epoch": 0.08310355883295928, "grad_norm": 1.160860300064087, "learning_rate": 0.0002329361699248514, "loss": 6.2435, "num_input_tokens_seen": 63700992, "step": 486 }, { "epoch": 0.08361654376402693, "grad_norm": 0.8638364672660828, "learning_rate": 0.00023222054251960172, "loss": 6.2793, "num_input_tokens_seen": 64094208, "step": 489 }, { "epoch": 0.08412952869509459, "grad_norm": 0.9435164928436279, "learning_rate": 0.00023151147050548, "loss": 6.278, "num_input_tokens_seen": 64487424, "step": 492 }, { "epoch": 0.08464251362616224, "grad_norm": 0.7893585562705994, "learning_rate": 0.00023080885440712032, "loss": 6.2768, "num_input_tokens_seen": 64880640, "step": 495 }, { "epoch": 0.08515549855722988, "grad_norm": 1.150215983390808, "learning_rate": 0.00023011259684970676, "loss": 6.2422, "num_input_tokens_seen": 65273856, "step": 498 }, { "epoch": 0.08566848348829753, "grad_norm": 1.1255900859832764, "learning_rate": 0.00022942260250228647, "loss": 6.2772, "num_input_tokens_seen": 65667072, "step": 501 }, { "epoch": 0.08618146841936518, "grad_norm": 1.0153459310531616, "learning_rate": 0.00022873877802294162, "loss": 6.2121, "num_input_tokens_seen": 66060288, "step": 504 }, { "epoch": 0.08669445335043283, "grad_norm": 0.8988346457481384, "learning_rate": 0.0002280610320057476, "loss": 6.2168, "num_input_tokens_seen": 66453504, "step": 507 }, { "epoch": 0.08720743828150049, "grad_norm": 1.5252900123596191, "learning_rate": 0.00022738927492945034, "loss": 6.269, "num_input_tokens_seen": 66846720, "step": 510 }, { "epoch": 0.08772042321256814, "grad_norm": 1.5005996227264404, "learning_rate": 0.00022672341910779707, "loss": 6.2683, "num_input_tokens_seen": 67239936, "step": 513 }, { "epoch": 0.08823340814363578, "grad_norm": 1.3754152059555054, "learning_rate": 0.00022606337864145685, "loss": 6.2707, "num_input_tokens_seen": 67633152, "step": 516 }, { "epoch": 0.08874639307470343, "grad_norm": 1.0265483856201172, "learning_rate": 0.0002254090693714725, "loss": 6.2469, "num_input_tokens_seen": 68026368, "step": 519 }, { "epoch": 0.08925937800577108, "grad_norm": 1.0772042274475098, "learning_rate": 0.00022476040883418578, "loss": 6.2262, "num_input_tokens_seen": 68419584, "step": 522 }, { "epoch": 0.08977236293683873, "grad_norm": 1.050948977470398, "learning_rate": 0.00022411731621758152, "loss": 6.2141, "num_input_tokens_seen": 68812800, "step": 525 }, { "epoch": 0.09028534786790637, "grad_norm": 1.455889344215393, "learning_rate": 0.00022347971231899736, "loss": 6.2429, "num_input_tokens_seen": 69206016, "step": 528 }, { "epoch": 0.09079833279897404, "grad_norm": 1.0375335216522217, "learning_rate": 0.00022284751950415004, "loss": 6.2112, "num_input_tokens_seen": 69599232, "step": 531 }, { "epoch": 0.09131131773004168, "grad_norm": 0.9029484987258911, "learning_rate": 0.00022222066166742937, "loss": 6.212, "num_input_tokens_seen": 69992448, "step": 534 }, { "epoch": 0.09182430266110933, "grad_norm": 2.0414113998413086, "learning_rate": 0.0002215990641934136, "loss": 6.2324, "num_input_tokens_seen": 70385664, "step": 537 }, { "epoch": 0.09233728759217698, "grad_norm": 1.7731704711914062, "learning_rate": 0.00022098265391956294, "loss": 6.2663, "num_input_tokens_seen": 70778880, "step": 540 }, { "epoch": 0.09285027252324463, "grad_norm": 1.9474400281906128, "learning_rate": 0.00022037135910004776, "loss": 6.2006, "num_input_tokens_seen": 71172096, "step": 543 }, { "epoch": 0.09336325745431227, "grad_norm": 1.139887809753418, "learning_rate": 0.00021976510937067167, "loss": 6.1835, "num_input_tokens_seen": 71565312, "step": 546 }, { "epoch": 0.09387624238537993, "grad_norm": 1.4211649894714355, "learning_rate": 0.0002191638357148503, "loss": 6.2367, "num_input_tokens_seen": 71958528, "step": 549 }, { "epoch": 0.09438922731644758, "grad_norm": 1.0333606004714966, "learning_rate": 0.00021856747043060817, "loss": 6.1811, "num_input_tokens_seen": 72351744, "step": 552 }, { "epoch": 0.09490221224751523, "grad_norm": 1.1093227863311768, "learning_rate": 0.00021797594709855838, "loss": 6.1716, "num_input_tokens_seen": 72744960, "step": 555 }, { "epoch": 0.09541519717858288, "grad_norm": 1.3098039627075195, "learning_rate": 0.00021738920055083008, "loss": 6.2282, "num_input_tokens_seen": 73138176, "step": 558 }, { "epoch": 0.09592818210965053, "grad_norm": 0.9683685898780823, "learning_rate": 0.00021680716684091162, "loss": 6.1501, "num_input_tokens_seen": 73531392, "step": 561 }, { "epoch": 0.09644116704071817, "grad_norm": 0.9972584247589111, "learning_rate": 0.0002162297832143763, "loss": 6.1839, "num_input_tokens_seen": 73924608, "step": 564 }, { "epoch": 0.09695415197178583, "grad_norm": 0.8736885190010071, "learning_rate": 0.00021565698808046193, "loss": 6.1591, "num_input_tokens_seen": 74317824, "step": 567 }, { "epoch": 0.09746713690285348, "grad_norm": 1.025628924369812, "learning_rate": 0.0002150887209844738, "loss": 6.2475, "num_input_tokens_seen": 74711040, "step": 570 }, { "epoch": 0.09798012183392113, "grad_norm": 1.068459153175354, "learning_rate": 0.00021452492258098351, "loss": 6.1618, "num_input_tokens_seen": 75104256, "step": 573 }, { "epoch": 0.09849310676498878, "grad_norm": 0.8722976446151733, "learning_rate": 0.0002139655346077961, "loss": 6.1769, "num_input_tokens_seen": 75497472, "step": 576 }, { "epoch": 0.09900609169605643, "grad_norm": 0.99879390001297, "learning_rate": 0.00021341049986066098, "loss": 6.1479, "num_input_tokens_seen": 75890688, "step": 579 }, { "epoch": 0.09951907662712407, "grad_norm": 1.2947535514831543, "learning_rate": 0.00021285976216869982, "loss": 6.1689, "num_input_tokens_seen": 76283904, "step": 582 }, { "epoch": 0.10003206155819173, "grad_norm": 1.198506236076355, "learning_rate": 0.00021231326637052871, "loss": 6.176, "num_input_tokens_seen": 76677120, "step": 585 }, { "epoch": 0.10054504648925938, "grad_norm": 1.9186298847198486, "learning_rate": 0.00021177095829105132, "loss": 6.1542, "num_input_tokens_seen": 77070336, "step": 588 }, { "epoch": 0.10105803142032703, "grad_norm": 1.6377959251403809, "learning_rate": 0.00021123278471890086, "loss": 6.1783, "num_input_tokens_seen": 77463552, "step": 591 }, { "epoch": 0.10157101635139468, "grad_norm": 1.5491153001785278, "learning_rate": 0.00021069869338450912, "loss": 6.192, "num_input_tokens_seen": 77856768, "step": 594 }, { "epoch": 0.10208400128246232, "grad_norm": 1.4104523658752441, "learning_rate": 0.0002101686329387827, "loss": 6.2015, "num_input_tokens_seen": 78249984, "step": 597 }, { "epoch": 0.10259698621352997, "grad_norm": 1.230622410774231, "learning_rate": 0.00020964255293236627, "loss": 6.1604, "num_input_tokens_seen": 78643200, "step": 600 }, { "epoch": 0.10310997114459763, "grad_norm": 0.9526540040969849, "learning_rate": 0.00020912040379547395, "loss": 6.1395, "num_input_tokens_seen": 79036416, "step": 603 }, { "epoch": 0.10362295607566528, "grad_norm": 1.131076455116272, "learning_rate": 0.00020860213681827064, "loss": 6.1538, "num_input_tokens_seen": 79429632, "step": 606 }, { "epoch": 0.10413594100673293, "grad_norm": 1.2227041721343994, "learning_rate": 0.00020808770413178535, "loss": 6.1001, "num_input_tokens_seen": 79822848, "step": 609 }, { "epoch": 0.10464892593780058, "grad_norm": 1.2456010580062866, "learning_rate": 0.00020757705868933984, "loss": 6.153, "num_input_tokens_seen": 80216064, "step": 612 }, { "epoch": 0.10516191086886822, "grad_norm": 1.1259610652923584, "learning_rate": 0.00020707015424847639, "loss": 6.1156, "num_input_tokens_seen": 80609280, "step": 615 }, { "epoch": 0.10567489579993587, "grad_norm": 1.1174367666244507, "learning_rate": 0.00020656694535336808, "loss": 6.1389, "num_input_tokens_seen": 81002496, "step": 618 }, { "epoch": 0.10618788073100352, "grad_norm": 1.1163129806518555, "learning_rate": 0.00020606738731769765, "loss": 6.1592, "num_input_tokens_seen": 81395712, "step": 621 }, { "epoch": 0.10670086566207118, "grad_norm": 1.0666199922561646, "learning_rate": 0.0002055714362079892, "loss": 6.1443, "num_input_tokens_seen": 81788928, "step": 624 }, { "epoch": 0.10721385059313883, "grad_norm": 1.0415575504302979, "learning_rate": 0.00020507904882737917, "loss": 6.1783, "num_input_tokens_seen": 82182144, "step": 627 }, { "epoch": 0.10772683552420648, "grad_norm": 1.8121472597122192, "learning_rate": 0.00020459018269981298, "loss": 6.1281, "num_input_tokens_seen": 82575360, "step": 630 }, { "epoch": 0.10823982045527412, "grad_norm": 1.4155458211898804, "learning_rate": 0.00020410479605465385, "loss": 6.1657, "num_input_tokens_seen": 82968576, "step": 633 }, { "epoch": 0.10875280538634177, "grad_norm": 1.4348151683807373, "learning_rate": 0.00020362284781169176, "loss": 6.1204, "num_input_tokens_seen": 83361792, "step": 636 }, { "epoch": 0.10926579031740942, "grad_norm": 1.1229435205459595, "learning_rate": 0.00020314429756653965, "loss": 6.1741, "num_input_tokens_seen": 83755008, "step": 639 }, { "epoch": 0.10977877524847708, "grad_norm": 1.871185064315796, "learning_rate": 0.00020266910557640547, "loss": 6.1443, "num_input_tokens_seen": 84148224, "step": 642 }, { "epoch": 0.11029176017954473, "grad_norm": 1.2954288721084595, "learning_rate": 0.00020219723274622864, "loss": 6.0994, "num_input_tokens_seen": 84541440, "step": 645 }, { "epoch": 0.11080474511061238, "grad_norm": 1.6798019409179688, "learning_rate": 0.00020172864061517005, "loss": 6.105, "num_input_tokens_seen": 84934656, "step": 648 }, { "epoch": 0.11131773004168002, "grad_norm": 1.2881441116333008, "learning_rate": 0.00020126329134344468, "loss": 6.0997, "num_input_tokens_seen": 85327872, "step": 651 }, { "epoch": 0.11183071497274767, "grad_norm": 1.2833970785140991, "learning_rate": 0.000200801147699487, "loss": 6.084, "num_input_tokens_seen": 85721088, "step": 654 }, { "epoch": 0.11234369990381532, "grad_norm": 1.1184037923812866, "learning_rate": 0.00020034217304743868, "loss": 6.0939, "num_input_tokens_seen": 86114304, "step": 657 }, { "epoch": 0.11285668483488298, "grad_norm": 1.2673637866973877, "learning_rate": 0.00019988633133495007, "loss": 6.058, "num_input_tokens_seen": 86507520, "step": 660 }, { "epoch": 0.11336966976595063, "grad_norm": 1.0385109186172485, "learning_rate": 0.00019943358708128528, "loss": 6.0954, "num_input_tokens_seen": 86900736, "step": 663 }, { "epoch": 0.11388265469701828, "grad_norm": 1.233398675918579, "learning_rate": 0.00019898390536572197, "loss": 6.1543, "num_input_tokens_seen": 87293952, "step": 666 }, { "epoch": 0.11439563962808592, "grad_norm": 0.970950186252594, "learning_rate": 0.00019853725181623823, "loss": 6.1156, "num_input_tokens_seen": 87687168, "step": 669 }, { "epoch": 0.11490862455915357, "grad_norm": 1.1220591068267822, "learning_rate": 0.00019809359259847711, "loss": 6.1181, "num_input_tokens_seen": 88080384, "step": 672 }, { "epoch": 0.11542160949022122, "grad_norm": 1.175992488861084, "learning_rate": 0.00019765289440498121, "loss": 6.1052, "num_input_tokens_seen": 88473600, "step": 675 }, { "epoch": 0.11593459442128888, "grad_norm": 1.138728141784668, "learning_rate": 0.00019721512444468987, "loss": 6.059, "num_input_tokens_seen": 88866816, "step": 678 }, { "epoch": 0.11644757935235653, "grad_norm": 1.0036921501159668, "learning_rate": 0.00019678025043269053, "loss": 6.065, "num_input_tokens_seen": 89260032, "step": 681 }, { "epoch": 0.11696056428342418, "grad_norm": 0.8844671249389648, "learning_rate": 0.00019634824058021848, "loss": 6.1027, "num_input_tokens_seen": 89653248, "step": 684 }, { "epoch": 0.11747354921449182, "grad_norm": 1.0989590883255005, "learning_rate": 0.000195919063584896, "loss": 6.1392, "num_input_tokens_seen": 90046464, "step": 687 }, { "epoch": 0.11798653414555947, "grad_norm": 0.9273776412010193, "learning_rate": 0.00019549268862120603, "loss": 6.097, "num_input_tokens_seen": 90439680, "step": 690 }, { "epoch": 0.11849951907662712, "grad_norm": 0.8641604781150818, "learning_rate": 0.00019506908533119244, "loss": 6.08, "num_input_tokens_seen": 90832896, "step": 693 }, { "epoch": 0.11901250400769478, "grad_norm": 0.8257124423980713, "learning_rate": 0.00019464822381538125, "loss": 6.0782, "num_input_tokens_seen": 91226112, "step": 696 }, { "epoch": 0.11952548893876243, "grad_norm": 1.1454071998596191, "learning_rate": 0.00019423007462391608, "loss": 6.0874, "num_input_tokens_seen": 91619328, "step": 699 }, { "epoch": 0.12003847386983008, "grad_norm": 1.0243558883666992, "learning_rate": 0.0001938146087479026, "loss": 6.0659, "num_input_tokens_seen": 92012544, "step": 702 }, { "epoch": 0.12055145880089772, "grad_norm": 1.1402223110198975, "learning_rate": 0.0001934017976109553, "loss": 6.0855, "num_input_tokens_seen": 92405760, "step": 705 }, { "epoch": 0.12106444373196537, "grad_norm": 1.1279404163360596, "learning_rate": 0.00019299161306094212, "loss": 6.0553, "num_input_tokens_seen": 92798976, "step": 708 }, { "epoch": 0.12157742866303302, "grad_norm": 1.2544893026351929, "learning_rate": 0.00019258402736191987, "loss": 6.0772, "num_input_tokens_seen": 93192192, "step": 711 }, { "epoch": 0.12209041359410068, "grad_norm": 1.333325743675232, "learning_rate": 0.00019217901318625737, "loss": 6.0254, "num_input_tokens_seen": 93585408, "step": 714 }, { "epoch": 0.12260339852516833, "grad_norm": 1.4925867319107056, "learning_rate": 0.00019177654360693922, "loss": 6.1594, "num_input_tokens_seen": 93978624, "step": 717 }, { "epoch": 0.12311638345623598, "grad_norm": 1.4974321126937866, "learning_rate": 0.00019137659209004636, "loss": 6.0507, "num_input_tokens_seen": 94371840, "step": 720 }, { "epoch": 0.12362936838730362, "grad_norm": 1.6772838830947876, "learning_rate": 0.00019097913248740852, "loss": 6.1063, "num_input_tokens_seen": 94765056, "step": 723 }, { "epoch": 0.12414235331837127, "grad_norm": 1.1892170906066895, "learning_rate": 0.00019058413902942387, "loss": 6.0525, "num_input_tokens_seen": 95158272, "step": 726 }, { "epoch": 0.12465533824943892, "grad_norm": 1.5597805976867676, "learning_rate": 0.00019019158631804098, "loss": 6.0913, "num_input_tokens_seen": 95551488, "step": 729 }, { "epoch": 0.12516832318050658, "grad_norm": 1.638031005859375, "learning_rate": 0.0001898014493198996, "loss": 6.1035, "num_input_tokens_seen": 95944704, "step": 732 }, { "epoch": 0.1256813081115742, "grad_norm": 1.4688829183578491, "learning_rate": 0.00018941370335962538, "loss": 6.0601, "num_input_tokens_seen": 96337920, "step": 735 }, { "epoch": 0.12619429304264188, "grad_norm": 0.9651637077331543, "learning_rate": 0.00018902832411327452, "loss": 6.068, "num_input_tokens_seen": 96731136, "step": 738 }, { "epoch": 0.1267072779737095, "grad_norm": 1.2612296342849731, "learning_rate": 0.00018864528760192487, "loss": 6.0142, "num_input_tokens_seen": 97124352, "step": 741 }, { "epoch": 0.12722026290477717, "grad_norm": 1.0166645050048828, "learning_rate": 0.00018826457018540895, "loss": 6.0268, "num_input_tokens_seen": 97517568, "step": 744 }, { "epoch": 0.12773324783584483, "grad_norm": 1.159142255783081, "learning_rate": 0.00018788614855618575, "loss": 6.0269, "num_input_tokens_seen": 97910784, "step": 747 }, { "epoch": 0.12824623276691247, "grad_norm": 0.9123517870903015, "learning_rate": 0.00018750999973334755, "loss": 6.0037, "num_input_tokens_seen": 98304000, "step": 750 }, { "epoch": 0.12875921769798013, "grad_norm": 0.9361982345581055, "learning_rate": 0.00018713610105675787, "loss": 6.0246, "num_input_tokens_seen": 98697216, "step": 753 }, { "epoch": 0.12927220262904776, "grad_norm": 0.8802709579467773, "learning_rate": 0.00018676443018131788, "loss": 6.0012, "num_input_tokens_seen": 99090432, "step": 756 }, { "epoch": 0.12978518756011542, "grad_norm": 1.0109879970550537, "learning_rate": 0.00018639496507135743, "loss": 6.0781, "num_input_tokens_seen": 99483648, "step": 759 }, { "epoch": 0.13029817249118308, "grad_norm": 0.9797167778015137, "learning_rate": 0.00018602768399514743, "loss": 6.0611, "num_input_tokens_seen": 99876864, "step": 762 }, { "epoch": 0.13081115742225072, "grad_norm": 1.554618000984192, "learning_rate": 0.0001856625655195309, "loss": 6.0584, "num_input_tokens_seen": 100270080, "step": 765 }, { "epoch": 0.13132414235331838, "grad_norm": 1.2623248100280762, "learning_rate": 0.00018529958850466993, "loss": 6.0177, "num_input_tokens_seen": 100663296, "step": 768 }, { "epoch": 0.131837127284386, "grad_norm": 1.2730941772460938, "learning_rate": 0.000184938732098904, "loss": 6.0187, "num_input_tokens_seen": 101056512, "step": 771 }, { "epoch": 0.13235011221545367, "grad_norm": 1.1981124877929688, "learning_rate": 0.00018457997573371942, "loss": 6.0086, "num_input_tokens_seen": 101449728, "step": 774 }, { "epoch": 0.1328630971465213, "grad_norm": 1.0194454193115234, "learning_rate": 0.00018422329911882464, "loss": 6.0152, "num_input_tokens_seen": 101842944, "step": 777 }, { "epoch": 0.13337608207758897, "grad_norm": 1.068668246269226, "learning_rate": 0.0001838686822373302, "loss": 6.0552, "num_input_tokens_seen": 102236160, "step": 780 }, { "epoch": 0.13388906700865663, "grad_norm": 1.023908019065857, "learning_rate": 0.00018351610534103057, "loss": 6.0448, "num_input_tokens_seen": 102629376, "step": 783 }, { "epoch": 0.13440205193972427, "grad_norm": 0.8301390409469604, "learning_rate": 0.0001831655489457848, "loss": 5.979, "num_input_tokens_seen": 103022592, "step": 786 }, { "epoch": 0.13491503687079193, "grad_norm": 1.0532444715499878, "learning_rate": 0.00018281699382699399, "loss": 6.0356, "num_input_tokens_seen": 103415808, "step": 789 }, { "epoch": 0.13542802180185956, "grad_norm": 0.9144531488418579, "learning_rate": 0.00018247042101517312, "loss": 5.9964, "num_input_tokens_seen": 103809024, "step": 792 }, { "epoch": 0.13594100673292722, "grad_norm": 0.9134213328361511, "learning_rate": 0.00018212581179161483, "loss": 6.0034, "num_input_tokens_seen": 104202240, "step": 795 }, { "epoch": 0.13645399166399488, "grad_norm": 0.9252693057060242, "learning_rate": 0.0001817831476841428, "loss": 6.0547, "num_input_tokens_seen": 104595456, "step": 798 }, { "epoch": 0.13679598161803996, "eval_accuracy": 0.14596971177332682, "eval_loss": 6.421415328979492, "eval_runtime": 112.8503, "eval_samples_per_second": 2.658, "eval_steps_per_second": 1.329, "num_input_tokens_seen": 104857600, "step": 800 }, { "epoch": 0.13696697659506252, "grad_norm": 0.7578924298286438, "learning_rate": 0.00018144241046295307, "loss": 6.0183, "num_input_tokens_seen": 104988672, "step": 801 }, { "epoch": 0.13747996152613018, "grad_norm": 0.8038005232810974, "learning_rate": 0.0001811035821365402, "loss": 6.0242, "num_input_tokens_seen": 105381888, "step": 804 }, { "epoch": 0.1379929464571978, "grad_norm": 0.8382763266563416, "learning_rate": 0.0001807666449477075, "loss": 6.0535, "num_input_tokens_seen": 105775104, "step": 807 }, { "epoch": 0.13850593138826547, "grad_norm": 0.8043891787528992, "learning_rate": 0.0001804315813696581, "loss": 6.0307, "num_input_tokens_seen": 106168320, "step": 810 }, { "epoch": 0.1390189163193331, "grad_norm": 1.0049474239349365, "learning_rate": 0.00018009837410216546, "loss": 5.9799, "num_input_tokens_seen": 106561536, "step": 813 }, { "epoch": 0.13953190125040077, "grad_norm": 1.1410833597183228, "learning_rate": 0.00017976700606782165, "loss": 5.9542, "num_input_tokens_seen": 106954752, "step": 816 }, { "epoch": 0.14004488618146843, "grad_norm": 1.1920111179351807, "learning_rate": 0.0001794374604083612, "loss": 6.0421, "num_input_tokens_seen": 107347968, "step": 819 }, { "epoch": 0.14055787111253606, "grad_norm": 1.410753607749939, "learning_rate": 0.00017910972048105852, "loss": 6.0533, "num_input_tokens_seen": 107741184, "step": 822 }, { "epoch": 0.14107085604360373, "grad_norm": 1.0810803174972534, "learning_rate": 0.00017878376985519786, "loss": 6.0042, "num_input_tokens_seen": 108134400, "step": 825 }, { "epoch": 0.14158384097467136, "grad_norm": 0.937443196773529, "learning_rate": 0.00017845959230861343, "loss": 5.9796, "num_input_tokens_seen": 108527616, "step": 828 }, { "epoch": 0.14209682590573902, "grad_norm": 0.9939092397689819, "learning_rate": 0.00017813717182429826, "loss": 5.9832, "num_input_tokens_seen": 108920832, "step": 831 }, { "epoch": 0.14260981083680666, "grad_norm": 0.9864884614944458, "learning_rate": 0.00017781649258708038, "loss": 5.9771, "num_input_tokens_seen": 109314048, "step": 834 }, { "epoch": 0.14312279576787432, "grad_norm": 1.1324708461761475, "learning_rate": 0.0001774975389803645, "loss": 6.0005, "num_input_tokens_seen": 109707264, "step": 837 }, { "epoch": 0.14363578069894198, "grad_norm": 1.1927917003631592, "learning_rate": 0.00017718029558293758, "loss": 6.0208, "num_input_tokens_seen": 110100480, "step": 840 }, { "epoch": 0.1441487656300096, "grad_norm": 1.1395940780639648, "learning_rate": 0.00017686474716583739, "loss": 5.9824, "num_input_tokens_seen": 110493696, "step": 843 }, { "epoch": 0.14466175056107727, "grad_norm": 1.562849998474121, "learning_rate": 0.00017655087868928166, "loss": 5.9618, "num_input_tokens_seen": 110886912, "step": 846 }, { "epoch": 0.1451747354921449, "grad_norm": 1.2808341979980469, "learning_rate": 0.00017623867529965745, "loss": 6.0031, "num_input_tokens_seen": 111280128, "step": 849 }, { "epoch": 0.14568772042321257, "grad_norm": 1.1808559894561768, "learning_rate": 0.00017592812232656866, "loss": 5.9783, "num_input_tokens_seen": 111673344, "step": 852 }, { "epoch": 0.14620070535428023, "grad_norm": 1.1873037815093994, "learning_rate": 0.00017561920527994052, "loss": 5.9943, "num_input_tokens_seen": 112066560, "step": 855 }, { "epoch": 0.14671369028534786, "grad_norm": 1.1178746223449707, "learning_rate": 0.00017531190984717987, "loss": 5.938, "num_input_tokens_seen": 112459776, "step": 858 }, { "epoch": 0.14722667521641553, "grad_norm": 1.8096652030944824, "learning_rate": 0.00017500622189039, "loss": 6.0226, "num_input_tokens_seen": 112852992, "step": 861 }, { "epoch": 0.14773966014748316, "grad_norm": 1.264701247215271, "learning_rate": 0.00017470212744363856, "loss": 6.0132, "num_input_tokens_seen": 113246208, "step": 864 }, { "epoch": 0.14825264507855082, "grad_norm": 1.6313904523849487, "learning_rate": 0.00017439961271027758, "loss": 5.9518, "num_input_tokens_seen": 113639424, "step": 867 }, { "epoch": 0.14876563000961845, "grad_norm": 1.2754333019256592, "learning_rate": 0.00017409866406031439, "loss": 5.9424, "num_input_tokens_seen": 114032640, "step": 870 }, { "epoch": 0.14927861494068612, "grad_norm": 1.5135891437530518, "learning_rate": 0.00017379926802783236, "loss": 5.9789, "num_input_tokens_seen": 114425856, "step": 873 }, { "epoch": 0.14979159987175378, "grad_norm": 0.951026439666748, "learning_rate": 0.00017350141130845995, "loss": 5.9601, "num_input_tokens_seen": 114819072, "step": 876 }, { "epoch": 0.1503045848028214, "grad_norm": 1.3585782051086426, "learning_rate": 0.00017320508075688773, "loss": 5.9518, "num_input_tokens_seen": 115212288, "step": 879 }, { "epoch": 0.15081756973388907, "grad_norm": 1.0978291034698486, "learning_rate": 0.0001729102633844315, "loss": 5.9492, "num_input_tokens_seen": 115605504, "step": 882 }, { "epoch": 0.1513305546649567, "grad_norm": 1.389070987701416, "learning_rate": 0.0001726169463566411, "loss": 5.9284, "num_input_tokens_seen": 115998720, "step": 885 }, { "epoch": 0.15184353959602437, "grad_norm": 0.956652045249939, "learning_rate": 0.00017232511699095387, "loss": 6.0113, "num_input_tokens_seen": 116391936, "step": 888 }, { "epoch": 0.15235652452709203, "grad_norm": 1.1337711811065674, "learning_rate": 0.00017203476275439095, "loss": 5.904, "num_input_tokens_seen": 116785152, "step": 891 }, { "epoch": 0.15286950945815966, "grad_norm": 0.9648370742797852, "learning_rate": 0.00017174587126129703, "loss": 5.9447, "num_input_tokens_seen": 117178368, "step": 894 }, { "epoch": 0.15338249438922733, "grad_norm": 0.9501051902770996, "learning_rate": 0.00017145843027112077, "loss": 5.9404, "num_input_tokens_seen": 117571584, "step": 897 }, { "epoch": 0.15389547932029496, "grad_norm": 0.9637885093688965, "learning_rate": 0.00017117242768623688, "loss": 5.9458, "num_input_tokens_seen": 117964800, "step": 900 }, { "epoch": 0.15440846425136262, "grad_norm": 1.0075721740722656, "learning_rate": 0.00017088785154980728, "loss": 5.9596, "num_input_tokens_seen": 118358016, "step": 903 }, { "epoch": 0.15492144918243025, "grad_norm": 1.1554243564605713, "learning_rate": 0.00017060469004368157, "loss": 5.9451, "num_input_tokens_seen": 118751232, "step": 906 }, { "epoch": 0.15543443411349792, "grad_norm": 0.8994986414909363, "learning_rate": 0.0001703229314863357, "loss": 5.9276, "num_input_tokens_seen": 119144448, "step": 909 }, { "epoch": 0.15594741904456558, "grad_norm": 0.9552657604217529, "learning_rate": 0.0001700425643308478, "loss": 5.9344, "num_input_tokens_seen": 119537664, "step": 912 }, { "epoch": 0.1564604039756332, "grad_norm": 1.1688953638076782, "learning_rate": 0.00016976357716291072, "loss": 5.9453, "num_input_tokens_seen": 119930880, "step": 915 }, { "epoch": 0.15697338890670087, "grad_norm": 0.9850606918334961, "learning_rate": 0.0001694859586988799, "loss": 5.9218, "num_input_tokens_seen": 120324096, "step": 918 }, { "epoch": 0.1574863738377685, "grad_norm": 0.9341318607330322, "learning_rate": 0.00016920969778385703, "loss": 5.8967, "num_input_tokens_seen": 120717312, "step": 921 }, { "epoch": 0.15799935876883617, "grad_norm": 1.1161161661148071, "learning_rate": 0.00016893478338980708, "loss": 5.9717, "num_input_tokens_seen": 121110528, "step": 924 }, { "epoch": 0.1585123436999038, "grad_norm": 1.0169016122817993, "learning_rate": 0.00016866120461370946, "loss": 5.9749, "num_input_tokens_seen": 121503744, "step": 927 }, { "epoch": 0.15902532863097146, "grad_norm": 0.8988534212112427, "learning_rate": 0.00016838895067574185, "loss": 5.9448, "num_input_tokens_seen": 121896960, "step": 930 }, { "epoch": 0.15953831356203912, "grad_norm": 0.7732037305831909, "learning_rate": 0.00016811801091749597, "loss": 5.928, "num_input_tokens_seen": 122290176, "step": 933 }, { "epoch": 0.16005129849310676, "grad_norm": 0.7758464217185974, "learning_rate": 0.00016784837480022532, "loss": 5.9203, "num_input_tokens_seen": 122683392, "step": 936 }, { "epoch": 0.16056428342417442, "grad_norm": 0.688848614692688, "learning_rate": 0.0001675800319031231, "loss": 5.9253, "num_input_tokens_seen": 123076608, "step": 939 }, { "epoch": 0.16107726835524205, "grad_norm": 0.8796403408050537, "learning_rate": 0.00016731297192163077, "loss": 5.9361, "num_input_tokens_seen": 123469824, "step": 942 }, { "epoch": 0.16159025328630972, "grad_norm": 1.0328707695007324, "learning_rate": 0.00016704718466577608, "loss": 5.94, "num_input_tokens_seen": 123863040, "step": 945 }, { "epoch": 0.16210323821737738, "grad_norm": 1.015504240989685, "learning_rate": 0.00016678266005854003, "loss": 5.9536, "num_input_tokens_seen": 124256256, "step": 948 }, { "epoch": 0.162616223148445, "grad_norm": 0.8219988346099854, "learning_rate": 0.00016651938813425204, "loss": 5.9519, "num_input_tokens_seen": 124649472, "step": 951 }, { "epoch": 0.16312920807951267, "grad_norm": 1.0020133256912231, "learning_rate": 0.00016625735903701302, "loss": 5.9622, "num_input_tokens_seen": 125042688, "step": 954 }, { "epoch": 0.1636421930105803, "grad_norm": 0.8336507678031921, "learning_rate": 0.00016599656301914591, "loss": 5.9003, "num_input_tokens_seen": 125435904, "step": 957 }, { "epoch": 0.16415517794164797, "grad_norm": 0.9200095534324646, "learning_rate": 0.0001657369904396722, "loss": 5.8612, "num_input_tokens_seen": 125829120, "step": 960 }, { "epoch": 0.1646681628727156, "grad_norm": 0.858650267124176, "learning_rate": 0.0001654786317628154, "loss": 5.9106, "num_input_tokens_seen": 126222336, "step": 963 }, { "epoch": 0.16518114780378326, "grad_norm": 0.8560724258422852, "learning_rate": 0.00016522147755652932, "loss": 5.9117, "num_input_tokens_seen": 126615552, "step": 966 }, { "epoch": 0.16569413273485092, "grad_norm": 0.8329317569732666, "learning_rate": 0.00016496551849105217, "loss": 5.9295, "num_input_tokens_seen": 127008768, "step": 969 }, { "epoch": 0.16620711766591856, "grad_norm": 0.9380521774291992, "learning_rate": 0.00016471074533748437, "loss": 5.9553, "num_input_tokens_seen": 127401984, "step": 972 }, { "epoch": 0.16672010259698622, "grad_norm": 1.0690613985061646, "learning_rate": 0.00016445714896639137, "loss": 5.9229, "num_input_tokens_seen": 127795200, "step": 975 }, { "epoch": 0.16723308752805385, "grad_norm": 1.3576298952102661, "learning_rate": 0.00016420472034642939, "loss": 5.9454, "num_input_tokens_seen": 128188416, "step": 978 }, { "epoch": 0.16774607245912151, "grad_norm": 1.040351390838623, "learning_rate": 0.00016395345054299445, "loss": 5.9541, "num_input_tokens_seen": 128581632, "step": 981 }, { "epoch": 0.16825905739018918, "grad_norm": 0.8309887647628784, "learning_rate": 0.0001637033307168943, "loss": 5.8765, "num_input_tokens_seen": 128974848, "step": 984 }, { "epoch": 0.1687720423212568, "grad_norm": 0.7443415522575378, "learning_rate": 0.00016345435212304236, "loss": 5.9289, "num_input_tokens_seen": 129368064, "step": 987 }, { "epoch": 0.16928502725232447, "grad_norm": 0.8244521617889404, "learning_rate": 0.00016320650610917334, "loss": 5.9387, "num_input_tokens_seen": 129761280, "step": 990 }, { "epoch": 0.1697980121833921, "grad_norm": 0.8613632321357727, "learning_rate": 0.0001629597841145805, "loss": 5.9035, "num_input_tokens_seen": 130154496, "step": 993 }, { "epoch": 0.17031099711445977, "grad_norm": 0.9145336151123047, "learning_rate": 0.00016271417766887378, "loss": 5.9495, "num_input_tokens_seen": 130547712, "step": 996 }, { "epoch": 0.1708239820455274, "grad_norm": 1.0651459693908691, "learning_rate": 0.00016246967839075817, "loss": 5.9386, "num_input_tokens_seen": 130940928, "step": 999 }, { "epoch": 0.17133696697659506, "grad_norm": 1.1779698133468628, "learning_rate": 0.00016222627798683257, "loss": 5.8951, "num_input_tokens_seen": 131334144, "step": 1002 }, { "epoch": 0.17184995190766272, "grad_norm": 0.9768991470336914, "learning_rate": 0.00016198396825040817, "loss": 5.9006, "num_input_tokens_seen": 131727360, "step": 1005 }, { "epoch": 0.17236293683873036, "grad_norm": 0.8397189974784851, "learning_rate": 0.00016174274106034645, "loss": 5.9255, "num_input_tokens_seen": 132120576, "step": 1008 }, { "epoch": 0.17287592176979802, "grad_norm": 0.7937173843383789, "learning_rate": 0.00016150258837991562, "loss": 5.8806, "num_input_tokens_seen": 132513792, "step": 1011 }, { "epoch": 0.17338890670086565, "grad_norm": 0.8085054159164429, "learning_rate": 0.00016126350225566634, "loss": 5.893, "num_input_tokens_seen": 132907008, "step": 1014 }, { "epoch": 0.17390189163193331, "grad_norm": 0.836296021938324, "learning_rate": 0.0001610254748163253, "loss": 5.8943, "num_input_tokens_seen": 133300224, "step": 1017 }, { "epoch": 0.17441487656300098, "grad_norm": 0.8128154873847961, "learning_rate": 0.0001607884982717066, "loss": 5.9156, "num_input_tokens_seen": 133693440, "step": 1020 }, { "epoch": 0.1749278614940686, "grad_norm": 1.1624642610549927, "learning_rate": 0.00016055256491164112, "loss": 5.8715, "num_input_tokens_seen": 134086656, "step": 1023 }, { "epoch": 0.17544084642513627, "grad_norm": 0.7400571703910828, "learning_rate": 0.000160317667104923, "loss": 5.9113, "num_input_tokens_seen": 134479872, "step": 1026 }, { "epoch": 0.1759538313562039, "grad_norm": 0.864396333694458, "learning_rate": 0.0001600837972982725, "loss": 5.8986, "num_input_tokens_seen": 134873088, "step": 1029 }, { "epoch": 0.17646681628727157, "grad_norm": 0.8100318312644958, "learning_rate": 0.00015985094801531627, "loss": 5.9568, "num_input_tokens_seen": 135266304, "step": 1032 }, { "epoch": 0.1769798012183392, "grad_norm": 0.739395797252655, "learning_rate": 0.0001596191118555833, "loss": 5.9094, "num_input_tokens_seen": 135659520, "step": 1035 }, { "epoch": 0.17749278614940686, "grad_norm": 0.8485215306282043, "learning_rate": 0.0001593882814935171, "loss": 5.9509, "num_input_tokens_seen": 136052736, "step": 1038 }, { "epoch": 0.17800577108047452, "grad_norm": 0.8100500106811523, "learning_rate": 0.00015915844967750344, "loss": 5.8577, "num_input_tokens_seen": 136445952, "step": 1041 }, { "epoch": 0.17851875601154216, "grad_norm": 0.9649944305419922, "learning_rate": 0.00015892960922891358, "loss": 5.901, "num_input_tokens_seen": 136839168, "step": 1044 }, { "epoch": 0.17903174094260982, "grad_norm": 1.0253026485443115, "learning_rate": 0.00015870175304116244, "loss": 5.8591, "num_input_tokens_seen": 137232384, "step": 1047 }, { "epoch": 0.17954472587367745, "grad_norm": 1.2840728759765625, "learning_rate": 0.00015847487407878166, "loss": 5.9175, "num_input_tokens_seen": 137625600, "step": 1050 }, { "epoch": 0.1800577108047451, "grad_norm": 0.9364380240440369, "learning_rate": 0.0001582489653765074, "loss": 5.8954, "num_input_tokens_seen": 138018816, "step": 1053 }, { "epoch": 0.18057069573581275, "grad_norm": 0.835299551486969, "learning_rate": 0.0001580240200383818, "loss": 5.866, "num_input_tokens_seen": 138412032, "step": 1056 }, { "epoch": 0.1810836806668804, "grad_norm": 1.4820502996444702, "learning_rate": 0.0001578000312368693, "loss": 5.8628, "num_input_tokens_seen": 138805248, "step": 1059 }, { "epoch": 0.18159666559794807, "grad_norm": 1.0213690996170044, "learning_rate": 0.0001575769922119859, "loss": 5.8842, "num_input_tokens_seen": 139198464, "step": 1062 }, { "epoch": 0.1821096505290157, "grad_norm": 1.2635438442230225, "learning_rate": 0.0001573548962704424, "loss": 5.8898, "num_input_tokens_seen": 139591680, "step": 1065 }, { "epoch": 0.18262263546008337, "grad_norm": 0.9599955677986145, "learning_rate": 0.00015713373678480076, "loss": 5.9028, "num_input_tokens_seen": 139984896, "step": 1068 }, { "epoch": 0.183135620391151, "grad_norm": 1.3038582801818848, "learning_rate": 0.00015691350719264352, "loss": 5.898, "num_input_tokens_seen": 140378112, "step": 1071 }, { "epoch": 0.18364860532221866, "grad_norm": 1.0840892791748047, "learning_rate": 0.00015669420099575582, "loss": 5.8597, "num_input_tokens_seen": 140771328, "step": 1074 }, { "epoch": 0.18416159025328632, "grad_norm": 1.0119037628173828, "learning_rate": 0.00015647581175932002, "loss": 5.8969, "num_input_tokens_seen": 141164544, "step": 1077 }, { "epoch": 0.18467457518435396, "grad_norm": 0.8829667568206787, "learning_rate": 0.00015625833311112293, "loss": 5.8546, "num_input_tokens_seen": 141557760, "step": 1080 }, { "epoch": 0.18518756011542162, "grad_norm": 0.8768784403800964, "learning_rate": 0.00015604175874077463, "loss": 5.8718, "num_input_tokens_seen": 141950976, "step": 1083 }, { "epoch": 0.18570054504648925, "grad_norm": 0.9819164276123047, "learning_rate": 0.00015582608239893955, "loss": 5.8585, "num_input_tokens_seen": 142344192, "step": 1086 }, { "epoch": 0.1862135299775569, "grad_norm": 0.8090091943740845, "learning_rate": 0.00015561129789657898, "loss": 5.8592, "num_input_tokens_seen": 142737408, "step": 1089 }, { "epoch": 0.18672651490862455, "grad_norm": 0.878285825252533, "learning_rate": 0.0001553973991042052, "loss": 5.9156, "num_input_tokens_seen": 143130624, "step": 1092 }, { "epoch": 0.1872394998396922, "grad_norm": 0.8004051446914673, "learning_rate": 0.00015518437995114688, "loss": 5.8415, "num_input_tokens_seen": 143523840, "step": 1095 }, { "epoch": 0.18775248477075987, "grad_norm": 0.8821293711662292, "learning_rate": 0.0001549722344248251, "loss": 5.8736, "num_input_tokens_seen": 143917056, "step": 1098 }, { "epoch": 0.1882654697018275, "grad_norm": 0.8214676380157471, "learning_rate": 0.00015476095657004097, "loss": 5.8634, "num_input_tokens_seen": 144310272, "step": 1101 }, { "epoch": 0.18877845463289517, "grad_norm": 0.9714133739471436, "learning_rate": 0.00015455054048827327, "loss": 5.8468, "num_input_tokens_seen": 144703488, "step": 1104 }, { "epoch": 0.1892914395639628, "grad_norm": 0.9200727939605713, "learning_rate": 0.00015434098033698665, "loss": 5.8887, "num_input_tokens_seen": 145096704, "step": 1107 }, { "epoch": 0.18980442449503046, "grad_norm": 0.8920320868492126, "learning_rate": 0.00015413227032895076, "loss": 5.8708, "num_input_tokens_seen": 145489920, "step": 1110 }, { "epoch": 0.19031740942609812, "grad_norm": 0.7470078468322754, "learning_rate": 0.00015392440473156833, "loss": 5.8795, "num_input_tokens_seen": 145883136, "step": 1113 }, { "epoch": 0.19083039435716576, "grad_norm": 0.8388041853904724, "learning_rate": 0.0001537173778662143, "loss": 5.823, "num_input_tokens_seen": 146276352, "step": 1116 }, { "epoch": 0.19134337928823342, "grad_norm": 0.767766535282135, "learning_rate": 0.00015351118410758416, "loss": 5.8513, "num_input_tokens_seen": 146669568, "step": 1119 }, { "epoch": 0.19185636421930105, "grad_norm": 0.8732740879058838, "learning_rate": 0.00015330581788305177, "loss": 5.8557, "num_input_tokens_seen": 147062784, "step": 1122 }, { "epoch": 0.1923693491503687, "grad_norm": 0.8404717445373535, "learning_rate": 0.0001531012736720371, "loss": 5.8324, "num_input_tokens_seen": 147456000, "step": 1125 }, { "epoch": 0.19288233408143635, "grad_norm": 0.9904493093490601, "learning_rate": 0.0001528975460053826, "loss": 5.8828, "num_input_tokens_seen": 147849216, "step": 1128 }, { "epoch": 0.193395319012504, "grad_norm": 1.0272847414016724, "learning_rate": 0.00015269462946473922, "loss": 5.8644, "num_input_tokens_seen": 148242432, "step": 1131 }, { "epoch": 0.19390830394357167, "grad_norm": 0.9573217630386353, "learning_rate": 0.00015249251868196107, "loss": 5.8459, "num_input_tokens_seen": 148635648, "step": 1134 }, { "epoch": 0.1944212888746393, "grad_norm": 0.9274708032608032, "learning_rate": 0.00015229120833850902, "loss": 5.8528, "num_input_tokens_seen": 149028864, "step": 1137 }, { "epoch": 0.19493427380570696, "grad_norm": 0.8022924661636353, "learning_rate": 0.0001520906931648627, "loss": 5.8369, "num_input_tokens_seen": 149422080, "step": 1140 }, { "epoch": 0.1954472587367746, "grad_norm": 1.0030078887939453, "learning_rate": 0.00015189096793994132, "loss": 5.8514, "num_input_tokens_seen": 149815296, "step": 1143 }, { "epoch": 0.19596024366784226, "grad_norm": 1.0330564975738525, "learning_rate": 0.00015169202749053254, "loss": 5.877, "num_input_tokens_seen": 150208512, "step": 1146 }, { "epoch": 0.1964732285989099, "grad_norm": 0.8017846345901489, "learning_rate": 0.00015149386669072978, "loss": 5.9143, "num_input_tokens_seen": 150601728, "step": 1149 }, { "epoch": 0.19698621352997756, "grad_norm": 0.9474872946739197, "learning_rate": 0.00015129648046137753, "loss": 5.8327, "num_input_tokens_seen": 150994944, "step": 1152 }, { "epoch": 0.19749919846104522, "grad_norm": 0.8487737774848938, "learning_rate": 0.0001510998637695244, "loss": 5.9011, "num_input_tokens_seen": 151388160, "step": 1155 }, { "epoch": 0.19801218339211285, "grad_norm": 0.8053280711174011, "learning_rate": 0.00015090401162788414, "loss": 5.8002, "num_input_tokens_seen": 151781376, "step": 1158 }, { "epoch": 0.1985251683231805, "grad_norm": 0.7677531838417053, "learning_rate": 0.00015070891909430456, "loss": 5.8301, "num_input_tokens_seen": 152174592, "step": 1161 }, { "epoch": 0.19903815325424815, "grad_norm": 0.8054178357124329, "learning_rate": 0.0001505145812712434, "loss": 5.8065, "num_input_tokens_seen": 152567808, "step": 1164 }, { "epoch": 0.1995511381853158, "grad_norm": 1.0484933853149414, "learning_rate": 0.00015032099330525203, "loss": 5.8322, "num_input_tokens_seen": 152961024, "step": 1167 }, { "epoch": 0.20006412311638347, "grad_norm": 0.8616706132888794, "learning_rate": 0.0001501281503864666, "loss": 5.8494, "num_input_tokens_seen": 153354240, "step": 1170 }, { "epoch": 0.2005771080474511, "grad_norm": 1.0345689058303833, "learning_rate": 0.00014993604774810574, "loss": 5.8326, "num_input_tokens_seen": 153747456, "step": 1173 }, { "epoch": 0.20109009297851876, "grad_norm": 0.9733481407165527, "learning_rate": 0.00014974468066597592, "loss": 5.8334, "num_input_tokens_seen": 154140672, "step": 1176 }, { "epoch": 0.2016030779095864, "grad_norm": 0.9011761546134949, "learning_rate": 0.0001495540444579833, "loss": 5.7905, "num_input_tokens_seen": 154533888, "step": 1179 }, { "epoch": 0.20211606284065406, "grad_norm": 0.8033789992332458, "learning_rate": 0.00014936413448365292, "loss": 5.8216, "num_input_tokens_seen": 154927104, "step": 1182 }, { "epoch": 0.2026290477717217, "grad_norm": 0.8766458630561829, "learning_rate": 0.00014917494614365384, "loss": 5.8183, "num_input_tokens_seen": 155320320, "step": 1185 }, { "epoch": 0.20314203270278935, "grad_norm": 0.8766114711761475, "learning_rate": 0.00014898647487933156, "loss": 5.8486, "num_input_tokens_seen": 155713536, "step": 1188 }, { "epoch": 0.20365501763385702, "grad_norm": 0.988873302936554, "learning_rate": 0.00014879871617224662, "loss": 5.8093, "num_input_tokens_seen": 156106752, "step": 1191 }, { "epoch": 0.20416800256492465, "grad_norm": 0.9486966729164124, "learning_rate": 0.00014861166554371963, "loss": 5.7705, "num_input_tokens_seen": 156499968, "step": 1194 }, { "epoch": 0.2046809874959923, "grad_norm": 0.8582146763801575, "learning_rate": 0.00014842531855438251, "loss": 5.7997, "num_input_tokens_seen": 156893184, "step": 1197 }, { "epoch": 0.20519397242705995, "grad_norm": 0.7633207440376282, "learning_rate": 0.00014823967080373592, "loss": 5.8133, "num_input_tokens_seen": 157286400, "step": 1200 }, { "epoch": 0.20519397242705995, "eval_accuracy": 0.15496498941540465, "eval_loss": 6.256631851196289, "eval_runtime": 110.7502, "eval_samples_per_second": 2.709, "eval_steps_per_second": 1.354, "num_input_tokens_seen": 157286400, "step": 1200 }, { "epoch": 0.2057069573581276, "grad_norm": 0.7945148944854736, "learning_rate": 0.000148054717929713, "loss": 5.8051, "num_input_tokens_seen": 157679616, "step": 1203 }, { "epoch": 0.20621994228919527, "grad_norm": 0.6455596089363098, "learning_rate": 0.00014787045560824864, "loss": 5.7968, "num_input_tokens_seen": 158072832, "step": 1206 }, { "epoch": 0.2067329272202629, "grad_norm": 0.7505893111228943, "learning_rate": 0.00014768687955285517, "loss": 5.7999, "num_input_tokens_seen": 158466048, "step": 1209 }, { "epoch": 0.20724591215133056, "grad_norm": 0.7592454552650452, "learning_rate": 0.00014750398551420315, "loss": 5.8526, "num_input_tokens_seen": 158859264, "step": 1212 }, { "epoch": 0.2077588970823982, "grad_norm": 0.8120488524436951, "learning_rate": 0.00014732176927970863, "loss": 5.8122, "num_input_tokens_seen": 159252480, "step": 1215 }, { "epoch": 0.20827188201346586, "grad_norm": 0.9561596512794495, "learning_rate": 0.0001471402266731254, "loss": 5.7933, "num_input_tokens_seen": 159645696, "step": 1218 }, { "epoch": 0.2087848669445335, "grad_norm": 1.0503617525100708, "learning_rate": 0.00014695935355414297, "loss": 5.8015, "num_input_tokens_seen": 160038912, "step": 1221 }, { "epoch": 0.20929785187560115, "grad_norm": 0.7630879282951355, "learning_rate": 0.00014677914581799015, "loss": 5.8202, "num_input_tokens_seen": 160432128, "step": 1224 }, { "epoch": 0.20981083680666882, "grad_norm": 0.8692222237586975, "learning_rate": 0.00014659959939504366, "loss": 5.8312, "num_input_tokens_seen": 160825344, "step": 1227 }, { "epoch": 0.21032382173773645, "grad_norm": 0.9576478004455566, "learning_rate": 0.00014642071025044203, "loss": 5.8261, "num_input_tokens_seen": 161218560, "step": 1230 }, { "epoch": 0.2108368066688041, "grad_norm": 1.0278856754302979, "learning_rate": 0.000146242474383705, "loss": 5.8203, "num_input_tokens_seen": 161611776, "step": 1233 }, { "epoch": 0.21134979159987174, "grad_norm": 1.0111125707626343, "learning_rate": 0.00014606488782835757, "loss": 5.78, "num_input_tokens_seen": 162004992, "step": 1236 }, { "epoch": 0.2118627765309394, "grad_norm": 1.093246340751648, "learning_rate": 0.00014588794665155937, "loss": 5.8341, "num_input_tokens_seen": 162398208, "step": 1239 }, { "epoch": 0.21237576146200704, "grad_norm": 0.9989133477210999, "learning_rate": 0.0001457116469537388, "loss": 5.8199, "num_input_tokens_seen": 162791424, "step": 1242 }, { "epoch": 0.2128887463930747, "grad_norm": 0.9265642762184143, "learning_rate": 0.00014553598486823202, "loss": 5.833, "num_input_tokens_seen": 163184640, "step": 1245 }, { "epoch": 0.21340173132414236, "grad_norm": 0.9529325366020203, "learning_rate": 0.0001453609565609269, "loss": 5.7129, "num_input_tokens_seen": 163577856, "step": 1248 }, { "epoch": 0.21391471625521, "grad_norm": 0.8420143723487854, "learning_rate": 0.00014518655822991146, "loss": 5.7953, "num_input_tokens_seen": 163971072, "step": 1251 }, { "epoch": 0.21442770118627766, "grad_norm": 0.977813720703125, "learning_rate": 0.0001450127861051269, "loss": 5.8014, "num_input_tokens_seen": 164364288, "step": 1254 }, { "epoch": 0.2149406861173453, "grad_norm": 0.8765429854393005, "learning_rate": 0.00014483963644802545, "loss": 5.8276, "num_input_tokens_seen": 164757504, "step": 1257 }, { "epoch": 0.21545367104841295, "grad_norm": 0.8605163097381592, "learning_rate": 0.00014466710555123243, "loss": 5.726, "num_input_tokens_seen": 165150720, "step": 1260 }, { "epoch": 0.21596665597948062, "grad_norm": 1.005022644996643, "learning_rate": 0.000144495189738213, "loss": 5.8138, "num_input_tokens_seen": 165543936, "step": 1263 }, { "epoch": 0.21647964091054825, "grad_norm": 0.8421231508255005, "learning_rate": 0.00014432388536294303, "loss": 5.8112, "num_input_tokens_seen": 165937152, "step": 1266 }, { "epoch": 0.2169926258416159, "grad_norm": 0.8746516108512878, "learning_rate": 0.00014415318880958418, "loss": 5.8362, "num_input_tokens_seen": 166330368, "step": 1269 }, { "epoch": 0.21750561077268354, "grad_norm": 0.9044854044914246, "learning_rate": 0.0001439830964921636, "loss": 5.8002, "num_input_tokens_seen": 166723584, "step": 1272 }, { "epoch": 0.2180185957037512, "grad_norm": 1.2117860317230225, "learning_rate": 0.00014381360485425735, "loss": 5.8231, "num_input_tokens_seen": 167116800, "step": 1275 }, { "epoch": 0.21853158063481884, "grad_norm": 1.057684302330017, "learning_rate": 0.00014364471036867806, "loss": 5.8179, "num_input_tokens_seen": 167510016, "step": 1278 }, { "epoch": 0.2190445655658865, "grad_norm": 0.8717806339263916, "learning_rate": 0.00014347640953716679, "loss": 5.7661, "num_input_tokens_seen": 167903232, "step": 1281 }, { "epoch": 0.21955755049695416, "grad_norm": 0.7429178953170776, "learning_rate": 0.00014330869889008863, "loss": 5.7949, "num_input_tokens_seen": 168296448, "step": 1284 }, { "epoch": 0.2200705354280218, "grad_norm": 0.7271013855934143, "learning_rate": 0.00014314157498613212, "loss": 5.8544, "num_input_tokens_seen": 168689664, "step": 1287 }, { "epoch": 0.22058352035908946, "grad_norm": 0.8151692748069763, "learning_rate": 0.0001429750344120129, "loss": 5.808, "num_input_tokens_seen": 169082880, "step": 1290 }, { "epoch": 0.2210965052901571, "grad_norm": 0.7940250039100647, "learning_rate": 0.00014280907378218079, "loss": 5.8021, "num_input_tokens_seen": 169476096, "step": 1293 }, { "epoch": 0.22160949022122475, "grad_norm": 0.7792456150054932, "learning_rate": 0.00014264368973853074, "loss": 5.8054, "num_input_tokens_seen": 169869312, "step": 1296 }, { "epoch": 0.22212247515229241, "grad_norm": 0.7758190035820007, "learning_rate": 0.00014247887895011744, "loss": 5.7821, "num_input_tokens_seen": 170262528, "step": 1299 }, { "epoch": 0.22263546008336005, "grad_norm": 1.006454348564148, "learning_rate": 0.00014231463811287352, "loss": 5.8103, "num_input_tokens_seen": 170655744, "step": 1302 }, { "epoch": 0.2231484450144277, "grad_norm": 0.9445181488990784, "learning_rate": 0.00014215096394933147, "loss": 5.7801, "num_input_tokens_seen": 171048960, "step": 1305 }, { "epoch": 0.22366142994549534, "grad_norm": 1.0699506998062134, "learning_rate": 0.00014198785320834877, "loss": 5.7746, "num_input_tokens_seen": 171442176, "step": 1308 }, { "epoch": 0.224174414876563, "grad_norm": 0.8202515244483948, "learning_rate": 0.0001418253026648367, "loss": 5.7614, "num_input_tokens_seen": 171835392, "step": 1311 }, { "epoch": 0.22468739980763064, "grad_norm": 0.8101188540458679, "learning_rate": 0.00014166330911949266, "loss": 5.772, "num_input_tokens_seen": 172228608, "step": 1314 }, { "epoch": 0.2252003847386983, "grad_norm": 0.8872966766357422, "learning_rate": 0.00014150186939853544, "loss": 5.7827, "num_input_tokens_seen": 172621824, "step": 1317 }, { "epoch": 0.22571336966976596, "grad_norm": 0.9942976236343384, "learning_rate": 0.00014134098035344428, "loss": 5.7677, "num_input_tokens_seen": 173015040, "step": 1320 }, { "epoch": 0.2262263546008336, "grad_norm": 0.8756197094917297, "learning_rate": 0.00014118063886070086, "loss": 5.7569, "num_input_tokens_seen": 173408256, "step": 1323 }, { "epoch": 0.22673933953190126, "grad_norm": 0.9162293672561646, "learning_rate": 0.00014102084182153463, "loss": 5.7365, "num_input_tokens_seen": 173801472, "step": 1326 }, { "epoch": 0.2272523244629689, "grad_norm": 0.7763002514839172, "learning_rate": 0.00014086158616167125, "loss": 5.7591, "num_input_tokens_seen": 174194688, "step": 1329 }, { "epoch": 0.22776530939403655, "grad_norm": 0.7513299584388733, "learning_rate": 0.0001407028688310842, "loss": 5.7568, "num_input_tokens_seen": 174587904, "step": 1332 }, { "epoch": 0.2282782943251042, "grad_norm": 0.8344148397445679, "learning_rate": 0.0001405446868037495, "loss": 5.7722, "num_input_tokens_seen": 174981120, "step": 1335 }, { "epoch": 0.22879127925617185, "grad_norm": 0.9340312480926514, "learning_rate": 0.00014038703707740325, "loss": 5.774, "num_input_tokens_seen": 175374336, "step": 1338 }, { "epoch": 0.2293042641872395, "grad_norm": 0.8781185746192932, "learning_rate": 0.0001402299166733024, "loss": 5.7994, "num_input_tokens_seen": 175767552, "step": 1341 }, { "epoch": 0.22981724911830714, "grad_norm": 0.7272279262542725, "learning_rate": 0.00014007332263598843, "loss": 5.8141, "num_input_tokens_seen": 176160768, "step": 1344 }, { "epoch": 0.2303302340493748, "grad_norm": 0.8324047327041626, "learning_rate": 0.0001399172520330537, "loss": 5.7908, "num_input_tokens_seen": 176553984, "step": 1347 }, { "epoch": 0.23084321898044244, "grad_norm": 1.018019199371338, "learning_rate": 0.00013976170195491086, "loss": 5.7865, "num_input_tokens_seen": 176947200, "step": 1350 }, { "epoch": 0.2313562039115101, "grad_norm": 0.9174796342849731, "learning_rate": 0.00013960666951456512, "loss": 5.7007, "num_input_tokens_seen": 177340416, "step": 1353 }, { "epoch": 0.23186918884257776, "grad_norm": 0.9878000617027283, "learning_rate": 0.00013945215184738905, "loss": 5.7603, "num_input_tokens_seen": 177733632, "step": 1356 }, { "epoch": 0.2323821737736454, "grad_norm": 0.9942120909690857, "learning_rate": 0.00013929814611090044, "loss": 5.7836, "num_input_tokens_seen": 178126848, "step": 1359 }, { "epoch": 0.23289515870471306, "grad_norm": 1.0599764585494995, "learning_rate": 0.00013914464948454254, "loss": 5.7588, "num_input_tokens_seen": 178520064, "step": 1362 }, { "epoch": 0.2334081436357807, "grad_norm": 0.9592947959899902, "learning_rate": 0.00013899165916946712, "loss": 5.7805, "num_input_tokens_seen": 178913280, "step": 1365 }, { "epoch": 0.23392112856684835, "grad_norm": 0.8882340788841248, "learning_rate": 0.00013883917238832015, "loss": 5.722, "num_input_tokens_seen": 179306496, "step": 1368 }, { "epoch": 0.23443411349791599, "grad_norm": 1.0210182666778564, "learning_rate": 0.00013868718638503002, "loss": 5.7775, "num_input_tokens_seen": 179699712, "step": 1371 }, { "epoch": 0.23494709842898365, "grad_norm": 1.022925853729248, "learning_rate": 0.00013853569842459833, "loss": 5.7809, "num_input_tokens_seen": 180092928, "step": 1374 }, { "epoch": 0.2354600833600513, "grad_norm": 0.926238477230072, "learning_rate": 0.00013838470579289325, "loss": 5.7873, "num_input_tokens_seen": 180486144, "step": 1377 }, { "epoch": 0.23597306829111894, "grad_norm": 0.9598406553268433, "learning_rate": 0.000138234205796445, "loss": 5.7696, "num_input_tokens_seen": 180879360, "step": 1380 }, { "epoch": 0.2364860532221866, "grad_norm": 0.8754788637161255, "learning_rate": 0.00013808419576224448, "loss": 5.7407, "num_input_tokens_seen": 181272576, "step": 1383 }, { "epoch": 0.23699903815325424, "grad_norm": 0.8794489502906799, "learning_rate": 0.0001379346730375435, "loss": 5.7445, "num_input_tokens_seen": 181665792, "step": 1386 }, { "epoch": 0.2375120230843219, "grad_norm": 0.7969871759414673, "learning_rate": 0.0001377856349896579, "loss": 5.7955, "num_input_tokens_seen": 182059008, "step": 1389 }, { "epoch": 0.23802500801538956, "grad_norm": 0.9116011261940002, "learning_rate": 0.00013763707900577292, "loss": 5.7498, "num_input_tokens_seen": 182452224, "step": 1392 }, { "epoch": 0.2385379929464572, "grad_norm": 1.0278836488723755, "learning_rate": 0.0001374890024927507, "loss": 5.7964, "num_input_tokens_seen": 182845440, "step": 1395 }, { "epoch": 0.23905097787752486, "grad_norm": 0.8830828070640564, "learning_rate": 0.00013734140287694022, "loss": 5.7192, "num_input_tokens_seen": 183238656, "step": 1398 }, { "epoch": 0.2395639628085925, "grad_norm": 0.9770954847335815, "learning_rate": 0.0001371942776039894, "loss": 5.7229, "num_input_tokens_seen": 183631872, "step": 1401 }, { "epoch": 0.24007694773966015, "grad_norm": 0.7094408273696899, "learning_rate": 0.0001370476241386595, "loss": 5.7209, "num_input_tokens_seen": 184025088, "step": 1404 }, { "epoch": 0.24058993267072779, "grad_norm": 0.9037938117980957, "learning_rate": 0.00013690143996464142, "loss": 5.8047, "num_input_tokens_seen": 184418304, "step": 1407 }, { "epoch": 0.24110291760179545, "grad_norm": 0.7936687469482422, "learning_rate": 0.00013675572258437476, "loss": 5.7976, "num_input_tokens_seen": 184811520, "step": 1410 }, { "epoch": 0.2416159025328631, "grad_norm": 0.7403805255889893, "learning_rate": 0.00013661046951886816, "loss": 5.7839, "num_input_tokens_seen": 185204736, "step": 1413 }, { "epoch": 0.24212888746393074, "grad_norm": 0.7771823406219482, "learning_rate": 0.00013646567830752246, "loss": 5.7907, "num_input_tokens_seen": 185597952, "step": 1416 }, { "epoch": 0.2426418723949984, "grad_norm": 0.6545729637145996, "learning_rate": 0.0001363213465079555, "loss": 5.6881, "num_input_tokens_seen": 185991168, "step": 1419 }, { "epoch": 0.24315485732606604, "grad_norm": 0.8105820417404175, "learning_rate": 0.00013617747169582915, "loss": 5.7786, "num_input_tokens_seen": 186384384, "step": 1422 }, { "epoch": 0.2436678422571337, "grad_norm": 0.9515424370765686, "learning_rate": 0.00013603405146467827, "loss": 5.7501, "num_input_tokens_seen": 186777600, "step": 1425 }, { "epoch": 0.24418082718820136, "grad_norm": 0.8837321400642395, "learning_rate": 0.00013589108342574154, "loss": 5.7674, "num_input_tokens_seen": 187170816, "step": 1428 }, { "epoch": 0.244693812119269, "grad_norm": 0.7665640711784363, "learning_rate": 0.0001357485652077945, "loss": 5.7271, "num_input_tokens_seen": 187564032, "step": 1431 }, { "epoch": 0.24520679705033666, "grad_norm": 0.7923889756202698, "learning_rate": 0.00013560649445698437, "loss": 5.7707, "num_input_tokens_seen": 187957248, "step": 1434 }, { "epoch": 0.2457197819814043, "grad_norm": 0.806024968624115, "learning_rate": 0.00013546486883666656, "loss": 5.7383, "num_input_tokens_seen": 188350464, "step": 1437 }, { "epoch": 0.24623276691247195, "grad_norm": 0.8087600469589233, "learning_rate": 0.00013532368602724355, "loss": 5.746, "num_input_tokens_seen": 188743680, "step": 1440 }, { "epoch": 0.24674575184353958, "grad_norm": 0.7250015735626221, "learning_rate": 0.00013518294372600513, "loss": 5.7275, "num_input_tokens_seen": 189136896, "step": 1443 }, { "epoch": 0.24725873677460725, "grad_norm": 0.8613927364349365, "learning_rate": 0.00013504263964697066, "loss": 5.7787, "num_input_tokens_seen": 189530112, "step": 1446 }, { "epoch": 0.2477717217056749, "grad_norm": 0.839963972568512, "learning_rate": 0.00013490277152073324, "loss": 5.7404, "num_input_tokens_seen": 189923328, "step": 1449 }, { "epoch": 0.24828470663674254, "grad_norm": 1.002913236618042, "learning_rate": 0.0001347633370943054, "loss": 5.7409, "num_input_tokens_seen": 190316544, "step": 1452 }, { "epoch": 0.2487976915678102, "grad_norm": 0.9510200619697571, "learning_rate": 0.00013462433413096678, "loss": 5.7916, "num_input_tokens_seen": 190709760, "step": 1455 }, { "epoch": 0.24931067649887784, "grad_norm": 0.8563526272773743, "learning_rate": 0.00013448576041011335, "loss": 5.692, "num_input_tokens_seen": 191102976, "step": 1458 }, { "epoch": 0.2498236614299455, "grad_norm": 1.1031644344329834, "learning_rate": 0.0001343476137271086, "loss": 5.7583, "num_input_tokens_seen": 191496192, "step": 1461 }, { "epoch": 0.25033664636101316, "grad_norm": 1.085344672203064, "learning_rate": 0.00013420989189313586, "loss": 5.7663, "num_input_tokens_seen": 191889408, "step": 1464 }, { "epoch": 0.2508496312920808, "grad_norm": 0.8237776160240173, "learning_rate": 0.00013407259273505302, "loss": 5.7291, "num_input_tokens_seen": 192282624, "step": 1467 }, { "epoch": 0.2513626162231484, "grad_norm": 0.7341859340667725, "learning_rate": 0.00013393571409524825, "loss": 5.7751, "num_input_tokens_seen": 192675840, "step": 1470 }, { "epoch": 0.2518756011542161, "grad_norm": 0.7785446047782898, "learning_rate": 0.0001337992538314978, "loss": 5.705, "num_input_tokens_seen": 193069056, "step": 1473 }, { "epoch": 0.25238858608528375, "grad_norm": 0.9106130003929138, "learning_rate": 0.00013366320981682498, "loss": 5.7583, "num_input_tokens_seen": 193462272, "step": 1476 }, { "epoch": 0.2529015710163514, "grad_norm": 0.8381192684173584, "learning_rate": 0.0001335275799393611, "loss": 5.725, "num_input_tokens_seen": 193855488, "step": 1479 }, { "epoch": 0.253414555947419, "grad_norm": 0.8275421261787415, "learning_rate": 0.00013339236210220762, "loss": 5.727, "num_input_tokens_seen": 194248704, "step": 1482 }, { "epoch": 0.2539275408784867, "grad_norm": 0.9162700176239014, "learning_rate": 0.00013325755422330005, "loss": 5.7105, "num_input_tokens_seen": 194641920, "step": 1485 }, { "epoch": 0.25444052580955434, "grad_norm": 0.7449747323989868, "learning_rate": 0.0001331231542352734, "loss": 5.7681, "num_input_tokens_seen": 195035136, "step": 1488 }, { "epoch": 0.254953510740622, "grad_norm": 0.7723684310913086, "learning_rate": 0.00013298916008532878, "loss": 5.7201, "num_input_tokens_seen": 195428352, "step": 1491 }, { "epoch": 0.25546649567168966, "grad_norm": 0.7799750566482544, "learning_rate": 0.000132855569735102, "loss": 5.7213, "num_input_tokens_seen": 195821568, "step": 1494 }, { "epoch": 0.25597948060275727, "grad_norm": 1.017113447189331, "learning_rate": 0.00013272238116053312, "loss": 5.7102, "num_input_tokens_seen": 196214784, "step": 1497 }, { "epoch": 0.25649246553382493, "grad_norm": 0.9866935014724731, "learning_rate": 0.00013258959235173777, "loss": 5.7064, "num_input_tokens_seen": 196608000, "step": 1500 }, { "epoch": 0.2570054504648926, "grad_norm": 1.0269203186035156, "learning_rate": 0.0001324572013128796, "loss": 5.7444, "num_input_tokens_seen": 197001216, "step": 1503 }, { "epoch": 0.25751843539596025, "grad_norm": 1.189927101135254, "learning_rate": 0.00013232520606204452, "loss": 5.6448, "num_input_tokens_seen": 197394432, "step": 1506 }, { "epoch": 0.2580314203270279, "grad_norm": 1.0342806577682495, "learning_rate": 0.0001321936046311159, "loss": 5.6614, "num_input_tokens_seen": 197787648, "step": 1509 }, { "epoch": 0.2585444052580955, "grad_norm": 0.8518815040588379, "learning_rate": 0.0001320623950656514, "loss": 5.7448, "num_input_tokens_seen": 198180864, "step": 1512 }, { "epoch": 0.2590573901891632, "grad_norm": 0.7017694711685181, "learning_rate": 0.00013193157542476102, "loss": 5.7127, "num_input_tokens_seen": 198574080, "step": 1515 }, { "epoch": 0.25957037512023085, "grad_norm": 0.7911322712898254, "learning_rate": 0.00013180114378098651, "loss": 5.6725, "num_input_tokens_seen": 198967296, "step": 1518 }, { "epoch": 0.2600833600512985, "grad_norm": 0.7968602776527405, "learning_rate": 0.0001316710982201822, "loss": 5.7116, "num_input_tokens_seen": 199360512, "step": 1521 }, { "epoch": 0.26059634498236617, "grad_norm": 0.8894298672676086, "learning_rate": 0.0001315414368413969, "loss": 5.7022, "num_input_tokens_seen": 199753728, "step": 1524 }, { "epoch": 0.2611093299134338, "grad_norm": 0.9683092832565308, "learning_rate": 0.00013141215775675717, "loss": 5.7418, "num_input_tokens_seen": 200146944, "step": 1527 }, { "epoch": 0.26162231484450144, "grad_norm": 0.947652280330658, "learning_rate": 0.000131283259091352, "loss": 5.7165, "num_input_tokens_seen": 200540160, "step": 1530 }, { "epoch": 0.2621352997755691, "grad_norm": 0.7747431993484497, "learning_rate": 0.00013115473898311848, "loss": 5.7537, "num_input_tokens_seen": 200933376, "step": 1533 }, { "epoch": 0.26264828470663676, "grad_norm": 0.719679594039917, "learning_rate": 0.00013102659558272893, "loss": 5.7737, "num_input_tokens_seen": 201326592, "step": 1536 }, { "epoch": 0.26316126963770436, "grad_norm": 0.7812837958335876, "learning_rate": 0.000130898827053479, "loss": 5.7438, "num_input_tokens_seen": 201719808, "step": 1539 }, { "epoch": 0.263674254568772, "grad_norm": 0.8203752040863037, "learning_rate": 0.00013077143157117724, "loss": 5.7158, "num_input_tokens_seen": 202113024, "step": 1542 }, { "epoch": 0.2641872394998397, "grad_norm": 0.9190983772277832, "learning_rate": 0.00013064440732403566, "loss": 5.6373, "num_input_tokens_seen": 202506240, "step": 1545 }, { "epoch": 0.26470022443090735, "grad_norm": 0.894138753414154, "learning_rate": 0.0001305177525125614, "loss": 5.7164, "num_input_tokens_seen": 202899456, "step": 1548 }, { "epoch": 0.265213209361975, "grad_norm": 1.0378000736236572, "learning_rate": 0.00013039146534944986, "loss": 5.7008, "num_input_tokens_seen": 203292672, "step": 1551 }, { "epoch": 0.2657261942930426, "grad_norm": 0.9447348117828369, "learning_rate": 0.00013026554405947864, "loss": 5.7429, "num_input_tokens_seen": 203685888, "step": 1554 }, { "epoch": 0.2662391792241103, "grad_norm": 0.8675848841667175, "learning_rate": 0.00013013998687940264, "loss": 5.7207, "num_input_tokens_seen": 204079104, "step": 1557 }, { "epoch": 0.26675216415517794, "grad_norm": 1.0198581218719482, "learning_rate": 0.00013001479205785067, "loss": 5.7049, "num_input_tokens_seen": 204472320, "step": 1560 }, { "epoch": 0.2672651490862456, "grad_norm": 0.863447368144989, "learning_rate": 0.0001298899578552225, "loss": 5.753, "num_input_tokens_seen": 204865536, "step": 1563 }, { "epoch": 0.26777813401731326, "grad_norm": 0.9340473413467407, "learning_rate": 0.0001297654825435875, "loss": 5.6951, "num_input_tokens_seen": 205258752, "step": 1566 }, { "epoch": 0.26829111894838087, "grad_norm": 0.826303243637085, "learning_rate": 0.0001296413644065842, "loss": 5.6856, "num_input_tokens_seen": 205651968, "step": 1569 }, { "epoch": 0.26880410387944853, "grad_norm": 0.8205902576446533, "learning_rate": 0.0001295176017393209, "loss": 5.7166, "num_input_tokens_seen": 206045184, "step": 1572 }, { "epoch": 0.2693170888105162, "grad_norm": 0.8139010667800903, "learning_rate": 0.00012939419284827716, "loss": 5.7012, "num_input_tokens_seen": 206438400, "step": 1575 }, { "epoch": 0.26983007374158385, "grad_norm": 0.8484136462211609, "learning_rate": 0.00012927113605120665, "loss": 5.7306, "num_input_tokens_seen": 206831616, "step": 1578 }, { "epoch": 0.2703430586726515, "grad_norm": 0.8395585417747498, "learning_rate": 0.00012914842967704074, "loss": 5.7253, "num_input_tokens_seen": 207224832, "step": 1581 }, { "epoch": 0.2708560436037191, "grad_norm": 0.8231976628303528, "learning_rate": 0.00012902607206579324, "loss": 5.6968, "num_input_tokens_seen": 207618048, "step": 1584 }, { "epoch": 0.2713690285347868, "grad_norm": 0.9288123250007629, "learning_rate": 0.000128904061568466, "loss": 5.6838, "num_input_tokens_seen": 208011264, "step": 1587 }, { "epoch": 0.27188201346585444, "grad_norm": 1.3254705667495728, "learning_rate": 0.00012878239654695573, "loss": 5.7356, "num_input_tokens_seen": 208404480, "step": 1590 }, { "epoch": 0.2723949983969221, "grad_norm": 0.9656926393508911, "learning_rate": 0.0001286610753739614, "loss": 5.7178, "num_input_tokens_seen": 208797696, "step": 1593 }, { "epoch": 0.27290798332798977, "grad_norm": 0.7304670214653015, "learning_rate": 0.00012854009643289304, "loss": 5.7276, "num_input_tokens_seen": 209190912, "step": 1596 }, { "epoch": 0.2734209682590574, "grad_norm": 0.7497060298919678, "learning_rate": 0.0001284194581177811, "loss": 5.7212, "num_input_tokens_seen": 209584128, "step": 1599 }, { "epoch": 0.2735919632360799, "eval_accuracy": 0.16197361993160722, "eval_loss": 6.1410651206970215, "eval_runtime": 110.7292, "eval_samples_per_second": 2.709, "eval_steps_per_second": 1.355, "num_input_tokens_seen": 209715200, "step": 1600 }, { "epoch": 0.27393395319012503, "grad_norm": 0.8801946640014648, "learning_rate": 0.0001282991588331871, "loss": 5.6859, "num_input_tokens_seen": 209977344, "step": 1602 }, { "epoch": 0.2744469381211927, "grad_norm": 0.8806388974189758, "learning_rate": 0.00012817919699411473, "loss": 5.6518, "num_input_tokens_seen": 210370560, "step": 1605 }, { "epoch": 0.27495992305226036, "grad_norm": 0.8406085968017578, "learning_rate": 0.00012805957102592246, "loss": 5.6947, "num_input_tokens_seen": 210763776, "step": 1608 }, { "epoch": 0.27547290798332796, "grad_norm": 0.985683798789978, "learning_rate": 0.0001279402793642365, "loss": 5.6736, "num_input_tokens_seen": 211156992, "step": 1611 }, { "epoch": 0.2759858929143956, "grad_norm": 1.0117186307907104, "learning_rate": 0.00012782132045486498, "loss": 5.688, "num_input_tokens_seen": 211550208, "step": 1614 }, { "epoch": 0.2764988778454633, "grad_norm": 0.7577993869781494, "learning_rate": 0.00012770269275371276, "loss": 5.6798, "num_input_tokens_seen": 211943424, "step": 1617 }, { "epoch": 0.27701186277653095, "grad_norm": 1.0011804103851318, "learning_rate": 0.0001275843947266976, "loss": 5.6961, "num_input_tokens_seen": 212336640, "step": 1620 }, { "epoch": 0.2775248477075986, "grad_norm": 0.7885441184043884, "learning_rate": 0.00012746642484966631, "loss": 5.7085, "num_input_tokens_seen": 212729856, "step": 1623 }, { "epoch": 0.2780378326386662, "grad_norm": 0.7686509490013123, "learning_rate": 0.00012734878160831288, "loss": 5.7337, "num_input_tokens_seen": 213123072, "step": 1626 }, { "epoch": 0.2785508175697339, "grad_norm": 0.7514293789863586, "learning_rate": 0.00012723146349809627, "loss": 5.6868, "num_input_tokens_seen": 213516288, "step": 1629 }, { "epoch": 0.27906380250080154, "grad_norm": 0.7466459274291992, "learning_rate": 0.00012711446902415993, "loss": 5.6678, "num_input_tokens_seen": 213909504, "step": 1632 }, { "epoch": 0.2795767874318692, "grad_norm": 0.7304571866989136, "learning_rate": 0.00012699779670125177, "loss": 5.677, "num_input_tokens_seen": 214302720, "step": 1635 }, { "epoch": 0.28008977236293686, "grad_norm": 0.7830135226249695, "learning_rate": 0.00012688144505364484, "loss": 5.6716, "num_input_tokens_seen": 214695936, "step": 1638 }, { "epoch": 0.28060275729400447, "grad_norm": 0.8072492480278015, "learning_rate": 0.00012676541261505907, "loss": 5.7114, "num_input_tokens_seen": 215089152, "step": 1641 }, { "epoch": 0.28111574222507213, "grad_norm": 0.828734815120697, "learning_rate": 0.00012664969792858355, "loss": 5.6624, "num_input_tokens_seen": 215482368, "step": 1644 }, { "epoch": 0.2816287271561398, "grad_norm": 1.1204993724822998, "learning_rate": 0.00012653429954659974, "loss": 5.6849, "num_input_tokens_seen": 215875584, "step": 1647 }, { "epoch": 0.28214171208720745, "grad_norm": 0.8877683877944946, "learning_rate": 0.00012641921603070546, "loss": 5.7019, "num_input_tokens_seen": 216268800, "step": 1650 }, { "epoch": 0.2826546970182751, "grad_norm": 0.9409082531929016, "learning_rate": 0.00012630444595163954, "loss": 5.6977, "num_input_tokens_seen": 216662016, "step": 1653 }, { "epoch": 0.2831676819493427, "grad_norm": 1.058484435081482, "learning_rate": 0.0001261899878892072, "loss": 5.716, "num_input_tokens_seen": 217055232, "step": 1656 }, { "epoch": 0.2836806668804104, "grad_norm": 0.8193328380584717, "learning_rate": 0.00012607584043220635, "loss": 5.6651, "num_input_tokens_seen": 217448448, "step": 1659 }, { "epoch": 0.28419365181147804, "grad_norm": 0.8166428208351135, "learning_rate": 0.00012596200217835447, "loss": 5.6376, "num_input_tokens_seen": 217841664, "step": 1662 }, { "epoch": 0.2847066367425457, "grad_norm": 0.9603435397148132, "learning_rate": 0.00012584847173421627, "loss": 5.7189, "num_input_tokens_seen": 218234880, "step": 1665 }, { "epoch": 0.2852196216736133, "grad_norm": 0.8467148542404175, "learning_rate": 0.000125735247715132, "loss": 5.6642, "num_input_tokens_seen": 218628096, "step": 1668 }, { "epoch": 0.28573260660468097, "grad_norm": 0.8301926255226135, "learning_rate": 0.00012562232874514657, "loss": 5.6971, "num_input_tokens_seen": 219021312, "step": 1671 }, { "epoch": 0.28624559153574863, "grad_norm": 0.8587663769721985, "learning_rate": 0.0001255097134569393, "loss": 5.6961, "num_input_tokens_seen": 219414528, "step": 1674 }, { "epoch": 0.2867585764668163, "grad_norm": 0.9333747625350952, "learning_rate": 0.00012539740049175436, "loss": 5.6821, "num_input_tokens_seen": 219807744, "step": 1677 }, { "epoch": 0.28727156139788396, "grad_norm": 0.7336766123771667, "learning_rate": 0.00012528538849933206, "loss": 5.678, "num_input_tokens_seen": 220200960, "step": 1680 }, { "epoch": 0.28778454632895156, "grad_norm": 0.7425720691680908, "learning_rate": 0.00012517367613784042, "loss": 5.6576, "num_input_tokens_seen": 220594176, "step": 1683 }, { "epoch": 0.2882975312600192, "grad_norm": 0.7801720499992371, "learning_rate": 0.00012506226207380784, "loss": 5.6991, "num_input_tokens_seen": 220987392, "step": 1686 }, { "epoch": 0.2888105161910869, "grad_norm": 0.7652679681777954, "learning_rate": 0.00012495114498205616, "loss": 5.6489, "num_input_tokens_seen": 221380608, "step": 1689 }, { "epoch": 0.28932350112215455, "grad_norm": 0.8375071287155151, "learning_rate": 0.00012484032354563457, "loss": 5.6568, "num_input_tokens_seen": 221773824, "step": 1692 }, { "epoch": 0.2898364860532222, "grad_norm": 0.7536932826042175, "learning_rate": 0.0001247297964557539, "loss": 5.6788, "num_input_tokens_seen": 222167040, "step": 1695 }, { "epoch": 0.2903494709842898, "grad_norm": 0.807601273059845, "learning_rate": 0.0001246195624117219, "loss": 5.6461, "num_input_tokens_seen": 222560256, "step": 1698 }, { "epoch": 0.2908624559153575, "grad_norm": 0.700380802154541, "learning_rate": 0.0001245096201208786, "loss": 5.6886, "num_input_tokens_seen": 222953472, "step": 1701 }, { "epoch": 0.29137544084642514, "grad_norm": 0.7354556322097778, "learning_rate": 0.00012439996829853315, "loss": 5.6543, "num_input_tokens_seen": 223346688, "step": 1704 }, { "epoch": 0.2918884257774928, "grad_norm": 0.8027917146682739, "learning_rate": 0.00012429060566790032, "loss": 5.7035, "num_input_tokens_seen": 223739904, "step": 1707 }, { "epoch": 0.29240141070856046, "grad_norm": 0.9573730826377869, "learning_rate": 0.0001241815309600383, "loss": 5.6513, "num_input_tokens_seen": 224133120, "step": 1710 }, { "epoch": 0.29291439563962807, "grad_norm": 1.0752445459365845, "learning_rate": 0.00012407274291378672, "loss": 5.6466, "num_input_tokens_seen": 224526336, "step": 1713 }, { "epoch": 0.29342738057069573, "grad_norm": 1.0931577682495117, "learning_rate": 0.0001239642402757056, "loss": 5.6842, "num_input_tokens_seen": 224919552, "step": 1716 }, { "epoch": 0.2939403655017634, "grad_norm": 0.7436386346817017, "learning_rate": 0.00012385602180001445, "loss": 5.645, "num_input_tokens_seen": 225312768, "step": 1719 }, { "epoch": 0.29445335043283105, "grad_norm": 0.7766621708869934, "learning_rate": 0.0001237480862485324, "loss": 5.6611, "num_input_tokens_seen": 225705984, "step": 1722 }, { "epoch": 0.2949663353638987, "grad_norm": 0.8995407223701477, "learning_rate": 0.0001236404323906186, "loss": 5.6912, "num_input_tokens_seen": 226099200, "step": 1725 }, { "epoch": 0.2954793202949663, "grad_norm": 0.9217162728309631, "learning_rate": 0.00012353305900311327, "loss": 5.6695, "num_input_tokens_seen": 226492416, "step": 1728 }, { "epoch": 0.295992305226034, "grad_norm": 1.1063520908355713, "learning_rate": 0.00012342596487027938, "loss": 5.6392, "num_input_tokens_seen": 226885632, "step": 1731 }, { "epoch": 0.29650529015710164, "grad_norm": 1.064606785774231, "learning_rate": 0.00012331914878374486, "loss": 5.6703, "num_input_tokens_seen": 227278848, "step": 1734 }, { "epoch": 0.2970182750881693, "grad_norm": 0.8207647800445557, "learning_rate": 0.00012321260954244523, "loss": 5.6174, "num_input_tokens_seen": 227672064, "step": 1737 }, { "epoch": 0.2975312600192369, "grad_norm": 0.9838363528251648, "learning_rate": 0.00012310634595256696, "loss": 5.6604, "num_input_tokens_seen": 228065280, "step": 1740 }, { "epoch": 0.29804424495030457, "grad_norm": 1.094573974609375, "learning_rate": 0.0001230003568274913, "loss": 5.6628, "num_input_tokens_seen": 228458496, "step": 1743 }, { "epoch": 0.29855722988137223, "grad_norm": 0.8992403745651245, "learning_rate": 0.00012289464098773857, "loss": 5.6452, "num_input_tokens_seen": 228851712, "step": 1746 }, { "epoch": 0.2990702148124399, "grad_norm": 0.9246178269386292, "learning_rate": 0.00012278919726091303, "loss": 5.6766, "num_input_tokens_seen": 229244928, "step": 1749 }, { "epoch": 0.29958319974350756, "grad_norm": 0.8067333102226257, "learning_rate": 0.00012268402448164836, "loss": 5.6916, "num_input_tokens_seen": 229638144, "step": 1752 }, { "epoch": 0.30009618467457516, "grad_norm": 0.8054561614990234, "learning_rate": 0.00012257912149155346, "loss": 5.6736, "num_input_tokens_seen": 230031360, "step": 1755 }, { "epoch": 0.3006091696056428, "grad_norm": 0.8408488631248474, "learning_rate": 0.00012247448713915892, "loss": 5.7006, "num_input_tokens_seen": 230424576, "step": 1758 }, { "epoch": 0.3011221545367105, "grad_norm": 0.8754010200500488, "learning_rate": 0.00012237012027986385, "loss": 5.6319, "num_input_tokens_seen": 230817792, "step": 1761 }, { "epoch": 0.30163513946777815, "grad_norm": 0.9594668745994568, "learning_rate": 0.00012226601977588348, "loss": 5.6849, "num_input_tokens_seen": 231211008, "step": 1764 }, { "epoch": 0.3021481243988458, "grad_norm": 0.7682183384895325, "learning_rate": 0.0001221621844961969, "loss": 5.6842, "num_input_tokens_seen": 231604224, "step": 1767 }, { "epoch": 0.3026611093299134, "grad_norm": 0.8172413110733032, "learning_rate": 0.00012205861331649545, "loss": 5.7111, "num_input_tokens_seen": 231997440, "step": 1770 }, { "epoch": 0.3031740942609811, "grad_norm": 0.893372118473053, "learning_rate": 0.0001219553051191317, "loss": 5.6743, "num_input_tokens_seen": 232390656, "step": 1773 }, { "epoch": 0.30368707919204874, "grad_norm": 0.8585564494132996, "learning_rate": 0.00012185225879306862, "loss": 5.6044, "num_input_tokens_seen": 232783872, "step": 1776 }, { "epoch": 0.3042000641231164, "grad_norm": 0.7689789533615112, "learning_rate": 0.00012174947323382965, "loss": 5.6363, "num_input_tokens_seen": 233177088, "step": 1779 }, { "epoch": 0.30471304905418406, "grad_norm": 0.7540669441223145, "learning_rate": 0.00012164694734344876, "loss": 5.6607, "num_input_tokens_seen": 233570304, "step": 1782 }, { "epoch": 0.30522603398525167, "grad_norm": 0.9404392242431641, "learning_rate": 0.00012154468003042123, "loss": 5.6585, "num_input_tokens_seen": 233963520, "step": 1785 }, { "epoch": 0.3057390189163193, "grad_norm": 1.020546793937683, "learning_rate": 0.00012144267020965491, "loss": 5.629, "num_input_tokens_seen": 234356736, "step": 1788 }, { "epoch": 0.306252003847387, "grad_norm": 0.9944835305213928, "learning_rate": 0.00012134091680242182, "loss": 5.6689, "num_input_tokens_seen": 234749952, "step": 1791 }, { "epoch": 0.30676498877845465, "grad_norm": 0.9175474047660828, "learning_rate": 0.00012123941873631032, "loss": 5.652, "num_input_tokens_seen": 235143168, "step": 1794 }, { "epoch": 0.30727797370952226, "grad_norm": 0.8481835722923279, "learning_rate": 0.00012113817494517742, "loss": 5.6716, "num_input_tokens_seen": 235536384, "step": 1797 }, { "epoch": 0.3077909586405899, "grad_norm": 0.8672162294387817, "learning_rate": 0.00012103718436910204, "loss": 5.6861, "num_input_tokens_seen": 235929600, "step": 1800 }, { "epoch": 0.3083039435716576, "grad_norm": 0.730492353439331, "learning_rate": 0.00012093644595433816, "loss": 5.6866, "num_input_tokens_seen": 236322816, "step": 1803 }, { "epoch": 0.30881692850272524, "grad_norm": 0.7994592189788818, "learning_rate": 0.00012083595865326879, "loss": 5.6567, "num_input_tokens_seen": 236716032, "step": 1806 }, { "epoch": 0.3093299134337929, "grad_norm": 0.8021286129951477, "learning_rate": 0.00012073572142436013, "loss": 5.6251, "num_input_tokens_seen": 237109248, "step": 1809 }, { "epoch": 0.3098428983648605, "grad_norm": 0.721517026424408, "learning_rate": 0.0001206357332321163, "loss": 5.6599, "num_input_tokens_seen": 237502464, "step": 1812 }, { "epoch": 0.31035588329592817, "grad_norm": 0.9358505606651306, "learning_rate": 0.00012053599304703434, "loss": 5.6717, "num_input_tokens_seen": 237895680, "step": 1815 }, { "epoch": 0.31086886822699583, "grad_norm": 1.005147099494934, "learning_rate": 0.0001204364998455597, "loss": 5.6534, "num_input_tokens_seen": 238288896, "step": 1818 }, { "epoch": 0.3113818531580635, "grad_norm": 0.7538228034973145, "learning_rate": 0.00012033725261004223, "loss": 5.6279, "num_input_tokens_seen": 238682112, "step": 1821 }, { "epoch": 0.31189483808913115, "grad_norm": 0.7190991640090942, "learning_rate": 0.00012023825032869223, "loss": 5.6295, "num_input_tokens_seen": 239075328, "step": 1824 }, { "epoch": 0.31240782302019876, "grad_norm": 0.8383211493492126, "learning_rate": 0.00012013949199553745, "loss": 5.7043, "num_input_tokens_seen": 239468544, "step": 1827 }, { "epoch": 0.3129208079512664, "grad_norm": 0.8222533464431763, "learning_rate": 0.00012004097661037986, "loss": 5.6339, "num_input_tokens_seen": 239861760, "step": 1830 }, { "epoch": 0.3134337928823341, "grad_norm": 0.7332392930984497, "learning_rate": 0.00011994270317875327, "loss": 5.6096, "num_input_tokens_seen": 240254976, "step": 1833 }, { "epoch": 0.31394677781340175, "grad_norm": 0.7699775099754333, "learning_rate": 0.00011984467071188111, "loss": 5.6614, "num_input_tokens_seen": 240648192, "step": 1836 }, { "epoch": 0.3144597627444694, "grad_norm": 0.6810494065284729, "learning_rate": 0.00011974687822663465, "loss": 5.6252, "num_input_tokens_seen": 241041408, "step": 1839 }, { "epoch": 0.314972747675537, "grad_norm": 0.7561641931533813, "learning_rate": 0.00011964932474549163, "loss": 5.5747, "num_input_tokens_seen": 241434624, "step": 1842 }, { "epoch": 0.3154857326066047, "grad_norm": 0.9112014770507812, "learning_rate": 0.00011955200929649517, "loss": 5.6209, "num_input_tokens_seen": 241827840, "step": 1845 }, { "epoch": 0.31599871753767234, "grad_norm": 0.8621751666069031, "learning_rate": 0.00011945493091321312, "loss": 5.6557, "num_input_tokens_seen": 242221056, "step": 1848 }, { "epoch": 0.31651170246874, "grad_norm": 0.7570284605026245, "learning_rate": 0.00011935808863469773, "loss": 5.6446, "num_input_tokens_seen": 242614272, "step": 1851 }, { "epoch": 0.3170246873998076, "grad_norm": 0.8017822504043579, "learning_rate": 0.00011926148150544575, "loss": 5.6767, "num_input_tokens_seen": 243007488, "step": 1854 }, { "epoch": 0.31753767233087526, "grad_norm": 0.7850795984268188, "learning_rate": 0.00011916510857535883, "loss": 5.62, "num_input_tokens_seen": 243400704, "step": 1857 }, { "epoch": 0.3180506572619429, "grad_norm": 0.774463951587677, "learning_rate": 0.00011906896889970413, "loss": 5.6407, "num_input_tokens_seen": 243793920, "step": 1860 }, { "epoch": 0.3185636421930106, "grad_norm": 0.8820813894271851, "learning_rate": 0.00011897306153907562, "loss": 5.6399, "num_input_tokens_seen": 244187136, "step": 1863 }, { "epoch": 0.31907662712407825, "grad_norm": 0.8753612041473389, "learning_rate": 0.00011887738555935545, "loss": 5.6036, "num_input_tokens_seen": 244580352, "step": 1866 }, { "epoch": 0.31958961205514586, "grad_norm": 0.8607076406478882, "learning_rate": 0.00011878194003167571, "loss": 5.6037, "num_input_tokens_seen": 244973568, "step": 1869 }, { "epoch": 0.3201025969862135, "grad_norm": 0.9494906663894653, "learning_rate": 0.00011868672403238055, "loss": 5.5947, "num_input_tokens_seen": 245366784, "step": 1872 }, { "epoch": 0.3206155819172812, "grad_norm": 0.9436231851577759, "learning_rate": 0.00011859173664298873, "loss": 5.6752, "num_input_tokens_seen": 245760000, "step": 1875 }, { "epoch": 0.32112856684834884, "grad_norm": 0.8782811760902405, "learning_rate": 0.00011849697695015632, "loss": 5.6304, "num_input_tokens_seen": 246153216, "step": 1878 }, { "epoch": 0.3216415517794165, "grad_norm": 0.8508527278900146, "learning_rate": 0.00011840244404563977, "loss": 5.6076, "num_input_tokens_seen": 246546432, "step": 1881 }, { "epoch": 0.3221545367104841, "grad_norm": 0.7841192483901978, "learning_rate": 0.00011830813702625953, "loss": 5.568, "num_input_tokens_seen": 246939648, "step": 1884 }, { "epoch": 0.32266752164155177, "grad_norm": 0.8864216804504395, "learning_rate": 0.0001182140549938636, "loss": 5.6292, "num_input_tokens_seen": 247332864, "step": 1887 }, { "epoch": 0.32318050657261943, "grad_norm": 0.7558512091636658, "learning_rate": 0.00011812019705529174, "loss": 5.6347, "num_input_tokens_seen": 247726080, "step": 1890 }, { "epoch": 0.3236934915036871, "grad_norm": 0.8658297657966614, "learning_rate": 0.00011802656232233979, "loss": 5.6167, "num_input_tokens_seen": 248119296, "step": 1893 }, { "epoch": 0.32420647643475475, "grad_norm": 0.7362368702888489, "learning_rate": 0.00011793314991172442, "loss": 5.5635, "num_input_tokens_seen": 248512512, "step": 1896 }, { "epoch": 0.32471946136582236, "grad_norm": 0.7577558755874634, "learning_rate": 0.00011783995894504806, "loss": 5.6168, "num_input_tokens_seen": 248905728, "step": 1899 }, { "epoch": 0.32523244629689, "grad_norm": 0.7319400906562805, "learning_rate": 0.00011774698854876431, "loss": 5.6247, "num_input_tokens_seen": 249298944, "step": 1902 }, { "epoch": 0.3257454312279577, "grad_norm": 0.7369369864463806, "learning_rate": 0.00011765423785414348, "loss": 5.6446, "num_input_tokens_seen": 249692160, "step": 1905 }, { "epoch": 0.32625841615902534, "grad_norm": 0.7148075103759766, "learning_rate": 0.00011756170599723845, "loss": 5.5845, "num_input_tokens_seen": 250085376, "step": 1908 }, { "epoch": 0.326771401090093, "grad_norm": 0.8362352252006531, "learning_rate": 0.00011746939211885098, "loss": 5.6083, "num_input_tokens_seen": 250478592, "step": 1911 }, { "epoch": 0.3272843860211606, "grad_norm": 0.7224807739257812, "learning_rate": 0.00011737729536449814, "loss": 5.5792, "num_input_tokens_seen": 250871808, "step": 1914 }, { "epoch": 0.3277973709522283, "grad_norm": 0.8332315683364868, "learning_rate": 0.00011728541488437912, "loss": 5.6376, "num_input_tokens_seen": 251265024, "step": 1917 }, { "epoch": 0.32831035588329593, "grad_norm": 0.89626544713974, "learning_rate": 0.00011719374983334221, "loss": 5.6722, "num_input_tokens_seen": 251658240, "step": 1920 }, { "epoch": 0.3288233408143636, "grad_norm": 0.6044060587882996, "learning_rate": 0.0001171022993708523, "loss": 5.6473, "num_input_tokens_seen": 252051456, "step": 1923 }, { "epoch": 0.3293363257454312, "grad_norm": 0.6592041850090027, "learning_rate": 0.00011701106266095837, "loss": 5.624, "num_input_tokens_seen": 252444672, "step": 1926 }, { "epoch": 0.32984931067649886, "grad_norm": 0.7988864183425903, "learning_rate": 0.00011692003887226147, "loss": 5.604, "num_input_tokens_seen": 252837888, "step": 1929 }, { "epoch": 0.3303622956075665, "grad_norm": 0.9502041935920715, "learning_rate": 0.00011682922717788286, "loss": 5.659, "num_input_tokens_seen": 253231104, "step": 1932 }, { "epoch": 0.3308752805386342, "grad_norm": 1.0182008743286133, "learning_rate": 0.0001167386267554325, "loss": 5.6019, "num_input_tokens_seen": 253624320, "step": 1935 }, { "epoch": 0.33138826546970185, "grad_norm": 0.9730847477912903, "learning_rate": 0.00011664823678697777, "loss": 5.6701, "num_input_tokens_seen": 254017536, "step": 1938 }, { "epoch": 0.33190125040076945, "grad_norm": 0.7735222578048706, "learning_rate": 0.00011655805645901238, "loss": 5.5851, "num_input_tokens_seen": 254410752, "step": 1941 }, { "epoch": 0.3324142353318371, "grad_norm": 0.7792080044746399, "learning_rate": 0.0001164680849624257, "loss": 5.6015, "num_input_tokens_seen": 254803968, "step": 1944 }, { "epoch": 0.3329272202629048, "grad_norm": 0.7463663816452026, "learning_rate": 0.0001163783214924723, "loss": 5.6114, "num_input_tokens_seen": 255197184, "step": 1947 }, { "epoch": 0.33344020519397244, "grad_norm": 0.6915552020072937, "learning_rate": 0.00011628876524874155, "loss": 5.6049, "num_input_tokens_seen": 255590400, "step": 1950 }, { "epoch": 0.3339531901250401, "grad_norm": 0.7225996851921082, "learning_rate": 0.00011619941543512788, "loss": 5.6132, "num_input_tokens_seen": 255983616, "step": 1953 }, { "epoch": 0.3344661750561077, "grad_norm": 0.7305698990821838, "learning_rate": 0.00011611027125980086, "loss": 5.6121, "num_input_tokens_seen": 256376832, "step": 1956 }, { "epoch": 0.33497915998717537, "grad_norm": 0.699140191078186, "learning_rate": 0.00011602133193517582, "loss": 5.5685, "num_input_tokens_seen": 256770048, "step": 1959 }, { "epoch": 0.33549214491824303, "grad_norm": 0.7200695872306824, "learning_rate": 0.00011593259667788463, "loss": 5.639, "num_input_tokens_seen": 257163264, "step": 1962 }, { "epoch": 0.3360051298493107, "grad_norm": 0.84376060962677, "learning_rate": 0.0001158440647087466, "loss": 5.6694, "num_input_tokens_seen": 257556480, "step": 1965 }, { "epoch": 0.33651811478037835, "grad_norm": 0.8291401863098145, "learning_rate": 0.00011575573525274, "loss": 5.5855, "num_input_tokens_seen": 257949696, "step": 1968 }, { "epoch": 0.33703109971144596, "grad_norm": 0.743291437625885, "learning_rate": 0.0001156676075389733, "loss": 5.5808, "num_input_tokens_seen": 258342912, "step": 1971 }, { "epoch": 0.3375440846425136, "grad_norm": 0.9974462389945984, "learning_rate": 0.000115579680800657, "loss": 5.5863, "num_input_tokens_seen": 258736128, "step": 1974 }, { "epoch": 0.3380570695735813, "grad_norm": 1.0641543865203857, "learning_rate": 0.00011549195427507569, "loss": 5.6162, "num_input_tokens_seen": 259129344, "step": 1977 }, { "epoch": 0.33857005450464894, "grad_norm": 1.3743962049484253, "learning_rate": 0.00011540442720356016, "loss": 5.6052, "num_input_tokens_seen": 259522560, "step": 1980 }, { "epoch": 0.33908303943571655, "grad_norm": 0.8301076889038086, "learning_rate": 0.0001153170988314599, "loss": 5.5922, "num_input_tokens_seen": 259915776, "step": 1983 }, { "epoch": 0.3395960243667842, "grad_norm": 0.7537018656730652, "learning_rate": 0.00011522996840811572, "loss": 5.5989, "num_input_tokens_seen": 260308992, "step": 1986 }, { "epoch": 0.34010900929785187, "grad_norm": 0.9778670072555542, "learning_rate": 0.00011514303518683271, "loss": 5.614, "num_input_tokens_seen": 260702208, "step": 1989 }, { "epoch": 0.34062199422891953, "grad_norm": 0.9663039445877075, "learning_rate": 0.00011505629842485338, "loss": 5.6108, "num_input_tokens_seen": 261095424, "step": 1992 }, { "epoch": 0.3411349791599872, "grad_norm": 0.9101848006248474, "learning_rate": 0.00011496975738333083, "loss": 5.5891, "num_input_tokens_seen": 261488640, "step": 1995 }, { "epoch": 0.3416479640910548, "grad_norm": 0.8312708735466003, "learning_rate": 0.00011488341132730259, "loss": 5.6175, "num_input_tokens_seen": 261881856, "step": 1998 }, { "epoch": 0.3419899540450999, "eval_accuracy": 0.16692558215274386, "eval_loss": 6.050157070159912, "eval_runtime": 112.2283, "eval_samples_per_second": 2.673, "eval_steps_per_second": 1.337, "num_input_tokens_seen": 262144000, "step": 2000 }, { "epoch": 0.34216094902212246, "grad_norm": 0.836723268032074, "learning_rate": 0.00011479725952566419, "loss": 5.5988, "num_input_tokens_seen": 262275072, "step": 2001 }, { "epoch": 0.3426739339531901, "grad_norm": 0.7804883122444153, "learning_rate": 0.00011471130125114323, "loss": 5.6296, "num_input_tokens_seen": 262668288, "step": 2004 }, { "epoch": 0.3431869188842578, "grad_norm": 0.9573701620101929, "learning_rate": 0.00011462553578027366, "loss": 5.5844, "num_input_tokens_seen": 263061504, "step": 2007 }, { "epoch": 0.34369990381532545, "grad_norm": 0.9088414311408997, "learning_rate": 0.00011453996239337006, "loss": 5.5551, "num_input_tokens_seen": 263454720, "step": 2010 }, { "epoch": 0.34421288874639305, "grad_norm": 0.8941265940666199, "learning_rate": 0.00011445458037450239, "loss": 5.5586, "num_input_tokens_seen": 263847936, "step": 2013 }, { "epoch": 0.3447258736774607, "grad_norm": 0.7578518390655518, "learning_rate": 0.00011436938901147081, "loss": 5.6165, "num_input_tokens_seen": 264241152, "step": 2016 }, { "epoch": 0.3452388586085284, "grad_norm": 0.7264053821563721, "learning_rate": 0.00011428438759578074, "loss": 5.5951, "num_input_tokens_seen": 264634368, "step": 2019 }, { "epoch": 0.34575184353959604, "grad_norm": 0.7484355568885803, "learning_rate": 0.00011419957542261805, "loss": 5.6023, "num_input_tokens_seen": 265027584, "step": 2022 }, { "epoch": 0.3462648284706637, "grad_norm": 0.7840693593025208, "learning_rate": 0.0001141149517908246, "loss": 5.5978, "num_input_tokens_seen": 265420800, "step": 2025 }, { "epoch": 0.3467778134017313, "grad_norm": 0.7896338701248169, "learning_rate": 0.0001140305160028738, "loss": 5.6215, "num_input_tokens_seen": 265814016, "step": 2028 }, { "epoch": 0.34729079833279897, "grad_norm": 0.8777353763580322, "learning_rate": 0.00011394626736484653, "loss": 5.5965, "num_input_tokens_seen": 266207232, "step": 2031 }, { "epoch": 0.34780378326386663, "grad_norm": 0.8751804232597351, "learning_rate": 0.00011386220518640724, "loss": 5.6445, "num_input_tokens_seen": 266600448, "step": 2034 }, { "epoch": 0.3483167681949343, "grad_norm": 0.8036639094352722, "learning_rate": 0.00011377832878078, "loss": 5.5957, "num_input_tokens_seen": 266993664, "step": 2037 }, { "epoch": 0.34882975312600195, "grad_norm": 0.8183003664016724, "learning_rate": 0.00011369463746472517, "loss": 5.6243, "num_input_tokens_seen": 267386880, "step": 2040 }, { "epoch": 0.34934273805706956, "grad_norm": 0.8002908825874329, "learning_rate": 0.00011361113055851587, "loss": 5.5953, "num_input_tokens_seen": 267780096, "step": 2043 }, { "epoch": 0.3498557229881372, "grad_norm": 0.6631841063499451, "learning_rate": 0.00011352780738591478, "loss": 5.6013, "num_input_tokens_seen": 268173312, "step": 2046 }, { "epoch": 0.3503687079192049, "grad_norm": 0.7779785394668579, "learning_rate": 0.00011344466727415132, "loss": 5.6058, "num_input_tokens_seen": 268566528, "step": 2049 }, { "epoch": 0.35088169285027254, "grad_norm": 0.722675621509552, "learning_rate": 0.00011336170955389853, "loss": 5.6014, "num_input_tokens_seen": 268959744, "step": 2052 }, { "epoch": 0.35139467778134015, "grad_norm": 0.725713312625885, "learning_rate": 0.00011327893355925084, "loss": 5.6318, "num_input_tokens_seen": 269352960, "step": 2055 }, { "epoch": 0.3519076627124078, "grad_norm": 0.7272054553031921, "learning_rate": 0.0001131963386277012, "loss": 5.5913, "num_input_tokens_seen": 269746176, "step": 2058 }, { "epoch": 0.35242064764347547, "grad_norm": 0.7865688800811768, "learning_rate": 0.00011311392410011913, "loss": 5.5727, "num_input_tokens_seen": 270139392, "step": 2061 }, { "epoch": 0.35293363257454313, "grad_norm": 0.754695475101471, "learning_rate": 0.00011303168932072842, "loss": 5.5762, "num_input_tokens_seen": 270532608, "step": 2064 }, { "epoch": 0.3534466175056108, "grad_norm": 0.7348251342773438, "learning_rate": 0.00011294963363708538, "loss": 5.5913, "num_input_tokens_seen": 270925824, "step": 2067 }, { "epoch": 0.3539596024366784, "grad_norm": 0.8406401872634888, "learning_rate": 0.00011286775640005698, "loss": 5.5496, "num_input_tokens_seen": 271319040, "step": 2070 }, { "epoch": 0.35447258736774606, "grad_norm": 0.8418242335319519, "learning_rate": 0.00011278605696379935, "loss": 5.5903, "num_input_tokens_seen": 271712256, "step": 2073 }, { "epoch": 0.3549855722988137, "grad_norm": 0.6441095471382141, "learning_rate": 0.00011270453468573625, "loss": 5.5503, "num_input_tokens_seen": 272105472, "step": 2076 }, { "epoch": 0.3554985572298814, "grad_norm": 0.7358053922653198, "learning_rate": 0.00011262318892653804, "loss": 5.5992, "num_input_tokens_seen": 272498688, "step": 2079 }, { "epoch": 0.35601154216094905, "grad_norm": 0.7976645231246948, "learning_rate": 0.00011254201905010056, "loss": 5.594, "num_input_tokens_seen": 272891904, "step": 2082 }, { "epoch": 0.35652452709201665, "grad_norm": 0.9197404980659485, "learning_rate": 0.00011246102442352411, "loss": 5.5648, "num_input_tokens_seen": 273285120, "step": 2085 }, { "epoch": 0.3570375120230843, "grad_norm": 0.9233546853065491, "learning_rate": 0.00011238020441709289, "loss": 5.5126, "num_input_tokens_seen": 273678336, "step": 2088 }, { "epoch": 0.357550496954152, "grad_norm": 0.8510675430297852, "learning_rate": 0.00011229955840425433, "loss": 5.5816, "num_input_tokens_seen": 274071552, "step": 2091 }, { "epoch": 0.35806348188521964, "grad_norm": 0.7524028420448303, "learning_rate": 0.00011221908576159871, "loss": 5.5925, "num_input_tokens_seen": 274464768, "step": 2094 }, { "epoch": 0.3585764668162873, "grad_norm": 0.9298664927482605, "learning_rate": 0.00011213878586883904, "loss": 5.5632, "num_input_tokens_seen": 274857984, "step": 2097 }, { "epoch": 0.3590894517473549, "grad_norm": 0.8467714190483093, "learning_rate": 0.00011205865810879076, "loss": 5.5589, "num_input_tokens_seen": 275251200, "step": 2100 }, { "epoch": 0.35960243667842257, "grad_norm": 0.7781933546066284, "learning_rate": 0.00011197870186735193, "loss": 5.5408, "num_input_tokens_seen": 275644416, "step": 2103 }, { "epoch": 0.3601154216094902, "grad_norm": 0.8665353655815125, "learning_rate": 0.00011189891653348355, "loss": 5.6069, "num_input_tokens_seen": 276037632, "step": 2106 }, { "epoch": 0.3606284065405579, "grad_norm": 0.8551245927810669, "learning_rate": 0.00011181930149918981, "loss": 5.5846, "num_input_tokens_seen": 276430848, "step": 2109 }, { "epoch": 0.3611413914716255, "grad_norm": 0.7167636752128601, "learning_rate": 0.00011173985615949868, "loss": 5.5516, "num_input_tokens_seen": 276824064, "step": 2112 }, { "epoch": 0.36165437640269316, "grad_norm": 0.6893343329429626, "learning_rate": 0.00011166057991244258, "loss": 5.5724, "num_input_tokens_seen": 277217280, "step": 2115 }, { "epoch": 0.3621673613337608, "grad_norm": 0.9303981065750122, "learning_rate": 0.00011158147215903933, "loss": 5.5756, "num_input_tokens_seen": 277610496, "step": 2118 }, { "epoch": 0.3626803462648285, "grad_norm": 0.7787050008773804, "learning_rate": 0.00011150253230327296, "loss": 5.5545, "num_input_tokens_seen": 278003712, "step": 2121 }, { "epoch": 0.36319333119589614, "grad_norm": 0.7556629180908203, "learning_rate": 0.00011142375975207502, "loss": 5.5149, "num_input_tokens_seen": 278396928, "step": 2124 }, { "epoch": 0.36370631612696375, "grad_norm": 0.8528605103492737, "learning_rate": 0.00011134515391530575, "loss": 5.5498, "num_input_tokens_seen": 278790144, "step": 2127 }, { "epoch": 0.3642193010580314, "grad_norm": 0.8373255729675293, "learning_rate": 0.00011126671420573558, "loss": 5.5908, "num_input_tokens_seen": 279183360, "step": 2130 }, { "epoch": 0.36473228598909907, "grad_norm": 0.778972327709198, "learning_rate": 0.0001111884400390267, "loss": 5.579, "num_input_tokens_seen": 279576576, "step": 2133 }, { "epoch": 0.36524527092016673, "grad_norm": 0.7942299246788025, "learning_rate": 0.00011111033083371468, "loss": 5.5897, "num_input_tokens_seen": 279969792, "step": 2136 }, { "epoch": 0.3657582558512344, "grad_norm": 0.7748274207115173, "learning_rate": 0.00011103238601119048, "loss": 5.5885, "num_input_tokens_seen": 280363008, "step": 2139 }, { "epoch": 0.366271240782302, "grad_norm": 0.7882058024406433, "learning_rate": 0.00011095460499568234, "loss": 5.6277, "num_input_tokens_seen": 280756224, "step": 2142 }, { "epoch": 0.36678422571336966, "grad_norm": 0.7407231330871582, "learning_rate": 0.00011087698721423798, "loss": 5.562, "num_input_tokens_seen": 281149440, "step": 2145 }, { "epoch": 0.3672972106444373, "grad_norm": 0.8401018381118774, "learning_rate": 0.0001107995320967068, "loss": 5.5526, "num_input_tokens_seen": 281542656, "step": 2148 }, { "epoch": 0.367810195575505, "grad_norm": 0.7504671812057495, "learning_rate": 0.00011072223907572236, "loss": 5.5194, "num_input_tokens_seen": 281935872, "step": 2151 }, { "epoch": 0.36832318050657264, "grad_norm": 0.8152016401290894, "learning_rate": 0.0001106451075866849, "loss": 5.5838, "num_input_tokens_seen": 282329088, "step": 2154 }, { "epoch": 0.36883616543764025, "grad_norm": 0.764788031578064, "learning_rate": 0.00011056813706774403, "loss": 5.5741, "num_input_tokens_seen": 282722304, "step": 2157 }, { "epoch": 0.3693491503687079, "grad_norm": 0.7509755492210388, "learning_rate": 0.00011049132695978147, "loss": 5.5572, "num_input_tokens_seen": 283115520, "step": 2160 }, { "epoch": 0.3698621352997756, "grad_norm": 1.0009026527404785, "learning_rate": 0.0001104146767063941, "loss": 5.5376, "num_input_tokens_seen": 283508736, "step": 2163 }, { "epoch": 0.37037512023084324, "grad_norm": 0.7605016231536865, "learning_rate": 0.00011033818575387697, "loss": 5.5529, "num_input_tokens_seen": 283901952, "step": 2166 }, { "epoch": 0.37088810516191084, "grad_norm": 0.7910396456718445, "learning_rate": 0.00011026185355120653, "loss": 5.5987, "num_input_tokens_seen": 284295168, "step": 2169 }, { "epoch": 0.3714010900929785, "grad_norm": 0.7264419794082642, "learning_rate": 0.00011018567955002388, "loss": 5.5723, "num_input_tokens_seen": 284688384, "step": 2172 }, { "epoch": 0.37191407502404616, "grad_norm": 0.6918433308601379, "learning_rate": 0.00011010966320461834, "loss": 5.5759, "num_input_tokens_seen": 285081600, "step": 2175 }, { "epoch": 0.3724270599551138, "grad_norm": 0.7263005375862122, "learning_rate": 0.00011003380397191095, "loss": 5.5918, "num_input_tokens_seen": 285474816, "step": 2178 }, { "epoch": 0.3729400448861815, "grad_norm": 0.8244202733039856, "learning_rate": 0.00010995810131143818, "loss": 5.5039, "num_input_tokens_seen": 285868032, "step": 2181 }, { "epoch": 0.3734530298172491, "grad_norm": 0.8637891411781311, "learning_rate": 0.00010988255468533583, "loss": 5.5602, "num_input_tokens_seen": 286261248, "step": 2184 }, { "epoch": 0.37396601474831676, "grad_norm": 0.7446593046188354, "learning_rate": 0.0001098071635583229, "loss": 5.573, "num_input_tokens_seen": 286654464, "step": 2187 }, { "epoch": 0.3744789996793844, "grad_norm": 0.8517831563949585, "learning_rate": 0.00010973192739768566, "loss": 5.5137, "num_input_tokens_seen": 287047680, "step": 2190 }, { "epoch": 0.3749919846104521, "grad_norm": 0.8291754722595215, "learning_rate": 0.00010965684567326188, "loss": 5.5965, "num_input_tokens_seen": 287440896, "step": 2193 }, { "epoch": 0.37550496954151974, "grad_norm": 0.7934954762458801, "learning_rate": 0.00010958191785742515, "loss": 5.5761, "num_input_tokens_seen": 287834112, "step": 2196 }, { "epoch": 0.37601795447258735, "grad_norm": 1.1249563694000244, "learning_rate": 0.00010950714342506926, "loss": 5.5857, "num_input_tokens_seen": 288227328, "step": 2199 }, { "epoch": 0.376530939403655, "grad_norm": 0.9391211867332458, "learning_rate": 0.00010943252185359275, "loss": 5.5403, "num_input_tokens_seen": 288620544, "step": 2202 }, { "epoch": 0.37704392433472267, "grad_norm": 0.8083456754684448, "learning_rate": 0.00010935805262288362, "loss": 5.5593, "num_input_tokens_seen": 289013760, "step": 2205 }, { "epoch": 0.37755690926579033, "grad_norm": 0.8864873051643372, "learning_rate": 0.00010928373521530409, "loss": 5.5901, "num_input_tokens_seen": 289406976, "step": 2208 }, { "epoch": 0.378069894196858, "grad_norm": 0.7634648084640503, "learning_rate": 0.00010920956911567537, "loss": 5.5755, "num_input_tokens_seen": 289800192, "step": 2211 }, { "epoch": 0.3785828791279256, "grad_norm": 0.9811239242553711, "learning_rate": 0.00010913555381126287, "loss": 5.5405, "num_input_tokens_seen": 290193408, "step": 2214 }, { "epoch": 0.37909586405899326, "grad_norm": 0.8969237208366394, "learning_rate": 0.00010906168879176115, "loss": 5.6022, "num_input_tokens_seen": 290586624, "step": 2217 }, { "epoch": 0.3796088489900609, "grad_norm": 0.8845155835151672, "learning_rate": 0.00010898797354927919, "loss": 5.554, "num_input_tokens_seen": 290979840, "step": 2220 }, { "epoch": 0.3801218339211286, "grad_norm": 0.7560102343559265, "learning_rate": 0.0001089144075783257, "loss": 5.6002, "num_input_tokens_seen": 291373056, "step": 2223 }, { "epoch": 0.38063481885219624, "grad_norm": 0.7519661784172058, "learning_rate": 0.00010884099037579465, "loss": 5.5744, "num_input_tokens_seen": 291766272, "step": 2226 }, { "epoch": 0.38114780378326385, "grad_norm": 0.7482137084007263, "learning_rate": 0.00010876772144095075, "loss": 5.4982, "num_input_tokens_seen": 292159488, "step": 2229 }, { "epoch": 0.3816607887143315, "grad_norm": 0.7390425205230713, "learning_rate": 0.00010869460027541504, "loss": 5.5839, "num_input_tokens_seen": 292552704, "step": 2232 }, { "epoch": 0.3821737736453992, "grad_norm": 0.7970213890075684, "learning_rate": 0.00010862162638315081, "loss": 5.5299, "num_input_tokens_seen": 292945920, "step": 2235 }, { "epoch": 0.38268675857646683, "grad_norm": 1.0196114778518677, "learning_rate": 0.00010854879927044931, "loss": 5.5759, "num_input_tokens_seen": 293339136, "step": 2238 }, { "epoch": 0.38319974350753444, "grad_norm": 0.8929862380027771, "learning_rate": 0.00010847611844591587, "loss": 5.5529, "num_input_tokens_seen": 293732352, "step": 2241 }, { "epoch": 0.3837127284386021, "grad_norm": 0.7914404273033142, "learning_rate": 0.00010840358342045581, "loss": 5.5529, "num_input_tokens_seen": 294125568, "step": 2244 }, { "epoch": 0.38422571336966976, "grad_norm": 0.8195559978485107, "learning_rate": 0.00010833119370726075, "loss": 5.5227, "num_input_tokens_seen": 294518784, "step": 2247 }, { "epoch": 0.3847386983007374, "grad_norm": 0.947847306728363, "learning_rate": 0.00010825894882179485, "loss": 5.5733, "num_input_tokens_seen": 294912000, "step": 2250 }, { "epoch": 0.3852516832318051, "grad_norm": 0.9748887419700623, "learning_rate": 0.00010818684828178117, "loss": 5.5793, "num_input_tokens_seen": 295305216, "step": 2253 }, { "epoch": 0.3857646681628727, "grad_norm": 0.8942933678627014, "learning_rate": 0.00010811489160718815, "loss": 5.5403, "num_input_tokens_seen": 295698432, "step": 2256 }, { "epoch": 0.38627765309394035, "grad_norm": 0.7008263468742371, "learning_rate": 0.00010804307832021618, "loss": 5.5767, "num_input_tokens_seen": 296091648, "step": 2259 }, { "epoch": 0.386790638025008, "grad_norm": 0.7309878468513489, "learning_rate": 0.0001079714079452843, "loss": 5.5122, "num_input_tokens_seen": 296484864, "step": 2262 }, { "epoch": 0.3873036229560757, "grad_norm": 0.9144716858863831, "learning_rate": 0.000107899880009017, "loss": 5.57, "num_input_tokens_seen": 296878080, "step": 2265 }, { "epoch": 0.38781660788714334, "grad_norm": 0.961901068687439, "learning_rate": 0.00010782849404023096, "loss": 5.5156, "num_input_tokens_seen": 297271296, "step": 2268 }, { "epoch": 0.38832959281821094, "grad_norm": 0.7054316997528076, "learning_rate": 0.00010775724956992224, "loss": 5.5626, "num_input_tokens_seen": 297664512, "step": 2271 }, { "epoch": 0.3888425777492786, "grad_norm": 0.8600339889526367, "learning_rate": 0.00010768614613125303, "loss": 5.5521, "num_input_tokens_seen": 298057728, "step": 2274 }, { "epoch": 0.38935556268034627, "grad_norm": 0.7894279956817627, "learning_rate": 0.0001076151832595391, "loss": 5.5412, "num_input_tokens_seen": 298450944, "step": 2277 }, { "epoch": 0.38986854761141393, "grad_norm": 0.7673327922821045, "learning_rate": 0.0001075443604922369, "loss": 5.584, "num_input_tokens_seen": 298844160, "step": 2280 }, { "epoch": 0.3903815325424816, "grad_norm": 0.7792801856994629, "learning_rate": 0.00010747367736893089, "loss": 5.5592, "num_input_tokens_seen": 299237376, "step": 2283 }, { "epoch": 0.3908945174735492, "grad_norm": 0.8032937049865723, "learning_rate": 0.00010740313343132098, "loss": 5.5543, "num_input_tokens_seen": 299630592, "step": 2286 }, { "epoch": 0.39140750240461686, "grad_norm": 0.731970489025116, "learning_rate": 0.00010733272822321011, "loss": 5.5259, "num_input_tokens_seen": 300023808, "step": 2289 }, { "epoch": 0.3919204873356845, "grad_norm": 0.7217367887496948, "learning_rate": 0.00010726246129049176, "loss": 5.5442, "num_input_tokens_seen": 300417024, "step": 2292 }, { "epoch": 0.3924334722667522, "grad_norm": 0.7392825484275818, "learning_rate": 0.00010719233218113771, "loss": 5.5274, "num_input_tokens_seen": 300810240, "step": 2295 }, { "epoch": 0.3929464571978198, "grad_norm": 0.7724013924598694, "learning_rate": 0.00010712234044518587, "loss": 5.5069, "num_input_tokens_seen": 301203456, "step": 2298 }, { "epoch": 0.39345944212888745, "grad_norm": 0.709718644618988, "learning_rate": 0.00010705248563472809, "loss": 5.5211, "num_input_tokens_seen": 301596672, "step": 2301 }, { "epoch": 0.3939724270599551, "grad_norm": 0.7409883141517639, "learning_rate": 0.00010698276730389805, "loss": 5.5102, "num_input_tokens_seen": 301989888, "step": 2304 }, { "epoch": 0.39448541199102277, "grad_norm": 0.8129496574401855, "learning_rate": 0.0001069131850088595, "loss": 5.5545, "num_input_tokens_seen": 302383104, "step": 2307 }, { "epoch": 0.39499839692209043, "grad_norm": 1.0027920007705688, "learning_rate": 0.00010684373830779422, "loss": 5.5445, "num_input_tokens_seen": 302776320, "step": 2310 }, { "epoch": 0.39551138185315804, "grad_norm": 0.8895756006240845, "learning_rate": 0.0001067744267608903, "loss": 5.5625, "num_input_tokens_seen": 303169536, "step": 2313 }, { "epoch": 0.3960243667842257, "grad_norm": 0.8327840566635132, "learning_rate": 0.00010670524993033049, "loss": 5.5472, "num_input_tokens_seen": 303562752, "step": 2316 }, { "epoch": 0.39653735171529336, "grad_norm": 1.1410460472106934, "learning_rate": 0.00010663620738028051, "loss": 5.5659, "num_input_tokens_seen": 303955968, "step": 2319 }, { "epoch": 0.397050336646361, "grad_norm": 0.889072835445404, "learning_rate": 0.0001065672986768775, "loss": 5.5594, "num_input_tokens_seen": 304349184, "step": 2322 }, { "epoch": 0.3975633215774287, "grad_norm": 0.8344824910163879, "learning_rate": 0.0001064985233882187, "loss": 5.553, "num_input_tokens_seen": 304742400, "step": 2325 }, { "epoch": 0.3980763065084963, "grad_norm": 0.7969459891319275, "learning_rate": 0.00010642988108434991, "loss": 5.5389, "num_input_tokens_seen": 305135616, "step": 2328 }, { "epoch": 0.39858929143956395, "grad_norm": 0.9471580386161804, "learning_rate": 0.00010636137133725434, "loss": 5.5615, "num_input_tokens_seen": 305528832, "step": 2331 }, { "epoch": 0.3991022763706316, "grad_norm": 0.8734350204467773, "learning_rate": 0.00010629299372084134, "loss": 5.5455, "num_input_tokens_seen": 305922048, "step": 2334 }, { "epoch": 0.3996152613016993, "grad_norm": 0.8351041078567505, "learning_rate": 0.00010622474781093524, "loss": 5.5332, "num_input_tokens_seen": 306315264, "step": 2337 }, { "epoch": 0.40012824623276694, "grad_norm": 0.8178178668022156, "learning_rate": 0.00010615663318526436, "loss": 5.5456, "num_input_tokens_seen": 306708480, "step": 2340 }, { "epoch": 0.40064123116383454, "grad_norm": 0.8616231679916382, "learning_rate": 0.00010608864942345, "loss": 5.5559, "num_input_tokens_seen": 307101696, "step": 2343 }, { "epoch": 0.4011542160949022, "grad_norm": 0.9160477519035339, "learning_rate": 0.00010602079610699554, "loss": 5.5369, "num_input_tokens_seen": 307494912, "step": 2346 }, { "epoch": 0.40166720102596987, "grad_norm": 0.8481185436248779, "learning_rate": 0.00010595307281927571, "loss": 5.5697, "num_input_tokens_seen": 307888128, "step": 2349 }, { "epoch": 0.40218018595703753, "grad_norm": 0.7999601364135742, "learning_rate": 0.00010588547914552566, "loss": 5.5475, "num_input_tokens_seen": 308281344, "step": 2352 }, { "epoch": 0.40269317088810513, "grad_norm": 0.8336549997329712, "learning_rate": 0.00010581801467283045, "loss": 5.5177, "num_input_tokens_seen": 308674560, "step": 2355 }, { "epoch": 0.4032061558191728, "grad_norm": 0.9885947108268738, "learning_rate": 0.00010575067899011441, "loss": 5.5241, "num_input_tokens_seen": 309067776, "step": 2358 }, { "epoch": 0.40371914075024046, "grad_norm": 0.811789870262146, "learning_rate": 0.00010568347168813064, "loss": 5.5635, "num_input_tokens_seen": 309460992, "step": 2361 }, { "epoch": 0.4042321256813081, "grad_norm": 0.7453095316886902, "learning_rate": 0.00010561639235945043, "loss": 5.5208, "num_input_tokens_seen": 309854208, "step": 2364 }, { "epoch": 0.4047451106123758, "grad_norm": 0.7175402045249939, "learning_rate": 0.00010554944059845314, "loss": 5.5112, "num_input_tokens_seen": 310247424, "step": 2367 }, { "epoch": 0.4052580955434434, "grad_norm": 0.7702206373214722, "learning_rate": 0.00010548261600131565, "loss": 5.5175, "num_input_tokens_seen": 310640640, "step": 2370 }, { "epoch": 0.40577108047451105, "grad_norm": 0.7227572798728943, "learning_rate": 0.00010541591816600227, "loss": 5.5596, "num_input_tokens_seen": 311033856, "step": 2373 }, { "epoch": 0.4062840654055787, "grad_norm": 0.8014532327651978, "learning_rate": 0.00010534934669225456, "loss": 5.4984, "num_input_tokens_seen": 311427072, "step": 2376 }, { "epoch": 0.40679705033664637, "grad_norm": 0.867141604423523, "learning_rate": 0.0001052829011815812, "loss": 5.5651, "num_input_tokens_seen": 311820288, "step": 2379 }, { "epoch": 0.40731003526771403, "grad_norm": 0.6916822791099548, "learning_rate": 0.00010521658123724799, "loss": 5.5142, "num_input_tokens_seen": 312213504, "step": 2382 }, { "epoch": 0.40782302019878164, "grad_norm": 0.7513076066970825, "learning_rate": 0.00010515038646426796, "loss": 5.5373, "num_input_tokens_seen": 312606720, "step": 2385 }, { "epoch": 0.4083360051298493, "grad_norm": 0.7861223220825195, "learning_rate": 0.00010508431646939135, "loss": 5.5649, "num_input_tokens_seen": 312999936, "step": 2388 }, { "epoch": 0.40884899006091696, "grad_norm": 0.7609034180641174, "learning_rate": 0.00010501837086109599, "loss": 5.5171, "num_input_tokens_seen": 313393152, "step": 2391 }, { "epoch": 0.4093619749919846, "grad_norm": 0.6941331624984741, "learning_rate": 0.00010495254924957736, "loss": 5.5279, "num_input_tokens_seen": 313786368, "step": 2394 }, { "epoch": 0.4098749599230523, "grad_norm": 0.6960221529006958, "learning_rate": 0.00010488685124673906, "loss": 5.53, "num_input_tokens_seen": 314179584, "step": 2397 }, { "epoch": 0.4103879448541199, "grad_norm": 0.7227168083190918, "learning_rate": 0.00010482127646618314, "loss": 5.5014, "num_input_tokens_seen": 314572800, "step": 2400 }, { "epoch": 0.4103879448541199, "eval_accuracy": 0.16866471258752647, "eval_loss": 5.9827094078063965, "eval_runtime": 115.7853, "eval_samples_per_second": 2.591, "eval_steps_per_second": 1.296, "num_input_tokens_seen": 314572800, "step": 2400 }, { "epoch": 0.41090092978518755, "grad_norm": 0.7502657771110535, "learning_rate": 0.00010475582452320052, "loss": 5.5249, "num_input_tokens_seen": 314966016, "step": 2403 }, { "epoch": 0.4114139147162552, "grad_norm": 0.709670901298523, "learning_rate": 0.00010469049503476158, "loss": 5.5021, "num_input_tokens_seen": 315359232, "step": 2406 }, { "epoch": 0.4119268996473229, "grad_norm": 0.8333126902580261, "learning_rate": 0.00010462528761950672, "loss": 5.5293, "num_input_tokens_seen": 315752448, "step": 2409 }, { "epoch": 0.41243988457839054, "grad_norm": 0.7035155296325684, "learning_rate": 0.00010456020189773697, "loss": 5.5508, "num_input_tokens_seen": 316145664, "step": 2412 }, { "epoch": 0.41295286950945814, "grad_norm": 0.7859997749328613, "learning_rate": 0.00010449523749140482, "loss": 5.5175, "num_input_tokens_seen": 316538880, "step": 2415 }, { "epoch": 0.4134658544405258, "grad_norm": 0.7992687821388245, "learning_rate": 0.00010443039402410475, "loss": 5.5136, "num_input_tokens_seen": 316932096, "step": 2418 }, { "epoch": 0.41397883937159347, "grad_norm": 0.8525195717811584, "learning_rate": 0.00010436567112106444, "loss": 5.54, "num_input_tokens_seen": 317325312, "step": 2421 }, { "epoch": 0.4144918243026611, "grad_norm": 0.6531423330307007, "learning_rate": 0.00010430106840913532, "loss": 5.4994, "num_input_tokens_seen": 317718528, "step": 2424 }, { "epoch": 0.41500480923372873, "grad_norm": 0.6764413714408875, "learning_rate": 0.00010423658551678376, "loss": 5.496, "num_input_tokens_seen": 318111744, "step": 2427 }, { "epoch": 0.4155177941647964, "grad_norm": 0.6776644587516785, "learning_rate": 0.00010417222207408196, "loss": 5.5749, "num_input_tokens_seen": 318504960, "step": 2430 }, { "epoch": 0.41603077909586406, "grad_norm": 0.790438711643219, "learning_rate": 0.00010410797771269917, "loss": 5.5339, "num_input_tokens_seen": 318898176, "step": 2433 }, { "epoch": 0.4165437640269317, "grad_norm": 0.8480133414268494, "learning_rate": 0.00010404385206589268, "loss": 5.5411, "num_input_tokens_seen": 319291392, "step": 2436 }, { "epoch": 0.4170567489579994, "grad_norm": 0.832699716091156, "learning_rate": 0.00010397984476849915, "loss": 5.5524, "num_input_tokens_seen": 319684608, "step": 2439 }, { "epoch": 0.417569733889067, "grad_norm": 0.8338193297386169, "learning_rate": 0.00010391595545692583, "loss": 5.4927, "num_input_tokens_seen": 320077824, "step": 2442 }, { "epoch": 0.41808271882013465, "grad_norm": 0.7665431499481201, "learning_rate": 0.00010385218376914195, "loss": 5.5396, "num_input_tokens_seen": 320471040, "step": 2445 }, { "epoch": 0.4185957037512023, "grad_norm": 0.719890832901001, "learning_rate": 0.00010378852934466992, "loss": 5.5215, "num_input_tokens_seen": 320864256, "step": 2448 }, { "epoch": 0.41910868868226997, "grad_norm": 0.7127965688705444, "learning_rate": 0.000103724991824577, "loss": 5.517, "num_input_tokens_seen": 321257472, "step": 2451 }, { "epoch": 0.41962167361333763, "grad_norm": 0.841174840927124, "learning_rate": 0.00010366157085146666, "loss": 5.5549, "num_input_tokens_seen": 321650688, "step": 2454 }, { "epoch": 0.42013465854440524, "grad_norm": 0.9073885083198547, "learning_rate": 0.00010359826606947015, "loss": 5.5391, "num_input_tokens_seen": 322043904, "step": 2457 }, { "epoch": 0.4206476434754729, "grad_norm": 0.7966265082359314, "learning_rate": 0.00010353507712423819, "loss": 5.5094, "num_input_tokens_seen": 322437120, "step": 2460 }, { "epoch": 0.42116062840654056, "grad_norm": 0.7716068029403687, "learning_rate": 0.00010347200366293252, "loss": 5.5039, "num_input_tokens_seen": 322830336, "step": 2463 }, { "epoch": 0.4216736133376082, "grad_norm": 1.1247206926345825, "learning_rate": 0.00010340904533421777, "loss": 5.5764, "num_input_tokens_seen": 323223552, "step": 2466 }, { "epoch": 0.4221865982686759, "grad_norm": 1.0969719886779785, "learning_rate": 0.00010334620178825307, "loss": 5.5091, "num_input_tokens_seen": 323616768, "step": 2469 }, { "epoch": 0.4226995831997435, "grad_norm": 0.7282251119613647, "learning_rate": 0.00010328347267668404, "loss": 5.5254, "num_input_tokens_seen": 324009984, "step": 2472 }, { "epoch": 0.42321256813081115, "grad_norm": 0.7151831984519958, "learning_rate": 0.0001032208576526346, "loss": 5.5016, "num_input_tokens_seen": 324403200, "step": 2475 }, { "epoch": 0.4237255530618788, "grad_norm": 0.7762022018432617, "learning_rate": 0.0001031583563706989, "loss": 5.4836, "num_input_tokens_seen": 324796416, "step": 2478 }, { "epoch": 0.4242385379929465, "grad_norm": 0.7001504898071289, "learning_rate": 0.00010309596848693339, "loss": 5.5204, "num_input_tokens_seen": 325189632, "step": 2481 }, { "epoch": 0.4247515229240141, "grad_norm": 0.8730801343917847, "learning_rate": 0.00010303369365884883, "loss": 5.5267, "num_input_tokens_seen": 325582848, "step": 2484 }, { "epoch": 0.42526450785508174, "grad_norm": 0.9579359889030457, "learning_rate": 0.00010297153154540234, "loss": 5.4667, "num_input_tokens_seen": 325976064, "step": 2487 }, { "epoch": 0.4257774927861494, "grad_norm": 0.7636502385139465, "learning_rate": 0.00010290948180698962, "loss": 5.5169, "num_input_tokens_seen": 326369280, "step": 2490 }, { "epoch": 0.42629047771721706, "grad_norm": 0.7500234246253967, "learning_rate": 0.00010284754410543722, "loss": 5.4919, "num_input_tokens_seen": 326762496, "step": 2493 }, { "epoch": 0.4268034626482847, "grad_norm": 0.9112027883529663, "learning_rate": 0.0001027857181039946, "loss": 5.5179, "num_input_tokens_seen": 327155712, "step": 2496 }, { "epoch": 0.42731644757935233, "grad_norm": 0.820213794708252, "learning_rate": 0.00010272400346732667, "loss": 5.5183, "num_input_tokens_seen": 327548928, "step": 2499 }, { "epoch": 0.42782943251042, "grad_norm": 0.8203981518745422, "learning_rate": 0.00010266239986150597, "loss": 5.532, "num_input_tokens_seen": 327942144, "step": 2502 }, { "epoch": 0.42834241744148766, "grad_norm": 0.7337089776992798, "learning_rate": 0.00010260090695400518, "loss": 5.4943, "num_input_tokens_seen": 328335360, "step": 2505 }, { "epoch": 0.4288554023725553, "grad_norm": 0.7428235411643982, "learning_rate": 0.00010253952441368959, "loss": 5.4861, "num_input_tokens_seen": 328728576, "step": 2508 }, { "epoch": 0.429368387303623, "grad_norm": 0.7361696362495422, "learning_rate": 0.00010247825191080954, "loss": 5.5521, "num_input_tokens_seen": 329121792, "step": 2511 }, { "epoch": 0.4298813722346906, "grad_norm": 0.6974912285804749, "learning_rate": 0.00010241708911699302, "loss": 5.456, "num_input_tokens_seen": 329515008, "step": 2514 }, { "epoch": 0.43039435716575825, "grad_norm": 0.8302775621414185, "learning_rate": 0.00010235603570523828, "loss": 5.507, "num_input_tokens_seen": 329908224, "step": 2517 }, { "epoch": 0.4309073420968259, "grad_norm": 0.8034529089927673, "learning_rate": 0.00010229509134990649, "loss": 5.4999, "num_input_tokens_seen": 330301440, "step": 2520 }, { "epoch": 0.43142032702789357, "grad_norm": 0.812213122844696, "learning_rate": 0.00010223425572671442, "loss": 5.5129, "num_input_tokens_seen": 330694656, "step": 2523 }, { "epoch": 0.43193331195896123, "grad_norm": 0.9187823534011841, "learning_rate": 0.00010217352851272726, "loss": 5.5116, "num_input_tokens_seen": 331087872, "step": 2526 }, { "epoch": 0.43244629689002884, "grad_norm": 0.9143493175506592, "learning_rate": 0.00010211290938635132, "loss": 5.4893, "num_input_tokens_seen": 331481088, "step": 2529 }, { "epoch": 0.4329592818210965, "grad_norm": 0.6335490942001343, "learning_rate": 0.00010205239802732692, "loss": 5.4349, "num_input_tokens_seen": 331874304, "step": 2532 }, { "epoch": 0.43347226675216416, "grad_norm": 0.7708688974380493, "learning_rate": 0.00010199199411672136, "loss": 5.4729, "num_input_tokens_seen": 332267520, "step": 2535 }, { "epoch": 0.4339852516832318, "grad_norm": 0.7687310576438904, "learning_rate": 0.00010193169733692172, "loss": 5.5084, "num_input_tokens_seen": 332660736, "step": 2538 }, { "epoch": 0.4344982366142995, "grad_norm": 0.8705409169197083, "learning_rate": 0.00010187150737162795, "loss": 5.5312, "num_input_tokens_seen": 333053952, "step": 2541 }, { "epoch": 0.4350112215453671, "grad_norm": 0.7544601559638977, "learning_rate": 0.00010181142390584588, "loss": 5.5007, "num_input_tokens_seen": 333447168, "step": 2544 }, { "epoch": 0.43552420647643475, "grad_norm": 0.7180626392364502, "learning_rate": 0.00010175144662588028, "loss": 5.5181, "num_input_tokens_seen": 333840384, "step": 2547 }, { "epoch": 0.4360371914075024, "grad_norm": 0.8776599764823914, "learning_rate": 0.00010169157521932794, "loss": 5.5253, "num_input_tokens_seen": 334233600, "step": 2550 }, { "epoch": 0.4365501763385701, "grad_norm": 0.8420502543449402, "learning_rate": 0.00010163180937507096, "loss": 5.4906, "num_input_tokens_seen": 334626816, "step": 2553 }, { "epoch": 0.4370631612696377, "grad_norm": 0.7820854783058167, "learning_rate": 0.00010157214878326983, "loss": 5.4878, "num_input_tokens_seen": 335020032, "step": 2556 }, { "epoch": 0.43757614620070534, "grad_norm": 0.7041083574295044, "learning_rate": 0.00010151259313535675, "loss": 5.5047, "num_input_tokens_seen": 335413248, "step": 2559 }, { "epoch": 0.438089131131773, "grad_norm": 0.7176365852355957, "learning_rate": 0.00010145314212402889, "loss": 5.4753, "num_input_tokens_seen": 335806464, "step": 2562 }, { "epoch": 0.43860211606284066, "grad_norm": 0.6593925356864929, "learning_rate": 0.00010139379544324182, "loss": 5.5398, "num_input_tokens_seen": 336199680, "step": 2565 }, { "epoch": 0.4391151009939083, "grad_norm": 0.8498782515525818, "learning_rate": 0.00010133455278820273, "loss": 5.5204, "num_input_tokens_seen": 336592896, "step": 2568 }, { "epoch": 0.43962808592497593, "grad_norm": 0.7824472188949585, "learning_rate": 0.00010127541385536402, "loss": 5.4865, "num_input_tokens_seen": 336986112, "step": 2571 }, { "epoch": 0.4401410708560436, "grad_norm": 0.7467549443244934, "learning_rate": 0.00010121637834241672, "loss": 5.4581, "num_input_tokens_seen": 337379328, "step": 2574 }, { "epoch": 0.44065405578711125, "grad_norm": 0.8907204866409302, "learning_rate": 0.00010115744594828388, "loss": 5.5488, "num_input_tokens_seen": 337772544, "step": 2577 }, { "epoch": 0.4411670407181789, "grad_norm": 0.9740023016929626, "learning_rate": 0.00010109861637311432, "loss": 5.5207, "num_input_tokens_seen": 338165760, "step": 2580 }, { "epoch": 0.4416800256492466, "grad_norm": 0.9154981970787048, "learning_rate": 0.00010103988931827606, "loss": 5.4704, "num_input_tokens_seen": 338558976, "step": 2583 }, { "epoch": 0.4421930105803142, "grad_norm": 0.8180415630340576, "learning_rate": 0.00010098126448635004, "loss": 5.5134, "num_input_tokens_seen": 338952192, "step": 2586 }, { "epoch": 0.44270599551138184, "grad_norm": 0.8424299359321594, "learning_rate": 0.00010092274158112377, "loss": 5.5021, "num_input_tokens_seen": 339345408, "step": 2589 }, { "epoch": 0.4432189804424495, "grad_norm": 0.7481082081794739, "learning_rate": 0.00010086432030758502, "loss": 5.4675, "num_input_tokens_seen": 339738624, "step": 2592 }, { "epoch": 0.44373196537351717, "grad_norm": 0.9558732509613037, "learning_rate": 0.00010080600037191566, "loss": 5.4996, "num_input_tokens_seen": 340131840, "step": 2595 }, { "epoch": 0.44424495030458483, "grad_norm": 0.9134954810142517, "learning_rate": 0.00010074778148148528, "loss": 5.5097, "num_input_tokens_seen": 340525056, "step": 2598 }, { "epoch": 0.44475793523565244, "grad_norm": 0.7287417054176331, "learning_rate": 0.00010068966334484521, "loss": 5.5043, "num_input_tokens_seen": 340918272, "step": 2601 }, { "epoch": 0.4452709201667201, "grad_norm": 0.9164344668388367, "learning_rate": 0.00010063164567172234, "loss": 5.5109, "num_input_tokens_seen": 341311488, "step": 2604 }, { "epoch": 0.44578390509778776, "grad_norm": 0.7782894968986511, "learning_rate": 0.00010057372817301295, "loss": 5.5104, "num_input_tokens_seen": 341704704, "step": 2607 }, { "epoch": 0.4462968900288554, "grad_norm": 0.8201740980148315, "learning_rate": 0.00010051591056077674, "loss": 5.5767, "num_input_tokens_seen": 342097920, "step": 2610 }, { "epoch": 0.446809874959923, "grad_norm": 0.8876894116401672, "learning_rate": 0.00010045819254823074, "loss": 5.4695, "num_input_tokens_seen": 342491136, "step": 2613 }, { "epoch": 0.4473228598909907, "grad_norm": 0.708038866519928, "learning_rate": 0.0001004005738497435, "loss": 5.4691, "num_input_tokens_seen": 342884352, "step": 2616 }, { "epoch": 0.44783584482205835, "grad_norm": 0.79575514793396, "learning_rate": 0.0001003430541808289, "loss": 5.4651, "num_input_tokens_seen": 343277568, "step": 2619 }, { "epoch": 0.448348829753126, "grad_norm": 0.8515266180038452, "learning_rate": 0.00010028563325814057, "loss": 5.4751, "num_input_tokens_seen": 343670784, "step": 2622 }, { "epoch": 0.44886181468419367, "grad_norm": 0.8329154253005981, "learning_rate": 0.00010022831079946566, "loss": 5.5093, "num_input_tokens_seen": 344064000, "step": 2625 }, { "epoch": 0.4493747996152613, "grad_norm": 0.871041476726532, "learning_rate": 0.00010017108652371934, "loss": 5.4697, "num_input_tokens_seen": 344457216, "step": 2628 }, { "epoch": 0.44988778454632894, "grad_norm": 0.649675190448761, "learning_rate": 0.0001001139601509388, "loss": 5.4813, "num_input_tokens_seen": 344850432, "step": 2631 }, { "epoch": 0.4504007694773966, "grad_norm": 0.7647591829299927, "learning_rate": 0.00010005693140227763, "loss": 5.4817, "num_input_tokens_seen": 345243648, "step": 2634 }, { "epoch": 0.45091375440846426, "grad_norm": 0.8665961027145386, "learning_rate": 9.999999999999999e-05, "loss": 5.4681, "num_input_tokens_seen": 345636864, "step": 2637 }, { "epoch": 0.4514267393395319, "grad_norm": 0.7397317290306091, "learning_rate": 9.994316566747503e-05, "loss": 5.4471, "num_input_tokens_seen": 346030080, "step": 2640 }, { "epoch": 0.45193972427059953, "grad_norm": 0.7339237928390503, "learning_rate": 9.988642812917122e-05, "loss": 5.4694, "num_input_tokens_seen": 346423296, "step": 2643 }, { "epoch": 0.4524527092016672, "grad_norm": 0.7354670166969299, "learning_rate": 9.98297871106506e-05, "loss": 5.4715, "num_input_tokens_seen": 346816512, "step": 2646 }, { "epoch": 0.45296569413273485, "grad_norm": 0.7800298929214478, "learning_rate": 9.977324233856346e-05, "loss": 5.5178, "num_input_tokens_seen": 347209728, "step": 2649 }, { "epoch": 0.4534786790638025, "grad_norm": 0.7750239968299866, "learning_rate": 9.971679354064264e-05, "loss": 5.4624, "num_input_tokens_seen": 347602944, "step": 2652 }, { "epoch": 0.4539916639948702, "grad_norm": 0.8229271173477173, "learning_rate": 9.966044044569793e-05, "loss": 5.5072, "num_input_tokens_seen": 347996160, "step": 2655 }, { "epoch": 0.4545046489259378, "grad_norm": 0.7565569877624512, "learning_rate": 9.960418278361088e-05, "loss": 5.5323, "num_input_tokens_seen": 348389376, "step": 2658 }, { "epoch": 0.45501763385700544, "grad_norm": 0.7968035936355591, "learning_rate": 9.954802028532911e-05, "loss": 5.4457, "num_input_tokens_seen": 348782592, "step": 2661 }, { "epoch": 0.4555306187880731, "grad_norm": 1.092264175415039, "learning_rate": 9.949195268286099e-05, "loss": 5.5038, "num_input_tokens_seen": 349175808, "step": 2664 }, { "epoch": 0.45604360371914077, "grad_norm": 0.9976704120635986, "learning_rate": 9.943597970927025e-05, "loss": 5.4596, "num_input_tokens_seen": 349569024, "step": 2667 }, { "epoch": 0.4565565886502084, "grad_norm": 0.8629297018051147, "learning_rate": 9.938010109867075e-05, "loss": 5.4183, "num_input_tokens_seen": 349962240, "step": 2670 }, { "epoch": 0.45706957358127603, "grad_norm": 0.9318130016326904, "learning_rate": 9.932431658622104e-05, "loss": 5.5005, "num_input_tokens_seen": 350355456, "step": 2673 }, { "epoch": 0.4575825585123437, "grad_norm": 0.7205513715744019, "learning_rate": 9.926862590811912e-05, "loss": 5.4921, "num_input_tokens_seen": 350748672, "step": 2676 }, { "epoch": 0.45809554344341136, "grad_norm": 0.8188350200653076, "learning_rate": 9.921302880159722e-05, "loss": 5.4676, "num_input_tokens_seen": 351141888, "step": 2679 }, { "epoch": 0.458608528374479, "grad_norm": 0.7740142941474915, "learning_rate": 9.915752500491666e-05, "loss": 5.4431, "num_input_tokens_seen": 351535104, "step": 2682 }, { "epoch": 0.4591215133055466, "grad_norm": 0.7324191331863403, "learning_rate": 9.910211425736248e-05, "loss": 5.4386, "num_input_tokens_seen": 351928320, "step": 2685 }, { "epoch": 0.4596344982366143, "grad_norm": 0.7066339254379272, "learning_rate": 9.904679629923856e-05, "loss": 5.5035, "num_input_tokens_seen": 352321536, "step": 2688 }, { "epoch": 0.46014748316768195, "grad_norm": 0.7234504222869873, "learning_rate": 9.899157087186225e-05, "loss": 5.4922, "num_input_tokens_seen": 352714752, "step": 2691 }, { "epoch": 0.4606604680987496, "grad_norm": 0.7445735335350037, "learning_rate": 9.893643771755952e-05, "loss": 5.4954, "num_input_tokens_seen": 353107968, "step": 2694 }, { "epoch": 0.46117345302981727, "grad_norm": 0.7736021876335144, "learning_rate": 9.88813965796597e-05, "loss": 5.4861, "num_input_tokens_seen": 353501184, "step": 2697 }, { "epoch": 0.4616864379608849, "grad_norm": 0.8421680927276611, "learning_rate": 9.882644720249061e-05, "loss": 5.4398, "num_input_tokens_seen": 353894400, "step": 2700 }, { "epoch": 0.46219942289195254, "grad_norm": 0.8024502992630005, "learning_rate": 9.877158933137354e-05, "loss": 5.4792, "num_input_tokens_seen": 354287616, "step": 2703 }, { "epoch": 0.4627124078230202, "grad_norm": 0.6904874444007874, "learning_rate": 9.871682271261825e-05, "loss": 5.4698, "num_input_tokens_seen": 354680832, "step": 2706 }, { "epoch": 0.46322539275408786, "grad_norm": 0.7565279603004456, "learning_rate": 9.866214709351803e-05, "loss": 5.4867, "num_input_tokens_seen": 355074048, "step": 2709 }, { "epoch": 0.4637383776851555, "grad_norm": 0.7363823056221008, "learning_rate": 9.860756222234493e-05, "loss": 5.5109, "num_input_tokens_seen": 355467264, "step": 2712 }, { "epoch": 0.46425136261622313, "grad_norm": 0.74873948097229, "learning_rate": 9.855306784834474e-05, "loss": 5.4505, "num_input_tokens_seen": 355860480, "step": 2715 }, { "epoch": 0.4647643475472908, "grad_norm": 0.7366782426834106, "learning_rate": 9.849866372173222e-05, "loss": 5.5185, "num_input_tokens_seen": 356253696, "step": 2718 }, { "epoch": 0.46527733247835845, "grad_norm": 0.8568825125694275, "learning_rate": 9.84443495936863e-05, "loss": 5.4669, "num_input_tokens_seen": 356646912, "step": 2721 }, { "epoch": 0.4657903174094261, "grad_norm": 0.7414649724960327, "learning_rate": 9.839012521634527e-05, "loss": 5.4915, "num_input_tokens_seen": 357040128, "step": 2724 }, { "epoch": 0.4663033023404938, "grad_norm": 0.8652457594871521, "learning_rate": 9.83359903428021e-05, "loss": 5.4666, "num_input_tokens_seen": 357433344, "step": 2727 }, { "epoch": 0.4668162872715614, "grad_norm": 0.6931141018867493, "learning_rate": 9.828194472709959e-05, "loss": 5.4974, "num_input_tokens_seen": 357826560, "step": 2730 }, { "epoch": 0.46732927220262904, "grad_norm": 0.8068703413009644, "learning_rate": 9.822798812422577e-05, "loss": 5.4778, "num_input_tokens_seen": 358219776, "step": 2733 }, { "epoch": 0.4678422571336967, "grad_norm": 0.7154794931411743, "learning_rate": 9.817412029010924e-05, "loss": 5.4866, "num_input_tokens_seen": 358612992, "step": 2736 }, { "epoch": 0.46835524206476437, "grad_norm": 0.6981579065322876, "learning_rate": 9.81203409816145e-05, "loss": 5.4718, "num_input_tokens_seen": 359006208, "step": 2739 }, { "epoch": 0.46886822699583197, "grad_norm": 0.8242044448852539, "learning_rate": 9.806664995653737e-05, "loss": 5.4839, "num_input_tokens_seen": 359399424, "step": 2742 }, { "epoch": 0.46938121192689963, "grad_norm": 0.7493621706962585, "learning_rate": 9.80130469736003e-05, "loss": 5.4704, "num_input_tokens_seen": 359792640, "step": 2745 }, { "epoch": 0.4698941968579673, "grad_norm": 0.7475427389144897, "learning_rate": 9.7959531792448e-05, "loss": 5.4021, "num_input_tokens_seen": 360185856, "step": 2748 }, { "epoch": 0.47040718178903496, "grad_norm": 0.7679263353347778, "learning_rate": 9.79061041736428e-05, "loss": 5.5059, "num_input_tokens_seen": 360579072, "step": 2751 }, { "epoch": 0.4709201667201026, "grad_norm": 0.7734596729278564, "learning_rate": 9.785276387866011e-05, "loss": 5.4497, "num_input_tokens_seen": 360972288, "step": 2754 }, { "epoch": 0.4714331516511702, "grad_norm": 0.7543107867240906, "learning_rate": 9.779951066988407e-05, "loss": 5.4706, "num_input_tokens_seen": 361365504, "step": 2757 }, { "epoch": 0.4719461365822379, "grad_norm": 0.6790075898170471, "learning_rate": 9.774634431060301e-05, "loss": 5.4785, "num_input_tokens_seen": 361758720, "step": 2760 }, { "epoch": 0.47245912151330555, "grad_norm": 0.7988852262496948, "learning_rate": 9.769326456500506e-05, "loss": 5.4941, "num_input_tokens_seen": 362151936, "step": 2763 }, { "epoch": 0.4729721064443732, "grad_norm": 0.6978922486305237, "learning_rate": 9.76402711981738e-05, "loss": 5.4991, "num_input_tokens_seen": 362545152, "step": 2766 }, { "epoch": 0.47348509137544087, "grad_norm": 0.7577718496322632, "learning_rate": 9.758736397608374e-05, "loss": 5.4926, "num_input_tokens_seen": 362938368, "step": 2769 }, { "epoch": 0.4739980763065085, "grad_norm": 0.6399986147880554, "learning_rate": 9.753454266559622e-05, "loss": 5.4783, "num_input_tokens_seen": 363331584, "step": 2772 }, { "epoch": 0.47451106123757614, "grad_norm": 0.7283456921577454, "learning_rate": 9.74818070344549e-05, "loss": 5.5014, "num_input_tokens_seen": 363724800, "step": 2775 }, { "epoch": 0.4750240461686438, "grad_norm": 0.8536427021026611, "learning_rate": 9.742915685128152e-05, "loss": 5.5269, "num_input_tokens_seen": 364118016, "step": 2778 }, { "epoch": 0.47553703109971146, "grad_norm": 0.6984924077987671, "learning_rate": 9.737659188557171e-05, "loss": 5.467, "num_input_tokens_seen": 364511232, "step": 2781 }, { "epoch": 0.4760500160307791, "grad_norm": 0.8646853566169739, "learning_rate": 9.732411190769063e-05, "loss": 5.4627, "num_input_tokens_seen": 364904448, "step": 2784 }, { "epoch": 0.47656300096184673, "grad_norm": 0.7577709555625916, "learning_rate": 9.727171668886887e-05, "loss": 5.4473, "num_input_tokens_seen": 365297664, "step": 2787 }, { "epoch": 0.4770759858929144, "grad_norm": 0.7367388010025024, "learning_rate": 9.721940600119815e-05, "loss": 5.4691, "num_input_tokens_seen": 365690880, "step": 2790 }, { "epoch": 0.47758897082398205, "grad_norm": 0.7202689051628113, "learning_rate": 9.716717961762733e-05, "loss": 5.5151, "num_input_tokens_seen": 366084096, "step": 2793 }, { "epoch": 0.4781019557550497, "grad_norm": 0.7965312004089355, "learning_rate": 9.711503731195804e-05, "loss": 5.4734, "num_input_tokens_seen": 366477312, "step": 2796 }, { "epoch": 0.4786149406861173, "grad_norm": 0.7794646620750427, "learning_rate": 9.706297885884074e-05, "loss": 5.4882, "num_input_tokens_seen": 366870528, "step": 2799 }, { "epoch": 0.47878593566313987, "eval_accuracy": 0.17308907344080768, "eval_loss": 5.920318603515625, "eval_runtime": 109.4183, "eval_samples_per_second": 2.742, "eval_steps_per_second": 1.371, "num_input_tokens_seen": 367001600, "step": 2800 }, { "epoch": 0.479127925617185, "grad_norm": 0.6703746914863586, "learning_rate": 9.701100403377059e-05, "loss": 5.5227, "num_input_tokens_seen": 367263744, "step": 2802 }, { "epoch": 0.47964091054825264, "grad_norm": 0.649358868598938, "learning_rate": 9.695911261308335e-05, "loss": 5.5274, "num_input_tokens_seen": 367656960, "step": 2805 }, { "epoch": 0.4801538954793203, "grad_norm": 0.7234277725219727, "learning_rate": 9.69073043739513e-05, "loss": 5.4809, "num_input_tokens_seen": 368050176, "step": 2808 }, { "epoch": 0.48066688041038796, "grad_norm": 0.8192933797836304, "learning_rate": 9.685557909437936e-05, "loss": 5.4696, "num_input_tokens_seen": 368443392, "step": 2811 }, { "epoch": 0.48117986534145557, "grad_norm": 0.8990679383277893, "learning_rate": 9.680393655320099e-05, "loss": 5.4669, "num_input_tokens_seen": 368836608, "step": 2814 }, { "epoch": 0.48169285027252323, "grad_norm": 0.8200316429138184, "learning_rate": 9.67523765300742e-05, "loss": 5.4802, "num_input_tokens_seen": 369229824, "step": 2817 }, { "epoch": 0.4822058352035909, "grad_norm": 0.9789479970932007, "learning_rate": 9.670089880547766e-05, "loss": 5.4415, "num_input_tokens_seen": 369623040, "step": 2820 }, { "epoch": 0.48271882013465855, "grad_norm": 0.7308698892593384, "learning_rate": 9.664950316070681e-05, "loss": 5.5066, "num_input_tokens_seen": 370016256, "step": 2823 }, { "epoch": 0.4832318050657262, "grad_norm": 0.6770713925361633, "learning_rate": 9.659818937786982e-05, "loss": 5.4506, "num_input_tokens_seen": 370409472, "step": 2826 }, { "epoch": 0.4837447899967938, "grad_norm": 0.6626781225204468, "learning_rate": 9.654695723988381e-05, "loss": 5.4453, "num_input_tokens_seen": 370802688, "step": 2829 }, { "epoch": 0.4842577749278615, "grad_norm": 0.741698682308197, "learning_rate": 9.649580653047106e-05, "loss": 5.4754, "num_input_tokens_seen": 371195904, "step": 2832 }, { "epoch": 0.48477075985892915, "grad_norm": 0.7719034552574158, "learning_rate": 9.644473703415494e-05, "loss": 5.4889, "num_input_tokens_seen": 371589120, "step": 2835 }, { "epoch": 0.4852837447899968, "grad_norm": 0.7475199103355408, "learning_rate": 9.63937485362564e-05, "loss": 5.4272, "num_input_tokens_seen": 371982336, "step": 2838 }, { "epoch": 0.48579672972106447, "grad_norm": 0.7785484790802002, "learning_rate": 9.634284082288993e-05, "loss": 5.4695, "num_input_tokens_seen": 372375552, "step": 2841 }, { "epoch": 0.4863097146521321, "grad_norm": 0.6962818503379822, "learning_rate": 9.629201368095994e-05, "loss": 5.4975, "num_input_tokens_seen": 372768768, "step": 2844 }, { "epoch": 0.48682269958319974, "grad_norm": 0.7307711839675903, "learning_rate": 9.624126689815691e-05, "loss": 5.4963, "num_input_tokens_seen": 373161984, "step": 2847 }, { "epoch": 0.4873356845142674, "grad_norm": 0.9694541692733765, "learning_rate": 9.61906002629538e-05, "loss": 5.4076, "num_input_tokens_seen": 373555200, "step": 2850 }, { "epoch": 0.48784866944533506, "grad_norm": 0.7529440522193909, "learning_rate": 9.614001356460217e-05, "loss": 5.4576, "num_input_tokens_seen": 373948416, "step": 2853 }, { "epoch": 0.4883616543764027, "grad_norm": 0.8865497708320618, "learning_rate": 9.608950659312869e-05, "loss": 5.4596, "num_input_tokens_seen": 374341632, "step": 2856 }, { "epoch": 0.4888746393074703, "grad_norm": 0.8430432677268982, "learning_rate": 9.603907913933133e-05, "loss": 5.4806, "num_input_tokens_seen": 374734848, "step": 2859 }, { "epoch": 0.489387624238538, "grad_norm": 0.6669235229492188, "learning_rate": 9.598873099477574e-05, "loss": 5.4507, "num_input_tokens_seen": 375128064, "step": 2862 }, { "epoch": 0.48990060916960565, "grad_norm": 0.8070666790008545, "learning_rate": 9.593846195179174e-05, "loss": 5.454, "num_input_tokens_seen": 375521280, "step": 2865 }, { "epoch": 0.4904135941006733, "grad_norm": 0.8225948214530945, "learning_rate": 9.588827180346961e-05, "loss": 5.4561, "num_input_tokens_seen": 375914496, "step": 2868 }, { "epoch": 0.4909265790317409, "grad_norm": 0.7412498593330383, "learning_rate": 9.583816034365655e-05, "loss": 5.4657, "num_input_tokens_seen": 376307712, "step": 2871 }, { "epoch": 0.4914395639628086, "grad_norm": 0.7847759127616882, "learning_rate": 9.578812736695315e-05, "loss": 5.4268, "num_input_tokens_seen": 376700928, "step": 2874 }, { "epoch": 0.49195254889387624, "grad_norm": 0.7222384214401245, "learning_rate": 9.573817266870979e-05, "loss": 5.4318, "num_input_tokens_seen": 377094144, "step": 2877 }, { "epoch": 0.4924655338249439, "grad_norm": 0.6922466158866882, "learning_rate": 9.568829604502318e-05, "loss": 5.5184, "num_input_tokens_seen": 377487360, "step": 2880 }, { "epoch": 0.49297851875601156, "grad_norm": 0.7505002021789551, "learning_rate": 9.563849729273287e-05, "loss": 5.4798, "num_input_tokens_seen": 377880576, "step": 2883 }, { "epoch": 0.49349150368707917, "grad_norm": 0.705833911895752, "learning_rate": 9.558877620941768e-05, "loss": 5.4977, "num_input_tokens_seen": 378273792, "step": 2886 }, { "epoch": 0.49400448861814683, "grad_norm": 0.7315141558647156, "learning_rate": 9.553913259339242e-05, "loss": 5.428, "num_input_tokens_seen": 378667008, "step": 2889 }, { "epoch": 0.4945174735492145, "grad_norm": 0.7071065902709961, "learning_rate": 9.548956624370426e-05, "loss": 5.4213, "num_input_tokens_seen": 379060224, "step": 2892 }, { "epoch": 0.49503045848028215, "grad_norm": 0.7307475209236145, "learning_rate": 9.54400769601295e-05, "loss": 5.4333, "num_input_tokens_seen": 379453440, "step": 2895 }, { "epoch": 0.4955434434113498, "grad_norm": 0.7351901531219482, "learning_rate": 9.539066454316994e-05, "loss": 5.4426, "num_input_tokens_seen": 379846656, "step": 2898 }, { "epoch": 0.4960564283424174, "grad_norm": 0.7794121503829956, "learning_rate": 9.534132879404975e-05, "loss": 5.4382, "num_input_tokens_seen": 380239872, "step": 2901 }, { "epoch": 0.4965694132734851, "grad_norm": 0.7838775515556335, "learning_rate": 9.529206951471193e-05, "loss": 5.4391, "num_input_tokens_seen": 380633088, "step": 2904 }, { "epoch": 0.49708239820455274, "grad_norm": 0.729987382888794, "learning_rate": 9.524288650781515e-05, "loss": 5.4505, "num_input_tokens_seen": 381026304, "step": 2907 }, { "epoch": 0.4975953831356204, "grad_norm": 0.7023864984512329, "learning_rate": 9.519377957673018e-05, "loss": 5.4664, "num_input_tokens_seen": 381419520, "step": 2910 }, { "epoch": 0.49810836806668807, "grad_norm": 0.7854782342910767, "learning_rate": 9.51447485255368e-05, "loss": 5.4174, "num_input_tokens_seen": 381812736, "step": 2913 }, { "epoch": 0.4986213529977557, "grad_norm": 0.8409479856491089, "learning_rate": 9.509579315902049e-05, "loss": 5.413, "num_input_tokens_seen": 382205952, "step": 2916 }, { "epoch": 0.49913433792882334, "grad_norm": 0.9801114201545715, "learning_rate": 9.504691328266901e-05, "loss": 5.4627, "num_input_tokens_seen": 382599168, "step": 2919 }, { "epoch": 0.499647322859891, "grad_norm": 0.699877917766571, "learning_rate": 9.499810870266937e-05, "loss": 5.4361, "num_input_tokens_seen": 382992384, "step": 2922 }, { "epoch": 0.5001603077909587, "grad_norm": 0.7811382412910461, "learning_rate": 9.494937922590444e-05, "loss": 5.4357, "num_input_tokens_seen": 383385600, "step": 2925 }, { "epoch": 0.5006732927220263, "grad_norm": 0.6606900095939636, "learning_rate": 9.49007246599498e-05, "loss": 5.4534, "num_input_tokens_seen": 383778816, "step": 2928 }, { "epoch": 0.501186277653094, "grad_norm": 0.7077214121818542, "learning_rate": 9.485214481307057e-05, "loss": 5.4578, "num_input_tokens_seen": 384172032, "step": 2931 }, { "epoch": 0.5016992625841616, "grad_norm": 0.7163876891136169, "learning_rate": 9.480363949421822e-05, "loss": 5.4555, "num_input_tokens_seen": 384565248, "step": 2934 }, { "epoch": 0.5022122475152292, "grad_norm": 0.7486274838447571, "learning_rate": 9.475520851302736e-05, "loss": 5.4676, "num_input_tokens_seen": 384958464, "step": 2937 }, { "epoch": 0.5027252324462969, "grad_norm": 0.8384826183319092, "learning_rate": 9.470685167981269e-05, "loss": 5.4334, "num_input_tokens_seen": 385351680, "step": 2940 }, { "epoch": 0.5032382173773645, "grad_norm": 0.7711573243141174, "learning_rate": 9.465856880556584e-05, "loss": 5.3987, "num_input_tokens_seen": 385744896, "step": 2943 }, { "epoch": 0.5037512023084322, "grad_norm": 0.7309294939041138, "learning_rate": 9.461035970195224e-05, "loss": 5.4298, "num_input_tokens_seen": 386138112, "step": 2946 }, { "epoch": 0.5042641872394998, "grad_norm": 0.7861335873603821, "learning_rate": 9.45622241813081e-05, "loss": 5.4336, "num_input_tokens_seen": 386531328, "step": 2949 }, { "epoch": 0.5047771721705675, "grad_norm": 0.6272249221801758, "learning_rate": 9.451416205663726e-05, "loss": 5.4306, "num_input_tokens_seen": 386924544, "step": 2952 }, { "epoch": 0.5052901571016352, "grad_norm": 0.805742084980011, "learning_rate": 9.446617314160821e-05, "loss": 5.4855, "num_input_tokens_seen": 387317760, "step": 2955 }, { "epoch": 0.5058031420327028, "grad_norm": 0.7901838421821594, "learning_rate": 9.441825725055105e-05, "loss": 5.4566, "num_input_tokens_seen": 387710976, "step": 2958 }, { "epoch": 0.5063161269637705, "grad_norm": 0.7398175597190857, "learning_rate": 9.437041419845438e-05, "loss": 5.4069, "num_input_tokens_seen": 388104192, "step": 2961 }, { "epoch": 0.506829111894838, "grad_norm": 0.6904522776603699, "learning_rate": 9.432264380096243e-05, "loss": 5.45, "num_input_tokens_seen": 388497408, "step": 2964 }, { "epoch": 0.5073420968259057, "grad_norm": 0.8031904697418213, "learning_rate": 9.4274945874372e-05, "loss": 5.413, "num_input_tokens_seen": 388890624, "step": 2967 }, { "epoch": 0.5078550817569734, "grad_norm": 0.892519474029541, "learning_rate": 9.422732023562952e-05, "loss": 5.4264, "num_input_tokens_seen": 389283840, "step": 2970 }, { "epoch": 0.508368066688041, "grad_norm": 0.8635051846504211, "learning_rate": 9.417976670232808e-05, "loss": 5.4243, "num_input_tokens_seen": 389677056, "step": 2973 }, { "epoch": 0.5088810516191087, "grad_norm": 0.785637617111206, "learning_rate": 9.413228509270448e-05, "loss": 5.4366, "num_input_tokens_seen": 390070272, "step": 2976 }, { "epoch": 0.5093940365501763, "grad_norm": 0.6426643133163452, "learning_rate": 9.408487522563637e-05, "loss": 5.4627, "num_input_tokens_seen": 390463488, "step": 2979 }, { "epoch": 0.509907021481244, "grad_norm": 0.7258966565132141, "learning_rate": 9.403753692063932e-05, "loss": 5.4237, "num_input_tokens_seen": 390856704, "step": 2982 }, { "epoch": 0.5104200064123117, "grad_norm": 0.7019416093826294, "learning_rate": 9.39902699978639e-05, "loss": 5.3984, "num_input_tokens_seen": 391249920, "step": 2985 }, { "epoch": 0.5109329913433793, "grad_norm": 0.6688271760940552, "learning_rate": 9.394307427809288e-05, "loss": 5.4452, "num_input_tokens_seen": 391643136, "step": 2988 }, { "epoch": 0.511445976274447, "grad_norm": 0.6243648529052734, "learning_rate": 9.389594958273828e-05, "loss": 5.4079, "num_input_tokens_seen": 392036352, "step": 2991 }, { "epoch": 0.5119589612055145, "grad_norm": 0.7106574177742004, "learning_rate": 9.384889573383865e-05, "loss": 5.4911, "num_input_tokens_seen": 392429568, "step": 2994 }, { "epoch": 0.5124719461365822, "grad_norm": 0.8549032211303711, "learning_rate": 9.380191255405614e-05, "loss": 5.4275, "num_input_tokens_seen": 392822784, "step": 2997 }, { "epoch": 0.5129849310676499, "grad_norm": 0.7520581483840942, "learning_rate": 9.375499986667377e-05, "loss": 5.4381, "num_input_tokens_seen": 393216000, "step": 3000 }, { "epoch": 0.5134979159987175, "grad_norm": 1.052897572517395, "learning_rate": 9.370815749559257e-05, "loss": 5.4058, "num_input_tokens_seen": 393609216, "step": 3003 }, { "epoch": 0.5140109009297852, "grad_norm": 0.8729445338249207, "learning_rate": 9.366138526532885e-05, "loss": 5.4444, "num_input_tokens_seen": 394002432, "step": 3006 }, { "epoch": 0.5145238858608528, "grad_norm": 0.7415926456451416, "learning_rate": 9.361468300101144e-05, "loss": 5.4457, "num_input_tokens_seen": 394395648, "step": 3009 }, { "epoch": 0.5150368707919205, "grad_norm": 0.7771437764167786, "learning_rate": 9.356805052837894e-05, "loss": 5.4288, "num_input_tokens_seen": 394788864, "step": 3012 }, { "epoch": 0.5155498557229882, "grad_norm": 0.8604034781455994, "learning_rate": 9.352148767377697e-05, "loss": 5.4217, "num_input_tokens_seen": 395182080, "step": 3015 }, { "epoch": 0.5160628406540558, "grad_norm": 0.9626191854476929, "learning_rate": 9.347499426415546e-05, "loss": 5.4037, "num_input_tokens_seen": 395575296, "step": 3018 }, { "epoch": 0.5165758255851234, "grad_norm": 0.8346033096313477, "learning_rate": 9.342857012706596e-05, "loss": 5.414, "num_input_tokens_seen": 395968512, "step": 3021 }, { "epoch": 0.517088810516191, "grad_norm": 0.7749696969985962, "learning_rate": 9.338221509065894e-05, "loss": 5.4262, "num_input_tokens_seen": 396361728, "step": 3024 }, { "epoch": 0.5176017954472587, "grad_norm": 0.844420850276947, "learning_rate": 9.333592898368119e-05, "loss": 5.4352, "num_input_tokens_seen": 396754944, "step": 3027 }, { "epoch": 0.5181147803783264, "grad_norm": 0.7230442762374878, "learning_rate": 9.328971163547297e-05, "loss": 5.5282, "num_input_tokens_seen": 397148160, "step": 3030 }, { "epoch": 0.518627765309394, "grad_norm": 0.9192338585853577, "learning_rate": 9.324356287596562e-05, "loss": 5.4224, "num_input_tokens_seen": 397541376, "step": 3033 }, { "epoch": 0.5191407502404617, "grad_norm": 0.9499441385269165, "learning_rate": 9.319748253567871e-05, "loss": 5.4896, "num_input_tokens_seen": 397934592, "step": 3036 }, { "epoch": 0.5196537351715294, "grad_norm": 0.7503839135169983, "learning_rate": 9.315147044571765e-05, "loss": 5.4228, "num_input_tokens_seen": 398327808, "step": 3039 }, { "epoch": 0.520166720102597, "grad_norm": 0.754388153553009, "learning_rate": 9.310552643777079e-05, "loss": 5.4824, "num_input_tokens_seen": 398721024, "step": 3042 }, { "epoch": 0.5206797050336647, "grad_norm": 0.8488169312477112, "learning_rate": 9.305965034410718e-05, "loss": 5.4188, "num_input_tokens_seen": 399114240, "step": 3045 }, { "epoch": 0.5211926899647323, "grad_norm": 0.8866889476776123, "learning_rate": 9.301384199757371e-05, "loss": 5.4401, "num_input_tokens_seen": 399507456, "step": 3048 }, { "epoch": 0.5217056748957999, "grad_norm": 0.8220815062522888, "learning_rate": 9.296810123159271e-05, "loss": 5.459, "num_input_tokens_seen": 399900672, "step": 3051 }, { "epoch": 0.5222186598268675, "grad_norm": 0.7505759000778198, "learning_rate": 9.292242788015935e-05, "loss": 5.4215, "num_input_tokens_seen": 400293888, "step": 3054 }, { "epoch": 0.5227316447579352, "grad_norm": 0.767932116985321, "learning_rate": 9.287682177783917e-05, "loss": 5.4263, "num_input_tokens_seen": 400687104, "step": 3057 }, { "epoch": 0.5232446296890029, "grad_norm": 0.7310931086540222, "learning_rate": 9.283128275976545e-05, "loss": 5.4805, "num_input_tokens_seen": 401080320, "step": 3060 }, { "epoch": 0.5237576146200705, "grad_norm": 0.7426648736000061, "learning_rate": 9.278581066163683e-05, "loss": 5.4506, "num_input_tokens_seen": 401473536, "step": 3063 }, { "epoch": 0.5242705995511382, "grad_norm": 0.8278624415397644, "learning_rate": 9.27404053197147e-05, "loss": 5.4258, "num_input_tokens_seen": 401866752, "step": 3066 }, { "epoch": 0.5247835844822059, "grad_norm": 0.7115198373794556, "learning_rate": 9.269506657082087e-05, "loss": 5.3972, "num_input_tokens_seen": 402259968, "step": 3069 }, { "epoch": 0.5252965694132735, "grad_norm": 0.8112291693687439, "learning_rate": 9.264979425233496e-05, "loss": 5.4267, "num_input_tokens_seen": 402653184, "step": 3072 }, { "epoch": 0.5258095543443412, "grad_norm": 0.7363867163658142, "learning_rate": 9.260458820219201e-05, "loss": 5.4101, "num_input_tokens_seen": 403046400, "step": 3075 }, { "epoch": 0.5263225392754087, "grad_norm": 0.8078411221504211, "learning_rate": 9.25594482588801e-05, "loss": 5.4623, "num_input_tokens_seen": 403439616, "step": 3078 }, { "epoch": 0.5268355242064764, "grad_norm": 0.8161293268203735, "learning_rate": 9.251437426143784e-05, "loss": 5.4603, "num_input_tokens_seen": 403832832, "step": 3081 }, { "epoch": 0.527348509137544, "grad_norm": 0.778689444065094, "learning_rate": 9.2469366049452e-05, "loss": 5.4116, "num_input_tokens_seen": 404226048, "step": 3084 }, { "epoch": 0.5278614940686117, "grad_norm": 0.9100328683853149, "learning_rate": 9.24244234630551e-05, "loss": 5.4603, "num_input_tokens_seen": 404619264, "step": 3087 }, { "epoch": 0.5283744789996794, "grad_norm": 0.8067951798439026, "learning_rate": 9.237954634292307e-05, "loss": 5.4426, "num_input_tokens_seen": 405012480, "step": 3090 }, { "epoch": 0.528887463930747, "grad_norm": 0.7273784279823303, "learning_rate": 9.233473453027276e-05, "loss": 5.3976, "num_input_tokens_seen": 405405696, "step": 3093 }, { "epoch": 0.5294004488618147, "grad_norm": 0.9762039184570312, "learning_rate": 9.228998786685971e-05, "loss": 5.4157, "num_input_tokens_seen": 405798912, "step": 3096 }, { "epoch": 0.5299134337928824, "grad_norm": 0.7714497447013855, "learning_rate": 9.22453061949758e-05, "loss": 5.4444, "num_input_tokens_seen": 406192128, "step": 3099 }, { "epoch": 0.53042641872395, "grad_norm": 0.7574513554573059, "learning_rate": 9.220068935744674e-05, "loss": 5.4365, "num_input_tokens_seen": 406585344, "step": 3102 }, { "epoch": 0.5309394036550177, "grad_norm": 0.8316619396209717, "learning_rate": 9.215613719763e-05, "loss": 5.4588, "num_input_tokens_seen": 406978560, "step": 3105 }, { "epoch": 0.5314523885860852, "grad_norm": 0.819129228591919, "learning_rate": 9.211164955941232e-05, "loss": 5.4262, "num_input_tokens_seen": 407371776, "step": 3108 }, { "epoch": 0.5319653735171529, "grad_norm": 0.7421338558197021, "learning_rate": 9.206722628720746e-05, "loss": 5.4259, "num_input_tokens_seen": 407764992, "step": 3111 }, { "epoch": 0.5324783584482206, "grad_norm": 0.7415031790733337, "learning_rate": 9.202286722595394e-05, "loss": 5.3804, "num_input_tokens_seen": 408158208, "step": 3114 }, { "epoch": 0.5329913433792882, "grad_norm": 0.7613891959190369, "learning_rate": 9.197857222111274e-05, "loss": 5.4539, "num_input_tokens_seen": 408551424, "step": 3117 }, { "epoch": 0.5335043283103559, "grad_norm": 0.7640434503555298, "learning_rate": 9.19343411186651e-05, "loss": 5.4272, "num_input_tokens_seen": 408944640, "step": 3120 }, { "epoch": 0.5340173132414235, "grad_norm": 0.7618759870529175, "learning_rate": 9.189017376511012e-05, "loss": 5.4546, "num_input_tokens_seen": 409337856, "step": 3123 }, { "epoch": 0.5345302981724912, "grad_norm": 0.7451426386833191, "learning_rate": 9.184607000746269e-05, "loss": 5.4063, "num_input_tokens_seen": 409731072, "step": 3126 }, { "epoch": 0.5350432831035589, "grad_norm": 0.8144820928573608, "learning_rate": 9.18020296932512e-05, "loss": 5.3909, "num_input_tokens_seen": 410124288, "step": 3129 }, { "epoch": 0.5355562680346265, "grad_norm": 0.7408854365348816, "learning_rate": 9.175805267051529e-05, "loss": 5.4057, "num_input_tokens_seen": 410517504, "step": 3132 }, { "epoch": 0.5360692529656942, "grad_norm": 0.6958907246589661, "learning_rate": 9.171413878780367e-05, "loss": 5.4055, "num_input_tokens_seen": 410910720, "step": 3135 }, { "epoch": 0.5365822378967617, "grad_norm": 0.948639452457428, "learning_rate": 9.167028789417202e-05, "loss": 5.4399, "num_input_tokens_seen": 411303936, "step": 3138 }, { "epoch": 0.5370952228278294, "grad_norm": 0.9219982028007507, "learning_rate": 9.162649983918063e-05, "loss": 5.4244, "num_input_tokens_seen": 411697152, "step": 3141 }, { "epoch": 0.5376082077588971, "grad_norm": 0.8096942901611328, "learning_rate": 9.15827744728924e-05, "loss": 5.4215, "num_input_tokens_seen": 412090368, "step": 3144 }, { "epoch": 0.5381211926899647, "grad_norm": 0.7412171363830566, "learning_rate": 9.153911164587056e-05, "loss": 5.3988, "num_input_tokens_seen": 412483584, "step": 3147 }, { "epoch": 0.5386341776210324, "grad_norm": 0.878857433795929, "learning_rate": 9.149551120917665e-05, "loss": 5.4578, "num_input_tokens_seen": 412876800, "step": 3150 }, { "epoch": 0.5391471625521, "grad_norm": 0.6796370148658752, "learning_rate": 9.145197301436826e-05, "loss": 5.4093, "num_input_tokens_seen": 413270016, "step": 3153 }, { "epoch": 0.5396601474831677, "grad_norm": 0.7488420009613037, "learning_rate": 9.140849691349699e-05, "loss": 5.415, "num_input_tokens_seen": 413663232, "step": 3156 }, { "epoch": 0.5401731324142354, "grad_norm": 0.7959953546524048, "learning_rate": 9.136508275910631e-05, "loss": 5.4424, "num_input_tokens_seen": 414056448, "step": 3159 }, { "epoch": 0.540686117345303, "grad_norm": 0.7761291265487671, "learning_rate": 9.132173040422948e-05, "loss": 5.3982, "num_input_tokens_seen": 414449664, "step": 3162 }, { "epoch": 0.5411991022763706, "grad_norm": 0.7197316884994507, "learning_rate": 9.127843970238739e-05, "loss": 5.4369, "num_input_tokens_seen": 414842880, "step": 3165 }, { "epoch": 0.5417120872074382, "grad_norm": 0.8164528012275696, "learning_rate": 9.123521050758656e-05, "loss": 5.3976, "num_input_tokens_seen": 415236096, "step": 3168 }, { "epoch": 0.5422250721385059, "grad_norm": 0.7773632407188416, "learning_rate": 9.119204267431711e-05, "loss": 5.4227, "num_input_tokens_seen": 415629312, "step": 3171 }, { "epoch": 0.5427380570695736, "grad_norm": 0.7797026634216309, "learning_rate": 9.114893605755055e-05, "loss": 5.4028, "num_input_tokens_seen": 416022528, "step": 3174 }, { "epoch": 0.5432510420006412, "grad_norm": 0.8309365510940552, "learning_rate": 9.110589051273787e-05, "loss": 5.38, "num_input_tokens_seen": 416415744, "step": 3177 }, { "epoch": 0.5437640269317089, "grad_norm": 0.7560862302780151, "learning_rate": 9.106290589580741e-05, "loss": 5.4071, "num_input_tokens_seen": 416808960, "step": 3180 }, { "epoch": 0.5442770118627765, "grad_norm": 0.9633191227912903, "learning_rate": 9.101998206316296e-05, "loss": 5.454, "num_input_tokens_seen": 417202176, "step": 3183 }, { "epoch": 0.5447899967938442, "grad_norm": 0.9686263203620911, "learning_rate": 9.097711887168163e-05, "loss": 5.4424, "num_input_tokens_seen": 417595392, "step": 3186 }, { "epoch": 0.5453029817249119, "grad_norm": 0.7610862255096436, "learning_rate": 9.093431617871184e-05, "loss": 5.4298, "num_input_tokens_seen": 417988608, "step": 3189 }, { "epoch": 0.5458159666559795, "grad_norm": 0.8192333579063416, "learning_rate": 9.08915738420714e-05, "loss": 5.3675, "num_input_tokens_seen": 418381824, "step": 3192 }, { "epoch": 0.5463289515870471, "grad_norm": 0.8405448198318481, "learning_rate": 9.084889172004556e-05, "loss": 5.4132, "num_input_tokens_seen": 418775040, "step": 3195 }, { "epoch": 0.5468419365181147, "grad_norm": 0.8114674687385559, "learning_rate": 9.080626967138484e-05, "loss": 5.3972, "num_input_tokens_seen": 419168256, "step": 3198 }, { "epoch": 0.5471839264721599, "eval_accuracy": 0.17817944960104218, "eval_loss": 5.861401081085205, "eval_runtime": 114.3884, "eval_samples_per_second": 2.623, "eval_steps_per_second": 1.311, "num_input_tokens_seen": 419430400, "step": 3200 }, { "epoch": 0.5473549214491824, "grad_norm": 0.7660786509513855, "learning_rate": 9.076370755530334e-05, "loss": 5.4147, "num_input_tokens_seen": 419561472, "step": 3201 }, { "epoch": 0.5478679063802501, "grad_norm": 0.8808948397636414, "learning_rate": 9.072120523147654e-05, "loss": 5.4063, "num_input_tokens_seen": 419954688, "step": 3204 }, { "epoch": 0.5483808913113177, "grad_norm": 0.751136302947998, "learning_rate": 9.067876256003947e-05, "loss": 5.393, "num_input_tokens_seen": 420347904, "step": 3207 }, { "epoch": 0.5488938762423854, "grad_norm": 0.7583978772163391, "learning_rate": 9.063637940158486e-05, "loss": 5.441, "num_input_tokens_seen": 420741120, "step": 3210 }, { "epoch": 0.549406861173453, "grad_norm": 0.685180127620697, "learning_rate": 9.059405561716102e-05, "loss": 5.3506, "num_input_tokens_seen": 421134336, "step": 3213 }, { "epoch": 0.5499198461045207, "grad_norm": 0.7538748979568481, "learning_rate": 9.05517910682701e-05, "loss": 5.4059, "num_input_tokens_seen": 421527552, "step": 3216 }, { "epoch": 0.5504328310355884, "grad_norm": 0.7428572773933411, "learning_rate": 9.050958561686607e-05, "loss": 5.4318, "num_input_tokens_seen": 421920768, "step": 3219 }, { "epoch": 0.5509458159666559, "grad_norm": 0.6747097373008728, "learning_rate": 9.046743912535294e-05, "loss": 5.412, "num_input_tokens_seen": 422313984, "step": 3222 }, { "epoch": 0.5514588008977236, "grad_norm": 0.7077423930168152, "learning_rate": 9.042535145658275e-05, "loss": 5.4399, "num_input_tokens_seen": 422707200, "step": 3225 }, { "epoch": 0.5519717858287913, "grad_norm": 0.7870452404022217, "learning_rate": 9.038332247385375e-05, "loss": 5.4146, "num_input_tokens_seen": 423100416, "step": 3228 }, { "epoch": 0.5524847707598589, "grad_norm": 0.788215160369873, "learning_rate": 9.034135204090863e-05, "loss": 5.3864, "num_input_tokens_seen": 423493632, "step": 3231 }, { "epoch": 0.5529977556909266, "grad_norm": 0.8093016743659973, "learning_rate": 9.029944002193249e-05, "loss": 5.4134, "num_input_tokens_seen": 423886848, "step": 3234 }, { "epoch": 0.5535107406219942, "grad_norm": 0.8293936252593994, "learning_rate": 9.025758628155108e-05, "loss": 5.403, "num_input_tokens_seen": 424280064, "step": 3237 }, { "epoch": 0.5540237255530619, "grad_norm": 0.8486737012863159, "learning_rate": 9.021579068482906e-05, "loss": 5.4037, "num_input_tokens_seen": 424673280, "step": 3240 }, { "epoch": 0.5545367104841296, "grad_norm": 0.7581062316894531, "learning_rate": 9.017405309726795e-05, "loss": 5.4085, "num_input_tokens_seen": 425066496, "step": 3243 }, { "epoch": 0.5550496954151972, "grad_norm": 0.7119733691215515, "learning_rate": 9.013237338480452e-05, "loss": 5.412, "num_input_tokens_seen": 425459712, "step": 3246 }, { "epoch": 0.5555626803462649, "grad_norm": 0.6932626962661743, "learning_rate": 9.009075141380889e-05, "loss": 5.3921, "num_input_tokens_seen": 425852928, "step": 3249 }, { "epoch": 0.5560756652773324, "grad_norm": 0.7145546078681946, "learning_rate": 9.004918705108273e-05, "loss": 5.4198, "num_input_tokens_seen": 426246144, "step": 3252 }, { "epoch": 0.5565886502084001, "grad_norm": 0.7427055239677429, "learning_rate": 9.000768016385747e-05, "loss": 5.4181, "num_input_tokens_seen": 426639360, "step": 3255 }, { "epoch": 0.5571016351394678, "grad_norm": 0.794049859046936, "learning_rate": 8.996623061979255e-05, "loss": 5.4207, "num_input_tokens_seen": 427032576, "step": 3258 }, { "epoch": 0.5576146200705354, "grad_norm": 0.7688232064247131, "learning_rate": 8.992483828697364e-05, "loss": 5.3995, "num_input_tokens_seen": 427425792, "step": 3261 }, { "epoch": 0.5581276050016031, "grad_norm": 0.78509521484375, "learning_rate": 8.988350303391082e-05, "loss": 5.3776, "num_input_tokens_seen": 427819008, "step": 3264 }, { "epoch": 0.5586405899326707, "grad_norm": 0.7266300320625305, "learning_rate": 8.984222472953694e-05, "loss": 5.3997, "num_input_tokens_seen": 428212224, "step": 3267 }, { "epoch": 0.5591535748637384, "grad_norm": 0.7247098684310913, "learning_rate": 8.980100324320567e-05, "loss": 5.3709, "num_input_tokens_seen": 428605440, "step": 3270 }, { "epoch": 0.5596665597948061, "grad_norm": 0.7840573191642761, "learning_rate": 8.975983844469008e-05, "loss": 5.3795, "num_input_tokens_seen": 428998656, "step": 3273 }, { "epoch": 0.5601795447258737, "grad_norm": 0.766008734703064, "learning_rate": 8.97187302041806e-05, "loss": 5.392, "num_input_tokens_seen": 429391872, "step": 3276 }, { "epoch": 0.5606925296569413, "grad_norm": 0.7803478837013245, "learning_rate": 8.967767839228347e-05, "loss": 5.3733, "num_input_tokens_seen": 429785088, "step": 3279 }, { "epoch": 0.5612055145880089, "grad_norm": 0.7566540837287903, "learning_rate": 8.963668288001898e-05, "loss": 5.4469, "num_input_tokens_seen": 430178304, "step": 3282 }, { "epoch": 0.5617184995190766, "grad_norm": 0.7852625250816345, "learning_rate": 8.959574353881981e-05, "loss": 5.4397, "num_input_tokens_seen": 430571520, "step": 3285 }, { "epoch": 0.5622314844501443, "grad_norm": 0.7693859934806824, "learning_rate": 8.955486024052926e-05, "loss": 5.4515, "num_input_tokens_seen": 430964736, "step": 3288 }, { "epoch": 0.5627444693812119, "grad_norm": 0.7919835448265076, "learning_rate": 8.951403285739966e-05, "loss": 5.3651, "num_input_tokens_seen": 431357952, "step": 3291 }, { "epoch": 0.5632574543122796, "grad_norm": 0.7930036187171936, "learning_rate": 8.947326126209056e-05, "loss": 5.3632, "num_input_tokens_seen": 431751168, "step": 3294 }, { "epoch": 0.5637704392433472, "grad_norm": 0.8255593776702881, "learning_rate": 8.943254532766725e-05, "loss": 5.4164, "num_input_tokens_seen": 432144384, "step": 3297 }, { "epoch": 0.5642834241744149, "grad_norm": 0.7580281496047974, "learning_rate": 8.939188492759893e-05, "loss": 5.4217, "num_input_tokens_seen": 432537600, "step": 3300 }, { "epoch": 0.5647964091054826, "grad_norm": 0.8095049858093262, "learning_rate": 8.935127993575714e-05, "loss": 5.4079, "num_input_tokens_seen": 432930816, "step": 3303 }, { "epoch": 0.5653093940365502, "grad_norm": 0.8694627285003662, "learning_rate": 8.93107302264141e-05, "loss": 5.4076, "num_input_tokens_seen": 433324032, "step": 3306 }, { "epoch": 0.5658223789676178, "grad_norm": 0.7595628499984741, "learning_rate": 8.927023567424106e-05, "loss": 5.3759, "num_input_tokens_seen": 433717248, "step": 3309 }, { "epoch": 0.5663353638986854, "grad_norm": 0.7479428648948669, "learning_rate": 8.922979615430672e-05, "loss": 5.4004, "num_input_tokens_seen": 434110464, "step": 3312 }, { "epoch": 0.5668483488297531, "grad_norm": 0.7877030372619629, "learning_rate": 8.918941154207554e-05, "loss": 5.3521, "num_input_tokens_seen": 434503680, "step": 3315 }, { "epoch": 0.5673613337608208, "grad_norm": 0.711147129535675, "learning_rate": 8.914908171340622e-05, "loss": 5.3964, "num_input_tokens_seen": 434896896, "step": 3318 }, { "epoch": 0.5678743186918884, "grad_norm": 0.8448489904403687, "learning_rate": 8.910880654455001e-05, "loss": 5.3609, "num_input_tokens_seen": 435290112, "step": 3321 }, { "epoch": 0.5683873036229561, "grad_norm": 1.0078827142715454, "learning_rate": 8.906858591214913e-05, "loss": 5.3788, "num_input_tokens_seen": 435683328, "step": 3324 }, { "epoch": 0.5689002885540237, "grad_norm": 0.9142970442771912, "learning_rate": 8.902841969323526e-05, "loss": 5.4028, "num_input_tokens_seen": 436076544, "step": 3327 }, { "epoch": 0.5694132734850914, "grad_norm": 0.7522501945495605, "learning_rate": 8.898830776522789e-05, "loss": 5.371, "num_input_tokens_seen": 436469760, "step": 3330 }, { "epoch": 0.5699262584161591, "grad_norm": 0.7678289413452148, "learning_rate": 8.894825000593272e-05, "loss": 5.4068, "num_input_tokens_seen": 436862976, "step": 3333 }, { "epoch": 0.5704392433472266, "grad_norm": 0.8633313179016113, "learning_rate": 8.890824629354019e-05, "loss": 5.4327, "num_input_tokens_seen": 437256192, "step": 3336 }, { "epoch": 0.5709522282782943, "grad_norm": 0.7590151429176331, "learning_rate": 8.886829650662388e-05, "loss": 5.428, "num_input_tokens_seen": 437649408, "step": 3339 }, { "epoch": 0.5714652132093619, "grad_norm": 0.847149133682251, "learning_rate": 8.882840052413889e-05, "loss": 5.3417, "num_input_tokens_seen": 438042624, "step": 3342 }, { "epoch": 0.5719781981404296, "grad_norm": 0.898617684841156, "learning_rate": 8.878855822542044e-05, "loss": 5.4506, "num_input_tokens_seen": 438435840, "step": 3345 }, { "epoch": 0.5724911830714973, "grad_norm": 0.763965368270874, "learning_rate": 8.874876949018225e-05, "loss": 5.3538, "num_input_tokens_seen": 438829056, "step": 3348 }, { "epoch": 0.5730041680025649, "grad_norm": 0.6895188093185425, "learning_rate": 8.8709034198515e-05, "loss": 5.3798, "num_input_tokens_seen": 439222272, "step": 3351 }, { "epoch": 0.5735171529336326, "grad_norm": 0.7596139907836914, "learning_rate": 8.866935223088484e-05, "loss": 5.4335, "num_input_tokens_seen": 439615488, "step": 3354 }, { "epoch": 0.5740301378647003, "grad_norm": 0.7017662525177002, "learning_rate": 8.86297234681319e-05, "loss": 5.3408, "num_input_tokens_seen": 440008704, "step": 3357 }, { "epoch": 0.5745431227957679, "grad_norm": 0.68537437915802, "learning_rate": 8.859014779146879e-05, "loss": 5.389, "num_input_tokens_seen": 440401920, "step": 3360 }, { "epoch": 0.5750561077268356, "grad_norm": 0.7110214233398438, "learning_rate": 8.855062508247906e-05, "loss": 5.388, "num_input_tokens_seen": 440795136, "step": 3363 }, { "epoch": 0.5755690926579031, "grad_norm": 0.7052029371261597, "learning_rate": 8.851115522311569e-05, "loss": 5.369, "num_input_tokens_seen": 441188352, "step": 3366 }, { "epoch": 0.5760820775889708, "grad_norm": 0.7604022026062012, "learning_rate": 8.847173809569973e-05, "loss": 5.4126, "num_input_tokens_seen": 441581568, "step": 3369 }, { "epoch": 0.5765950625200384, "grad_norm": 0.9355179071426392, "learning_rate": 8.843237358291869e-05, "loss": 5.4177, "num_input_tokens_seen": 441974784, "step": 3372 }, { "epoch": 0.5771080474511061, "grad_norm": 0.7173782587051392, "learning_rate": 8.839306156782517e-05, "loss": 5.3681, "num_input_tokens_seen": 442368000, "step": 3375 }, { "epoch": 0.5776210323821738, "grad_norm": 0.6853963136672974, "learning_rate": 8.835380193383536e-05, "loss": 5.4111, "num_input_tokens_seen": 442761216, "step": 3378 }, { "epoch": 0.5781340173132414, "grad_norm": 0.8973987102508545, "learning_rate": 8.831459456472757e-05, "loss": 5.3506, "num_input_tokens_seen": 443154432, "step": 3381 }, { "epoch": 0.5786470022443091, "grad_norm": 0.9155653119087219, "learning_rate": 8.827543934464083e-05, "loss": 5.4238, "num_input_tokens_seen": 443547648, "step": 3384 }, { "epoch": 0.5791599871753768, "grad_norm": 0.8291952013969421, "learning_rate": 8.823633615807338e-05, "loss": 5.3837, "num_input_tokens_seen": 443940864, "step": 3387 }, { "epoch": 0.5796729721064444, "grad_norm": 0.8799310922622681, "learning_rate": 8.81972848898814e-05, "loss": 5.4034, "num_input_tokens_seen": 444334080, "step": 3390 }, { "epoch": 0.580185957037512, "grad_norm": 0.8494542837142944, "learning_rate": 8.815828542527734e-05, "loss": 5.3643, "num_input_tokens_seen": 444727296, "step": 3393 }, { "epoch": 0.5806989419685796, "grad_norm": 0.8374956250190735, "learning_rate": 8.811933764982872e-05, "loss": 5.3538, "num_input_tokens_seen": 445120512, "step": 3396 }, { "epoch": 0.5812119268996473, "grad_norm": 0.7371034026145935, "learning_rate": 8.80804414494566e-05, "loss": 5.3759, "num_input_tokens_seen": 445513728, "step": 3399 }, { "epoch": 0.581724911830715, "grad_norm": 0.905910074710846, "learning_rate": 8.804159671043426e-05, "loss": 5.3869, "num_input_tokens_seen": 445906944, "step": 3402 }, { "epoch": 0.5822378967617826, "grad_norm": 0.7442747950553894, "learning_rate": 8.80028033193857e-05, "loss": 5.3816, "num_input_tokens_seen": 446300160, "step": 3405 }, { "epoch": 0.5827508816928503, "grad_norm": 0.7881158590316772, "learning_rate": 8.796406116328433e-05, "loss": 5.4244, "num_input_tokens_seen": 446693376, "step": 3408 }, { "epoch": 0.5832638666239179, "grad_norm": 0.7259749174118042, "learning_rate": 8.792537012945155e-05, "loss": 5.3949, "num_input_tokens_seen": 447086592, "step": 3411 }, { "epoch": 0.5837768515549856, "grad_norm": 0.7151450514793396, "learning_rate": 8.788673010555546e-05, "loss": 5.3901, "num_input_tokens_seen": 447479808, "step": 3414 }, { "epoch": 0.5842898364860533, "grad_norm": 0.7341894507408142, "learning_rate": 8.784814097960928e-05, "loss": 5.3559, "num_input_tokens_seen": 447873024, "step": 3417 }, { "epoch": 0.5848028214171209, "grad_norm": 0.805620014667511, "learning_rate": 8.780960263997026e-05, "loss": 5.3723, "num_input_tokens_seen": 448266240, "step": 3420 }, { "epoch": 0.5853158063481885, "grad_norm": 0.6728326082229614, "learning_rate": 8.777111497533811e-05, "loss": 5.4222, "num_input_tokens_seen": 448659456, "step": 3423 }, { "epoch": 0.5858287912792561, "grad_norm": 0.7800388336181641, "learning_rate": 8.773267787475375e-05, "loss": 5.4013, "num_input_tokens_seen": 449052672, "step": 3426 }, { "epoch": 0.5863417762103238, "grad_norm": 0.7759156823158264, "learning_rate": 8.769429122759794e-05, "loss": 5.3962, "num_input_tokens_seen": 449445888, "step": 3429 }, { "epoch": 0.5868547611413915, "grad_norm": 0.7210240364074707, "learning_rate": 8.765595492358994e-05, "loss": 5.4314, "num_input_tokens_seen": 449839104, "step": 3432 }, { "epoch": 0.5873677460724591, "grad_norm": 0.7143703699111938, "learning_rate": 8.761766885278622e-05, "loss": 5.4162, "num_input_tokens_seen": 450232320, "step": 3435 }, { "epoch": 0.5878807310035268, "grad_norm": 0.8158110976219177, "learning_rate": 8.757943290557907e-05, "loss": 5.4427, "num_input_tokens_seen": 450625536, "step": 3438 }, { "epoch": 0.5883937159345944, "grad_norm": 1.0660381317138672, "learning_rate": 8.754124697269531e-05, "loss": 5.3696, "num_input_tokens_seen": 451018752, "step": 3441 }, { "epoch": 0.5889067008656621, "grad_norm": 0.7566826939582825, "learning_rate": 8.7503110945195e-05, "loss": 5.3177, "num_input_tokens_seen": 451411968, "step": 3444 }, { "epoch": 0.5894196857967298, "grad_norm": 1.0346297025680542, "learning_rate": 8.746502471447013e-05, "loss": 5.3814, "num_input_tokens_seen": 451805184, "step": 3447 }, { "epoch": 0.5899326707277974, "grad_norm": 0.7817288637161255, "learning_rate": 8.742698817224326e-05, "loss": 5.3748, "num_input_tokens_seen": 452198400, "step": 3450 }, { "epoch": 0.590445655658865, "grad_norm": 0.6449154019355774, "learning_rate": 8.738900121056633e-05, "loss": 5.3848, "num_input_tokens_seen": 452591616, "step": 3453 }, { "epoch": 0.5909586405899326, "grad_norm": 0.789215624332428, "learning_rate": 8.735106372181928e-05, "loss": 5.3952, "num_input_tokens_seen": 452984832, "step": 3456 }, { "epoch": 0.5914716255210003, "grad_norm": 0.7104623317718506, "learning_rate": 8.731317559870881e-05, "loss": 5.3851, "num_input_tokens_seen": 453378048, "step": 3459 }, { "epoch": 0.591984610452068, "grad_norm": 0.6851987838745117, "learning_rate": 8.727533673426715e-05, "loss": 5.3495, "num_input_tokens_seen": 453771264, "step": 3462 }, { "epoch": 0.5924975953831356, "grad_norm": 0.661577582359314, "learning_rate": 8.723754702185069e-05, "loss": 5.4102, "num_input_tokens_seen": 454164480, "step": 3465 }, { "epoch": 0.5930105803142033, "grad_norm": 0.7876046895980835, "learning_rate": 8.719980635513879e-05, "loss": 5.3651, "num_input_tokens_seen": 454557696, "step": 3468 }, { "epoch": 0.593523565245271, "grad_norm": 0.6847128868103027, "learning_rate": 8.716211462813248e-05, "loss": 5.374, "num_input_tokens_seen": 454950912, "step": 3471 }, { "epoch": 0.5940365501763386, "grad_norm": 0.7161657214164734, "learning_rate": 8.712447173515334e-05, "loss": 5.3707, "num_input_tokens_seen": 455344128, "step": 3474 }, { "epoch": 0.5945495351074063, "grad_norm": 0.7502399682998657, "learning_rate": 8.708687757084202e-05, "loss": 5.3479, "num_input_tokens_seen": 455737344, "step": 3477 }, { "epoch": 0.5950625200384738, "grad_norm": 0.7003999948501587, "learning_rate": 8.704933203015719e-05, "loss": 5.3245, "num_input_tokens_seen": 456130560, "step": 3480 }, { "epoch": 0.5955755049695415, "grad_norm": 0.7405338287353516, "learning_rate": 8.701183500837426e-05, "loss": 5.3814, "num_input_tokens_seen": 456523776, "step": 3483 }, { "epoch": 0.5960884899006091, "grad_norm": 0.7183709144592285, "learning_rate": 8.697438640108417e-05, "loss": 5.3444, "num_input_tokens_seen": 456916992, "step": 3486 }, { "epoch": 0.5966014748316768, "grad_norm": 0.7054752707481384, "learning_rate": 8.693698610419203e-05, "loss": 5.4152, "num_input_tokens_seen": 457310208, "step": 3489 }, { "epoch": 0.5971144597627445, "grad_norm": 0.7215176820755005, "learning_rate": 8.689963401391618e-05, "loss": 5.3406, "num_input_tokens_seen": 457703424, "step": 3492 }, { "epoch": 0.5976274446938121, "grad_norm": 0.7162594795227051, "learning_rate": 8.686233002678664e-05, "loss": 5.4583, "num_input_tokens_seen": 458096640, "step": 3495 }, { "epoch": 0.5981404296248798, "grad_norm": 0.7248669862747192, "learning_rate": 8.682507403964426e-05, "loss": 5.339, "num_input_tokens_seen": 458489856, "step": 3498 }, { "epoch": 0.5986534145559474, "grad_norm": 0.7000369429588318, "learning_rate": 8.67878659496392e-05, "loss": 5.3227, "num_input_tokens_seen": 458883072, "step": 3501 }, { "epoch": 0.5991663994870151, "grad_norm": 0.7467644810676575, "learning_rate": 8.675070565422998e-05, "loss": 5.421, "num_input_tokens_seen": 459276288, "step": 3504 }, { "epoch": 0.5996793844180828, "grad_norm": 0.7262928485870361, "learning_rate": 8.671359305118213e-05, "loss": 5.3765, "num_input_tokens_seen": 459669504, "step": 3507 }, { "epoch": 0.6001923693491503, "grad_norm": 0.721593976020813, "learning_rate": 8.667652803856712e-05, "loss": 5.415, "num_input_tokens_seen": 460062720, "step": 3510 }, { "epoch": 0.600705354280218, "grad_norm": 0.712061882019043, "learning_rate": 8.663951051476112e-05, "loss": 5.4037, "num_input_tokens_seen": 460455936, "step": 3513 }, { "epoch": 0.6012183392112856, "grad_norm": 0.6668177247047424, "learning_rate": 8.660254037844386e-05, "loss": 5.3052, "num_input_tokens_seen": 460849152, "step": 3516 }, { "epoch": 0.6017313241423533, "grad_norm": 0.6952577233314514, "learning_rate": 8.656561752859744e-05, "loss": 5.3895, "num_input_tokens_seen": 461242368, "step": 3519 }, { "epoch": 0.602244309073421, "grad_norm": 0.8409635424613953, "learning_rate": 8.652874186450518e-05, "loss": 5.3989, "num_input_tokens_seen": 461635584, "step": 3522 }, { "epoch": 0.6027572940044886, "grad_norm": 0.7468051314353943, "learning_rate": 8.64919132857505e-05, "loss": 5.3822, "num_input_tokens_seen": 462028800, "step": 3525 }, { "epoch": 0.6032702789355563, "grad_norm": 0.6477757692337036, "learning_rate": 8.645513169221575e-05, "loss": 5.3758, "num_input_tokens_seen": 462422016, "step": 3528 }, { "epoch": 0.603783263866624, "grad_norm": 0.7509175539016724, "learning_rate": 8.641839698408096e-05, "loss": 5.3906, "num_input_tokens_seen": 462815232, "step": 3531 }, { "epoch": 0.6042962487976916, "grad_norm": 0.7417723536491394, "learning_rate": 8.638170906182295e-05, "loss": 5.4307, "num_input_tokens_seen": 463208448, "step": 3534 }, { "epoch": 0.6048092337287592, "grad_norm": 0.9790907502174377, "learning_rate": 8.634506782621394e-05, "loss": 5.387, "num_input_tokens_seen": 463601664, "step": 3537 }, { "epoch": 0.6053222186598268, "grad_norm": 0.7534716129302979, "learning_rate": 8.630847317832056e-05, "loss": 5.3362, "num_input_tokens_seen": 463994880, "step": 3540 }, { "epoch": 0.6058352035908945, "grad_norm": 0.8518982529640198, "learning_rate": 8.627192501950274e-05, "loss": 5.4182, "num_input_tokens_seen": 464388096, "step": 3543 }, { "epoch": 0.6063481885219622, "grad_norm": 0.9109683632850647, "learning_rate": 8.623542325141249e-05, "loss": 5.3571, "num_input_tokens_seen": 464781312, "step": 3546 }, { "epoch": 0.6068611734530298, "grad_norm": 0.7174829840660095, "learning_rate": 8.619896777599289e-05, "loss": 5.4502, "num_input_tokens_seen": 465174528, "step": 3549 }, { "epoch": 0.6073741583840975, "grad_norm": 0.796428918838501, "learning_rate": 8.616255849547694e-05, "loss": 5.3815, "num_input_tokens_seen": 465567744, "step": 3552 }, { "epoch": 0.6078871433151651, "grad_norm": 0.9209778308868408, "learning_rate": 8.612619531238647e-05, "loss": 5.4294, "num_input_tokens_seen": 465960960, "step": 3555 }, { "epoch": 0.6084001282462328, "grad_norm": 0.9266733527183533, "learning_rate": 8.6089878129531e-05, "loss": 5.3943, "num_input_tokens_seen": 466354176, "step": 3558 }, { "epoch": 0.6089131131773005, "grad_norm": 0.7748308181762695, "learning_rate": 8.60536068500068e-05, "loss": 5.3581, "num_input_tokens_seen": 466747392, "step": 3561 }, { "epoch": 0.6094260981083681, "grad_norm": 0.8532904386520386, "learning_rate": 8.601738137719548e-05, "loss": 5.3198, "num_input_tokens_seen": 467140608, "step": 3564 }, { "epoch": 0.6099390830394357, "grad_norm": 0.8617231845855713, "learning_rate": 8.598120161476338e-05, "loss": 5.4047, "num_input_tokens_seen": 467533824, "step": 3567 }, { "epoch": 0.6104520679705033, "grad_norm": 0.7476005554199219, "learning_rate": 8.594506746665999e-05, "loss": 5.3728, "num_input_tokens_seen": 467927040, "step": 3570 }, { "epoch": 0.610965052901571, "grad_norm": 0.7166175842285156, "learning_rate": 8.590897883711732e-05, "loss": 5.3925, "num_input_tokens_seen": 468320256, "step": 3573 }, { "epoch": 0.6114780378326387, "grad_norm": 0.6815547347068787, "learning_rate": 8.587293563064851e-05, "loss": 5.3407, "num_input_tokens_seen": 468713472, "step": 3576 }, { "epoch": 0.6119910227637063, "grad_norm": 0.7247835397720337, "learning_rate": 8.583693775204695e-05, "loss": 5.3833, "num_input_tokens_seen": 469106688, "step": 3579 }, { "epoch": 0.612504007694774, "grad_norm": 0.715282142162323, "learning_rate": 8.580098510638516e-05, "loss": 5.3863, "num_input_tokens_seen": 469499904, "step": 3582 }, { "epoch": 0.6130169926258416, "grad_norm": 0.8423024415969849, "learning_rate": 8.576507759901377e-05, "loss": 5.4177, "num_input_tokens_seen": 469893120, "step": 3585 }, { "epoch": 0.6135299775569093, "grad_norm": 0.7851782441139221, "learning_rate": 8.572921513556039e-05, "loss": 5.375, "num_input_tokens_seen": 470286336, "step": 3588 }, { "epoch": 0.614042962487977, "grad_norm": 0.9765253067016602, "learning_rate": 8.569339762192868e-05, "loss": 5.3584, "num_input_tokens_seen": 470679552, "step": 3591 }, { "epoch": 0.6145559474190445, "grad_norm": 0.799020528793335, "learning_rate": 8.565762496429728e-05, "loss": 5.3448, "num_input_tokens_seen": 471072768, "step": 3594 }, { "epoch": 0.6150689323501122, "grad_norm": 0.711087703704834, "learning_rate": 8.562189706911872e-05, "loss": 5.3743, "num_input_tokens_seen": 471465984, "step": 3597 }, { "epoch": 0.6155819172811798, "grad_norm": 0.8044856190681458, "learning_rate": 8.558621384311844e-05, "loss": 5.3983, "num_input_tokens_seen": 471859200, "step": 3600 }, { "epoch": 0.6155819172811798, "eval_accuracy": 0.17727405959941378, "eval_loss": 5.83395528793335, "eval_runtime": 110.1105, "eval_samples_per_second": 2.725, "eval_steps_per_second": 1.362, "num_input_tokens_seen": 471859200, "step": 3600 }, { "epoch": 0.6160949022122475, "grad_norm": 0.6745046973228455, "learning_rate": 8.555057519329377e-05, "loss": 5.3712, "num_input_tokens_seen": 472252416, "step": 3603 }, { "epoch": 0.6166078871433152, "grad_norm": 0.7151539921760559, "learning_rate": 8.551498102691291e-05, "loss": 5.3079, "num_input_tokens_seen": 472645632, "step": 3606 }, { "epoch": 0.6171208720743828, "grad_norm": 0.7192103266716003, "learning_rate": 8.547943125151391e-05, "loss": 5.3905, "num_input_tokens_seen": 473038848, "step": 3609 }, { "epoch": 0.6176338570054505, "grad_norm": 0.734131932258606, "learning_rate": 8.544392577490364e-05, "loss": 5.3244, "num_input_tokens_seen": 473432064, "step": 3612 }, { "epoch": 0.6181468419365181, "grad_norm": 0.6585795283317566, "learning_rate": 8.54084645051568e-05, "loss": 5.3767, "num_input_tokens_seen": 473825280, "step": 3615 }, { "epoch": 0.6186598268675858, "grad_norm": 0.7334539294242859, "learning_rate": 8.537304735061498e-05, "loss": 5.3683, "num_input_tokens_seen": 474218496, "step": 3618 }, { "epoch": 0.6191728117986535, "grad_norm": 0.7433605790138245, "learning_rate": 8.533767421988556e-05, "loss": 5.3546, "num_input_tokens_seen": 474611712, "step": 3621 }, { "epoch": 0.619685796729721, "grad_norm": 0.7147760987281799, "learning_rate": 8.530234502184079e-05, "loss": 5.3851, "num_input_tokens_seen": 475004928, "step": 3624 }, { "epoch": 0.6201987816607887, "grad_norm": 0.7629379034042358, "learning_rate": 8.526705966561678e-05, "loss": 5.3321, "num_input_tokens_seen": 475398144, "step": 3627 }, { "epoch": 0.6207117665918563, "grad_norm": 0.7201533317565918, "learning_rate": 8.523181806061252e-05, "loss": 5.3973, "num_input_tokens_seen": 475791360, "step": 3630 }, { "epoch": 0.621224751522924, "grad_norm": 0.7145413756370544, "learning_rate": 8.519662011648894e-05, "loss": 5.357, "num_input_tokens_seen": 476184576, "step": 3633 }, { "epoch": 0.6217377364539917, "grad_norm": 0.6908425092697144, "learning_rate": 8.516146574316785e-05, "loss": 5.3227, "num_input_tokens_seen": 476577792, "step": 3636 }, { "epoch": 0.6222507213850593, "grad_norm": 0.7059429883956909, "learning_rate": 8.512635485083106e-05, "loss": 5.361, "num_input_tokens_seen": 476971008, "step": 3639 }, { "epoch": 0.622763706316127, "grad_norm": 0.7348899841308594, "learning_rate": 8.509128734991941e-05, "loss": 5.3768, "num_input_tokens_seen": 477364224, "step": 3642 }, { "epoch": 0.6232766912471946, "grad_norm": 0.8156319260597229, "learning_rate": 8.505626315113171e-05, "loss": 5.3989, "num_input_tokens_seen": 477757440, "step": 3645 }, { "epoch": 0.6237896761782623, "grad_norm": 0.776240885257721, "learning_rate": 8.50212821654239e-05, "loss": 5.3562, "num_input_tokens_seen": 478150656, "step": 3648 }, { "epoch": 0.6243026611093299, "grad_norm": 0.7777855396270752, "learning_rate": 8.498634430400809e-05, "loss": 5.3648, "num_input_tokens_seen": 478543872, "step": 3651 }, { "epoch": 0.6248156460403975, "grad_norm": 0.8572577834129333, "learning_rate": 8.495144947835149e-05, "loss": 5.3517, "num_input_tokens_seen": 478937088, "step": 3654 }, { "epoch": 0.6253286309714652, "grad_norm": 0.9028589725494385, "learning_rate": 8.491659760017563e-05, "loss": 5.3633, "num_input_tokens_seen": 479330304, "step": 3657 }, { "epoch": 0.6258416159025328, "grad_norm": 0.8123112320899963, "learning_rate": 8.488178858145536e-05, "loss": 5.3626, "num_input_tokens_seen": 479723520, "step": 3660 }, { "epoch": 0.6263546008336005, "grad_norm": 0.6874297261238098, "learning_rate": 8.484702233441784e-05, "loss": 5.3445, "num_input_tokens_seen": 480116736, "step": 3663 }, { "epoch": 0.6268675857646682, "grad_norm": 0.7794182300567627, "learning_rate": 8.481229877154171e-05, "loss": 5.4051, "num_input_tokens_seen": 480509952, "step": 3666 }, { "epoch": 0.6273805706957358, "grad_norm": 0.9119608402252197, "learning_rate": 8.477761780555616e-05, "loss": 5.3604, "num_input_tokens_seen": 480903168, "step": 3669 }, { "epoch": 0.6278935556268035, "grad_norm": 0.8943549394607544, "learning_rate": 8.474297934943995e-05, "loss": 5.3822, "num_input_tokens_seen": 481296384, "step": 3672 }, { "epoch": 0.6284065405578712, "grad_norm": 0.7565066814422607, "learning_rate": 8.470838331642053e-05, "loss": 5.3687, "num_input_tokens_seen": 481689600, "step": 3675 }, { "epoch": 0.6289195254889388, "grad_norm": 0.8296188116073608, "learning_rate": 8.46738296199731e-05, "loss": 5.4102, "num_input_tokens_seen": 482082816, "step": 3678 }, { "epoch": 0.6294325104200064, "grad_norm": 0.7473737597465515, "learning_rate": 8.463931817381974e-05, "loss": 5.3368, "num_input_tokens_seen": 482476032, "step": 3681 }, { "epoch": 0.629945495351074, "grad_norm": 0.7469894289970398, "learning_rate": 8.460484889192852e-05, "loss": 5.347, "num_input_tokens_seen": 482869248, "step": 3684 }, { "epoch": 0.6304584802821417, "grad_norm": 0.7461103796958923, "learning_rate": 8.457042168851248e-05, "loss": 5.376, "num_input_tokens_seen": 483262464, "step": 3687 }, { "epoch": 0.6309714652132093, "grad_norm": 0.7393492460250854, "learning_rate": 8.45360364780289e-05, "loss": 5.3428, "num_input_tokens_seen": 483655680, "step": 3690 }, { "epoch": 0.631484450144277, "grad_norm": 0.7223976850509644, "learning_rate": 8.450169317517828e-05, "loss": 5.3517, "num_input_tokens_seen": 484048896, "step": 3693 }, { "epoch": 0.6319974350753447, "grad_norm": 0.7259080410003662, "learning_rate": 8.446739169490354e-05, "loss": 5.3331, "num_input_tokens_seen": 484442112, "step": 3696 }, { "epoch": 0.6325104200064123, "grad_norm": 0.7238535284996033, "learning_rate": 8.443313195238902e-05, "loss": 5.3805, "num_input_tokens_seen": 484835328, "step": 3699 }, { "epoch": 0.63302340493748, "grad_norm": 0.7401497960090637, "learning_rate": 8.439891386305977e-05, "loss": 5.399, "num_input_tokens_seen": 485228544, "step": 3702 }, { "epoch": 0.6335363898685477, "grad_norm": 0.8473367094993591, "learning_rate": 8.436473734258046e-05, "loss": 5.3433, "num_input_tokens_seen": 485621760, "step": 3705 }, { "epoch": 0.6340493747996152, "grad_norm": 0.8806385397911072, "learning_rate": 8.433060230685473e-05, "loss": 5.4077, "num_input_tokens_seen": 486014976, "step": 3708 }, { "epoch": 0.6345623597306829, "grad_norm": 0.6987698078155518, "learning_rate": 8.429650867202415e-05, "loss": 5.356, "num_input_tokens_seen": 486408192, "step": 3711 }, { "epoch": 0.6350753446617505, "grad_norm": 0.74212247133255, "learning_rate": 8.426245635446741e-05, "loss": 5.3557, "num_input_tokens_seen": 486801408, "step": 3714 }, { "epoch": 0.6355883295928182, "grad_norm": 0.7430636882781982, "learning_rate": 8.422844527079955e-05, "loss": 5.3656, "num_input_tokens_seen": 487194624, "step": 3717 }, { "epoch": 0.6361013145238859, "grad_norm": 0.7818143963813782, "learning_rate": 8.419447533787093e-05, "loss": 5.3528, "num_input_tokens_seen": 487587840, "step": 3720 }, { "epoch": 0.6366142994549535, "grad_norm": 0.7322751879692078, "learning_rate": 8.416054647276643e-05, "loss": 5.35, "num_input_tokens_seen": 487981056, "step": 3723 }, { "epoch": 0.6371272843860212, "grad_norm": 0.7790704369544983, "learning_rate": 8.41266585928048e-05, "loss": 5.3617, "num_input_tokens_seen": 488374272, "step": 3726 }, { "epoch": 0.6376402693170888, "grad_norm": 0.8650009632110596, "learning_rate": 8.409281161553747e-05, "loss": 5.3689, "num_input_tokens_seen": 488767488, "step": 3729 }, { "epoch": 0.6381532542481565, "grad_norm": 0.8796480298042297, "learning_rate": 8.405900545874799e-05, "loss": 5.362, "num_input_tokens_seen": 489160704, "step": 3732 }, { "epoch": 0.6386662391792242, "grad_norm": 0.7147157788276672, "learning_rate": 8.402524004045107e-05, "loss": 5.3659, "num_input_tokens_seen": 489553920, "step": 3735 }, { "epoch": 0.6391792241102917, "grad_norm": 0.7321900725364685, "learning_rate": 8.399151527889171e-05, "loss": 5.4014, "num_input_tokens_seen": 489947136, "step": 3738 }, { "epoch": 0.6396922090413594, "grad_norm": 0.7102051973342896, "learning_rate": 8.39578310925445e-05, "loss": 5.3454, "num_input_tokens_seen": 490340352, "step": 3741 }, { "epoch": 0.640205193972427, "grad_norm": 0.7498131990432739, "learning_rate": 8.392418740011266e-05, "loss": 5.3246, "num_input_tokens_seen": 490733568, "step": 3744 }, { "epoch": 0.6407181789034947, "grad_norm": 0.7187573313713074, "learning_rate": 8.389058412052728e-05, "loss": 5.2761, "num_input_tokens_seen": 491126784, "step": 3747 }, { "epoch": 0.6412311638345624, "grad_norm": 0.6994777917861938, "learning_rate": 8.385702117294651e-05, "loss": 5.329, "num_input_tokens_seen": 491520000, "step": 3750 }, { "epoch": 0.64174414876563, "grad_norm": 0.7428282499313354, "learning_rate": 8.382349847675467e-05, "loss": 5.3071, "num_input_tokens_seen": 491913216, "step": 3753 }, { "epoch": 0.6422571336966977, "grad_norm": 0.7503423094749451, "learning_rate": 8.379001595156155e-05, "loss": 5.3232, "num_input_tokens_seen": 492306432, "step": 3756 }, { "epoch": 0.6427701186277653, "grad_norm": 0.7488569617271423, "learning_rate": 8.375657351720148e-05, "loss": 5.3871, "num_input_tokens_seen": 492699648, "step": 3759 }, { "epoch": 0.643283103558833, "grad_norm": 0.8150780200958252, "learning_rate": 8.372317109373264e-05, "loss": 5.3916, "num_input_tokens_seen": 493092864, "step": 3762 }, { "epoch": 0.6437960884899007, "grad_norm": 0.8473458290100098, "learning_rate": 8.368980860143615e-05, "loss": 5.3534, "num_input_tokens_seen": 493486080, "step": 3765 }, { "epoch": 0.6443090734209682, "grad_norm": 0.6835038661956787, "learning_rate": 8.365648596081538e-05, "loss": 5.3142, "num_input_tokens_seen": 493879296, "step": 3768 }, { "epoch": 0.6448220583520359, "grad_norm": 0.7488033771514893, "learning_rate": 8.362320309259501e-05, "loss": 5.3631, "num_input_tokens_seen": 494272512, "step": 3771 }, { "epoch": 0.6453350432831035, "grad_norm": 0.8691450357437134, "learning_rate": 8.35899599177204e-05, "loss": 5.3643, "num_input_tokens_seen": 494665728, "step": 3774 }, { "epoch": 0.6458480282141712, "grad_norm": 0.7763018012046814, "learning_rate": 8.355675635735668e-05, "loss": 5.2909, "num_input_tokens_seen": 495058944, "step": 3777 }, { "epoch": 0.6463610131452389, "grad_norm": 0.8163045644760132, "learning_rate": 8.352359233288804e-05, "loss": 5.3112, "num_input_tokens_seen": 495452160, "step": 3780 }, { "epoch": 0.6468739980763065, "grad_norm": 0.7238712906837463, "learning_rate": 8.349046776591689e-05, "loss": 5.325, "num_input_tokens_seen": 495845376, "step": 3783 }, { "epoch": 0.6473869830073742, "grad_norm": 0.6693491339683533, "learning_rate": 8.345738257826312e-05, "loss": 5.3193, "num_input_tokens_seen": 496238592, "step": 3786 }, { "epoch": 0.6478999679384418, "grad_norm": 0.6386433243751526, "learning_rate": 8.342433669196334e-05, "loss": 5.3531, "num_input_tokens_seen": 496631808, "step": 3789 }, { "epoch": 0.6484129528695095, "grad_norm": 0.6837417483329773, "learning_rate": 8.339133002927001e-05, "loss": 5.309, "num_input_tokens_seen": 497025024, "step": 3792 }, { "epoch": 0.6489259378005771, "grad_norm": 0.7823799252510071, "learning_rate": 8.335836251265084e-05, "loss": 5.3361, "num_input_tokens_seen": 497418240, "step": 3795 }, { "epoch": 0.6494389227316447, "grad_norm": 0.7094940543174744, "learning_rate": 8.332543406478784e-05, "loss": 5.3467, "num_input_tokens_seen": 497811456, "step": 3798 }, { "epoch": 0.6499519076627124, "grad_norm": 0.6330589056015015, "learning_rate": 8.329254460857673e-05, "loss": 5.2885, "num_input_tokens_seen": 498204672, "step": 3801 }, { "epoch": 0.65046489259378, "grad_norm": 0.716102123260498, "learning_rate": 8.325969406712602e-05, "loss": 5.3826, "num_input_tokens_seen": 498597888, "step": 3804 }, { "epoch": 0.6509778775248477, "grad_norm": 0.7873062491416931, "learning_rate": 8.322688236375638e-05, "loss": 5.3422, "num_input_tokens_seen": 498991104, "step": 3807 }, { "epoch": 0.6514908624559154, "grad_norm": 0.8517410159111023, "learning_rate": 8.319410942199984e-05, "loss": 5.3181, "num_input_tokens_seen": 499384320, "step": 3810 }, { "epoch": 0.652003847386983, "grad_norm": 0.7410987615585327, "learning_rate": 8.316137516559907e-05, "loss": 5.3124, "num_input_tokens_seen": 499777536, "step": 3813 }, { "epoch": 0.6525168323180507, "grad_norm": 0.7573698163032532, "learning_rate": 8.312867951850651e-05, "loss": 5.3404, "num_input_tokens_seen": 500170752, "step": 3816 }, { "epoch": 0.6530298172491183, "grad_norm": 0.876664936542511, "learning_rate": 8.309602240488386e-05, "loss": 5.3598, "num_input_tokens_seen": 500563968, "step": 3819 }, { "epoch": 0.653542802180186, "grad_norm": 0.7487987875938416, "learning_rate": 8.306340374910112e-05, "loss": 5.3531, "num_input_tokens_seen": 500957184, "step": 3822 }, { "epoch": 0.6540557871112536, "grad_norm": 0.865337610244751, "learning_rate": 8.303082347573595e-05, "loss": 5.3075, "num_input_tokens_seen": 501350400, "step": 3825 }, { "epoch": 0.6545687720423212, "grad_norm": 0.7958502769470215, "learning_rate": 8.299828150957296e-05, "loss": 5.3461, "num_input_tokens_seen": 501743616, "step": 3828 }, { "epoch": 0.6550817569733889, "grad_norm": 0.7690322399139404, "learning_rate": 8.29657777756029e-05, "loss": 5.2989, "num_input_tokens_seen": 502136832, "step": 3831 }, { "epoch": 0.6555947419044565, "grad_norm": 0.7034088969230652, "learning_rate": 8.29333121990221e-05, "loss": 5.3167, "num_input_tokens_seen": 502530048, "step": 3834 }, { "epoch": 0.6561077268355242, "grad_norm": 0.7316693067550659, "learning_rate": 8.29008847052315e-05, "loss": 5.397, "num_input_tokens_seen": 502923264, "step": 3837 }, { "epoch": 0.6566207117665919, "grad_norm": 0.7567901611328125, "learning_rate": 8.28684952198361e-05, "loss": 5.31, "num_input_tokens_seen": 503316480, "step": 3840 }, { "epoch": 0.6571336966976595, "grad_norm": 0.7767483592033386, "learning_rate": 8.283614366864425e-05, "loss": 5.3663, "num_input_tokens_seen": 503709696, "step": 3843 }, { "epoch": 0.6576466816287272, "grad_norm": 0.7799834609031677, "learning_rate": 8.280382997766685e-05, "loss": 5.3231, "num_input_tokens_seen": 504102912, "step": 3846 }, { "epoch": 0.6581596665597949, "grad_norm": 0.810688316822052, "learning_rate": 8.277155407311666e-05, "loss": 5.3251, "num_input_tokens_seen": 504496128, "step": 3849 }, { "epoch": 0.6586726514908624, "grad_norm": 0.7437490820884705, "learning_rate": 8.27393158814077e-05, "loss": 5.3464, "num_input_tokens_seen": 504889344, "step": 3852 }, { "epoch": 0.6591856364219301, "grad_norm": 0.7459340691566467, "learning_rate": 8.270711532915435e-05, "loss": 5.3044, "num_input_tokens_seen": 505282560, "step": 3855 }, { "epoch": 0.6596986213529977, "grad_norm": 0.8228518962860107, "learning_rate": 8.267495234317081e-05, "loss": 5.3375, "num_input_tokens_seen": 505675776, "step": 3858 }, { "epoch": 0.6602116062840654, "grad_norm": 0.6991084218025208, "learning_rate": 8.264282685047038e-05, "loss": 5.3671, "num_input_tokens_seen": 506068992, "step": 3861 }, { "epoch": 0.660724591215133, "grad_norm": 0.7868938446044922, "learning_rate": 8.261073877826466e-05, "loss": 5.3483, "num_input_tokens_seen": 506462208, "step": 3864 }, { "epoch": 0.6612375761462007, "grad_norm": 0.7679697275161743, "learning_rate": 8.2578688053963e-05, "loss": 5.3693, "num_input_tokens_seen": 506855424, "step": 3867 }, { "epoch": 0.6617505610772684, "grad_norm": 0.7753176689147949, "learning_rate": 8.254667460517166e-05, "loss": 5.3721, "num_input_tokens_seen": 507248640, "step": 3870 }, { "epoch": 0.662263546008336, "grad_norm": 0.8868235945701599, "learning_rate": 8.251469835969328e-05, "loss": 5.3506, "num_input_tokens_seen": 507641856, "step": 3873 }, { "epoch": 0.6627765309394037, "grad_norm": 0.9391675591468811, "learning_rate": 8.248275924552608e-05, "loss": 5.2984, "num_input_tokens_seen": 508035072, "step": 3876 }, { "epoch": 0.6632895158704714, "grad_norm": 0.757840633392334, "learning_rate": 8.245085719086321e-05, "loss": 5.3594, "num_input_tokens_seen": 508428288, "step": 3879 }, { "epoch": 0.6638025008015389, "grad_norm": 0.7130749821662903, "learning_rate": 8.24189921240921e-05, "loss": 5.338, "num_input_tokens_seen": 508821504, "step": 3882 }, { "epoch": 0.6643154857326066, "grad_norm": 0.8119063973426819, "learning_rate": 8.238716397379381e-05, "loss": 5.3518, "num_input_tokens_seen": 509214720, "step": 3885 }, { "epoch": 0.6648284706636742, "grad_norm": 0.7790501117706299, "learning_rate": 8.235537266874219e-05, "loss": 5.3012, "num_input_tokens_seen": 509607936, "step": 3888 }, { "epoch": 0.6653414555947419, "grad_norm": 0.7022935152053833, "learning_rate": 8.232361813790342e-05, "loss": 5.2907, "num_input_tokens_seen": 510001152, "step": 3891 }, { "epoch": 0.6658544405258096, "grad_norm": 0.8035649657249451, "learning_rate": 8.229190031043528e-05, "loss": 5.2965, "num_input_tokens_seen": 510394368, "step": 3894 }, { "epoch": 0.6663674254568772, "grad_norm": 0.6912944316864014, "learning_rate": 8.22602191156864e-05, "loss": 5.3651, "num_input_tokens_seen": 510787584, "step": 3897 }, { "epoch": 0.6668804103879449, "grad_norm": 0.6894614100456238, "learning_rate": 8.222857448319569e-05, "loss": 5.2651, "num_input_tokens_seen": 511180800, "step": 3900 }, { "epoch": 0.6673933953190125, "grad_norm": 0.7291402220726013, "learning_rate": 8.219696634269164e-05, "loss": 5.3479, "num_input_tokens_seen": 511574016, "step": 3903 }, { "epoch": 0.6679063802500802, "grad_norm": 0.7843152284622192, "learning_rate": 8.21653946240917e-05, "loss": 5.2789, "num_input_tokens_seen": 511967232, "step": 3906 }, { "epoch": 0.6684193651811478, "grad_norm": 0.6679741144180298, "learning_rate": 8.21338592575016e-05, "loss": 5.2875, "num_input_tokens_seen": 512360448, "step": 3909 }, { "epoch": 0.6689323501122154, "grad_norm": 0.709000289440155, "learning_rate": 8.210236017321469e-05, "loss": 5.3695, "num_input_tokens_seen": 512753664, "step": 3912 }, { "epoch": 0.6694453350432831, "grad_norm": 0.701045036315918, "learning_rate": 8.207089730171132e-05, "loss": 5.3274, "num_input_tokens_seen": 513146880, "step": 3915 }, { "epoch": 0.6699583199743507, "grad_norm": 0.741085946559906, "learning_rate": 8.203947057365817e-05, "loss": 5.3338, "num_input_tokens_seen": 513540096, "step": 3918 }, { "epoch": 0.6704713049054184, "grad_norm": 0.7441378235816956, "learning_rate": 8.200807991990765e-05, "loss": 5.3587, "num_input_tokens_seen": 513933312, "step": 3921 }, { "epoch": 0.6709842898364861, "grad_norm": 0.6489686965942383, "learning_rate": 8.197672527149723e-05, "loss": 5.3287, "num_input_tokens_seen": 514326528, "step": 3924 }, { "epoch": 0.6714972747675537, "grad_norm": 0.6942331194877625, "learning_rate": 8.194540655964876e-05, "loss": 5.3292, "num_input_tokens_seen": 514719744, "step": 3927 }, { "epoch": 0.6720102596986214, "grad_norm": 0.8609412312507629, "learning_rate": 8.191412371576794e-05, "loss": 5.3108, "num_input_tokens_seen": 515112960, "step": 3930 }, { "epoch": 0.672523244629689, "grad_norm": 0.866399347782135, "learning_rate": 8.188287667144362e-05, "loss": 5.3481, "num_input_tokens_seen": 515506176, "step": 3933 }, { "epoch": 0.6730362295607567, "grad_norm": 0.6309357285499573, "learning_rate": 8.185166535844714e-05, "loss": 5.3815, "num_input_tokens_seen": 515899392, "step": 3936 }, { "epoch": 0.6735492144918243, "grad_norm": 0.8205288052558899, "learning_rate": 8.182048970873184e-05, "loss": 5.359, "num_input_tokens_seen": 516292608, "step": 3939 }, { "epoch": 0.6740621994228919, "grad_norm": 0.8117387294769287, "learning_rate": 8.178934965443225e-05, "loss": 5.2779, "num_input_tokens_seen": 516685824, "step": 3942 }, { "epoch": 0.6745751843539596, "grad_norm": 0.7551442980766296, "learning_rate": 8.175824512786359e-05, "loss": 5.363, "num_input_tokens_seen": 517079040, "step": 3945 }, { "epoch": 0.6750881692850272, "grad_norm": 0.7865480184555054, "learning_rate": 8.172717606152118e-05, "loss": 5.3067, "num_input_tokens_seen": 517472256, "step": 3948 }, { "epoch": 0.6756011542160949, "grad_norm": 0.7011180520057678, "learning_rate": 8.16961423880797e-05, "loss": 5.2977, "num_input_tokens_seen": 517865472, "step": 3951 }, { "epoch": 0.6761141391471626, "grad_norm": 0.7099843621253967, "learning_rate": 8.166514404039269e-05, "loss": 5.3637, "num_input_tokens_seen": 518258688, "step": 3954 }, { "epoch": 0.6766271240782302, "grad_norm": 0.760845422744751, "learning_rate": 8.16341809514919e-05, "loss": 5.307, "num_input_tokens_seen": 518651904, "step": 3957 }, { "epoch": 0.6771401090092979, "grad_norm": 0.8513478636741638, "learning_rate": 8.160325305458667e-05, "loss": 5.3293, "num_input_tokens_seen": 519045120, "step": 3960 }, { "epoch": 0.6776530939403655, "grad_norm": 0.804071307182312, "learning_rate": 8.157236028306331e-05, "loss": 5.3303, "num_input_tokens_seen": 519438336, "step": 3963 }, { "epoch": 0.6781660788714331, "grad_norm": 0.7225372791290283, "learning_rate": 8.154150257048457e-05, "loss": 5.3751, "num_input_tokens_seen": 519831552, "step": 3966 }, { "epoch": 0.6786790638025008, "grad_norm": 0.7206907272338867, "learning_rate": 8.151067985058899e-05, "loss": 5.3409, "num_input_tokens_seen": 520224768, "step": 3969 }, { "epoch": 0.6791920487335684, "grad_norm": 0.7497385740280151, "learning_rate": 8.147989205729026e-05, "loss": 5.3236, "num_input_tokens_seen": 520617984, "step": 3972 }, { "epoch": 0.6797050336646361, "grad_norm": 0.7918210029602051, "learning_rate": 8.14491391246767e-05, "loss": 5.3549, "num_input_tokens_seen": 521011200, "step": 3975 }, { "epoch": 0.6802180185957037, "grad_norm": 0.8181982636451721, "learning_rate": 8.141842098701066e-05, "loss": 5.3461, "num_input_tokens_seen": 521404416, "step": 3978 }, { "epoch": 0.6807310035267714, "grad_norm": 0.7581628561019897, "learning_rate": 8.138773757872787e-05, "loss": 5.3411, "num_input_tokens_seen": 521797632, "step": 3981 }, { "epoch": 0.6812439884578391, "grad_norm": 0.7180889844894409, "learning_rate": 8.135708883443689e-05, "loss": 5.3157, "num_input_tokens_seen": 522190848, "step": 3984 }, { "epoch": 0.6817569733889067, "grad_norm": 0.7033380270004272, "learning_rate": 8.132647468891857e-05, "loss": 5.3305, "num_input_tokens_seen": 522584064, "step": 3987 }, { "epoch": 0.6822699583199744, "grad_norm": 0.7060420513153076, "learning_rate": 8.129589507712537e-05, "loss": 5.3815, "num_input_tokens_seen": 522977280, "step": 3990 }, { "epoch": 0.682782943251042, "grad_norm": 0.7359711527824402, "learning_rate": 8.126534993418085e-05, "loss": 5.3349, "num_input_tokens_seen": 523370496, "step": 3993 }, { "epoch": 0.6832959281821096, "grad_norm": 0.8105011582374573, "learning_rate": 8.123483919537908e-05, "loss": 5.3192, "num_input_tokens_seen": 523763712, "step": 3996 }, { "epoch": 0.6838089131131773, "grad_norm": 0.7450350522994995, "learning_rate": 8.120436279618406e-05, "loss": 5.3175, "num_input_tokens_seen": 524156928, "step": 3999 }, { "epoch": 0.6839799080901998, "eval_accuracy": 0.18136948379742712, "eval_loss": 5.791625022888184, "eval_runtime": 111.6249, "eval_samples_per_second": 2.688, "eval_steps_per_second": 1.344, "num_input_tokens_seen": 524288000, "step": 4000 }, { "epoch": 0.6843218980442449, "grad_norm": 0.742009162902832, "learning_rate": 8.117392067222913e-05, "loss": 5.2929, "num_input_tokens_seen": 524550144, "step": 4002 }, { "epoch": 0.6848348829753126, "grad_norm": 0.8065789937973022, "learning_rate": 8.114351275931643e-05, "loss": 5.3041, "num_input_tokens_seen": 524943360, "step": 4005 }, { "epoch": 0.6853478679063802, "grad_norm": 0.8113951086997986, "learning_rate": 8.111313899341628e-05, "loss": 5.3378, "num_input_tokens_seen": 525336576, "step": 4008 }, { "epoch": 0.6858608528374479, "grad_norm": 0.7892742156982422, "learning_rate": 8.10827993106667e-05, "loss": 5.3148, "num_input_tokens_seen": 525729792, "step": 4011 }, { "epoch": 0.6863738377685156, "grad_norm": 0.8820670247077942, "learning_rate": 8.105249364737273e-05, "loss": 5.2681, "num_input_tokens_seen": 526123008, "step": 4014 }, { "epoch": 0.6868868226995832, "grad_norm": 0.7888779640197754, "learning_rate": 8.102222194000602e-05, "loss": 5.3129, "num_input_tokens_seen": 526516224, "step": 4017 }, { "epoch": 0.6873998076306509, "grad_norm": 0.8002054691314697, "learning_rate": 8.099198412520408e-05, "loss": 5.2667, "num_input_tokens_seen": 526909440, "step": 4020 }, { "epoch": 0.6879127925617184, "grad_norm": 0.8794166445732117, "learning_rate": 8.096178013976995e-05, "loss": 5.3393, "num_input_tokens_seen": 527302656, "step": 4023 }, { "epoch": 0.6884257774927861, "grad_norm": 0.8726845979690552, "learning_rate": 8.093160992067137e-05, "loss": 5.3492, "num_input_tokens_seen": 527695872, "step": 4026 }, { "epoch": 0.6889387624238538, "grad_norm": 0.7601503133773804, "learning_rate": 8.09014734050405e-05, "loss": 5.3517, "num_input_tokens_seen": 528089088, "step": 4029 }, { "epoch": 0.6894517473549214, "grad_norm": 0.8088439106941223, "learning_rate": 8.087137053017323e-05, "loss": 5.3387, "num_input_tokens_seen": 528482304, "step": 4032 }, { "epoch": 0.6899647322859891, "grad_norm": 0.7194026112556458, "learning_rate": 8.084130123352858e-05, "loss": 5.3721, "num_input_tokens_seen": 528875520, "step": 4035 }, { "epoch": 0.6904777172170568, "grad_norm": 0.7850322723388672, "learning_rate": 8.081126545272833e-05, "loss": 5.3402, "num_input_tokens_seen": 529268736, "step": 4038 }, { "epoch": 0.6909907021481244, "grad_norm": 0.6744662523269653, "learning_rate": 8.078126312555625e-05, "loss": 5.3444, "num_input_tokens_seen": 529661952, "step": 4041 }, { "epoch": 0.6915036870791921, "grad_norm": 0.7170355916023254, "learning_rate": 8.075129418995781e-05, "loss": 5.3821, "num_input_tokens_seen": 530055168, "step": 4044 }, { "epoch": 0.6920166720102597, "grad_norm": 0.7175304889678955, "learning_rate": 8.072135858403943e-05, "loss": 5.3562, "num_input_tokens_seen": 530448384, "step": 4047 }, { "epoch": 0.6925296569413274, "grad_norm": 0.7564287185668945, "learning_rate": 8.069145624606803e-05, "loss": 5.3044, "num_input_tokens_seen": 530841600, "step": 4050 }, { "epoch": 0.693042641872395, "grad_norm": 0.7804622650146484, "learning_rate": 8.06615871144705e-05, "loss": 5.3709, "num_input_tokens_seen": 531234816, "step": 4053 }, { "epoch": 0.6935556268034626, "grad_norm": 0.7738920450210571, "learning_rate": 8.063175112783317e-05, "loss": 5.3234, "num_input_tokens_seen": 531628032, "step": 4056 }, { "epoch": 0.6940686117345303, "grad_norm": 0.7369733452796936, "learning_rate": 8.060194822490128e-05, "loss": 5.2849, "num_input_tokens_seen": 532021248, "step": 4059 }, { "epoch": 0.6945815966655979, "grad_norm": 0.6854174733161926, "learning_rate": 8.057217834457838e-05, "loss": 5.3224, "num_input_tokens_seen": 532414464, "step": 4062 }, { "epoch": 0.6950945815966656, "grad_norm": 0.6956667304039001, "learning_rate": 8.054244142592593e-05, "loss": 5.3002, "num_input_tokens_seen": 532807680, "step": 4065 }, { "epoch": 0.6956075665277333, "grad_norm": 0.8726805448532104, "learning_rate": 8.051273740816265e-05, "loss": 5.3259, "num_input_tokens_seen": 533200896, "step": 4068 }, { "epoch": 0.6961205514588009, "grad_norm": 0.7644637227058411, "learning_rate": 8.048306623066408e-05, "loss": 5.3527, "num_input_tokens_seen": 533594112, "step": 4071 }, { "epoch": 0.6966335363898686, "grad_norm": 0.7367181181907654, "learning_rate": 8.045342783296207e-05, "loss": 5.2681, "num_input_tokens_seen": 533987328, "step": 4074 }, { "epoch": 0.6971465213209362, "grad_norm": 0.8064795732498169, "learning_rate": 8.04238221547442e-05, "loss": 5.3359, "num_input_tokens_seen": 534380544, "step": 4077 }, { "epoch": 0.6976595062520039, "grad_norm": 0.7639942169189453, "learning_rate": 8.03942491358533e-05, "loss": 5.3703, "num_input_tokens_seen": 534773760, "step": 4080 }, { "epoch": 0.6981724911830715, "grad_norm": 0.7479289770126343, "learning_rate": 8.036470871628689e-05, "loss": 5.3445, "num_input_tokens_seen": 535166976, "step": 4083 }, { "epoch": 0.6986854761141391, "grad_norm": 0.9488199949264526, "learning_rate": 8.033520083619678e-05, "loss": 5.3398, "num_input_tokens_seen": 535560192, "step": 4086 }, { "epoch": 0.6991984610452068, "grad_norm": 0.8831247687339783, "learning_rate": 8.030572543588844e-05, "loss": 5.3225, "num_input_tokens_seen": 535953408, "step": 4089 }, { "epoch": 0.6997114459762744, "grad_norm": 0.787559449672699, "learning_rate": 8.027628245582056e-05, "loss": 5.3118, "num_input_tokens_seen": 536346624, "step": 4092 }, { "epoch": 0.7002244309073421, "grad_norm": 0.8601611852645874, "learning_rate": 8.024687183660457e-05, "loss": 5.3217, "num_input_tokens_seen": 536739840, "step": 4095 }, { "epoch": 0.7007374158384098, "grad_norm": 0.762852668762207, "learning_rate": 8.021749351900399e-05, "loss": 5.3815, "num_input_tokens_seen": 537133056, "step": 4098 }, { "epoch": 0.7012504007694774, "grad_norm": 0.6953300833702087, "learning_rate": 8.018814744393415e-05, "loss": 5.3272, "num_input_tokens_seen": 537526272, "step": 4101 }, { "epoch": 0.7017633857005451, "grad_norm": 0.7371882796287537, "learning_rate": 8.01588335524615e-05, "loss": 5.2754, "num_input_tokens_seen": 537919488, "step": 4104 }, { "epoch": 0.7022763706316127, "grad_norm": 0.7776148319244385, "learning_rate": 8.01295517858032e-05, "loss": 5.3249, "num_input_tokens_seen": 538312704, "step": 4107 }, { "epoch": 0.7027893555626803, "grad_norm": 0.7305675745010376, "learning_rate": 8.010030208532664e-05, "loss": 5.3191, "num_input_tokens_seen": 538705920, "step": 4110 }, { "epoch": 0.703302340493748, "grad_norm": 0.7893403172492981, "learning_rate": 8.007108439254888e-05, "loss": 5.2771, "num_input_tokens_seen": 539099136, "step": 4113 }, { "epoch": 0.7038153254248156, "grad_norm": 0.9661571383476257, "learning_rate": 8.004189864913625e-05, "loss": 5.34, "num_input_tokens_seen": 539492352, "step": 4116 }, { "epoch": 0.7043283103558833, "grad_norm": 0.7898194789886475, "learning_rate": 8.001274479690375e-05, "loss": 5.3278, "num_input_tokens_seen": 539885568, "step": 4119 }, { "epoch": 0.7048412952869509, "grad_norm": 0.7659739851951599, "learning_rate": 7.998362277781467e-05, "loss": 5.3226, "num_input_tokens_seen": 540278784, "step": 4122 }, { "epoch": 0.7053542802180186, "grad_norm": 0.889785885810852, "learning_rate": 7.995453253398004e-05, "loss": 5.2994, "num_input_tokens_seen": 540672000, "step": 4125 }, { "epoch": 0.7058672651490863, "grad_norm": 0.7162066698074341, "learning_rate": 7.992547400765813e-05, "loss": 5.3294, "num_input_tokens_seen": 541065216, "step": 4128 }, { "epoch": 0.7063802500801539, "grad_norm": 0.6695894002914429, "learning_rate": 7.989644714125407e-05, "loss": 5.296, "num_input_tokens_seen": 541458432, "step": 4131 }, { "epoch": 0.7068932350112216, "grad_norm": 0.7473436594009399, "learning_rate": 7.986745187731925e-05, "loss": 5.3078, "num_input_tokens_seen": 541851648, "step": 4134 }, { "epoch": 0.7074062199422892, "grad_norm": 0.7301039099693298, "learning_rate": 7.983848815855091e-05, "loss": 5.3322, "num_input_tokens_seen": 542244864, "step": 4137 }, { "epoch": 0.7079192048733568, "grad_norm": 0.7744476795196533, "learning_rate": 7.980955592779166e-05, "loss": 5.3224, "num_input_tokens_seen": 542638080, "step": 4140 }, { "epoch": 0.7084321898044245, "grad_norm": 0.7293386459350586, "learning_rate": 7.978065512802896e-05, "loss": 5.3634, "num_input_tokens_seen": 543031296, "step": 4143 }, { "epoch": 0.7089451747354921, "grad_norm": 0.6727131009101868, "learning_rate": 7.975178570239474e-05, "loss": 5.3217, "num_input_tokens_seen": 543424512, "step": 4146 }, { "epoch": 0.7094581596665598, "grad_norm": 0.7421523928642273, "learning_rate": 7.972294759416482e-05, "loss": 5.3374, "num_input_tokens_seen": 543817728, "step": 4149 }, { "epoch": 0.7099711445976274, "grad_norm": 0.6859121918678284, "learning_rate": 7.969414074675855e-05, "loss": 5.3435, "num_input_tokens_seen": 544210944, "step": 4152 }, { "epoch": 0.7104841295286951, "grad_norm": 0.7437335848808289, "learning_rate": 7.966536510373822e-05, "loss": 5.3121, "num_input_tokens_seen": 544604160, "step": 4155 }, { "epoch": 0.7109971144597628, "grad_norm": 0.7755163311958313, "learning_rate": 7.963662060880875e-05, "loss": 5.3148, "num_input_tokens_seen": 544997376, "step": 4158 }, { "epoch": 0.7115100993908304, "grad_norm": 0.7485529780387878, "learning_rate": 7.960790720581703e-05, "loss": 5.2968, "num_input_tokens_seen": 545390592, "step": 4161 }, { "epoch": 0.7120230843218981, "grad_norm": 0.7412263751029968, "learning_rate": 7.957922483875172e-05, "loss": 5.3184, "num_input_tokens_seen": 545783808, "step": 4164 }, { "epoch": 0.7125360692529656, "grad_norm": 0.7376791834831238, "learning_rate": 7.95505734517425e-05, "loss": 5.3002, "num_input_tokens_seen": 546177024, "step": 4167 }, { "epoch": 0.7130490541840333, "grad_norm": 0.7660586833953857, "learning_rate": 7.952195298905983e-05, "loss": 5.3282, "num_input_tokens_seen": 546570240, "step": 4170 }, { "epoch": 0.713562039115101, "grad_norm": 0.729190468788147, "learning_rate": 7.949336339511443e-05, "loss": 5.3018, "num_input_tokens_seen": 546963456, "step": 4173 }, { "epoch": 0.7140750240461686, "grad_norm": 0.7029862403869629, "learning_rate": 7.946480461445679e-05, "loss": 5.2991, "num_input_tokens_seen": 547356672, "step": 4176 }, { "epoch": 0.7145880089772363, "grad_norm": 0.8115814924240112, "learning_rate": 7.943627659177671e-05, "loss": 5.2578, "num_input_tokens_seen": 547749888, "step": 4179 }, { "epoch": 0.715100993908304, "grad_norm": 0.9112886190414429, "learning_rate": 7.940777927190298e-05, "loss": 5.3422, "num_input_tokens_seen": 548143104, "step": 4182 }, { "epoch": 0.7156139788393716, "grad_norm": 0.7369971871376038, "learning_rate": 7.937931259980275e-05, "loss": 5.3049, "num_input_tokens_seen": 548536320, "step": 4185 }, { "epoch": 0.7161269637704393, "grad_norm": 0.7676149606704712, "learning_rate": 7.935087652058122e-05, "loss": 5.3626, "num_input_tokens_seen": 548929536, "step": 4188 }, { "epoch": 0.7166399487015069, "grad_norm": 0.6889289617538452, "learning_rate": 7.932247097948111e-05, "loss": 5.2493, "num_input_tokens_seen": 549322752, "step": 4191 }, { "epoch": 0.7171529336325746, "grad_norm": 0.7226115465164185, "learning_rate": 7.929409592188228e-05, "loss": 5.3026, "num_input_tokens_seen": 549715968, "step": 4194 }, { "epoch": 0.7176659185636421, "grad_norm": 0.7485541701316833, "learning_rate": 7.926575129330127e-05, "loss": 5.3454, "num_input_tokens_seen": 550109184, "step": 4197 }, { "epoch": 0.7181789034947098, "grad_norm": 0.7012051343917847, "learning_rate": 7.923743703939083e-05, "loss": 5.3409, "num_input_tokens_seen": 550502400, "step": 4200 }, { "epoch": 0.7186918884257775, "grad_norm": 0.7870872616767883, "learning_rate": 7.920915310593953e-05, "loss": 5.2632, "num_input_tokens_seen": 550895616, "step": 4203 }, { "epoch": 0.7192048733568451, "grad_norm": 0.7597965598106384, "learning_rate": 7.918089943887127e-05, "loss": 5.3635, "num_input_tokens_seen": 551288832, "step": 4206 }, { "epoch": 0.7197178582879128, "grad_norm": 0.7153079509735107, "learning_rate": 7.915267598424488e-05, "loss": 5.2955, "num_input_tokens_seen": 551682048, "step": 4209 }, { "epoch": 0.7202308432189805, "grad_norm": 0.713191568851471, "learning_rate": 7.91244826882537e-05, "loss": 5.3174, "num_input_tokens_seen": 552075264, "step": 4212 }, { "epoch": 0.7207438281500481, "grad_norm": 0.774713397026062, "learning_rate": 7.909631949722512e-05, "loss": 5.3312, "num_input_tokens_seen": 552468480, "step": 4215 }, { "epoch": 0.7212568130811158, "grad_norm": 0.6564570665359497, "learning_rate": 7.90681863576202e-05, "loss": 5.2906, "num_input_tokens_seen": 552861696, "step": 4218 }, { "epoch": 0.7217697980121834, "grad_norm": 0.7904906868934631, "learning_rate": 7.904008321603313e-05, "loss": 5.282, "num_input_tokens_seen": 553254912, "step": 4221 }, { "epoch": 0.722282782943251, "grad_norm": 0.7598406672477722, "learning_rate": 7.90120100191909e-05, "loss": 5.3137, "num_input_tokens_seen": 553648128, "step": 4224 }, { "epoch": 0.7227957678743187, "grad_norm": 0.7061671018600464, "learning_rate": 7.898396671395296e-05, "loss": 5.3025, "num_input_tokens_seen": 554041344, "step": 4227 }, { "epoch": 0.7233087528053863, "grad_norm": 0.7577418684959412, "learning_rate": 7.895595324731055e-05, "loss": 5.2867, "num_input_tokens_seen": 554434560, "step": 4230 }, { "epoch": 0.723821737736454, "grad_norm": 0.8547803163528442, "learning_rate": 7.892796956638649e-05, "loss": 5.3218, "num_input_tokens_seen": 554827776, "step": 4233 }, { "epoch": 0.7243347226675216, "grad_norm": 0.6954768300056458, "learning_rate": 7.890001561843465e-05, "loss": 5.2734, "num_input_tokens_seen": 555220992, "step": 4236 }, { "epoch": 0.7248477075985893, "grad_norm": 0.6641873121261597, "learning_rate": 7.88720913508397e-05, "loss": 5.3519, "num_input_tokens_seen": 555614208, "step": 4239 }, { "epoch": 0.725360692529657, "grad_norm": 0.7015215754508972, "learning_rate": 7.88441967111164e-05, "loss": 5.3236, "num_input_tokens_seen": 556007424, "step": 4242 }, { "epoch": 0.7258736774607246, "grad_norm": 0.8150922656059265, "learning_rate": 7.881633164690944e-05, "loss": 5.3056, "num_input_tokens_seen": 556400640, "step": 4245 }, { "epoch": 0.7263866623917923, "grad_norm": 0.7670294642448425, "learning_rate": 7.878849610599295e-05, "loss": 5.341, "num_input_tokens_seen": 556793856, "step": 4248 }, { "epoch": 0.7268996473228599, "grad_norm": 0.6963903307914734, "learning_rate": 7.876069003627009e-05, "loss": 5.3405, "num_input_tokens_seen": 557187072, "step": 4251 }, { "epoch": 0.7274126322539275, "grad_norm": 0.7592700719833374, "learning_rate": 7.873291338577257e-05, "loss": 5.3088, "num_input_tokens_seen": 557580288, "step": 4254 }, { "epoch": 0.7279256171849952, "grad_norm": 0.7837212085723877, "learning_rate": 7.870516610266037e-05, "loss": 5.3128, "num_input_tokens_seen": 557973504, "step": 4257 }, { "epoch": 0.7284386021160628, "grad_norm": 0.6755237579345703, "learning_rate": 7.86774481352212e-05, "loss": 5.321, "num_input_tokens_seen": 558366720, "step": 4260 }, { "epoch": 0.7289515870471305, "grad_norm": 0.7284892201423645, "learning_rate": 7.864975943187024e-05, "loss": 5.2798, "num_input_tokens_seen": 558759936, "step": 4263 }, { "epoch": 0.7294645719781981, "grad_norm": 0.7344948053359985, "learning_rate": 7.862209994114962e-05, "loss": 5.2867, "num_input_tokens_seen": 559153152, "step": 4266 }, { "epoch": 0.7299775569092658, "grad_norm": 0.7037729024887085, "learning_rate": 7.859446961172803e-05, "loss": 5.2836, "num_input_tokens_seen": 559546368, "step": 4269 }, { "epoch": 0.7304905418403335, "grad_norm": 0.7983783483505249, "learning_rate": 7.856686839240038e-05, "loss": 5.3308, "num_input_tokens_seen": 559939584, "step": 4272 }, { "epoch": 0.7310035267714011, "grad_norm": 0.7463051080703735, "learning_rate": 7.853929623208739e-05, "loss": 5.3328, "num_input_tokens_seen": 560332800, "step": 4275 }, { "epoch": 0.7315165117024688, "grad_norm": 0.7885538339614868, "learning_rate": 7.851175307983515e-05, "loss": 5.3424, "num_input_tokens_seen": 560726016, "step": 4278 }, { "epoch": 0.7320294966335363, "grad_norm": 0.810090959072113, "learning_rate": 7.84842388848147e-05, "loss": 5.2868, "num_input_tokens_seen": 561119232, "step": 4281 }, { "epoch": 0.732542481564604, "grad_norm": 0.7771869897842407, "learning_rate": 7.845675359632176e-05, "loss": 5.2846, "num_input_tokens_seen": 561512448, "step": 4284 }, { "epoch": 0.7330554664956717, "grad_norm": 0.7376582026481628, "learning_rate": 7.842929716377623e-05, "loss": 5.2987, "num_input_tokens_seen": 561905664, "step": 4287 }, { "epoch": 0.7335684514267393, "grad_norm": 0.6871978640556335, "learning_rate": 7.84018695367218e-05, "loss": 5.236, "num_input_tokens_seen": 562298880, "step": 4290 }, { "epoch": 0.734081436357807, "grad_norm": 0.8098225593566895, "learning_rate": 7.837447066482563e-05, "loss": 5.3488, "num_input_tokens_seen": 562692096, "step": 4293 }, { "epoch": 0.7345944212888746, "grad_norm": 0.8568369150161743, "learning_rate": 7.834710049787791e-05, "loss": 5.3218, "num_input_tokens_seen": 563085312, "step": 4296 }, { "epoch": 0.7351074062199423, "grad_norm": 0.7187222242355347, "learning_rate": 7.831975898579147e-05, "loss": 5.3066, "num_input_tokens_seen": 563478528, "step": 4299 }, { "epoch": 0.73562039115101, "grad_norm": 0.7189285755157471, "learning_rate": 7.829244607860141e-05, "loss": 5.2294, "num_input_tokens_seen": 563871744, "step": 4302 }, { "epoch": 0.7361333760820776, "grad_norm": 1.026648759841919, "learning_rate": 7.826516172646476e-05, "loss": 5.2955, "num_input_tokens_seen": 564264960, "step": 4305 }, { "epoch": 0.7366463610131453, "grad_norm": 0.7149240374565125, "learning_rate": 7.823790587966001e-05, "loss": 5.3512, "num_input_tokens_seen": 564658176, "step": 4308 }, { "epoch": 0.7371593459442128, "grad_norm": 0.7869164943695068, "learning_rate": 7.821067848858679e-05, "loss": 5.2569, "num_input_tokens_seen": 565051392, "step": 4311 }, { "epoch": 0.7376723308752805, "grad_norm": 0.882475733757019, "learning_rate": 7.818347950376548e-05, "loss": 5.3148, "num_input_tokens_seen": 565444608, "step": 4314 }, { "epoch": 0.7381853158063482, "grad_norm": 0.818747878074646, "learning_rate": 7.815630887583679e-05, "loss": 5.3013, "num_input_tokens_seen": 565837824, "step": 4317 }, { "epoch": 0.7386983007374158, "grad_norm": 0.854594886302948, "learning_rate": 7.812916655556147e-05, "loss": 5.3028, "num_input_tokens_seen": 566231040, "step": 4320 }, { "epoch": 0.7392112856684835, "grad_norm": 1.0109381675720215, "learning_rate": 7.810205249381987e-05, "loss": 5.3257, "num_input_tokens_seen": 566624256, "step": 4323 }, { "epoch": 0.7397242705995511, "grad_norm": 0.8108871579170227, "learning_rate": 7.80749666416116e-05, "loss": 5.2564, "num_input_tokens_seen": 567017472, "step": 4326 }, { "epoch": 0.7402372555306188, "grad_norm": 0.9545855522155762, "learning_rate": 7.80479089500551e-05, "loss": 5.3024, "num_input_tokens_seen": 567410688, "step": 4329 }, { "epoch": 0.7407502404616865, "grad_norm": 0.7847074866294861, "learning_rate": 7.802087937038731e-05, "loss": 5.2754, "num_input_tokens_seen": 567803904, "step": 4332 }, { "epoch": 0.7412632253927541, "grad_norm": 0.7441365122795105, "learning_rate": 7.799387785396339e-05, "loss": 5.2589, "num_input_tokens_seen": 568197120, "step": 4335 }, { "epoch": 0.7417762103238217, "grad_norm": 0.7231647968292236, "learning_rate": 7.796690435225613e-05, "loss": 5.3153, "num_input_tokens_seen": 568590336, "step": 4338 }, { "epoch": 0.7422891952548893, "grad_norm": 0.7042517066001892, "learning_rate": 7.793995881685584e-05, "loss": 5.2829, "num_input_tokens_seen": 568983552, "step": 4341 }, { "epoch": 0.742802180185957, "grad_norm": 0.6561905741691589, "learning_rate": 7.791304119946978e-05, "loss": 5.2513, "num_input_tokens_seen": 569376768, "step": 4344 }, { "epoch": 0.7433151651170247, "grad_norm": 0.7863454818725586, "learning_rate": 7.788615145192192e-05, "loss": 5.2902, "num_input_tokens_seen": 569769984, "step": 4347 }, { "epoch": 0.7438281500480923, "grad_norm": 0.8196501135826111, "learning_rate": 7.785928952615248e-05, "loss": 5.2557, "num_input_tokens_seen": 570163200, "step": 4350 }, { "epoch": 0.74434113497916, "grad_norm": 0.7091982960700989, "learning_rate": 7.783245537421777e-05, "loss": 5.2603, "num_input_tokens_seen": 570556416, "step": 4353 }, { "epoch": 0.7448541199102277, "grad_norm": 0.6906124353408813, "learning_rate": 7.780564894828949e-05, "loss": 5.319, "num_input_tokens_seen": 570949632, "step": 4356 }, { "epoch": 0.7453671048412953, "grad_norm": 0.9479339718818665, "learning_rate": 7.777887020065473e-05, "loss": 5.3323, "num_input_tokens_seen": 571342848, "step": 4359 }, { "epoch": 0.745880089772363, "grad_norm": 0.6457544565200806, "learning_rate": 7.775211908371534e-05, "loss": 5.2699, "num_input_tokens_seen": 571736064, "step": 4362 }, { "epoch": 0.7463930747034306, "grad_norm": 0.8440648913383484, "learning_rate": 7.772539554998778e-05, "loss": 5.2753, "num_input_tokens_seen": 572129280, "step": 4365 }, { "epoch": 0.7469060596344982, "grad_norm": 0.8873343467712402, "learning_rate": 7.76986995521026e-05, "loss": 5.2944, "num_input_tokens_seen": 572522496, "step": 4368 }, { "epoch": 0.7474190445655658, "grad_norm": 0.7578151822090149, "learning_rate": 7.767203104280422e-05, "loss": 5.2523, "num_input_tokens_seen": 572915712, "step": 4371 }, { "epoch": 0.7479320294966335, "grad_norm": 0.7755122780799866, "learning_rate": 7.764538997495046e-05, "loss": 5.3015, "num_input_tokens_seen": 573308928, "step": 4374 }, { "epoch": 0.7484450144277012, "grad_norm": 0.8372567296028137, "learning_rate": 7.761877630151229e-05, "loss": 5.2548, "num_input_tokens_seen": 573702144, "step": 4377 }, { "epoch": 0.7489579993587688, "grad_norm": 0.8696802854537964, "learning_rate": 7.759218997557344e-05, "loss": 5.308, "num_input_tokens_seen": 574095360, "step": 4380 }, { "epoch": 0.7494709842898365, "grad_norm": 0.6701018810272217, "learning_rate": 7.756563095033e-05, "loss": 5.2979, "num_input_tokens_seen": 574488576, "step": 4383 }, { "epoch": 0.7499839692209042, "grad_norm": 0.7554642558097839, "learning_rate": 7.75390991790902e-05, "loss": 5.3501, "num_input_tokens_seen": 574881792, "step": 4386 }, { "epoch": 0.7504969541519718, "grad_norm": 0.7375338077545166, "learning_rate": 7.751259461527394e-05, "loss": 5.2445, "num_input_tokens_seen": 575275008, "step": 4389 }, { "epoch": 0.7510099390830395, "grad_norm": 0.6931892037391663, "learning_rate": 7.748611721241256e-05, "loss": 5.2891, "num_input_tokens_seen": 575668224, "step": 4392 }, { "epoch": 0.7515229240141071, "grad_norm": 0.7114129066467285, "learning_rate": 7.745966692414832e-05, "loss": 5.2936, "num_input_tokens_seen": 576061440, "step": 4395 }, { "epoch": 0.7520359089451747, "grad_norm": 0.7084842324256897, "learning_rate": 7.743324370423433e-05, "loss": 5.3014, "num_input_tokens_seen": 576454656, "step": 4398 }, { "epoch": 0.7523778988992198, "eval_accuracy": 0.18137436899527765, "eval_loss": 5.756495952606201, "eval_runtime": 110.1663, "eval_samples_per_second": 2.723, "eval_steps_per_second": 1.362, "num_input_tokens_seen": 576716800, "step": 4400 }, { "epoch": 0.7525488938762424, "grad_norm": 0.8616915941238403, "learning_rate": 7.74068475065339e-05, "loss": 5.2833, "num_input_tokens_seen": 576847872, "step": 4401 }, { "epoch": 0.75306187880731, "grad_norm": 0.7370516061782837, "learning_rate": 7.738047828502048e-05, "loss": 5.2773, "num_input_tokens_seen": 577241088, "step": 4404 }, { "epoch": 0.7535748637383777, "grad_norm": 0.8121903538703918, "learning_rate": 7.735413599377714e-05, "loss": 5.2661, "num_input_tokens_seen": 577634304, "step": 4407 }, { "epoch": 0.7540878486694453, "grad_norm": 0.775635302066803, "learning_rate": 7.732782058699632e-05, "loss": 5.2528, "num_input_tokens_seen": 578027520, "step": 4410 }, { "epoch": 0.754600833600513, "grad_norm": 0.6981225609779358, "learning_rate": 7.730153201897945e-05, "loss": 5.3143, "num_input_tokens_seen": 578420736, "step": 4413 }, { "epoch": 0.7551138185315807, "grad_norm": 0.6825100779533386, "learning_rate": 7.727527024413663e-05, "loss": 5.2626, "num_input_tokens_seen": 578813952, "step": 4416 }, { "epoch": 0.7556268034626483, "grad_norm": 0.7106081247329712, "learning_rate": 7.724903521698631e-05, "loss": 5.272, "num_input_tokens_seen": 579207168, "step": 4419 }, { "epoch": 0.756139788393716, "grad_norm": 0.7914432287216187, "learning_rate": 7.722282689215501e-05, "loss": 5.2469, "num_input_tokens_seen": 579600384, "step": 4422 }, { "epoch": 0.7566527733247835, "grad_norm": 0.8186551928520203, "learning_rate": 7.719664522437684e-05, "loss": 5.3293, "num_input_tokens_seen": 579993600, "step": 4425 }, { "epoch": 0.7571657582558512, "grad_norm": 0.7702829837799072, "learning_rate": 7.717049016849333e-05, "loss": 5.2875, "num_input_tokens_seen": 580386816, "step": 4428 }, { "epoch": 0.7576787431869189, "grad_norm": 0.7700650095939636, "learning_rate": 7.714436167945303e-05, "loss": 5.2823, "num_input_tokens_seen": 580780032, "step": 4431 }, { "epoch": 0.7581917281179865, "grad_norm": 0.8458641767501831, "learning_rate": 7.71182597123112e-05, "loss": 5.2691, "num_input_tokens_seen": 581173248, "step": 4434 }, { "epoch": 0.7587047130490542, "grad_norm": 0.6728209853172302, "learning_rate": 7.709218422222942e-05, "loss": 5.2768, "num_input_tokens_seen": 581566464, "step": 4437 }, { "epoch": 0.7592176979801218, "grad_norm": 0.7042077779769897, "learning_rate": 7.706613516447538e-05, "loss": 5.2504, "num_input_tokens_seen": 581959680, "step": 4440 }, { "epoch": 0.7597306829111895, "grad_norm": 0.7626017332077026, "learning_rate": 7.704011249442249e-05, "loss": 5.3261, "num_input_tokens_seen": 582352896, "step": 4443 }, { "epoch": 0.7602436678422572, "grad_norm": 0.7157188057899475, "learning_rate": 7.70141161675496e-05, "loss": 5.2466, "num_input_tokens_seen": 582746112, "step": 4446 }, { "epoch": 0.7607566527733248, "grad_norm": 0.6773563623428345, "learning_rate": 7.69881461394406e-05, "loss": 5.2757, "num_input_tokens_seen": 583139328, "step": 4449 }, { "epoch": 0.7612696377043925, "grad_norm": 0.7485852241516113, "learning_rate": 7.696220236578416e-05, "loss": 5.3204, "num_input_tokens_seen": 583532544, "step": 4452 }, { "epoch": 0.76178262263546, "grad_norm": 0.7189561128616333, "learning_rate": 7.693628480237344e-05, "loss": 5.2909, "num_input_tokens_seen": 583925760, "step": 4455 }, { "epoch": 0.7622956075665277, "grad_norm": 0.809196412563324, "learning_rate": 7.691039340510571e-05, "loss": 5.313, "num_input_tokens_seen": 584318976, "step": 4458 }, { "epoch": 0.7628085924975954, "grad_norm": 0.8321667909622192, "learning_rate": 7.688452812998208e-05, "loss": 5.3164, "num_input_tokens_seen": 584712192, "step": 4461 }, { "epoch": 0.763321577428663, "grad_norm": 0.6554286479949951, "learning_rate": 7.685868893310715e-05, "loss": 5.2475, "num_input_tokens_seen": 585105408, "step": 4464 }, { "epoch": 0.7638345623597307, "grad_norm": 0.7397018671035767, "learning_rate": 7.683287577068874e-05, "loss": 5.2725, "num_input_tokens_seen": 585498624, "step": 4467 }, { "epoch": 0.7643475472907983, "grad_norm": 0.6564092636108398, "learning_rate": 7.680708859903753e-05, "loss": 5.2621, "num_input_tokens_seen": 585891840, "step": 4470 }, { "epoch": 0.764860532221866, "grad_norm": 0.7277629971504211, "learning_rate": 7.678132737456681e-05, "loss": 5.2376, "num_input_tokens_seen": 586285056, "step": 4473 }, { "epoch": 0.7653735171529337, "grad_norm": 0.7863820195198059, "learning_rate": 7.675559205379208e-05, "loss": 5.2615, "num_input_tokens_seen": 586678272, "step": 4476 }, { "epoch": 0.7658865020840013, "grad_norm": 0.760823130607605, "learning_rate": 7.672988259333085e-05, "loss": 5.2611, "num_input_tokens_seen": 587071488, "step": 4479 }, { "epoch": 0.7663994870150689, "grad_norm": 0.8245643377304077, "learning_rate": 7.670419894990224e-05, "loss": 5.234, "num_input_tokens_seen": 587464704, "step": 4482 }, { "epoch": 0.7669124719461365, "grad_norm": 0.8459969162940979, "learning_rate": 7.667854108032676e-05, "loss": 5.3133, "num_input_tokens_seen": 587857920, "step": 4485 }, { "epoch": 0.7674254568772042, "grad_norm": 0.7487183809280396, "learning_rate": 7.665290894152588e-05, "loss": 5.2422, "num_input_tokens_seen": 588251136, "step": 4488 }, { "epoch": 0.7679384418082719, "grad_norm": 0.6825780868530273, "learning_rate": 7.662730249052193e-05, "loss": 5.2759, "num_input_tokens_seen": 588644352, "step": 4491 }, { "epoch": 0.7684514267393395, "grad_norm": 0.7450793385505676, "learning_rate": 7.660172168443752e-05, "loss": 5.3166, "num_input_tokens_seen": 589037568, "step": 4494 }, { "epoch": 0.7689644116704072, "grad_norm": 0.7206063270568848, "learning_rate": 7.657616648049552e-05, "loss": 5.2758, "num_input_tokens_seen": 589430784, "step": 4497 }, { "epoch": 0.7694773966014749, "grad_norm": 0.7094364166259766, "learning_rate": 7.655063683601855e-05, "loss": 5.2929, "num_input_tokens_seen": 589824000, "step": 4500 }, { "epoch": 0.7699903815325425, "grad_norm": 0.6987499594688416, "learning_rate": 7.652513270842879e-05, "loss": 5.3014, "num_input_tokens_seen": 590217216, "step": 4503 }, { "epoch": 0.7705033664636102, "grad_norm": 0.699375331401825, "learning_rate": 7.649965405524765e-05, "loss": 5.2702, "num_input_tokens_seen": 590610432, "step": 4506 }, { "epoch": 0.7710163513946778, "grad_norm": 0.8014332056045532, "learning_rate": 7.647420083409549e-05, "loss": 5.1938, "num_input_tokens_seen": 591003648, "step": 4509 }, { "epoch": 0.7715293363257454, "grad_norm": 0.7827511429786682, "learning_rate": 7.64487730026913e-05, "loss": 5.2683, "num_input_tokens_seen": 591396864, "step": 4512 }, { "epoch": 0.772042321256813, "grad_norm": 0.7832793593406677, "learning_rate": 7.642337051885237e-05, "loss": 5.2913, "num_input_tokens_seen": 591790080, "step": 4515 }, { "epoch": 0.7725553061878807, "grad_norm": 0.7209380269050598, "learning_rate": 7.639799334049411e-05, "loss": 5.3039, "num_input_tokens_seen": 592183296, "step": 4518 }, { "epoch": 0.7730682911189484, "grad_norm": 0.7766376733779907, "learning_rate": 7.637264142562964e-05, "loss": 5.34, "num_input_tokens_seen": 592576512, "step": 4521 }, { "epoch": 0.773581276050016, "grad_norm": 0.7014556527137756, "learning_rate": 7.634731473236961e-05, "loss": 5.2624, "num_input_tokens_seen": 592969728, "step": 4524 }, { "epoch": 0.7740942609810837, "grad_norm": 0.726774275302887, "learning_rate": 7.632201321892173e-05, "loss": 5.2637, "num_input_tokens_seen": 593362944, "step": 4527 }, { "epoch": 0.7746072459121514, "grad_norm": 0.7574884295463562, "learning_rate": 7.62967368435907e-05, "loss": 5.2565, "num_input_tokens_seen": 593756160, "step": 4530 }, { "epoch": 0.775120230843219, "grad_norm": 0.6125289797782898, "learning_rate": 7.627148556477777e-05, "loss": 5.2344, "num_input_tokens_seen": 594149376, "step": 4533 }, { "epoch": 0.7756332157742867, "grad_norm": 0.6760141253471375, "learning_rate": 7.624625934098054e-05, "loss": 5.2637, "num_input_tokens_seen": 594542592, "step": 4536 }, { "epoch": 0.7761462007053542, "grad_norm": 0.7754603624343872, "learning_rate": 7.622105813079257e-05, "loss": 5.3232, "num_input_tokens_seen": 594935808, "step": 4539 }, { "epoch": 0.7766591856364219, "grad_norm": 0.7657566070556641, "learning_rate": 7.619588189290318e-05, "loss": 5.284, "num_input_tokens_seen": 595329024, "step": 4542 }, { "epoch": 0.7771721705674896, "grad_norm": 0.682299017906189, "learning_rate": 7.617073058609718e-05, "loss": 5.295, "num_input_tokens_seen": 595722240, "step": 4545 }, { "epoch": 0.7776851554985572, "grad_norm": 0.7046672701835632, "learning_rate": 7.614560416925451e-05, "loss": 5.271, "num_input_tokens_seen": 596115456, "step": 4548 }, { "epoch": 0.7781981404296249, "grad_norm": 0.6913005709648132, "learning_rate": 7.612050260135002e-05, "loss": 5.2684, "num_input_tokens_seen": 596508672, "step": 4551 }, { "epoch": 0.7787111253606925, "grad_norm": 0.7652671933174133, "learning_rate": 7.609542584145313e-05, "loss": 5.2268, "num_input_tokens_seen": 596901888, "step": 4554 }, { "epoch": 0.7792241102917602, "grad_norm": 0.6569912433624268, "learning_rate": 7.607037384872765e-05, "loss": 5.3011, "num_input_tokens_seen": 597295104, "step": 4557 }, { "epoch": 0.7797370952228279, "grad_norm": 0.6658585667610168, "learning_rate": 7.604534658243135e-05, "loss": 5.296, "num_input_tokens_seen": 597688320, "step": 4560 }, { "epoch": 0.7802500801538955, "grad_norm": 0.8153283596038818, "learning_rate": 7.602034400191585e-05, "loss": 5.2633, "num_input_tokens_seen": 598081536, "step": 4563 }, { "epoch": 0.7807630650849632, "grad_norm": 0.7321149110794067, "learning_rate": 7.599536606662622e-05, "loss": 5.2841, "num_input_tokens_seen": 598474752, "step": 4566 }, { "epoch": 0.7812760500160307, "grad_norm": 0.9558836221694946, "learning_rate": 7.597041273610076e-05, "loss": 5.2623, "num_input_tokens_seen": 598867968, "step": 4569 }, { "epoch": 0.7817890349470984, "grad_norm": 0.7963069081306458, "learning_rate": 7.594548396997066e-05, "loss": 5.3027, "num_input_tokens_seen": 599261184, "step": 4572 }, { "epoch": 0.782302019878166, "grad_norm": 0.7501682043075562, "learning_rate": 7.592057972795984e-05, "loss": 5.2485, "num_input_tokens_seen": 599654400, "step": 4575 }, { "epoch": 0.7828150048092337, "grad_norm": 0.8756089806556702, "learning_rate": 7.58956999698846e-05, "loss": 5.2901, "num_input_tokens_seen": 600047616, "step": 4578 }, { "epoch": 0.7833279897403014, "grad_norm": 0.7717293500900269, "learning_rate": 7.587084465565331e-05, "loss": 5.3035, "num_input_tokens_seen": 600440832, "step": 4581 }, { "epoch": 0.783840974671369, "grad_norm": 0.874310314655304, "learning_rate": 7.584601374526627e-05, "loss": 5.2816, "num_input_tokens_seen": 600834048, "step": 4584 }, { "epoch": 0.7843539596024367, "grad_norm": 0.7195756435394287, "learning_rate": 7.582120719881527e-05, "loss": 5.2297, "num_input_tokens_seen": 601227264, "step": 4587 }, { "epoch": 0.7848669445335044, "grad_norm": 0.8340673446655273, "learning_rate": 7.579642497648347e-05, "loss": 5.263, "num_input_tokens_seen": 601620480, "step": 4590 }, { "epoch": 0.785379929464572, "grad_norm": 0.6328992247581482, "learning_rate": 7.577166703854501e-05, "loss": 5.3334, "num_input_tokens_seen": 602013696, "step": 4593 }, { "epoch": 0.7858929143956396, "grad_norm": 0.7836408615112305, "learning_rate": 7.574693334536489e-05, "loss": 5.2894, "num_input_tokens_seen": 602406912, "step": 4596 }, { "epoch": 0.7864058993267072, "grad_norm": 0.7437619566917419, "learning_rate": 7.572222385739856e-05, "loss": 5.2906, "num_input_tokens_seen": 602800128, "step": 4599 }, { "epoch": 0.7869188842577749, "grad_norm": 0.665663480758667, "learning_rate": 7.569753853519169e-05, "loss": 5.3065, "num_input_tokens_seen": 603193344, "step": 4602 }, { "epoch": 0.7874318691888426, "grad_norm": 0.7104794383049011, "learning_rate": 7.567287733937997e-05, "loss": 5.2908, "num_input_tokens_seen": 603586560, "step": 4605 }, { "epoch": 0.7879448541199102, "grad_norm": 0.666928231716156, "learning_rate": 7.564824023068877e-05, "loss": 5.3252, "num_input_tokens_seen": 603979776, "step": 4608 }, { "epoch": 0.7884578390509779, "grad_norm": 0.7155689597129822, "learning_rate": 7.562362716993294e-05, "loss": 5.2782, "num_input_tokens_seen": 604372992, "step": 4611 }, { "epoch": 0.7889708239820455, "grad_norm": 0.7343229055404663, "learning_rate": 7.559903811801648e-05, "loss": 5.2521, "num_input_tokens_seen": 604766208, "step": 4614 }, { "epoch": 0.7894838089131132, "grad_norm": 0.7456091046333313, "learning_rate": 7.557447303593237e-05, "loss": 5.311, "num_input_tokens_seen": 605159424, "step": 4617 }, { "epoch": 0.7899967938441809, "grad_norm": 0.7218461036682129, "learning_rate": 7.55499318847622e-05, "loss": 5.2997, "num_input_tokens_seen": 605552640, "step": 4620 }, { "epoch": 0.7905097787752485, "grad_norm": 0.783655047416687, "learning_rate": 7.552541462567598e-05, "loss": 5.2334, "num_input_tokens_seen": 605945856, "step": 4623 }, { "epoch": 0.7910227637063161, "grad_norm": 0.7241744995117188, "learning_rate": 7.550092121993191e-05, "loss": 5.2428, "num_input_tokens_seen": 606339072, "step": 4626 }, { "epoch": 0.7915357486373837, "grad_norm": 0.77818363904953, "learning_rate": 7.547645162887604e-05, "loss": 5.2869, "num_input_tokens_seen": 606732288, "step": 4629 }, { "epoch": 0.7920487335684514, "grad_norm": 0.8874224424362183, "learning_rate": 7.545200581394207e-05, "loss": 5.253, "num_input_tokens_seen": 607125504, "step": 4632 }, { "epoch": 0.7925617184995191, "grad_norm": 0.8084154725074768, "learning_rate": 7.542758373665109e-05, "loss": 5.283, "num_input_tokens_seen": 607518720, "step": 4635 }, { "epoch": 0.7930747034305867, "grad_norm": 0.6735661625862122, "learning_rate": 7.540318535861131e-05, "loss": 5.2641, "num_input_tokens_seen": 607911936, "step": 4638 }, { "epoch": 0.7935876883616544, "grad_norm": 0.7534482479095459, "learning_rate": 7.537881064151782e-05, "loss": 5.2805, "num_input_tokens_seen": 608305152, "step": 4641 }, { "epoch": 0.794100673292722, "grad_norm": 0.7664649486541748, "learning_rate": 7.535445954715228e-05, "loss": 5.2183, "num_input_tokens_seen": 608698368, "step": 4644 }, { "epoch": 0.7946136582237897, "grad_norm": 0.8066388964653015, "learning_rate": 7.53301320373828e-05, "loss": 5.3102, "num_input_tokens_seen": 609091584, "step": 4647 }, { "epoch": 0.7951266431548574, "grad_norm": 0.733974277973175, "learning_rate": 7.530582807416357e-05, "loss": 5.2926, "num_input_tokens_seen": 609484800, "step": 4650 }, { "epoch": 0.7956396280859249, "grad_norm": 0.7289937138557434, "learning_rate": 7.528154761953464e-05, "loss": 5.276, "num_input_tokens_seen": 609878016, "step": 4653 }, { "epoch": 0.7961526130169926, "grad_norm": 0.7927041053771973, "learning_rate": 7.52572906356217e-05, "loss": 5.2783, "num_input_tokens_seen": 610271232, "step": 4656 }, { "epoch": 0.7966655979480602, "grad_norm": 0.718377411365509, "learning_rate": 7.523305708463577e-05, "loss": 5.2949, "num_input_tokens_seen": 610664448, "step": 4659 }, { "epoch": 0.7971785828791279, "grad_norm": 0.7721161246299744, "learning_rate": 7.520884692887304e-05, "loss": 5.3164, "num_input_tokens_seen": 611057664, "step": 4662 }, { "epoch": 0.7976915678101956, "grad_norm": 0.6871947050094604, "learning_rate": 7.518466013071455e-05, "loss": 5.2669, "num_input_tokens_seen": 611450880, "step": 4665 }, { "epoch": 0.7982045527412632, "grad_norm": 0.6703811883926392, "learning_rate": 7.516049665262601e-05, "loss": 5.2861, "num_input_tokens_seen": 611844096, "step": 4668 }, { "epoch": 0.7987175376723309, "grad_norm": 0.6838527321815491, "learning_rate": 7.51363564571575e-05, "loss": 5.2608, "num_input_tokens_seen": 612237312, "step": 4671 }, { "epoch": 0.7992305226033986, "grad_norm": 0.650619626045227, "learning_rate": 7.511223950694318e-05, "loss": 5.2961, "num_input_tokens_seen": 612630528, "step": 4674 }, { "epoch": 0.7997435075344662, "grad_norm": 0.6837221384048462, "learning_rate": 7.508814576470118e-05, "loss": 5.2384, "num_input_tokens_seen": 613023744, "step": 4677 }, { "epoch": 0.8002564924655339, "grad_norm": 0.8880760669708252, "learning_rate": 7.50640751932333e-05, "loss": 5.2994, "num_input_tokens_seen": 613416960, "step": 4680 }, { "epoch": 0.8007694773966014, "grad_norm": 0.7496533989906311, "learning_rate": 7.504002775542471e-05, "loss": 5.2399, "num_input_tokens_seen": 613810176, "step": 4683 }, { "epoch": 0.8012824623276691, "grad_norm": 0.8291968703269958, "learning_rate": 7.50160034142438e-05, "loss": 5.2665, "num_input_tokens_seen": 614203392, "step": 4686 }, { "epoch": 0.8017954472587367, "grad_norm": 0.6744365692138672, "learning_rate": 7.499200213274185e-05, "loss": 5.2685, "num_input_tokens_seen": 614596608, "step": 4689 }, { "epoch": 0.8023084321898044, "grad_norm": 0.8543739914894104, "learning_rate": 7.496802387405287e-05, "loss": 5.2792, "num_input_tokens_seen": 614989824, "step": 4692 }, { "epoch": 0.8028214171208721, "grad_norm": 0.8261051177978516, "learning_rate": 7.494406860139334e-05, "loss": 5.2683, "num_input_tokens_seen": 615383040, "step": 4695 }, { "epoch": 0.8033344020519397, "grad_norm": 0.7388364672660828, "learning_rate": 7.492013627806192e-05, "loss": 5.2577, "num_input_tokens_seen": 615776256, "step": 4698 }, { "epoch": 0.8038473869830074, "grad_norm": 0.7822057008743286, "learning_rate": 7.489622686743933e-05, "loss": 5.2975, "num_input_tokens_seen": 616169472, "step": 4701 }, { "epoch": 0.8043603719140751, "grad_norm": 0.866584837436676, "learning_rate": 7.487234033298796e-05, "loss": 5.2458, "num_input_tokens_seen": 616562688, "step": 4704 }, { "epoch": 0.8048733568451427, "grad_norm": 0.6807045340538025, "learning_rate": 7.484847663825176e-05, "loss": 5.2579, "num_input_tokens_seen": 616955904, "step": 4707 }, { "epoch": 0.8053863417762103, "grad_norm": 0.7374781966209412, "learning_rate": 7.4824635746856e-05, "loss": 5.2762, "num_input_tokens_seen": 617349120, "step": 4710 }, { "epoch": 0.8058993267072779, "grad_norm": 0.829736590385437, "learning_rate": 7.480081762250693e-05, "loss": 5.3006, "num_input_tokens_seen": 617742336, "step": 4713 }, { "epoch": 0.8064123116383456, "grad_norm": 0.768171489238739, "learning_rate": 7.477702222899166e-05, "loss": 5.2574, "num_input_tokens_seen": 618135552, "step": 4716 }, { "epoch": 0.8069252965694133, "grad_norm": 0.7289711833000183, "learning_rate": 7.47532495301779e-05, "loss": 5.2361, "num_input_tokens_seen": 618528768, "step": 4719 }, { "epoch": 0.8074382815004809, "grad_norm": 0.9484478831291199, "learning_rate": 7.472949949001368e-05, "loss": 5.2584, "num_input_tokens_seen": 618921984, "step": 4722 }, { "epoch": 0.8079512664315486, "grad_norm": 0.7003783583641052, "learning_rate": 7.470577207252715e-05, "loss": 5.2355, "num_input_tokens_seen": 619315200, "step": 4725 }, { "epoch": 0.8084642513626162, "grad_norm": 0.7656697034835815, "learning_rate": 7.468206724182646e-05, "loss": 5.2752, "num_input_tokens_seen": 619708416, "step": 4728 }, { "epoch": 0.8089772362936839, "grad_norm": 0.7776997089385986, "learning_rate": 7.465838496209931e-05, "loss": 5.2699, "num_input_tokens_seen": 620101632, "step": 4731 }, { "epoch": 0.8094902212247516, "grad_norm": 0.7209721207618713, "learning_rate": 7.463472519761289e-05, "loss": 5.2341, "num_input_tokens_seen": 620494848, "step": 4734 }, { "epoch": 0.8100032061558192, "grad_norm": 0.7924665212631226, "learning_rate": 7.461108791271363e-05, "loss": 5.2278, "num_input_tokens_seen": 620888064, "step": 4737 }, { "epoch": 0.8105161910868868, "grad_norm": 0.673541247844696, "learning_rate": 7.458747307182692e-05, "loss": 5.3302, "num_input_tokens_seen": 621281280, "step": 4740 }, { "epoch": 0.8110291760179544, "grad_norm": 0.7629460096359253, "learning_rate": 7.456388063945693e-05, "loss": 5.2465, "num_input_tokens_seen": 621674496, "step": 4743 }, { "epoch": 0.8115421609490221, "grad_norm": 0.665367603302002, "learning_rate": 7.454031058018637e-05, "loss": 5.3017, "num_input_tokens_seen": 622067712, "step": 4746 }, { "epoch": 0.8120551458800898, "grad_norm": 0.7708544731140137, "learning_rate": 7.451676285867628e-05, "loss": 5.2512, "num_input_tokens_seen": 622460928, "step": 4749 }, { "epoch": 0.8125681308111574, "grad_norm": 0.8049509525299072, "learning_rate": 7.449323743966578e-05, "loss": 5.2656, "num_input_tokens_seen": 622854144, "step": 4752 }, { "epoch": 0.8130811157422251, "grad_norm": 0.6959990859031677, "learning_rate": 7.446973428797188e-05, "loss": 5.2437, "num_input_tokens_seen": 623247360, "step": 4755 }, { "epoch": 0.8135941006732927, "grad_norm": 0.7066569328308105, "learning_rate": 7.444625336848923e-05, "loss": 5.2506, "num_input_tokens_seen": 623640576, "step": 4758 }, { "epoch": 0.8141070856043604, "grad_norm": 0.6698566675186157, "learning_rate": 7.442279464618996e-05, "loss": 5.2852, "num_input_tokens_seen": 624033792, "step": 4761 }, { "epoch": 0.8146200705354281, "grad_norm": 0.7723349332809448, "learning_rate": 7.439935808612331e-05, "loss": 5.3053, "num_input_tokens_seen": 624427008, "step": 4764 }, { "epoch": 0.8151330554664957, "grad_norm": 0.7419276237487793, "learning_rate": 7.437594365341564e-05, "loss": 5.2353, "num_input_tokens_seen": 624820224, "step": 4767 }, { "epoch": 0.8156460403975633, "grad_norm": 0.8163347840309143, "learning_rate": 7.435255131327003e-05, "loss": 5.2718, "num_input_tokens_seen": 625213440, "step": 4770 }, { "epoch": 0.8161590253286309, "grad_norm": 0.7750483751296997, "learning_rate": 7.432918103096608e-05, "loss": 5.2919, "num_input_tokens_seen": 625606656, "step": 4773 }, { "epoch": 0.8166720102596986, "grad_norm": 0.7649890184402466, "learning_rate": 7.430583277185981e-05, "loss": 5.2961, "num_input_tokens_seen": 625999872, "step": 4776 }, { "epoch": 0.8171849951907663, "grad_norm": 0.7891272306442261, "learning_rate": 7.428250650138333e-05, "loss": 5.255, "num_input_tokens_seen": 626393088, "step": 4779 }, { "epoch": 0.8176979801218339, "grad_norm": 0.7607313990592957, "learning_rate": 7.425920218504469e-05, "loss": 5.232, "num_input_tokens_seen": 626786304, "step": 4782 }, { "epoch": 0.8182109650529016, "grad_norm": 0.8134208917617798, "learning_rate": 7.423591978842759e-05, "loss": 5.25, "num_input_tokens_seen": 627179520, "step": 4785 }, { "epoch": 0.8187239499839692, "grad_norm": 0.7779147624969482, "learning_rate": 7.421265927719126e-05, "loss": 5.2691, "num_input_tokens_seen": 627572736, "step": 4788 }, { "epoch": 0.8192369349150369, "grad_norm": 0.7561124563217163, "learning_rate": 7.418942061707016e-05, "loss": 5.247, "num_input_tokens_seen": 627965952, "step": 4791 }, { "epoch": 0.8197499198461046, "grad_norm": 0.7412567138671875, "learning_rate": 7.416620377387388e-05, "loss": 5.3103, "num_input_tokens_seen": 628359168, "step": 4794 }, { "epoch": 0.8202629047771721, "grad_norm": 0.720973551273346, "learning_rate": 7.414300871348681e-05, "loss": 5.318, "num_input_tokens_seen": 628752384, "step": 4797 }, { "epoch": 0.8207758897082398, "grad_norm": 0.7904423475265503, "learning_rate": 7.411983540186796e-05, "loss": 5.2749, "num_input_tokens_seen": 629145600, "step": 4800 }, { "epoch": 0.8207758897082398, "eval_accuracy": 0.1849405634261521, "eval_loss": 5.730287075042725, "eval_runtime": 109.3991, "eval_samples_per_second": 2.742, "eval_steps_per_second": 1.371, "num_input_tokens_seen": 629145600, "step": 4800 }, { "epoch": 0.8212888746393074, "grad_norm": 0.7457066774368286, "learning_rate": 7.409668380505084e-05, "loss": 5.2593, "num_input_tokens_seen": 629538816, "step": 4803 }, { "epoch": 0.8218018595703751, "grad_norm": 0.7565402984619141, "learning_rate": 7.407355388914312e-05, "loss": 5.2582, "num_input_tokens_seen": 629932032, "step": 4806 }, { "epoch": 0.8223148445014428, "grad_norm": 0.7606648206710815, "learning_rate": 7.40504456203265e-05, "loss": 5.2638, "num_input_tokens_seen": 630325248, "step": 4809 }, { "epoch": 0.8228278294325104, "grad_norm": 0.7904190421104431, "learning_rate": 7.40273589648565e-05, "loss": 5.2671, "num_input_tokens_seen": 630718464, "step": 4812 }, { "epoch": 0.8233408143635781, "grad_norm": 0.736111581325531, "learning_rate": 7.400429388906221e-05, "loss": 5.271, "num_input_tokens_seen": 631111680, "step": 4815 }, { "epoch": 0.8238537992946457, "grad_norm": 0.7933962345123291, "learning_rate": 7.398125035934614e-05, "loss": 5.2411, "num_input_tokens_seen": 631504896, "step": 4818 }, { "epoch": 0.8243667842257134, "grad_norm": 0.8558014631271362, "learning_rate": 7.395822834218396e-05, "loss": 5.2648, "num_input_tokens_seen": 631898112, "step": 4821 }, { "epoch": 0.8248797691567811, "grad_norm": 0.7140774130821228, "learning_rate": 7.393522780412432e-05, "loss": 5.2415, "num_input_tokens_seen": 632291328, "step": 4824 }, { "epoch": 0.8253927540878486, "grad_norm": 0.7220463156700134, "learning_rate": 7.391224871178872e-05, "loss": 5.2234, "num_input_tokens_seen": 632684544, "step": 4827 }, { "epoch": 0.8259057390189163, "grad_norm": 0.6848180890083313, "learning_rate": 7.388929103187108e-05, "loss": 5.2656, "num_input_tokens_seen": 633077760, "step": 4830 }, { "epoch": 0.826418723949984, "grad_norm": 0.7403995990753174, "learning_rate": 7.386635473113787e-05, "loss": 5.2528, "num_input_tokens_seen": 633470976, "step": 4833 }, { "epoch": 0.8269317088810516, "grad_norm": 0.7096079587936401, "learning_rate": 7.384343977642759e-05, "loss": 5.2495, "num_input_tokens_seen": 633864192, "step": 4836 }, { "epoch": 0.8274446938121193, "grad_norm": 0.7836086750030518, "learning_rate": 7.382054613465076e-05, "loss": 5.2646, "num_input_tokens_seen": 634257408, "step": 4839 }, { "epoch": 0.8279576787431869, "grad_norm": 0.6579269766807556, "learning_rate": 7.379767377278969e-05, "loss": 5.2655, "num_input_tokens_seen": 634650624, "step": 4842 }, { "epoch": 0.8284706636742546, "grad_norm": 0.8051040768623352, "learning_rate": 7.377482265789823e-05, "loss": 5.2754, "num_input_tokens_seen": 635043840, "step": 4845 }, { "epoch": 0.8289836486053223, "grad_norm": 0.7173849940299988, "learning_rate": 7.375199275710157e-05, "loss": 5.2735, "num_input_tokens_seen": 635437056, "step": 4848 }, { "epoch": 0.8294966335363899, "grad_norm": 0.7025325298309326, "learning_rate": 7.372918403759613e-05, "loss": 5.2521, "num_input_tokens_seen": 635830272, "step": 4851 }, { "epoch": 0.8300096184674575, "grad_norm": 0.789348840713501, "learning_rate": 7.370639646664927e-05, "loss": 5.2772, "num_input_tokens_seen": 636223488, "step": 4854 }, { "epoch": 0.8305226033985251, "grad_norm": 0.727810263633728, "learning_rate": 7.368363001159908e-05, "loss": 5.2853, "num_input_tokens_seen": 636616704, "step": 4857 }, { "epoch": 0.8310355883295928, "grad_norm": 0.7564170956611633, "learning_rate": 7.366088463985431e-05, "loss": 5.2352, "num_input_tokens_seen": 637009920, "step": 4860 }, { "epoch": 0.8315485732606605, "grad_norm": 0.7294690012931824, "learning_rate": 7.363816031889405e-05, "loss": 5.2667, "num_input_tokens_seen": 637403136, "step": 4863 }, { "epoch": 0.8320615581917281, "grad_norm": 0.688892126083374, "learning_rate": 7.361545701626754e-05, "loss": 5.2515, "num_input_tokens_seen": 637796352, "step": 4866 }, { "epoch": 0.8325745431227958, "grad_norm": 0.7049474716186523, "learning_rate": 7.359277469959405e-05, "loss": 5.297, "num_input_tokens_seen": 638189568, "step": 4869 }, { "epoch": 0.8330875280538634, "grad_norm": 0.6854651570320129, "learning_rate": 7.35701133365627e-05, "loss": 5.1935, "num_input_tokens_seen": 638582784, "step": 4872 }, { "epoch": 0.8336005129849311, "grad_norm": 0.8255504965782166, "learning_rate": 7.354747289493207e-05, "loss": 5.2819, "num_input_tokens_seen": 638976000, "step": 4875 }, { "epoch": 0.8341134979159988, "grad_norm": 0.8443409204483032, "learning_rate": 7.35248533425303e-05, "loss": 5.2579, "num_input_tokens_seen": 639369216, "step": 4878 }, { "epoch": 0.8346264828470664, "grad_norm": 0.759077787399292, "learning_rate": 7.350225464725466e-05, "loss": 5.21, "num_input_tokens_seen": 639762432, "step": 4881 }, { "epoch": 0.835139467778134, "grad_norm": 0.6921237111091614, "learning_rate": 7.347967677707148e-05, "loss": 5.2795, "num_input_tokens_seen": 640155648, "step": 4884 }, { "epoch": 0.8356524527092016, "grad_norm": 0.6738846302032471, "learning_rate": 7.345711970001593e-05, "loss": 5.262, "num_input_tokens_seen": 640548864, "step": 4887 }, { "epoch": 0.8361654376402693, "grad_norm": 0.6597248315811157, "learning_rate": 7.343458338419179e-05, "loss": 5.2218, "num_input_tokens_seen": 640942080, "step": 4890 }, { "epoch": 0.836678422571337, "grad_norm": 0.8394546508789062, "learning_rate": 7.341206779777132e-05, "loss": 5.2289, "num_input_tokens_seen": 641335296, "step": 4893 }, { "epoch": 0.8371914075024046, "grad_norm": 0.7097527980804443, "learning_rate": 7.338957290899508e-05, "loss": 5.2544, "num_input_tokens_seen": 641728512, "step": 4896 }, { "epoch": 0.8377043924334723, "grad_norm": 0.7356297373771667, "learning_rate": 7.336709868617169e-05, "loss": 5.2585, "num_input_tokens_seen": 642121728, "step": 4899 }, { "epoch": 0.8382173773645399, "grad_norm": 0.7718103528022766, "learning_rate": 7.334464509767758e-05, "loss": 5.264, "num_input_tokens_seen": 642514944, "step": 4902 }, { "epoch": 0.8387303622956076, "grad_norm": 0.8452854156494141, "learning_rate": 7.332221211195707e-05, "loss": 5.2449, "num_input_tokens_seen": 642908160, "step": 4905 }, { "epoch": 0.8392433472266753, "grad_norm": 0.7810956239700317, "learning_rate": 7.329979969752183e-05, "loss": 5.264, "num_input_tokens_seen": 643301376, "step": 4908 }, { "epoch": 0.8397563321577428, "grad_norm": 0.7456896305084229, "learning_rate": 7.327740782295093e-05, "loss": 5.2423, "num_input_tokens_seen": 643694592, "step": 4911 }, { "epoch": 0.8402693170888105, "grad_norm": 0.6871115565299988, "learning_rate": 7.325503645689056e-05, "loss": 5.274, "num_input_tokens_seen": 644087808, "step": 4914 }, { "epoch": 0.8407823020198781, "grad_norm": 0.7335032820701599, "learning_rate": 7.323268556805394e-05, "loss": 5.2725, "num_input_tokens_seen": 644481024, "step": 4917 }, { "epoch": 0.8412952869509458, "grad_norm": 0.8171728849411011, "learning_rate": 7.321035512522102e-05, "loss": 5.2408, "num_input_tokens_seen": 644874240, "step": 4920 }, { "epoch": 0.8418082718820135, "grad_norm": 0.7723347544670105, "learning_rate": 7.318804509723834e-05, "loss": 5.254, "num_input_tokens_seen": 645267456, "step": 4923 }, { "epoch": 0.8423212568130811, "grad_norm": 0.7238738536834717, "learning_rate": 7.316575545301888e-05, "loss": 5.2625, "num_input_tokens_seen": 645660672, "step": 4926 }, { "epoch": 0.8428342417441488, "grad_norm": 0.638489842414856, "learning_rate": 7.314348616154184e-05, "loss": 5.2686, "num_input_tokens_seen": 646053888, "step": 4929 }, { "epoch": 0.8433472266752164, "grad_norm": 0.6416002511978149, "learning_rate": 7.31212371918525e-05, "loss": 5.2102, "num_input_tokens_seen": 646447104, "step": 4932 }, { "epoch": 0.8438602116062841, "grad_norm": 0.7499914169311523, "learning_rate": 7.309900851306195e-05, "loss": 5.2976, "num_input_tokens_seen": 646840320, "step": 4935 }, { "epoch": 0.8443731965373518, "grad_norm": 0.7242657542228699, "learning_rate": 7.307680009434705e-05, "loss": 5.3085, "num_input_tokens_seen": 647233536, "step": 4938 }, { "epoch": 0.8448861814684193, "grad_norm": 0.6967840790748596, "learning_rate": 7.30546119049501e-05, "loss": 5.247, "num_input_tokens_seen": 647626752, "step": 4941 }, { "epoch": 0.845399166399487, "grad_norm": 0.7176790237426758, "learning_rate": 7.303244391417879e-05, "loss": 5.2728, "num_input_tokens_seen": 648019968, "step": 4944 }, { "epoch": 0.8459121513305546, "grad_norm": 0.6905176043510437, "learning_rate": 7.30102960914059e-05, "loss": 5.2653, "num_input_tokens_seen": 648413184, "step": 4947 }, { "epoch": 0.8464251362616223, "grad_norm": 0.731937050819397, "learning_rate": 7.298816840606925e-05, "loss": 5.2449, "num_input_tokens_seen": 648806400, "step": 4950 }, { "epoch": 0.84693812119269, "grad_norm": 0.733711302280426, "learning_rate": 7.296606082767145e-05, "loss": 5.2083, "num_input_tokens_seen": 649199616, "step": 4953 }, { "epoch": 0.8474511061237576, "grad_norm": 0.7619096040725708, "learning_rate": 7.294397332577968e-05, "loss": 5.2159, "num_input_tokens_seen": 649592832, "step": 4956 }, { "epoch": 0.8479640910548253, "grad_norm": 0.7431106567382812, "learning_rate": 7.292190587002563e-05, "loss": 5.2721, "num_input_tokens_seen": 649986048, "step": 4959 }, { "epoch": 0.848477075985893, "grad_norm": 0.7448694705963135, "learning_rate": 7.28998584301052e-05, "loss": 5.2766, "num_input_tokens_seen": 650379264, "step": 4962 }, { "epoch": 0.8489900609169606, "grad_norm": 0.6657842397689819, "learning_rate": 7.287783097577849e-05, "loss": 5.2348, "num_input_tokens_seen": 650772480, "step": 4965 }, { "epoch": 0.8495030458480282, "grad_norm": 0.7611784934997559, "learning_rate": 7.28558234768694e-05, "loss": 5.2712, "num_input_tokens_seen": 651165696, "step": 4968 }, { "epoch": 0.8500160307790958, "grad_norm": 0.678893506526947, "learning_rate": 7.283383590326562e-05, "loss": 5.233, "num_input_tokens_seen": 651558912, "step": 4971 }, { "epoch": 0.8505290157101635, "grad_norm": 0.7050936818122864, "learning_rate": 7.281186822491848e-05, "loss": 5.2726, "num_input_tokens_seen": 651952128, "step": 4974 }, { "epoch": 0.8510420006412311, "grad_norm": 0.7030792236328125, "learning_rate": 7.278992041184265e-05, "loss": 5.2673, "num_input_tokens_seen": 652345344, "step": 4977 }, { "epoch": 0.8515549855722988, "grad_norm": 0.7716869115829468, "learning_rate": 7.276799243411601e-05, "loss": 5.2385, "num_input_tokens_seen": 652738560, "step": 4980 }, { "epoch": 0.8520679705033665, "grad_norm": 0.6835726499557495, "learning_rate": 7.274608426187958e-05, "loss": 5.2496, "num_input_tokens_seen": 653131776, "step": 4983 }, { "epoch": 0.8525809554344341, "grad_norm": 0.8090899586677551, "learning_rate": 7.272419586533719e-05, "loss": 5.2421, "num_input_tokens_seen": 653524992, "step": 4986 }, { "epoch": 0.8530939403655018, "grad_norm": 0.7765442728996277, "learning_rate": 7.270232721475544e-05, "loss": 5.2459, "num_input_tokens_seen": 653918208, "step": 4989 }, { "epoch": 0.8536069252965695, "grad_norm": 0.7543562650680542, "learning_rate": 7.268047828046345e-05, "loss": 5.2362, "num_input_tokens_seen": 654311424, "step": 4992 }, { "epoch": 0.8541199102276371, "grad_norm": 0.7674638628959656, "learning_rate": 7.265864903285278e-05, "loss": 5.1859, "num_input_tokens_seen": 654704640, "step": 4995 }, { "epoch": 0.8546328951587047, "grad_norm": 0.7830629348754883, "learning_rate": 7.263683944237711e-05, "loss": 5.2391, "num_input_tokens_seen": 655097856, "step": 4998 }, { "epoch": 0.8551458800897723, "grad_norm": 0.8434771299362183, "learning_rate": 7.261504947955222e-05, "loss": 5.2177, "num_input_tokens_seen": 655491072, "step": 5001 }, { "epoch": 0.85565886502084, "grad_norm": 0.6819344162940979, "learning_rate": 7.259327911495573e-05, "loss": 5.245, "num_input_tokens_seen": 655884288, "step": 5004 }, { "epoch": 0.8561718499519076, "grad_norm": 0.6312018632888794, "learning_rate": 7.257152831922706e-05, "loss": 5.2205, "num_input_tokens_seen": 656277504, "step": 5007 }, { "epoch": 0.8566848348829753, "grad_norm": 0.7521694898605347, "learning_rate": 7.254979706306706e-05, "loss": 5.2303, "num_input_tokens_seen": 656670720, "step": 5010 }, { "epoch": 0.857197819814043, "grad_norm": 0.7708764672279358, "learning_rate": 7.252808531723802e-05, "loss": 5.2255, "num_input_tokens_seen": 657063936, "step": 5013 }, { "epoch": 0.8577108047451106, "grad_norm": 0.7379157543182373, "learning_rate": 7.250639305256345e-05, "loss": 5.2368, "num_input_tokens_seen": 657457152, "step": 5016 }, { "epoch": 0.8582237896761783, "grad_norm": 0.7467489242553711, "learning_rate": 7.248472023992787e-05, "loss": 5.2539, "num_input_tokens_seen": 657850368, "step": 5019 }, { "epoch": 0.858736774607246, "grad_norm": 0.7960140109062195, "learning_rate": 7.24630668502767e-05, "loss": 5.243, "num_input_tokens_seen": 658243584, "step": 5022 }, { "epoch": 0.8592497595383135, "grad_norm": 0.8107805252075195, "learning_rate": 7.244143285461608e-05, "loss": 5.2573, "num_input_tokens_seen": 658636800, "step": 5025 }, { "epoch": 0.8597627444693812, "grad_norm": 0.8108943104743958, "learning_rate": 7.241981822401273e-05, "loss": 5.1838, "num_input_tokens_seen": 659030016, "step": 5028 }, { "epoch": 0.8602757294004488, "grad_norm": 0.6969135403633118, "learning_rate": 7.23982229295937e-05, "loss": 5.2806, "num_input_tokens_seen": 659423232, "step": 5031 }, { "epoch": 0.8607887143315165, "grad_norm": 0.6890487670898438, "learning_rate": 7.237664694254637e-05, "loss": 5.2691, "num_input_tokens_seen": 659816448, "step": 5034 }, { "epoch": 0.8613016992625842, "grad_norm": 0.6614696979522705, "learning_rate": 7.235509023411809e-05, "loss": 5.2033, "num_input_tokens_seen": 660209664, "step": 5037 }, { "epoch": 0.8618146841936518, "grad_norm": 0.6726818680763245, "learning_rate": 7.233355277561621e-05, "loss": 5.2619, "num_input_tokens_seen": 660602880, "step": 5040 }, { "epoch": 0.8623276691247195, "grad_norm": 0.7320277094841003, "learning_rate": 7.231203453840776e-05, "loss": 5.2354, "num_input_tokens_seen": 660996096, "step": 5043 }, { "epoch": 0.8628406540557871, "grad_norm": 0.641864001750946, "learning_rate": 7.22905354939194e-05, "loss": 5.2346, "num_input_tokens_seen": 661389312, "step": 5046 }, { "epoch": 0.8633536389868548, "grad_norm": 0.7307341694831848, "learning_rate": 7.22690556136372e-05, "loss": 5.2156, "num_input_tokens_seen": 661782528, "step": 5049 }, { "epoch": 0.8638666239179225, "grad_norm": 0.7401044368743896, "learning_rate": 7.22475948691065e-05, "loss": 5.275, "num_input_tokens_seen": 662175744, "step": 5052 }, { "epoch": 0.86437960884899, "grad_norm": 0.7346693873405457, "learning_rate": 7.22261532319318e-05, "loss": 5.2385, "num_input_tokens_seen": 662568960, "step": 5055 }, { "epoch": 0.8648925937800577, "grad_norm": 0.7968463897705078, "learning_rate": 7.220473067377648e-05, "loss": 5.229, "num_input_tokens_seen": 662962176, "step": 5058 }, { "epoch": 0.8654055787111253, "grad_norm": 0.7128798961639404, "learning_rate": 7.218332716636276e-05, "loss": 5.2253, "num_input_tokens_seen": 663355392, "step": 5061 }, { "epoch": 0.865918563642193, "grad_norm": 0.6696746945381165, "learning_rate": 7.216194268147151e-05, "loss": 5.183, "num_input_tokens_seen": 663748608, "step": 5064 }, { "epoch": 0.8664315485732607, "grad_norm": 0.8338589668273926, "learning_rate": 7.214057719094208e-05, "loss": 5.1871, "num_input_tokens_seen": 664141824, "step": 5067 }, { "epoch": 0.8669445335043283, "grad_norm": 0.8238839507102966, "learning_rate": 7.211923066667213e-05, "loss": 5.2382, "num_input_tokens_seen": 664535040, "step": 5070 }, { "epoch": 0.867457518435396, "grad_norm": 0.883547306060791, "learning_rate": 7.20979030806175e-05, "loss": 5.2703, "num_input_tokens_seen": 664928256, "step": 5073 }, { "epoch": 0.8679705033664636, "grad_norm": 0.8404785394668579, "learning_rate": 7.207659440479209e-05, "loss": 5.2057, "num_input_tokens_seen": 665321472, "step": 5076 }, { "epoch": 0.8684834882975313, "grad_norm": 0.8399271368980408, "learning_rate": 7.20553046112676e-05, "loss": 5.2319, "num_input_tokens_seen": 665714688, "step": 5079 }, { "epoch": 0.868996473228599, "grad_norm": 0.8033043742179871, "learning_rate": 7.203403367217348e-05, "loss": 5.2562, "num_input_tokens_seen": 666107904, "step": 5082 }, { "epoch": 0.8695094581596665, "grad_norm": 0.8439496159553528, "learning_rate": 7.201278155969676e-05, "loss": 5.2529, "num_input_tokens_seen": 666501120, "step": 5085 }, { "epoch": 0.8700224430907342, "grad_norm": 0.7626810669898987, "learning_rate": 7.19915482460818e-05, "loss": 5.261, "num_input_tokens_seen": 666894336, "step": 5088 }, { "epoch": 0.8705354280218018, "grad_norm": 0.8755491375923157, "learning_rate": 7.197033370363028e-05, "loss": 5.2855, "num_input_tokens_seen": 667287552, "step": 5091 }, { "epoch": 0.8710484129528695, "grad_norm": 0.8308954238891602, "learning_rate": 7.1949137904701e-05, "loss": 5.234, "num_input_tokens_seen": 667680768, "step": 5094 }, { "epoch": 0.8715613978839372, "grad_norm": 0.748672604560852, "learning_rate": 7.192796082170961e-05, "loss": 5.2332, "num_input_tokens_seen": 668073984, "step": 5097 }, { "epoch": 0.8720743828150048, "grad_norm": 0.8005703687667847, "learning_rate": 7.190680242712868e-05, "loss": 5.2514, "num_input_tokens_seen": 668467200, "step": 5100 }, { "epoch": 0.8725873677460725, "grad_norm": 0.653659462928772, "learning_rate": 7.18856626934873e-05, "loss": 5.2233, "num_input_tokens_seen": 668860416, "step": 5103 }, { "epoch": 0.8731003526771401, "grad_norm": 0.7143151760101318, "learning_rate": 7.186454159337121e-05, "loss": 5.2236, "num_input_tokens_seen": 669253632, "step": 5106 }, { "epoch": 0.8736133376082078, "grad_norm": 0.7480143904685974, "learning_rate": 7.184343909942239e-05, "loss": 5.1812, "num_input_tokens_seen": 669646848, "step": 5109 }, { "epoch": 0.8741263225392754, "grad_norm": 0.7663584351539612, "learning_rate": 7.182235518433903e-05, "loss": 5.211, "num_input_tokens_seen": 670040064, "step": 5112 }, { "epoch": 0.874639307470343, "grad_norm": 0.7375966310501099, "learning_rate": 7.180128982087541e-05, "loss": 5.2147, "num_input_tokens_seen": 670433280, "step": 5115 }, { "epoch": 0.8751522924014107, "grad_norm": 0.7116428017616272, "learning_rate": 7.178024298184173e-05, "loss": 5.2005, "num_input_tokens_seen": 670826496, "step": 5118 }, { "epoch": 0.8756652773324783, "grad_norm": 0.7579666972160339, "learning_rate": 7.175921464010388e-05, "loss": 5.2272, "num_input_tokens_seen": 671219712, "step": 5121 }, { "epoch": 0.876178262263546, "grad_norm": 0.8425345420837402, "learning_rate": 7.173820476858339e-05, "loss": 5.2347, "num_input_tokens_seen": 671612928, "step": 5124 }, { "epoch": 0.8766912471946137, "grad_norm": 0.7112694382667542, "learning_rate": 7.171721334025732e-05, "loss": 5.2224, "num_input_tokens_seen": 672006144, "step": 5127 }, { "epoch": 0.8772042321256813, "grad_norm": 0.798812747001648, "learning_rate": 7.169624032815794e-05, "loss": 5.2049, "num_input_tokens_seen": 672399360, "step": 5130 }, { "epoch": 0.877717217056749, "grad_norm": 0.8146517872810364, "learning_rate": 7.167528570537277e-05, "loss": 5.2076, "num_input_tokens_seen": 672792576, "step": 5133 }, { "epoch": 0.8782302019878166, "grad_norm": 0.7291656732559204, "learning_rate": 7.165434944504431e-05, "loss": 5.1914, "num_input_tokens_seen": 673185792, "step": 5136 }, { "epoch": 0.8787431869188843, "grad_norm": 0.9471056461334229, "learning_rate": 7.163343152036998e-05, "loss": 5.267, "num_input_tokens_seen": 673579008, "step": 5139 }, { "epoch": 0.8792561718499519, "grad_norm": 0.6661033034324646, "learning_rate": 7.161253190460194e-05, "loss": 5.2058, "num_input_tokens_seen": 673972224, "step": 5142 }, { "epoch": 0.8797691567810195, "grad_norm": 0.7200859189033508, "learning_rate": 7.159165057104689e-05, "loss": 5.1633, "num_input_tokens_seen": 674365440, "step": 5145 }, { "epoch": 0.8802821417120872, "grad_norm": 0.8545052409172058, "learning_rate": 7.157078749306606e-05, "loss": 5.2405, "num_input_tokens_seen": 674758656, "step": 5148 }, { "epoch": 0.8807951266431548, "grad_norm": 0.785458505153656, "learning_rate": 7.154994264407493e-05, "loss": 5.2005, "num_input_tokens_seen": 675151872, "step": 5151 }, { "epoch": 0.8813081115742225, "grad_norm": 0.7209606170654297, "learning_rate": 7.152911599754318e-05, "loss": 5.2581, "num_input_tokens_seen": 675545088, "step": 5154 }, { "epoch": 0.8818210965052902, "grad_norm": 0.688242495059967, "learning_rate": 7.15083075269945e-05, "loss": 5.2225, "num_input_tokens_seen": 675938304, "step": 5157 }, { "epoch": 0.8823340814363578, "grad_norm": 0.7452808618545532, "learning_rate": 7.148751720600645e-05, "loss": 5.1834, "num_input_tokens_seen": 676331520, "step": 5160 }, { "epoch": 0.8828470663674255, "grad_norm": 0.7830954790115356, "learning_rate": 7.146674500821039e-05, "loss": 5.1773, "num_input_tokens_seen": 676724736, "step": 5163 }, { "epoch": 0.8833600512984932, "grad_norm": 0.7248668074607849, "learning_rate": 7.144599090729122e-05, "loss": 5.2314, "num_input_tokens_seen": 677117952, "step": 5166 }, { "epoch": 0.8838730362295607, "grad_norm": 0.8058164715766907, "learning_rate": 7.142525487698731e-05, "loss": 5.2789, "num_input_tokens_seen": 677511168, "step": 5169 }, { "epoch": 0.8843860211606284, "grad_norm": 0.70136958360672, "learning_rate": 7.140453689109039e-05, "loss": 5.272, "num_input_tokens_seen": 677904384, "step": 5172 }, { "epoch": 0.884899006091696, "grad_norm": 0.8600161671638489, "learning_rate": 7.138383692344537e-05, "loss": 5.2665, "num_input_tokens_seen": 678297600, "step": 5175 }, { "epoch": 0.8854119910227637, "grad_norm": 0.7736455202102661, "learning_rate": 7.136315494795016e-05, "loss": 5.2499, "num_input_tokens_seen": 678690816, "step": 5178 }, { "epoch": 0.8859249759538314, "grad_norm": 0.8731935024261475, "learning_rate": 7.134249093855563e-05, "loss": 5.2628, "num_input_tokens_seen": 679084032, "step": 5181 }, { "epoch": 0.886437960884899, "grad_norm": 0.7711248993873596, "learning_rate": 7.132184486926537e-05, "loss": 5.2459, "num_input_tokens_seen": 679477248, "step": 5184 }, { "epoch": 0.8869509458159667, "grad_norm": 0.7357232570648193, "learning_rate": 7.130121671413564e-05, "loss": 5.239, "num_input_tokens_seen": 679870464, "step": 5187 }, { "epoch": 0.8874639307470343, "grad_norm": 0.7501398324966431, "learning_rate": 7.128060644727519e-05, "loss": 5.2204, "num_input_tokens_seen": 680263680, "step": 5190 }, { "epoch": 0.887976915678102, "grad_norm": 0.7075851559638977, "learning_rate": 7.12600140428451e-05, "loss": 5.1888, "num_input_tokens_seen": 680656896, "step": 5193 }, { "epoch": 0.8884899006091697, "grad_norm": 0.7382627725601196, "learning_rate": 7.123943947505872e-05, "loss": 5.2237, "num_input_tokens_seen": 681050112, "step": 5196 }, { "epoch": 0.8890028855402372, "grad_norm": 0.6421706676483154, "learning_rate": 7.121888271818144e-05, "loss": 5.2264, "num_input_tokens_seen": 681443328, "step": 5199 }, { "epoch": 0.8891738805172598, "eval_accuracy": 0.18500407099820876, "eval_loss": 5.699267387390137, "eval_runtime": 109.3374, "eval_samples_per_second": 2.744, "eval_steps_per_second": 1.372, "num_input_tokens_seen": 681574400, "step": 5200 }, { "epoch": 0.8895158704713049, "grad_norm": 0.7272650599479675, "learning_rate": 7.11983437465306e-05, "loss": 5.298, "num_input_tokens_seen": 681836544, "step": 5202 }, { "epoch": 0.8900288554023725, "grad_norm": 0.6727803945541382, "learning_rate": 7.117782253447543e-05, "loss": 5.2554, "num_input_tokens_seen": 682229760, "step": 5205 }, { "epoch": 0.8905418403334402, "grad_norm": 0.713238000869751, "learning_rate": 7.115731905643676e-05, "loss": 5.2232, "num_input_tokens_seen": 682622976, "step": 5208 }, { "epoch": 0.8910548252645079, "grad_norm": 0.8077900409698486, "learning_rate": 7.1136833286887e-05, "loss": 5.188, "num_input_tokens_seen": 683016192, "step": 5211 }, { "epoch": 0.8915678101955755, "grad_norm": 0.694200873374939, "learning_rate": 7.111636520034998e-05, "loss": 5.2278, "num_input_tokens_seen": 683409408, "step": 5214 }, { "epoch": 0.8920807951266432, "grad_norm": 0.723655104637146, "learning_rate": 7.109591477140081e-05, "loss": 5.2641, "num_input_tokens_seen": 683802624, "step": 5217 }, { "epoch": 0.8925937800577108, "grad_norm": 0.7347647547721863, "learning_rate": 7.107548197466574e-05, "loss": 5.2124, "num_input_tokens_seen": 684195840, "step": 5220 }, { "epoch": 0.8931067649887785, "grad_norm": 0.7525767087936401, "learning_rate": 7.105506678482202e-05, "loss": 5.255, "num_input_tokens_seen": 684589056, "step": 5223 }, { "epoch": 0.893619749919846, "grad_norm": 0.7006353735923767, "learning_rate": 7.103466917659785e-05, "loss": 5.2275, "num_input_tokens_seen": 684982272, "step": 5226 }, { "epoch": 0.8941327348509137, "grad_norm": 0.7175182700157166, "learning_rate": 7.101428912477212e-05, "loss": 5.2545, "num_input_tokens_seen": 685375488, "step": 5229 }, { "epoch": 0.8946457197819814, "grad_norm": 0.692328691482544, "learning_rate": 7.099392660417439e-05, "loss": 5.1934, "num_input_tokens_seen": 685768704, "step": 5232 }, { "epoch": 0.895158704713049, "grad_norm": 0.6852241158485413, "learning_rate": 7.097358158968464e-05, "loss": 5.2326, "num_input_tokens_seen": 686161920, "step": 5235 }, { "epoch": 0.8956716896441167, "grad_norm": 0.7024674415588379, "learning_rate": 7.095325405623328e-05, "loss": 5.2226, "num_input_tokens_seen": 686555136, "step": 5238 }, { "epoch": 0.8961846745751844, "grad_norm": 0.6983912587165833, "learning_rate": 7.093294397880095e-05, "loss": 5.2614, "num_input_tokens_seen": 686948352, "step": 5241 }, { "epoch": 0.896697659506252, "grad_norm": 0.7533532381057739, "learning_rate": 7.091265133241835e-05, "loss": 5.2038, "num_input_tokens_seen": 687341568, "step": 5244 }, { "epoch": 0.8972106444373197, "grad_norm": 0.7239729166030884, "learning_rate": 7.08923760921662e-05, "loss": 5.2062, "num_input_tokens_seen": 687734784, "step": 5247 }, { "epoch": 0.8977236293683873, "grad_norm": 0.9106455445289612, "learning_rate": 7.087211823317505e-05, "loss": 5.1852, "num_input_tokens_seen": 688128000, "step": 5250 }, { "epoch": 0.898236614299455, "grad_norm": 0.7044593691825867, "learning_rate": 7.085187773062514e-05, "loss": 5.2111, "num_input_tokens_seen": 688521216, "step": 5253 }, { "epoch": 0.8987495992305226, "grad_norm": 0.7331400513648987, "learning_rate": 7.083165455974633e-05, "loss": 5.212, "num_input_tokens_seen": 688914432, "step": 5256 }, { "epoch": 0.8992625841615902, "grad_norm": 0.7778918743133545, "learning_rate": 7.081144869581792e-05, "loss": 5.1951, "num_input_tokens_seen": 689307648, "step": 5259 }, { "epoch": 0.8997755690926579, "grad_norm": 0.7159865498542786, "learning_rate": 7.079126011416861e-05, "loss": 5.1574, "num_input_tokens_seen": 689700864, "step": 5262 }, { "epoch": 0.9002885540237255, "grad_norm": 0.7375167012214661, "learning_rate": 7.077108879017622e-05, "loss": 5.2498, "num_input_tokens_seen": 690094080, "step": 5265 }, { "epoch": 0.9008015389547932, "grad_norm": 0.7126013040542603, "learning_rate": 7.075093469926772e-05, "loss": 5.2121, "num_input_tokens_seen": 690487296, "step": 5268 }, { "epoch": 0.9013145238858609, "grad_norm": 0.7177429795265198, "learning_rate": 7.073079781691898e-05, "loss": 5.2539, "num_input_tokens_seen": 690880512, "step": 5271 }, { "epoch": 0.9018275088169285, "grad_norm": 0.7073779106140137, "learning_rate": 7.071067811865475e-05, "loss": 5.2696, "num_input_tokens_seen": 691273728, "step": 5274 }, { "epoch": 0.9023404937479962, "grad_norm": 0.8069375157356262, "learning_rate": 7.069057558004847e-05, "loss": 5.1789, "num_input_tokens_seen": 691666944, "step": 5277 }, { "epoch": 0.9028534786790638, "grad_norm": 0.7410465478897095, "learning_rate": 7.067049017672214e-05, "loss": 5.2048, "num_input_tokens_seen": 692060160, "step": 5280 }, { "epoch": 0.9033664636101314, "grad_norm": 0.9009111523628235, "learning_rate": 7.065042188434626e-05, "loss": 5.2126, "num_input_tokens_seen": 692453376, "step": 5283 }, { "epoch": 0.9038794485411991, "grad_norm": 0.670901358127594, "learning_rate": 7.063037067863967e-05, "loss": 5.2405, "num_input_tokens_seen": 692846592, "step": 5286 }, { "epoch": 0.9043924334722667, "grad_norm": 0.666654646396637, "learning_rate": 7.061033653536935e-05, "loss": 5.2376, "num_input_tokens_seen": 693239808, "step": 5289 }, { "epoch": 0.9049054184033344, "grad_norm": 0.8325378894805908, "learning_rate": 7.059031943035043e-05, "loss": 5.175, "num_input_tokens_seen": 693633024, "step": 5292 }, { "epoch": 0.905418403334402, "grad_norm": 0.7429814338684082, "learning_rate": 7.0570319339446e-05, "loss": 5.2228, "num_input_tokens_seen": 694026240, "step": 5295 }, { "epoch": 0.9059313882654697, "grad_norm": 0.7891907095909119, "learning_rate": 7.055033623856699e-05, "loss": 5.2153, "num_input_tokens_seen": 694419456, "step": 5298 }, { "epoch": 0.9064443731965374, "grad_norm": 0.6809067726135254, "learning_rate": 7.053037010367201e-05, "loss": 5.1762, "num_input_tokens_seen": 694812672, "step": 5301 }, { "epoch": 0.906957358127605, "grad_norm": 0.721263587474823, "learning_rate": 7.051042091076731e-05, "loss": 5.2516, "num_input_tokens_seen": 695205888, "step": 5304 }, { "epoch": 0.9074703430586727, "grad_norm": 0.7249355912208557, "learning_rate": 7.049048863590665e-05, "loss": 5.211, "num_input_tokens_seen": 695599104, "step": 5307 }, { "epoch": 0.9079833279897404, "grad_norm": 0.7684010863304138, "learning_rate": 7.047057325519109e-05, "loss": 5.1956, "num_input_tokens_seen": 695992320, "step": 5310 }, { "epoch": 0.9084963129208079, "grad_norm": 0.8806822299957275, "learning_rate": 7.045067474476893e-05, "loss": 5.2286, "num_input_tokens_seen": 696385536, "step": 5313 }, { "epoch": 0.9090092978518756, "grad_norm": 0.8653072714805603, "learning_rate": 7.043079308083562e-05, "loss": 5.1777, "num_input_tokens_seen": 696778752, "step": 5316 }, { "epoch": 0.9095222827829432, "grad_norm": 0.8760643601417542, "learning_rate": 7.041092823963362e-05, "loss": 5.195, "num_input_tokens_seen": 697171968, "step": 5319 }, { "epoch": 0.9100352677140109, "grad_norm": 0.7858259081840515, "learning_rate": 7.03910801974522e-05, "loss": 5.2221, "num_input_tokens_seen": 697565184, "step": 5322 }, { "epoch": 0.9105482526450785, "grad_norm": 0.7364184856414795, "learning_rate": 7.037124893062746e-05, "loss": 5.1735, "num_input_tokens_seen": 697958400, "step": 5325 }, { "epoch": 0.9110612375761462, "grad_norm": 0.6950361132621765, "learning_rate": 7.03514344155421e-05, "loss": 5.2196, "num_input_tokens_seen": 698351616, "step": 5328 }, { "epoch": 0.9115742225072139, "grad_norm": 0.7385085821151733, "learning_rate": 7.03316366286254e-05, "loss": 5.2556, "num_input_tokens_seen": 698744832, "step": 5331 }, { "epoch": 0.9120872074382815, "grad_norm": 0.6558942198753357, "learning_rate": 7.031185554635294e-05, "loss": 5.2524, "num_input_tokens_seen": 699138048, "step": 5334 }, { "epoch": 0.9126001923693492, "grad_norm": 0.7035802602767944, "learning_rate": 7.029209114524669e-05, "loss": 5.2398, "num_input_tokens_seen": 699531264, "step": 5337 }, { "epoch": 0.9131131773004167, "grad_norm": 0.754648745059967, "learning_rate": 7.027234340187475e-05, "loss": 5.253, "num_input_tokens_seen": 699924480, "step": 5340 }, { "epoch": 0.9136261622314844, "grad_norm": 0.7020716667175293, "learning_rate": 7.025261229285127e-05, "loss": 5.2531, "num_input_tokens_seen": 700317696, "step": 5343 }, { "epoch": 0.9141391471625521, "grad_norm": 0.7005373239517212, "learning_rate": 7.023289779483637e-05, "loss": 5.1856, "num_input_tokens_seen": 700710912, "step": 5346 }, { "epoch": 0.9146521320936197, "grad_norm": 0.8192885518074036, "learning_rate": 7.021319988453594e-05, "loss": 5.2202, "num_input_tokens_seen": 701104128, "step": 5349 }, { "epoch": 0.9151651170246874, "grad_norm": 0.7134819626808167, "learning_rate": 7.019351853870163e-05, "loss": 5.1995, "num_input_tokens_seen": 701497344, "step": 5352 }, { "epoch": 0.915678101955755, "grad_norm": 0.7718667387962341, "learning_rate": 7.017385373413064e-05, "loss": 5.2546, "num_input_tokens_seen": 701890560, "step": 5355 }, { "epoch": 0.9161910868868227, "grad_norm": 0.7580474615097046, "learning_rate": 7.015420544766564e-05, "loss": 5.223, "num_input_tokens_seen": 702283776, "step": 5358 }, { "epoch": 0.9167040718178904, "grad_norm": 0.761403501033783, "learning_rate": 7.013457365619473e-05, "loss": 5.2063, "num_input_tokens_seen": 702676992, "step": 5361 }, { "epoch": 0.917217056748958, "grad_norm": 0.7182543873786926, "learning_rate": 7.01149583366512e-05, "loss": 5.2481, "num_input_tokens_seen": 703070208, "step": 5364 }, { "epoch": 0.9177300416800257, "grad_norm": 0.7789894938468933, "learning_rate": 7.009535946601349e-05, "loss": 5.2004, "num_input_tokens_seen": 703463424, "step": 5367 }, { "epoch": 0.9182430266110932, "grad_norm": 0.698883593082428, "learning_rate": 7.007577702130504e-05, "loss": 5.1961, "num_input_tokens_seen": 703856640, "step": 5370 }, { "epoch": 0.9187560115421609, "grad_norm": 0.7992156744003296, "learning_rate": 7.005621097959424e-05, "loss": 5.1863, "num_input_tokens_seen": 704249856, "step": 5373 }, { "epoch": 0.9192689964732286, "grad_norm": 0.7814815044403076, "learning_rate": 7.003666131799421e-05, "loss": 5.1962, "num_input_tokens_seen": 704643072, "step": 5376 }, { "epoch": 0.9197819814042962, "grad_norm": 0.7636701464653015, "learning_rate": 7.001712801366284e-05, "loss": 5.2015, "num_input_tokens_seen": 705036288, "step": 5379 }, { "epoch": 0.9202949663353639, "grad_norm": 0.7517854571342468, "learning_rate": 6.999761104380251e-05, "loss": 5.2346, "num_input_tokens_seen": 705429504, "step": 5382 }, { "epoch": 0.9208079512664316, "grad_norm": 0.7156343460083008, "learning_rate": 6.99781103856601e-05, "loss": 5.1884, "num_input_tokens_seen": 705822720, "step": 5385 }, { "epoch": 0.9213209361974992, "grad_norm": 0.7727795243263245, "learning_rate": 6.995862601652685e-05, "loss": 5.1847, "num_input_tokens_seen": 706215936, "step": 5388 }, { "epoch": 0.9218339211285669, "grad_norm": 0.790640115737915, "learning_rate": 6.993915791373815e-05, "loss": 5.2489, "num_input_tokens_seen": 706609152, "step": 5391 }, { "epoch": 0.9223469060596345, "grad_norm": 0.6869045495986938, "learning_rate": 6.991970605467365e-05, "loss": 5.2265, "num_input_tokens_seen": 707002368, "step": 5394 }, { "epoch": 0.9228598909907022, "grad_norm": 0.6860256791114807, "learning_rate": 6.99002704167569e-05, "loss": 5.1901, "num_input_tokens_seen": 707395584, "step": 5397 }, { "epoch": 0.9233728759217698, "grad_norm": 0.7133844494819641, "learning_rate": 6.988085097745543e-05, "loss": 5.2146, "num_input_tokens_seen": 707788800, "step": 5400 }, { "epoch": 0.9238858608528374, "grad_norm": 0.7109991908073425, "learning_rate": 6.986144771428049e-05, "loss": 5.244, "num_input_tokens_seen": 708182016, "step": 5403 }, { "epoch": 0.9243988457839051, "grad_norm": 0.6793102622032166, "learning_rate": 6.984206060478708e-05, "loss": 5.2256, "num_input_tokens_seen": 708575232, "step": 5406 }, { "epoch": 0.9249118307149727, "grad_norm": 0.6254820823669434, "learning_rate": 6.982268962657377e-05, "loss": 5.2082, "num_input_tokens_seen": 708968448, "step": 5409 }, { "epoch": 0.9254248156460404, "grad_norm": 0.6661543846130371, "learning_rate": 6.980333475728256e-05, "loss": 5.2922, "num_input_tokens_seen": 709361664, "step": 5412 }, { "epoch": 0.9259378005771081, "grad_norm": 0.674060583114624, "learning_rate": 6.978399597459882e-05, "loss": 5.2425, "num_input_tokens_seen": 709754880, "step": 5415 }, { "epoch": 0.9264507855081757, "grad_norm": 0.7253686189651489, "learning_rate": 6.976467325625122e-05, "loss": 5.1947, "num_input_tokens_seen": 710148096, "step": 5418 }, { "epoch": 0.9269637704392434, "grad_norm": 0.6542623043060303, "learning_rate": 6.974536658001151e-05, "loss": 5.1991, "num_input_tokens_seen": 710541312, "step": 5421 }, { "epoch": 0.927476755370311, "grad_norm": 0.7277594208717346, "learning_rate": 6.972607592369453e-05, "loss": 5.2157, "num_input_tokens_seen": 710934528, "step": 5424 }, { "epoch": 0.9279897403013786, "grad_norm": 0.6952804923057556, "learning_rate": 6.970680126515798e-05, "loss": 5.2329, "num_input_tokens_seen": 711327744, "step": 5427 }, { "epoch": 0.9285027252324463, "grad_norm": 0.7163040041923523, "learning_rate": 6.968754258230246e-05, "loss": 5.2398, "num_input_tokens_seen": 711720960, "step": 5430 }, { "epoch": 0.9290157101635139, "grad_norm": 0.6704942584037781, "learning_rate": 6.966829985307124e-05, "loss": 5.1918, "num_input_tokens_seen": 712114176, "step": 5433 }, { "epoch": 0.9295286950945816, "grad_norm": 0.6688148379325867, "learning_rate": 6.964907305545022e-05, "loss": 5.2048, "num_input_tokens_seen": 712507392, "step": 5436 }, { "epoch": 0.9300416800256492, "grad_norm": 0.7081178426742554, "learning_rate": 6.962986216746778e-05, "loss": 5.2602, "num_input_tokens_seen": 712900608, "step": 5439 }, { "epoch": 0.9305546649567169, "grad_norm": 0.7875306606292725, "learning_rate": 6.961066716719472e-05, "loss": 5.2051, "num_input_tokens_seen": 713293824, "step": 5442 }, { "epoch": 0.9310676498877846, "grad_norm": 0.7292640209197998, "learning_rate": 6.959148803274413e-05, "loss": 5.2177, "num_input_tokens_seen": 713687040, "step": 5445 }, { "epoch": 0.9315806348188522, "grad_norm": 0.7992739677429199, "learning_rate": 6.957232474227127e-05, "loss": 5.2218, "num_input_tokens_seen": 714080256, "step": 5448 }, { "epoch": 0.9320936197499199, "grad_norm": 0.6518861651420593, "learning_rate": 6.955317727397353e-05, "loss": 5.2443, "num_input_tokens_seen": 714473472, "step": 5451 }, { "epoch": 0.9326066046809875, "grad_norm": 0.6771551966667175, "learning_rate": 6.953404560609022e-05, "loss": 5.1987, "num_input_tokens_seen": 714866688, "step": 5454 }, { "epoch": 0.9331195896120551, "grad_norm": 0.7463414072990417, "learning_rate": 6.951492971690257e-05, "loss": 5.2047, "num_input_tokens_seen": 715259904, "step": 5457 }, { "epoch": 0.9336325745431228, "grad_norm": 0.7123643755912781, "learning_rate": 6.949582958473356e-05, "loss": 5.2367, "num_input_tokens_seen": 715653120, "step": 5460 }, { "epoch": 0.9341455594741904, "grad_norm": 0.8397427201271057, "learning_rate": 6.947674518794787e-05, "loss": 5.2235, "num_input_tokens_seen": 716046336, "step": 5463 }, { "epoch": 0.9346585444052581, "grad_norm": 0.731139600276947, "learning_rate": 6.94576765049517e-05, "loss": 5.2689, "num_input_tokens_seen": 716439552, "step": 5466 }, { "epoch": 0.9351715293363257, "grad_norm": 0.7536250352859497, "learning_rate": 6.943862351419276e-05, "loss": 5.1548, "num_input_tokens_seen": 716832768, "step": 5469 }, { "epoch": 0.9356845142673934, "grad_norm": 0.7753055095672607, "learning_rate": 6.941958619416007e-05, "loss": 5.2435, "num_input_tokens_seen": 717225984, "step": 5472 }, { "epoch": 0.9361974991984611, "grad_norm": 0.677101731300354, "learning_rate": 6.940056452338398e-05, "loss": 5.1863, "num_input_tokens_seen": 717619200, "step": 5475 }, { "epoch": 0.9367104841295287, "grad_norm": 0.730702817440033, "learning_rate": 6.938155848043593e-05, "loss": 5.2294, "num_input_tokens_seen": 718012416, "step": 5478 }, { "epoch": 0.9372234690605964, "grad_norm": 0.7470299601554871, "learning_rate": 6.936256804392845e-05, "loss": 5.1684, "num_input_tokens_seen": 718405632, "step": 5481 }, { "epoch": 0.9377364539916639, "grad_norm": 0.7415927052497864, "learning_rate": 6.934359319251501e-05, "loss": 5.213, "num_input_tokens_seen": 718798848, "step": 5484 }, { "epoch": 0.9382494389227316, "grad_norm": 0.8822851777076721, "learning_rate": 6.932463390488997e-05, "loss": 5.2153, "num_input_tokens_seen": 719192064, "step": 5487 }, { "epoch": 0.9387624238537993, "grad_norm": 0.8007544279098511, "learning_rate": 6.93056901597884e-05, "loss": 5.2219, "num_input_tokens_seen": 719585280, "step": 5490 }, { "epoch": 0.9392754087848669, "grad_norm": 0.7476817965507507, "learning_rate": 6.928676193598603e-05, "loss": 5.2602, "num_input_tokens_seen": 719978496, "step": 5493 }, { "epoch": 0.9397883937159346, "grad_norm": 0.8078888654708862, "learning_rate": 6.926784921229917e-05, "loss": 5.2227, "num_input_tokens_seen": 720371712, "step": 5496 }, { "epoch": 0.9403013786470023, "grad_norm": 0.75888991355896, "learning_rate": 6.924895196758458e-05, "loss": 5.2416, "num_input_tokens_seen": 720764928, "step": 5499 }, { "epoch": 0.9408143635780699, "grad_norm": 0.6974719762802124, "learning_rate": 6.923007018073937e-05, "loss": 5.191, "num_input_tokens_seen": 721158144, "step": 5502 }, { "epoch": 0.9413273485091376, "grad_norm": 0.7393808960914612, "learning_rate": 6.921120383070087e-05, "loss": 5.2472, "num_input_tokens_seen": 721551360, "step": 5505 }, { "epoch": 0.9418403334402052, "grad_norm": 0.7024008631706238, "learning_rate": 6.919235289644663e-05, "loss": 5.2467, "num_input_tokens_seen": 721944576, "step": 5508 }, { "epoch": 0.9423533183712729, "grad_norm": 0.7399491667747498, "learning_rate": 6.91735173569942e-05, "loss": 5.1734, "num_input_tokens_seen": 722337792, "step": 5511 }, { "epoch": 0.9428663033023404, "grad_norm": 0.6762517690658569, "learning_rate": 6.915469719140114e-05, "loss": 5.2006, "num_input_tokens_seen": 722731008, "step": 5514 }, { "epoch": 0.9433792882334081, "grad_norm": 0.7834249138832092, "learning_rate": 6.913589237876484e-05, "loss": 5.2128, "num_input_tokens_seen": 723124224, "step": 5517 }, { "epoch": 0.9438922731644758, "grad_norm": 0.7544867992401123, "learning_rate": 6.91171028982225e-05, "loss": 5.1778, "num_input_tokens_seen": 723517440, "step": 5520 }, { "epoch": 0.9444052580955434, "grad_norm": 0.7604652643203735, "learning_rate": 6.909832872895093e-05, "loss": 5.2395, "num_input_tokens_seen": 723910656, "step": 5523 }, { "epoch": 0.9449182430266111, "grad_norm": 0.7239798307418823, "learning_rate": 6.907956985016653e-05, "loss": 5.2005, "num_input_tokens_seen": 724303872, "step": 5526 }, { "epoch": 0.9454312279576788, "grad_norm": 0.6755896210670471, "learning_rate": 6.906082624112522e-05, "loss": 5.2482, "num_input_tokens_seen": 724697088, "step": 5529 }, { "epoch": 0.9459442128887464, "grad_norm": 0.6451256275177002, "learning_rate": 6.904209788112224e-05, "loss": 5.2247, "num_input_tokens_seen": 725090304, "step": 5532 }, { "epoch": 0.9464571978198141, "grad_norm": 0.6745946407318115, "learning_rate": 6.902338474949212e-05, "loss": 5.1867, "num_input_tokens_seen": 725483520, "step": 5535 }, { "epoch": 0.9469701827508817, "grad_norm": 0.7276203036308289, "learning_rate": 6.900468682560861e-05, "loss": 5.2158, "num_input_tokens_seen": 725876736, "step": 5538 }, { "epoch": 0.9474831676819493, "grad_norm": 0.7208330035209656, "learning_rate": 6.898600408888455e-05, "loss": 5.252, "num_input_tokens_seen": 726269952, "step": 5541 }, { "epoch": 0.947996152613017, "grad_norm": 0.7684881687164307, "learning_rate": 6.896733651877174e-05, "loss": 5.2424, "num_input_tokens_seen": 726663168, "step": 5544 }, { "epoch": 0.9485091375440846, "grad_norm": 0.6904166340827942, "learning_rate": 6.894868409476089e-05, "loss": 5.1711, "num_input_tokens_seen": 727056384, "step": 5547 }, { "epoch": 0.9490221224751523, "grad_norm": 0.6969854235649109, "learning_rate": 6.893004679638155e-05, "loss": 5.2098, "num_input_tokens_seen": 727449600, "step": 5550 }, { "epoch": 0.9495351074062199, "grad_norm": 0.6778176426887512, "learning_rate": 6.891142460320194e-05, "loss": 5.207, "num_input_tokens_seen": 727842816, "step": 5553 }, { "epoch": 0.9500480923372876, "grad_norm": 0.6988036632537842, "learning_rate": 6.889281749482896e-05, "loss": 5.1607, "num_input_tokens_seen": 728236032, "step": 5556 }, { "epoch": 0.9505610772683553, "grad_norm": 0.6899657845497131, "learning_rate": 6.887422545090792e-05, "loss": 5.2255, "num_input_tokens_seen": 728629248, "step": 5559 }, { "epoch": 0.9510740621994229, "grad_norm": 0.7000394463539124, "learning_rate": 6.885564845112269e-05, "loss": 5.234, "num_input_tokens_seen": 729022464, "step": 5562 }, { "epoch": 0.9515870471304906, "grad_norm": 0.7130152583122253, "learning_rate": 6.88370864751954e-05, "loss": 5.2547, "num_input_tokens_seen": 729415680, "step": 5565 }, { "epoch": 0.9521000320615582, "grad_norm": 0.7220883369445801, "learning_rate": 6.881853950288646e-05, "loss": 5.1583, "num_input_tokens_seen": 729808896, "step": 5568 }, { "epoch": 0.9526130169926258, "grad_norm": 0.700537919998169, "learning_rate": 6.88000075139944e-05, "loss": 5.2015, "num_input_tokens_seen": 730202112, "step": 5571 }, { "epoch": 0.9531260019236935, "grad_norm": 0.8122661113739014, "learning_rate": 6.878149048835583e-05, "loss": 5.1881, "num_input_tokens_seen": 730595328, "step": 5574 }, { "epoch": 0.9536389868547611, "grad_norm": 0.6667414307594299, "learning_rate": 6.876298840584535e-05, "loss": 5.1502, "num_input_tokens_seen": 730988544, "step": 5577 }, { "epoch": 0.9541519717858288, "grad_norm": 0.7198598980903625, "learning_rate": 6.874450124637534e-05, "loss": 5.2023, "num_input_tokens_seen": 731381760, "step": 5580 }, { "epoch": 0.9546649567168964, "grad_norm": 0.7227144837379456, "learning_rate": 6.872602898989611e-05, "loss": 5.1746, "num_input_tokens_seen": 731774976, "step": 5583 }, { "epoch": 0.9551779416479641, "grad_norm": 0.7212773561477661, "learning_rate": 6.870757161639557e-05, "loss": 5.2028, "num_input_tokens_seen": 732168192, "step": 5586 }, { "epoch": 0.9556909265790318, "grad_norm": 0.7933756113052368, "learning_rate": 6.868912910589922e-05, "loss": 5.1419, "num_input_tokens_seen": 732561408, "step": 5589 }, { "epoch": 0.9562039115100994, "grad_norm": 0.8172382712364197, "learning_rate": 6.867070143847011e-05, "loss": 5.1742, "num_input_tokens_seen": 732954624, "step": 5592 }, { "epoch": 0.9567168964411671, "grad_norm": 0.7590979337692261, "learning_rate": 6.86522885942087e-05, "loss": 5.2606, "num_input_tokens_seen": 733347840, "step": 5595 }, { "epoch": 0.9572298813722346, "grad_norm": 0.8111834526062012, "learning_rate": 6.86338905532528e-05, "loss": 5.2107, "num_input_tokens_seen": 733741056, "step": 5598 }, { "epoch": 0.9575718713262797, "eval_accuracy": 0.18844650708353688, "eval_loss": 5.67447566986084, "eval_runtime": 111.5849, "eval_samples_per_second": 2.689, "eval_steps_per_second": 1.344, "num_input_tokens_seen": 734003200, "step": 5600 }, { "epoch": 0.9577428663033023, "grad_norm": 0.7701963782310486, "learning_rate": 6.861550729577741e-05, "loss": 5.2339, "num_input_tokens_seen": 734134272, "step": 5601 }, { "epoch": 0.95825585123437, "grad_norm": 0.7521535754203796, "learning_rate": 6.85971388019947e-05, "loss": 5.1737, "num_input_tokens_seen": 734527488, "step": 5604 }, { "epoch": 0.9587688361654376, "grad_norm": 0.8248022794723511, "learning_rate": 6.857878505215393e-05, "loss": 5.202, "num_input_tokens_seen": 734920704, "step": 5607 }, { "epoch": 0.9592818210965053, "grad_norm": 0.6702877879142761, "learning_rate": 6.856044602654132e-05, "loss": 5.1872, "num_input_tokens_seen": 735313920, "step": 5610 }, { "epoch": 0.9597948060275729, "grad_norm": 0.6633716225624084, "learning_rate": 6.854212170547997e-05, "loss": 5.2188, "num_input_tokens_seen": 735707136, "step": 5613 }, { "epoch": 0.9603077909586406, "grad_norm": 0.7093427181243896, "learning_rate": 6.852381206932974e-05, "loss": 5.1756, "num_input_tokens_seen": 736100352, "step": 5616 }, { "epoch": 0.9608207758897083, "grad_norm": 0.7626616358757019, "learning_rate": 6.850551709848722e-05, "loss": 5.1181, "num_input_tokens_seen": 736493568, "step": 5619 }, { "epoch": 0.9613337608207759, "grad_norm": 0.7711624503135681, "learning_rate": 6.848723677338564e-05, "loss": 5.1677, "num_input_tokens_seen": 736886784, "step": 5622 }, { "epoch": 0.9618467457518436, "grad_norm": 0.7787635326385498, "learning_rate": 6.846897107449475e-05, "loss": 5.1462, "num_input_tokens_seen": 737280000, "step": 5625 }, { "epoch": 0.9623597306829111, "grad_norm": 0.7611491084098816, "learning_rate": 6.845071998232071e-05, "loss": 5.2339, "num_input_tokens_seen": 737673216, "step": 5628 }, { "epoch": 0.9628727156139788, "grad_norm": 0.7201474905014038, "learning_rate": 6.843248347740607e-05, "loss": 5.2319, "num_input_tokens_seen": 738066432, "step": 5631 }, { "epoch": 0.9633857005450465, "grad_norm": 0.6865774393081665, "learning_rate": 6.841426154032964e-05, "loss": 5.1631, "num_input_tokens_seen": 738459648, "step": 5634 }, { "epoch": 0.9638986854761141, "grad_norm": 0.7156822681427002, "learning_rate": 6.839605415170637e-05, "loss": 5.2378, "num_input_tokens_seen": 738852864, "step": 5637 }, { "epoch": 0.9644116704071818, "grad_norm": 0.7458356618881226, "learning_rate": 6.837786129218738e-05, "loss": 5.1746, "num_input_tokens_seen": 739246080, "step": 5640 }, { "epoch": 0.9649246553382494, "grad_norm": 0.7389182448387146, "learning_rate": 6.835968294245973e-05, "loss": 5.1859, "num_input_tokens_seen": 739639296, "step": 5643 }, { "epoch": 0.9654376402693171, "grad_norm": 0.7108219861984253, "learning_rate": 6.834151908324644e-05, "loss": 5.1834, "num_input_tokens_seen": 740032512, "step": 5646 }, { "epoch": 0.9659506252003848, "grad_norm": 0.8195409178733826, "learning_rate": 6.832336969530635e-05, "loss": 5.1927, "num_input_tokens_seen": 740425728, "step": 5649 }, { "epoch": 0.9664636101314524, "grad_norm": 0.7212623953819275, "learning_rate": 6.830523475943408e-05, "loss": 5.1814, "num_input_tokens_seen": 740818944, "step": 5652 }, { "epoch": 0.96697659506252, "grad_norm": 0.6641290187835693, "learning_rate": 6.828711425645984e-05, "loss": 5.16, "num_input_tokens_seen": 741212160, "step": 5655 }, { "epoch": 0.9674895799935876, "grad_norm": 0.7765944004058838, "learning_rate": 6.826900816724949e-05, "loss": 5.1787, "num_input_tokens_seen": 741605376, "step": 5658 }, { "epoch": 0.9680025649246553, "grad_norm": 0.6821857690811157, "learning_rate": 6.825091647270437e-05, "loss": 5.188, "num_input_tokens_seen": 741998592, "step": 5661 }, { "epoch": 0.968515549855723, "grad_norm": 0.6633355617523193, "learning_rate": 6.823283915376123e-05, "loss": 5.2485, "num_input_tokens_seen": 742391808, "step": 5664 }, { "epoch": 0.9690285347867906, "grad_norm": 0.6869776844978333, "learning_rate": 6.821477619139209e-05, "loss": 5.222, "num_input_tokens_seen": 742785024, "step": 5667 }, { "epoch": 0.9695415197178583, "grad_norm": 0.7537380456924438, "learning_rate": 6.819672756660432e-05, "loss": 5.2192, "num_input_tokens_seen": 743178240, "step": 5670 }, { "epoch": 0.970054504648926, "grad_norm": 0.775252640247345, "learning_rate": 6.817869326044036e-05, "loss": 5.15, "num_input_tokens_seen": 743571456, "step": 5673 }, { "epoch": 0.9705674895799936, "grad_norm": 0.7099266648292542, "learning_rate": 6.816067325397775e-05, "loss": 5.1486, "num_input_tokens_seen": 743964672, "step": 5676 }, { "epoch": 0.9710804745110613, "grad_norm": 0.7244137525558472, "learning_rate": 6.814266752832903e-05, "loss": 5.1545, "num_input_tokens_seen": 744357888, "step": 5679 }, { "epoch": 0.9715934594421289, "grad_norm": 0.6794865727424622, "learning_rate": 6.812467606464162e-05, "loss": 5.2089, "num_input_tokens_seen": 744751104, "step": 5682 }, { "epoch": 0.9721064443731965, "grad_norm": 0.7153213024139404, "learning_rate": 6.81066988440978e-05, "loss": 5.2345, "num_input_tokens_seen": 745144320, "step": 5685 }, { "epoch": 0.9726194293042641, "grad_norm": 0.7132147550582886, "learning_rate": 6.808873584791457e-05, "loss": 5.2006, "num_input_tokens_seen": 745537536, "step": 5688 }, { "epoch": 0.9731324142353318, "grad_norm": 0.6356132626533508, "learning_rate": 6.807078705734362e-05, "loss": 5.203, "num_input_tokens_seen": 745930752, "step": 5691 }, { "epoch": 0.9736453991663995, "grad_norm": 0.6981724500656128, "learning_rate": 6.805285245367116e-05, "loss": 5.2071, "num_input_tokens_seen": 746323968, "step": 5694 }, { "epoch": 0.9741583840974671, "grad_norm": 0.6996302604675293, "learning_rate": 6.803493201821794e-05, "loss": 5.2533, "num_input_tokens_seen": 746717184, "step": 5697 }, { "epoch": 0.9746713690285348, "grad_norm": 0.8056071996688843, "learning_rate": 6.801702573233913e-05, "loss": 5.1897, "num_input_tokens_seen": 747110400, "step": 5700 }, { "epoch": 0.9751843539596025, "grad_norm": 0.7028996348381042, "learning_rate": 6.79991335774242e-05, "loss": 5.1624, "num_input_tokens_seen": 747503616, "step": 5703 }, { "epoch": 0.9756973388906701, "grad_norm": 0.8844038248062134, "learning_rate": 6.798125553489686e-05, "loss": 5.1886, "num_input_tokens_seen": 747896832, "step": 5706 }, { "epoch": 0.9762103238217378, "grad_norm": 0.7125393152236938, "learning_rate": 6.796339158621506e-05, "loss": 5.1686, "num_input_tokens_seen": 748290048, "step": 5709 }, { "epoch": 0.9767233087528054, "grad_norm": 0.7706892490386963, "learning_rate": 6.794554171287077e-05, "loss": 5.1821, "num_input_tokens_seen": 748683264, "step": 5712 }, { "epoch": 0.977236293683873, "grad_norm": 0.7665553689002991, "learning_rate": 6.792770589638998e-05, "loss": 5.1875, "num_input_tokens_seen": 749076480, "step": 5715 }, { "epoch": 0.9777492786149407, "grad_norm": 0.822076141834259, "learning_rate": 6.790988411833267e-05, "loss": 5.1802, "num_input_tokens_seen": 749469696, "step": 5718 }, { "epoch": 0.9782622635460083, "grad_norm": 0.734258770942688, "learning_rate": 6.789207636029258e-05, "loss": 5.1485, "num_input_tokens_seen": 749862912, "step": 5721 }, { "epoch": 0.978775248477076, "grad_norm": 0.851309597492218, "learning_rate": 6.787428260389725e-05, "loss": 5.2038, "num_input_tokens_seen": 750256128, "step": 5724 }, { "epoch": 0.9792882334081436, "grad_norm": 0.8018383979797363, "learning_rate": 6.785650283080797e-05, "loss": 5.1956, "num_input_tokens_seen": 750649344, "step": 5727 }, { "epoch": 0.9798012183392113, "grad_norm": 0.8261561393737793, "learning_rate": 6.78387370227195e-05, "loss": 5.2167, "num_input_tokens_seen": 751042560, "step": 5730 }, { "epoch": 0.980314203270279, "grad_norm": 0.7677420377731323, "learning_rate": 6.782098516136031e-05, "loss": 5.1776, "num_input_tokens_seen": 751435776, "step": 5733 }, { "epoch": 0.9808271882013466, "grad_norm": 0.7804604172706604, "learning_rate": 6.780324722849218e-05, "loss": 5.2386, "num_input_tokens_seen": 751828992, "step": 5736 }, { "epoch": 0.9813401731324143, "grad_norm": 0.8051115870475769, "learning_rate": 6.778552320591031e-05, "loss": 5.1832, "num_input_tokens_seen": 752222208, "step": 5739 }, { "epoch": 0.9818531580634818, "grad_norm": 0.7745217680931091, "learning_rate": 6.776781307544323e-05, "loss": 5.1991, "num_input_tokens_seen": 752615424, "step": 5742 }, { "epoch": 0.9823661429945495, "grad_norm": 0.7418707609176636, "learning_rate": 6.775011681895259e-05, "loss": 5.1977, "num_input_tokens_seen": 753008640, "step": 5745 }, { "epoch": 0.9828791279256172, "grad_norm": 0.7361640930175781, "learning_rate": 6.773243441833328e-05, "loss": 5.1713, "num_input_tokens_seen": 753401856, "step": 5748 }, { "epoch": 0.9833921128566848, "grad_norm": 0.7707425355911255, "learning_rate": 6.771476585551323e-05, "loss": 5.1812, "num_input_tokens_seen": 753795072, "step": 5751 }, { "epoch": 0.9839050977877525, "grad_norm": 0.7603244781494141, "learning_rate": 6.769711111245329e-05, "loss": 5.2091, "num_input_tokens_seen": 754188288, "step": 5754 }, { "epoch": 0.9844180827188201, "grad_norm": 0.771886944770813, "learning_rate": 6.767947017114727e-05, "loss": 5.1886, "num_input_tokens_seen": 754581504, "step": 5757 }, { "epoch": 0.9849310676498878, "grad_norm": 0.8924115896224976, "learning_rate": 6.766184301362177e-05, "loss": 5.1916, "num_input_tokens_seen": 754974720, "step": 5760 }, { "epoch": 0.9854440525809555, "grad_norm": 0.7867364287376404, "learning_rate": 6.764422962193624e-05, "loss": 5.1989, "num_input_tokens_seen": 755367936, "step": 5763 }, { "epoch": 0.9859570375120231, "grad_norm": 0.809303343296051, "learning_rate": 6.762662997818266e-05, "loss": 5.2264, "num_input_tokens_seen": 755761152, "step": 5766 }, { "epoch": 0.9864700224430908, "grad_norm": 0.7048402428627014, "learning_rate": 6.760904406448573e-05, "loss": 5.2237, "num_input_tokens_seen": 756154368, "step": 5769 }, { "epoch": 0.9869830073741583, "grad_norm": 0.7238898277282715, "learning_rate": 6.759147186300257e-05, "loss": 5.2013, "num_input_tokens_seen": 756547584, "step": 5772 }, { "epoch": 0.987495992305226, "grad_norm": 0.7172737121582031, "learning_rate": 6.757391335592282e-05, "loss": 5.1839, "num_input_tokens_seen": 756940800, "step": 5775 }, { "epoch": 0.9880089772362937, "grad_norm": 0.7212975025177002, "learning_rate": 6.755636852546848e-05, "loss": 5.1696, "num_input_tokens_seen": 757334016, "step": 5778 }, { "epoch": 0.9885219621673613, "grad_norm": 0.754808247089386, "learning_rate": 6.753883735389383e-05, "loss": 5.2154, "num_input_tokens_seen": 757727232, "step": 5781 }, { "epoch": 0.989034947098429, "grad_norm": 0.782254159450531, "learning_rate": 6.752131982348533e-05, "loss": 5.1923, "num_input_tokens_seen": 758120448, "step": 5784 }, { "epoch": 0.9895479320294966, "grad_norm": 0.7205175757408142, "learning_rate": 6.750381591656167e-05, "loss": 5.2076, "num_input_tokens_seen": 758513664, "step": 5787 }, { "epoch": 0.9900609169605643, "grad_norm": 0.7384617328643799, "learning_rate": 6.748632561547353e-05, "loss": 5.1796, "num_input_tokens_seen": 758906880, "step": 5790 }, { "epoch": 0.990573901891632, "grad_norm": 0.7156171202659607, "learning_rate": 6.746884890260363e-05, "loss": 5.1877, "num_input_tokens_seen": 759300096, "step": 5793 }, { "epoch": 0.9910868868226996, "grad_norm": 0.8681517243385315, "learning_rate": 6.745138576036662e-05, "loss": 5.1948, "num_input_tokens_seen": 759693312, "step": 5796 }, { "epoch": 0.9915998717537672, "grad_norm": 0.7191133499145508, "learning_rate": 6.743393617120892e-05, "loss": 5.2031, "num_input_tokens_seen": 760086528, "step": 5799 }, { "epoch": 0.9921128566848348, "grad_norm": 0.8175661563873291, "learning_rate": 6.741650011760882e-05, "loss": 5.1437, "num_input_tokens_seen": 760479744, "step": 5802 }, { "epoch": 0.9926258416159025, "grad_norm": 0.7410142421722412, "learning_rate": 6.739907758207622e-05, "loss": 5.1831, "num_input_tokens_seen": 760872960, "step": 5805 }, { "epoch": 0.9931388265469702, "grad_norm": 0.8488388061523438, "learning_rate": 6.73816685471527e-05, "loss": 5.1424, "num_input_tokens_seen": 761266176, "step": 5808 }, { "epoch": 0.9936518114780378, "grad_norm": 0.7565668821334839, "learning_rate": 6.736427299541137e-05, "loss": 5.1853, "num_input_tokens_seen": 761659392, "step": 5811 }, { "epoch": 0.9941647964091055, "grad_norm": 0.8777074217796326, "learning_rate": 6.734689090945682e-05, "loss": 5.167, "num_input_tokens_seen": 762052608, "step": 5814 }, { "epoch": 0.9946777813401732, "grad_norm": 0.8720703125, "learning_rate": 6.732952227192505e-05, "loss": 5.1915, "num_input_tokens_seen": 762445824, "step": 5817 }, { "epoch": 0.9951907662712408, "grad_norm": 0.8000132441520691, "learning_rate": 6.731216706548339e-05, "loss": 5.1721, "num_input_tokens_seen": 762839040, "step": 5820 }, { "epoch": 0.9957037512023085, "grad_norm": 0.733505368232727, "learning_rate": 6.729482527283039e-05, "loss": 5.2092, "num_input_tokens_seen": 763232256, "step": 5823 }, { "epoch": 0.9962167361333761, "grad_norm": 0.7550384402275085, "learning_rate": 6.727749687669586e-05, "loss": 5.1846, "num_input_tokens_seen": 763625472, "step": 5826 }, { "epoch": 0.9967297210644437, "grad_norm": 0.8135012984275818, "learning_rate": 6.726018185984064e-05, "loss": 5.215, "num_input_tokens_seen": 764018688, "step": 5829 }, { "epoch": 0.9972427059955113, "grad_norm": 0.7134667634963989, "learning_rate": 6.724288020505667e-05, "loss": 5.1611, "num_input_tokens_seen": 764411904, "step": 5832 }, { "epoch": 0.997755690926579, "grad_norm": 0.7443891167640686, "learning_rate": 6.722559189516687e-05, "loss": 5.1741, "num_input_tokens_seen": 764805120, "step": 5835 }, { "epoch": 0.9982686758576467, "grad_norm": 0.6709545850753784, "learning_rate": 6.720831691302501e-05, "loss": 5.1677, "num_input_tokens_seen": 765198336, "step": 5838 }, { "epoch": 0.9987816607887143, "grad_norm": 0.7193747758865356, "learning_rate": 6.71910552415157e-05, "loss": 5.2325, "num_input_tokens_seen": 765591552, "step": 5841 }, { "epoch": 0.999294645719782, "grad_norm": 0.7146051526069641, "learning_rate": 6.71738068635543e-05, "loss": 5.1577, "num_input_tokens_seen": 765984768, "step": 5844 }, { "epoch": 0.9998076306508497, "grad_norm": 0.6998146772384644, "learning_rate": 6.715657176208689e-05, "loss": 5.2431, "num_input_tokens_seen": 766377984, "step": 5847 }, { "epoch": 0.9999786256278722, "num_input_tokens_seen": 766509056, "step": 5848, "total_flos": 4.708536848052388e+17, "train_loss": 5.594264277028972, "train_runtime": 134120.2101, "train_samples_per_second": 2.791, "train_steps_per_second": 0.044 } ], "logging_steps": 3, "max_steps": 5848, "num_input_tokens_seen": 766509056, "num_train_epochs": 1, "save_steps": 100, "total_flos": 4.708536848052388e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }