diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15763 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999786256278722, + "eval_steps": 400, + "global_step": 5848, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005129849310676499, + "grad_norm": 2.118435859680176, + "learning_rate": 3.0716723549488053e-06, + "loss": 7.7542, + "num_input_tokens_seen": 393216, + "step": 3 + }, + { + "epoch": 0.0010259698621352998, + "grad_norm": 1.6133321523666382, + "learning_rate": 6.1433447098976105e-06, + "loss": 7.78, + "num_input_tokens_seen": 786432, + "step": 6 + }, + { + "epoch": 0.0015389547932029496, + "grad_norm": 1.2574195861816406, + "learning_rate": 9.215017064846415e-06, + "loss": 7.7144, + "num_input_tokens_seen": 1179648, + "step": 9 + }, + { + "epoch": 0.0020519397242705997, + "grad_norm": 1.3165267705917358, + "learning_rate": 1.2286689419795221e-05, + "loss": 7.656, + "num_input_tokens_seen": 1572864, + "step": 12 + }, + { + "epoch": 0.0025649246553382495, + "grad_norm": 1.064561367034912, + "learning_rate": 1.5358361774744027e-05, + "loss": 7.662, + "num_input_tokens_seen": 1966080, + "step": 15 + }, + { + "epoch": 0.0030779095864058993, + "grad_norm": 1.0571919679641724, + "learning_rate": 1.843003412969283e-05, + "loss": 7.6047, + "num_input_tokens_seen": 2359296, + "step": 18 + }, + { + "epoch": 0.003590894517473549, + "grad_norm": 0.8261664509773254, + "learning_rate": 2.1501706484641635e-05, + "loss": 7.5477, + "num_input_tokens_seen": 2752512, + "step": 21 + }, + { + "epoch": 0.004103879448541199, + "grad_norm": 0.777800977230072, + "learning_rate": 2.4573378839590442e-05, + "loss": 7.5543, + "num_input_tokens_seen": 3145728, + "step": 24 + }, + { + "epoch": 0.004616864379608849, + "grad_norm": 0.6865202188491821, + "learning_rate": 2.7645051194539246e-05, + "loss": 7.4878, + "num_input_tokens_seen": 3538944, + "step": 27 + }, + { + "epoch": 0.005129849310676499, + "grad_norm": 0.694611132144928, + "learning_rate": 3.0716723549488054e-05, + "loss": 7.4608, + "num_input_tokens_seen": 3932160, + "step": 30 + }, + { + "epoch": 0.005642834241744148, + "grad_norm": 0.6728237271308899, + "learning_rate": 3.3788395904436854e-05, + "loss": 7.4121, + "num_input_tokens_seen": 4325376, + "step": 33 + }, + { + "epoch": 0.006155819172811799, + "grad_norm": 0.6782873868942261, + "learning_rate": 3.686006825938566e-05, + "loss": 7.4209, + "num_input_tokens_seen": 4718592, + "step": 36 + }, + { + "epoch": 0.006668804103879449, + "grad_norm": 0.6442039012908936, + "learning_rate": 3.993174061433447e-05, + "loss": 7.3416, + "num_input_tokens_seen": 5111808, + "step": 39 + }, + { + "epoch": 0.007181789034947098, + "grad_norm": 0.6313900947570801, + "learning_rate": 4.300341296928327e-05, + "loss": 7.3897, + "num_input_tokens_seen": 5505024, + "step": 42 + }, + { + "epoch": 0.0076947739660147485, + "grad_norm": 0.6429070830345154, + "learning_rate": 4.6075085324232084e-05, + "loss": 7.3358, + "num_input_tokens_seen": 5898240, + "step": 45 + }, + { + "epoch": 0.008207758897082399, + "grad_norm": 0.59568852186203, + "learning_rate": 4.9146757679180884e-05, + "loss": 7.2775, + "num_input_tokens_seen": 6291456, + "step": 48 + }, + { + "epoch": 0.008720743828150048, + "grad_norm": 0.6876631379127502, + "learning_rate": 5.221843003412969e-05, + "loss": 7.261, + "num_input_tokens_seen": 6684672, + "step": 51 + }, + { + "epoch": 0.009233728759217697, + "grad_norm": 0.6824859380722046, + "learning_rate": 5.529010238907849e-05, + "loss": 7.2941, + "num_input_tokens_seen": 7077888, + "step": 54 + }, + { + "epoch": 0.009746713690285349, + "grad_norm": 0.7125466465950012, + "learning_rate": 5.83617747440273e-05, + "loss": 7.23, + "num_input_tokens_seen": 7471104, + "step": 57 + }, + { + "epoch": 0.010259698621352998, + "grad_norm": 0.7616642713546753, + "learning_rate": 6.143344709897611e-05, + "loss": 7.1531, + "num_input_tokens_seen": 7864320, + "step": 60 + }, + { + "epoch": 0.010772683552420647, + "grad_norm": 0.7782993912696838, + "learning_rate": 6.450511945392491e-05, + "loss": 7.158, + "num_input_tokens_seen": 8257536, + "step": 63 + }, + { + "epoch": 0.011285668483488297, + "grad_norm": 0.7161626815795898, + "learning_rate": 6.757679180887371e-05, + "loss": 7.1733, + "num_input_tokens_seen": 8650752, + "step": 66 + }, + { + "epoch": 0.011798653414555948, + "grad_norm": 0.6889225244522095, + "learning_rate": 7.064846416382252e-05, + "loss": 7.1141, + "num_input_tokens_seen": 9043968, + "step": 69 + }, + { + "epoch": 0.012311638345623597, + "grad_norm": 0.7553271055221558, + "learning_rate": 7.372013651877132e-05, + "loss": 7.1584, + "num_input_tokens_seen": 9437184, + "step": 72 + }, + { + "epoch": 0.012824623276691247, + "grad_norm": 0.6760936975479126, + "learning_rate": 7.679180887372012e-05, + "loss": 7.0536, + "num_input_tokens_seen": 9830400, + "step": 75 + }, + { + "epoch": 0.013337608207758898, + "grad_norm": 0.6408318281173706, + "learning_rate": 7.986348122866894e-05, + "loss": 7.0168, + "num_input_tokens_seen": 10223616, + "step": 78 + }, + { + "epoch": 0.013850593138826547, + "grad_norm": 0.679551899433136, + "learning_rate": 8.293515358361774e-05, + "loss": 7.0783, + "num_input_tokens_seen": 10616832, + "step": 81 + }, + { + "epoch": 0.014363578069894196, + "grad_norm": 0.7019697427749634, + "learning_rate": 8.600682593856654e-05, + "loss": 7.1219, + "num_input_tokens_seen": 11010048, + "step": 84 + }, + { + "epoch": 0.014876563000961848, + "grad_norm": 0.7888438701629639, + "learning_rate": 8.907849829351534e-05, + "loss": 7.0021, + "num_input_tokens_seen": 11403264, + "step": 87 + }, + { + "epoch": 0.015389547932029497, + "grad_norm": 0.8103983998298645, + "learning_rate": 9.215017064846417e-05, + "loss": 7.0218, + "num_input_tokens_seen": 11796480, + "step": 90 + }, + { + "epoch": 0.015902532863097146, + "grad_norm": 0.8136906027793884, + "learning_rate": 9.522184300341297e-05, + "loss": 6.9954, + "num_input_tokens_seen": 12189696, + "step": 93 + }, + { + "epoch": 0.016415517794164797, + "grad_norm": 0.8006098866462708, + "learning_rate": 9.829351535836177e-05, + "loss": 7.0089, + "num_input_tokens_seen": 12582912, + "step": 96 + }, + { + "epoch": 0.016928502725232445, + "grad_norm": 0.7752631902694702, + "learning_rate": 0.00010136518771331057, + "loss": 7.0756, + "num_input_tokens_seen": 12976128, + "step": 99 + }, + { + "epoch": 0.017441487656300096, + "grad_norm": 0.8363745212554932, + "learning_rate": 0.00010443686006825938, + "loss": 6.9849, + "num_input_tokens_seen": 13369344, + "step": 102 + }, + { + "epoch": 0.017954472587367747, + "grad_norm": 0.9236883521080017, + "learning_rate": 0.00010750853242320818, + "loss": 6.9253, + "num_input_tokens_seen": 13762560, + "step": 105 + }, + { + "epoch": 0.018467457518435395, + "grad_norm": 0.993817925453186, + "learning_rate": 0.00011058020477815698, + "loss": 7.0273, + "num_input_tokens_seen": 14155776, + "step": 108 + }, + { + "epoch": 0.018980442449503046, + "grad_norm": 0.9351868033409119, + "learning_rate": 0.00011365187713310579, + "loss": 6.9702, + "num_input_tokens_seen": 14548992, + "step": 111 + }, + { + "epoch": 0.019493427380570697, + "grad_norm": 0.8399161696434021, + "learning_rate": 0.0001167235494880546, + "loss": 6.9723, + "num_input_tokens_seen": 14942208, + "step": 114 + }, + { + "epoch": 0.020006412311638345, + "grad_norm": 0.9522429704666138, + "learning_rate": 0.0001197952218430034, + "loss": 6.9254, + "num_input_tokens_seen": 15335424, + "step": 117 + }, + { + "epoch": 0.020519397242705996, + "grad_norm": 1.0010826587677002, + "learning_rate": 0.00012286689419795221, + "loss": 6.8872, + "num_input_tokens_seen": 15728640, + "step": 120 + }, + { + "epoch": 0.021032382173773647, + "grad_norm": 0.8947874903678894, + "learning_rate": 0.000125938566552901, + "loss": 6.9628, + "num_input_tokens_seen": 16121856, + "step": 123 + }, + { + "epoch": 0.021545367104841295, + "grad_norm": 1.0230917930603027, + "learning_rate": 0.00012901023890784982, + "loss": 6.8417, + "num_input_tokens_seen": 16515072, + "step": 126 + }, + { + "epoch": 0.022058352035908946, + "grad_norm": 0.9666141271591187, + "learning_rate": 0.00013208191126279863, + "loss": 6.9335, + "num_input_tokens_seen": 16908288, + "step": 129 + }, + { + "epoch": 0.022571336966976593, + "grad_norm": 1.1200824975967407, + "learning_rate": 0.00013515358361774742, + "loss": 6.8832, + "num_input_tokens_seen": 17301504, + "step": 132 + }, + { + "epoch": 0.023084321898044245, + "grad_norm": 0.9717016220092773, + "learning_rate": 0.00013822525597269623, + "loss": 6.8967, + "num_input_tokens_seen": 17694720, + "step": 135 + }, + { + "epoch": 0.023597306829111896, + "grad_norm": 1.024543285369873, + "learning_rate": 0.00014129692832764505, + "loss": 6.8902, + "num_input_tokens_seen": 18087936, + "step": 138 + }, + { + "epoch": 0.024110291760179543, + "grad_norm": 0.9085791707038879, + "learning_rate": 0.00014436860068259383, + "loss": 6.8873, + "num_input_tokens_seen": 18481152, + "step": 141 + }, + { + "epoch": 0.024623276691247194, + "grad_norm": 0.8841372132301331, + "learning_rate": 0.00014744027303754265, + "loss": 6.8524, + "num_input_tokens_seen": 18874368, + "step": 144 + }, + { + "epoch": 0.025136261622314846, + "grad_norm": 1.0399245023727417, + "learning_rate": 0.00015051194539249146, + "loss": 6.8402, + "num_input_tokens_seen": 19267584, + "step": 147 + }, + { + "epoch": 0.025649246553382493, + "grad_norm": 1.4088637828826904, + "learning_rate": 0.00015358361774744025, + "loss": 6.8282, + "num_input_tokens_seen": 19660800, + "step": 150 + }, + { + "epoch": 0.026162231484450144, + "grad_norm": 1.1549632549285889, + "learning_rate": 0.00015665529010238906, + "loss": 6.8356, + "num_input_tokens_seen": 20054016, + "step": 153 + }, + { + "epoch": 0.026675216415517795, + "grad_norm": 1.6987155675888062, + "learning_rate": 0.00015972696245733788, + "loss": 6.787, + "num_input_tokens_seen": 20447232, + "step": 156 + }, + { + "epoch": 0.027188201346585443, + "grad_norm": 1.1561607122421265, + "learning_rate": 0.00016279863481228666, + "loss": 6.7794, + "num_input_tokens_seen": 20840448, + "step": 159 + }, + { + "epoch": 0.027701186277653094, + "grad_norm": 1.8383941650390625, + "learning_rate": 0.00016587030716723548, + "loss": 6.7788, + "num_input_tokens_seen": 21233664, + "step": 162 + }, + { + "epoch": 0.028214171208720745, + "grad_norm": 1.4902769327163696, + "learning_rate": 0.0001689419795221843, + "loss": 6.7697, + "num_input_tokens_seen": 21626880, + "step": 165 + }, + { + "epoch": 0.028727156139788393, + "grad_norm": 1.2899839878082275, + "learning_rate": 0.00017201365187713308, + "loss": 6.8425, + "num_input_tokens_seen": 22020096, + "step": 168 + }, + { + "epoch": 0.029240141070856044, + "grad_norm": 1.4876312017440796, + "learning_rate": 0.0001750853242320819, + "loss": 6.776, + "num_input_tokens_seen": 22413312, + "step": 171 + }, + { + "epoch": 0.029753126001923695, + "grad_norm": 1.0720467567443848, + "learning_rate": 0.00017815699658703068, + "loss": 6.7536, + "num_input_tokens_seen": 22806528, + "step": 174 + }, + { + "epoch": 0.030266110932991343, + "grad_norm": 1.0955810546875, + "learning_rate": 0.0001812286689419795, + "loss": 6.7726, + "num_input_tokens_seen": 23199744, + "step": 177 + }, + { + "epoch": 0.030779095864058994, + "grad_norm": 1.5811485052108765, + "learning_rate": 0.00018430034129692833, + "loss": 6.7334, + "num_input_tokens_seen": 23592960, + "step": 180 + }, + { + "epoch": 0.031292080795126645, + "grad_norm": 1.4859919548034668, + "learning_rate": 0.0001873720136518771, + "loss": 6.7376, + "num_input_tokens_seen": 23986176, + "step": 183 + }, + { + "epoch": 0.03180506572619429, + "grad_norm": 1.0954172611236572, + "learning_rate": 0.00019044368600682594, + "loss": 6.7563, + "num_input_tokens_seen": 24379392, + "step": 186 + }, + { + "epoch": 0.03231805065726194, + "grad_norm": 1.2160760164260864, + "learning_rate": 0.00019351535836177475, + "loss": 6.7449, + "num_input_tokens_seen": 24772608, + "step": 189 + }, + { + "epoch": 0.032831035588329595, + "grad_norm": 1.6859344244003296, + "learning_rate": 0.00019658703071672354, + "loss": 6.7386, + "num_input_tokens_seen": 25165824, + "step": 192 + }, + { + "epoch": 0.03334402051939724, + "grad_norm": 1.3483397960662842, + "learning_rate": 0.00019965870307167235, + "loss": 6.7087, + "num_input_tokens_seen": 25559040, + "step": 195 + }, + { + "epoch": 0.03385700545046489, + "grad_norm": 1.5949305295944214, + "learning_rate": 0.00020273037542662114, + "loss": 6.6816, + "num_input_tokens_seen": 25952256, + "step": 198 + }, + { + "epoch": 0.034369990381532545, + "grad_norm": 1.3635272979736328, + "learning_rate": 0.00020580204778156995, + "loss": 6.6855, + "num_input_tokens_seen": 26345472, + "step": 201 + }, + { + "epoch": 0.03488297531260019, + "grad_norm": 1.3199516534805298, + "learning_rate": 0.00020887372013651877, + "loss": 6.7078, + "num_input_tokens_seen": 26738688, + "step": 204 + }, + { + "epoch": 0.03539596024366784, + "grad_norm": 1.3045519590377808, + "learning_rate": 0.00021194539249146755, + "loss": 6.6587, + "num_input_tokens_seen": 27131904, + "step": 207 + }, + { + "epoch": 0.035908945174735495, + "grad_norm": 1.1506019830703735, + "learning_rate": 0.00021501706484641637, + "loss": 6.7181, + "num_input_tokens_seen": 27525120, + "step": 210 + }, + { + "epoch": 0.03642193010580314, + "grad_norm": 1.068679690361023, + "learning_rate": 0.00021808873720136518, + "loss": 6.6458, + "num_input_tokens_seen": 27918336, + "step": 213 + }, + { + "epoch": 0.03693491503687079, + "grad_norm": 1.2845734357833862, + "learning_rate": 0.00022116040955631397, + "loss": 6.6676, + "num_input_tokens_seen": 28311552, + "step": 216 + }, + { + "epoch": 0.037447899967938444, + "grad_norm": 1.4203448295593262, + "learning_rate": 0.00022423208191126278, + "loss": 6.6261, + "num_input_tokens_seen": 28704768, + "step": 219 + }, + { + "epoch": 0.03796088489900609, + "grad_norm": 1.3917193412780762, + "learning_rate": 0.00022730375426621157, + "loss": 6.6926, + "num_input_tokens_seen": 29097984, + "step": 222 + }, + { + "epoch": 0.03847386983007374, + "grad_norm": 1.2271850109100342, + "learning_rate": 0.00023037542662116038, + "loss": 6.6867, + "num_input_tokens_seen": 29491200, + "step": 225 + }, + { + "epoch": 0.038986854761141394, + "grad_norm": 1.958269715309143, + "learning_rate": 0.0002334470989761092, + "loss": 6.6826, + "num_input_tokens_seen": 29884416, + "step": 228 + }, + { + "epoch": 0.03949983969220904, + "grad_norm": 1.7181731462478638, + "learning_rate": 0.00023651877133105799, + "loss": 6.6605, + "num_input_tokens_seen": 30277632, + "step": 231 + }, + { + "epoch": 0.04001282462327669, + "grad_norm": 1.3246721029281616, + "learning_rate": 0.0002395904436860068, + "loss": 6.6036, + "num_input_tokens_seen": 30670848, + "step": 234 + }, + { + "epoch": 0.040525809554344344, + "grad_norm": 1.2964049577713013, + "learning_rate": 0.00024266211604095561, + "loss": 6.6086, + "num_input_tokens_seen": 31064064, + "step": 237 + }, + { + "epoch": 0.04103879448541199, + "grad_norm": 1.2255417108535767, + "learning_rate": 0.00024573378839590443, + "loss": 6.6079, + "num_input_tokens_seen": 31457280, + "step": 240 + }, + { + "epoch": 0.04155177941647964, + "grad_norm": 1.6006697416305542, + "learning_rate": 0.0002488054607508532, + "loss": 6.5652, + "num_input_tokens_seen": 31850496, + "step": 243 + }, + { + "epoch": 0.042064764347547294, + "grad_norm": 1.1786364316940308, + "learning_rate": 0.000251877133105802, + "loss": 6.6156, + "num_input_tokens_seen": 32243712, + "step": 246 + }, + { + "epoch": 0.04257774927861494, + "grad_norm": 1.554391622543335, + "learning_rate": 0.00025494880546075084, + "loss": 6.6044, + "num_input_tokens_seen": 32636928, + "step": 249 + }, + { + "epoch": 0.04309073420968259, + "grad_norm": 1.8817625045776367, + "learning_rate": 0.00025802047781569963, + "loss": 6.5893, + "num_input_tokens_seen": 33030144, + "step": 252 + }, + { + "epoch": 0.043603719140750244, + "grad_norm": 1.1505640745162964, + "learning_rate": 0.0002610921501706484, + "loss": 6.578, + "num_input_tokens_seen": 33423360, + "step": 255 + }, + { + "epoch": 0.04411670407181789, + "grad_norm": 2.1356940269470215, + "learning_rate": 0.00026416382252559726, + "loss": 6.6012, + "num_input_tokens_seen": 33816576, + "step": 258 + }, + { + "epoch": 0.04462968900288554, + "grad_norm": 1.7814842462539673, + "learning_rate": 0.00026723549488054605, + "loss": 6.5964, + "num_input_tokens_seen": 34209792, + "step": 261 + }, + { + "epoch": 0.04514267393395319, + "grad_norm": 2.086648941040039, + "learning_rate": 0.00027030716723549483, + "loss": 6.6411, + "num_input_tokens_seen": 34603008, + "step": 264 + }, + { + "epoch": 0.04565565886502084, + "grad_norm": 1.9855871200561523, + "learning_rate": 0.0002733788395904437, + "loss": 6.5595, + "num_input_tokens_seen": 34996224, + "step": 267 + }, + { + "epoch": 0.04616864379608849, + "grad_norm": 1.3594361543655396, + "learning_rate": 0.00027645051194539246, + "loss": 6.6081, + "num_input_tokens_seen": 35389440, + "step": 270 + }, + { + "epoch": 0.04668162872715614, + "grad_norm": 1.9229851961135864, + "learning_rate": 0.00027952218430034125, + "loss": 6.5241, + "num_input_tokens_seen": 35782656, + "step": 273 + }, + { + "epoch": 0.04719461365822379, + "grad_norm": 1.7178096771240234, + "learning_rate": 0.0002825938566552901, + "loss": 6.5881, + "num_input_tokens_seen": 36175872, + "step": 276 + }, + { + "epoch": 0.04770759858929144, + "grad_norm": 1.7998623847961426, + "learning_rate": 0.0002856655290102389, + "loss": 6.5847, + "num_input_tokens_seen": 36569088, + "step": 279 + }, + { + "epoch": 0.04822058352035909, + "grad_norm": 2.0001308917999268, + "learning_rate": 0.00028873720136518766, + "loss": 6.5402, + "num_input_tokens_seen": 36962304, + "step": 282 + }, + { + "epoch": 0.04873356845142674, + "grad_norm": 1.416505217552185, + "learning_rate": 0.0002918088737201365, + "loss": 6.5869, + "num_input_tokens_seen": 37355520, + "step": 285 + }, + { + "epoch": 0.04924655338249439, + "grad_norm": 1.462956190109253, + "learning_rate": 0.0002948805460750853, + "loss": 6.5351, + "num_input_tokens_seen": 37748736, + "step": 288 + }, + { + "epoch": 0.049759538313562036, + "grad_norm": 2.0115163326263428, + "learning_rate": 0.0002979522184300341, + "loss": 6.5625, + "num_input_tokens_seen": 38141952, + "step": 291 + }, + { + "epoch": 0.05027252324462969, + "grad_norm": 1.7845978736877441, + "learning_rate": 0.00029948936133195183, + "loss": 6.6064, + "num_input_tokens_seen": 38535168, + "step": 294 + }, + { + "epoch": 0.05078550817569734, + "grad_norm": 1.81064772605896, + "learning_rate": 0.0002979729497586631, + "loss": 6.5544, + "num_input_tokens_seen": 38928384, + "step": 297 + }, + { + "epoch": 0.051298493106764986, + "grad_norm": 1.8805071115493774, + "learning_rate": 0.00029647934160747185, + "loss": 6.5701, + "num_input_tokens_seen": 39321600, + "step": 300 + }, + { + "epoch": 0.05181147803783264, + "grad_norm": 2.1542809009552, + "learning_rate": 0.0002950079710284063, + "loss": 6.5521, + "num_input_tokens_seen": 39714816, + "step": 303 + }, + { + "epoch": 0.05232446296890029, + "grad_norm": 1.9905306100845337, + "learning_rate": 0.0002935582916359803, + "loss": 6.5324, + "num_input_tokens_seen": 40108032, + "step": 306 + }, + { + "epoch": 0.052837447899967936, + "grad_norm": 1.9549764394760132, + "learning_rate": 0.00029212977565671515, + "loss": 6.5147, + "num_input_tokens_seen": 40501248, + "step": 309 + }, + { + "epoch": 0.05335043283103559, + "grad_norm": 1.7094388008117676, + "learning_rate": 0.0002907219131218538, + "loss": 6.5199, + "num_input_tokens_seen": 40894464, + "step": 312 + }, + { + "epoch": 0.05386341776210324, + "grad_norm": 1.6284552812576294, + "learning_rate": 0.00028933421110246486, + "loss": 6.5146, + "num_input_tokens_seen": 41287680, + "step": 315 + }, + { + "epoch": 0.054376402693170886, + "grad_norm": 1.983896017074585, + "learning_rate": 0.0002879661929843272, + "loss": 6.5427, + "num_input_tokens_seen": 41680896, + "step": 318 + }, + { + "epoch": 0.05488938762423854, + "grad_norm": 1.9521673917770386, + "learning_rate": 0.00028661739778017726, + "loss": 6.5047, + "num_input_tokens_seen": 42074112, + "step": 321 + }, + { + "epoch": 0.05540237255530619, + "grad_norm": 2.61079740524292, + "learning_rate": 0.0002852873794770615, + "loss": 6.5187, + "num_input_tokens_seen": 42467328, + "step": 324 + }, + { + "epoch": 0.055915357486373836, + "grad_norm": 2.143825054168701, + "learning_rate": 0.00028397570641669755, + "loss": 6.5335, + "num_input_tokens_seen": 42860544, + "step": 327 + }, + { + "epoch": 0.05642834241744149, + "grad_norm": 1.2572081089019775, + "learning_rate": 0.00028268196070688857, + "loss": 6.492, + "num_input_tokens_seen": 43253760, + "step": 330 + }, + { + "epoch": 0.05694132734850914, + "grad_norm": 1.3765265941619873, + "learning_rate": 0.0002814057376621684, + "loss": 6.5031, + "num_input_tokens_seen": 43646976, + "step": 333 + }, + { + "epoch": 0.057454312279576786, + "grad_norm": 1.4847021102905273, + "learning_rate": 0.00028014664527197685, + "loss": 6.5166, + "num_input_tokens_seen": 44040192, + "step": 336 + }, + { + "epoch": 0.05796729721064444, + "grad_norm": 1.3950855731964111, + "learning_rate": 0.0002789043036947781, + "loss": 6.5069, + "num_input_tokens_seen": 44433408, + "step": 339 + }, + { + "epoch": 0.05848028214171209, + "grad_norm": 1.9534938335418701, + "learning_rate": 0.0002776783447766403, + "loss": 6.4242, + "num_input_tokens_seen": 44826624, + "step": 342 + }, + { + "epoch": 0.058993267072779736, + "grad_norm": 1.5846917629241943, + "learning_rate": 0.00027646841159289, + "loss": 6.4923, + "num_input_tokens_seen": 45219840, + "step": 345 + }, + { + "epoch": 0.05950625200384739, + "grad_norm": 1.5529935359954834, + "learning_rate": 0.00027527415801154584, + "loss": 6.5015, + "num_input_tokens_seen": 45613056, + "step": 348 + }, + { + "epoch": 0.06001923693491504, + "grad_norm": 2.177635908126831, + "learning_rate": 0.000274095248277319, + "loss": 6.4769, + "num_input_tokens_seen": 46006272, + "step": 351 + }, + { + "epoch": 0.060532221865982686, + "grad_norm": 1.630927324295044, + "learning_rate": 0.0002729313566150449, + "loss": 6.4383, + "num_input_tokens_seen": 46399488, + "step": 354 + }, + { + "epoch": 0.06104520679705034, + "grad_norm": 1.6595220565795898, + "learning_rate": 0.0002717821668514831, + "loss": 6.4398, + "num_input_tokens_seen": 46792704, + "step": 357 + }, + { + "epoch": 0.06155819172811799, + "grad_norm": 2.464336395263672, + "learning_rate": 0.0002706473720544871, + "loss": 6.4676, + "num_input_tokens_seen": 47185920, + "step": 360 + }, + { + "epoch": 0.062071176659185635, + "grad_norm": 1.580959439277649, + "learning_rate": 0.0002695266741886108, + "loss": 6.4727, + "num_input_tokens_seen": 47579136, + "step": 363 + }, + { + "epoch": 0.06258416159025329, + "grad_norm": 1.2866970300674438, + "learning_rate": 0.0002684197837862717, + "loss": 6.4078, + "num_input_tokens_seen": 47972352, + "step": 366 + }, + { + "epoch": 0.06309714652132094, + "grad_norm": 1.84207022190094, + "learning_rate": 0.00026732641963364995, + "loss": 6.3775, + "num_input_tokens_seen": 48365568, + "step": 369 + }, + { + "epoch": 0.06361013145238859, + "grad_norm": 1.3852074146270752, + "learning_rate": 0.0002662463084705468, + "loss": 6.4169, + "num_input_tokens_seen": 48758784, + "step": 372 + }, + { + "epoch": 0.06412311638345623, + "grad_norm": 1.644254207611084, + "learning_rate": 0.00026517918470347554, + "loss": 6.4822, + "num_input_tokens_seen": 49152000, + "step": 375 + }, + { + "epoch": 0.06463610131452388, + "grad_norm": 1.1741936206817627, + "learning_rate": 0.0002641247901313028, + "loss": 6.4092, + "num_input_tokens_seen": 49545216, + "step": 378 + }, + { + "epoch": 0.06514908624559154, + "grad_norm": 2.0758259296417236, + "learning_rate": 0.0002630828736827938, + "loss": 6.4429, + "num_input_tokens_seen": 49938432, + "step": 381 + }, + { + "epoch": 0.06566207117665919, + "grad_norm": 1.7567379474639893, + "learning_rate": 0.00026205319116545786, + "loss": 6.4423, + "num_input_tokens_seen": 50331648, + "step": 384 + }, + { + "epoch": 0.06617505610772684, + "grad_norm": 1.2850079536437988, + "learning_rate": 0.0002610355050251228, + "loss": 6.4584, + "num_input_tokens_seen": 50724864, + "step": 387 + }, + { + "epoch": 0.06668804103879448, + "grad_norm": 1.6754025220870972, + "learning_rate": 0.00026002958411570134, + "loss": 6.461, + "num_input_tokens_seen": 51118080, + "step": 390 + }, + { + "epoch": 0.06720102596986213, + "grad_norm": 1.3733254671096802, + "learning_rate": 0.0002590352034786418, + "loss": 6.4098, + "num_input_tokens_seen": 51511296, + "step": 393 + }, + { + "epoch": 0.06771401090092978, + "grad_norm": 1.1443592309951782, + "learning_rate": 0.0002580521441315865, + "loss": 6.34, + "num_input_tokens_seen": 51904512, + "step": 396 + }, + { + "epoch": 0.06822699583199744, + "grad_norm": 1.9371726512908936, + "learning_rate": 0.0002570801928657861, + "loss": 6.4019, + "num_input_tokens_seen": 52297728, + "step": 399 + }, + { + "epoch": 0.06839799080901998, + "eval_accuracy": 0.1277788633773001, + "eval_loss": 6.768958568572998, + "eval_runtime": 112.6432, + "eval_samples_per_second": 2.663, + "eval_steps_per_second": 1.332, + "num_input_tokens_seen": 52428800, + "step": 400 + }, + { + "epoch": 0.06873998076306509, + "grad_norm": 1.4626785516738892, + "learning_rate": 0.0002561191420518449, + "loss": 6.3792, + "num_input_tokens_seen": 52690944, + "step": 402 + }, + { + "epoch": 0.06925296569413274, + "grad_norm": 1.1538268327713013, + "learning_rate": 0.0002551687894533952, + "loss": 6.3429, + "num_input_tokens_seen": 53084160, + "step": 405 + }, + { + "epoch": 0.06976595062520038, + "grad_norm": 1.803175926208496, + "learning_rate": 0.00025422893804831985, + "loss": 6.3593, + "num_input_tokens_seen": 53477376, + "step": 408 + }, + { + "epoch": 0.07027893555626803, + "grad_norm": 1.3619705438613892, + "learning_rate": 0.0002532993958571671, + "loss": 6.3489, + "num_input_tokens_seen": 53870592, + "step": 411 + }, + { + "epoch": 0.07079192048733568, + "grad_norm": 1.1999845504760742, + "learning_rate": 0.0002523799757784144, + "loss": 6.3924, + "num_input_tokens_seen": 54263808, + "step": 414 + }, + { + "epoch": 0.07130490541840333, + "grad_norm": 1.1951202154159546, + "learning_rate": 0.000251470495430264, + "loss": 6.356, + "num_input_tokens_seen": 54657024, + "step": 417 + }, + { + "epoch": 0.07181789034947099, + "grad_norm": 1.117151141166687, + "learning_rate": 0.0002505707769986641, + "loss": 6.2931, + "num_input_tokens_seen": 55050240, + "step": 420 + }, + { + "epoch": 0.07233087528053864, + "grad_norm": 1.0864099264144897, + "learning_rate": 0.00024968064709126914, + "loss": 6.3978, + "num_input_tokens_seen": 55443456, + "step": 423 + }, + { + "epoch": 0.07284386021160628, + "grad_norm": 1.4912980794906616, + "learning_rate": 0.0002487999365970663, + "loss": 6.3532, + "num_input_tokens_seen": 55836672, + "step": 426 + }, + { + "epoch": 0.07335684514267393, + "grad_norm": 1.2617565393447876, + "learning_rate": 0.0002479284805514112, + "loss": 6.3529, + "num_input_tokens_seen": 56229888, + "step": 429 + }, + { + "epoch": 0.07386983007374158, + "grad_norm": 1.585567831993103, + "learning_rate": 0.00024706611800622653, + "loss": 6.2868, + "num_input_tokens_seen": 56623104, + "step": 432 + }, + { + "epoch": 0.07438281500480923, + "grad_norm": 1.342955231666565, + "learning_rate": 0.00024621269190513393, + "loss": 6.3653, + "num_input_tokens_seen": 57016320, + "step": 435 + }, + { + "epoch": 0.07489579993587689, + "grad_norm": 1.0312269926071167, + "learning_rate": 0.00024536804896329673, + "loss": 6.3066, + "num_input_tokens_seen": 57409536, + "step": 438 + }, + { + "epoch": 0.07540878486694454, + "grad_norm": 1.047814130783081, + "learning_rate": 0.00024453203955176697, + "loss": 6.2996, + "num_input_tokens_seen": 57802752, + "step": 441 + }, + { + "epoch": 0.07592176979801218, + "grad_norm": 2.0140538215637207, + "learning_rate": 0.00024370451758613725, + "loss": 6.3169, + "num_input_tokens_seen": 58195968, + "step": 444 + }, + { + "epoch": 0.07643475472907983, + "grad_norm": 1.7778139114379883, + "learning_rate": 0.00024288534041930982, + "loss": 6.4133, + "num_input_tokens_seen": 58589184, + "step": 447 + }, + { + "epoch": 0.07694773966014748, + "grad_norm": 1.4507876634597778, + "learning_rate": 0.0002420743687382041, + "loss": 6.3126, + "num_input_tokens_seen": 58982400, + "step": 450 + }, + { + "epoch": 0.07746072459121513, + "grad_norm": 1.4134477376937866, + "learning_rate": 0.0002412714664642326, + "loss": 6.2931, + "num_input_tokens_seen": 59375616, + "step": 453 + }, + { + "epoch": 0.07797370952228279, + "grad_norm": 0.8874240517616272, + "learning_rate": 0.00024047650065738447, + "loss": 6.3292, + "num_input_tokens_seen": 59768832, + "step": 456 + }, + { + "epoch": 0.07848669445335044, + "grad_norm": 0.8925871253013611, + "learning_rate": 0.00023968934142376222, + "loss": 6.3195, + "num_input_tokens_seen": 60162048, + "step": 459 + }, + { + "epoch": 0.07899967938441808, + "grad_norm": 1.0737918615341187, + "learning_rate": 0.00023890986182642624, + "loss": 6.3276, + "num_input_tokens_seen": 60555264, + "step": 462 + }, + { + "epoch": 0.07951266431548573, + "grad_norm": 1.271182894706726, + "learning_rate": 0.00023813793779940825, + "loss": 6.271, + "num_input_tokens_seen": 60948480, + "step": 465 + }, + { + "epoch": 0.08002564924655338, + "grad_norm": 1.3476979732513428, + "learning_rate": 0.0002373734480647611, + "loss": 6.2534, + "num_input_tokens_seen": 61341696, + "step": 468 + }, + { + "epoch": 0.08053863417762103, + "grad_norm": 1.1347360610961914, + "learning_rate": 0.00023661627405251905, + "loss": 6.322, + "num_input_tokens_seen": 61734912, + "step": 471 + }, + { + "epoch": 0.08105161910868869, + "grad_norm": 1.2227282524108887, + "learning_rate": 0.00023586629982344883, + "loss": 6.2605, + "num_input_tokens_seen": 62128128, + "step": 474 + }, + { + "epoch": 0.08156460403975634, + "grad_norm": 1.0676379203796387, + "learning_rate": 0.0002351234119944769, + "loss": 6.2544, + "num_input_tokens_seen": 62521344, + "step": 477 + }, + { + "epoch": 0.08207758897082398, + "grad_norm": 1.1656867265701294, + "learning_rate": 0.00023438749966668443, + "loss": 6.2735, + "num_input_tokens_seen": 62914560, + "step": 480 + }, + { + "epoch": 0.08259057390189163, + "grad_norm": 0.9971214532852173, + "learning_rate": 0.00023365845435576572, + "loss": 6.2611, + "num_input_tokens_seen": 63307776, + "step": 483 + }, + { + "epoch": 0.08310355883295928, + "grad_norm": 1.160860300064087, + "learning_rate": 0.0002329361699248514, + "loss": 6.2435, + "num_input_tokens_seen": 63700992, + "step": 486 + }, + { + "epoch": 0.08361654376402693, + "grad_norm": 0.8638364672660828, + "learning_rate": 0.00023222054251960172, + "loss": 6.2793, + "num_input_tokens_seen": 64094208, + "step": 489 + }, + { + "epoch": 0.08412952869509459, + "grad_norm": 0.9435164928436279, + "learning_rate": 0.00023151147050548, + "loss": 6.278, + "num_input_tokens_seen": 64487424, + "step": 492 + }, + { + "epoch": 0.08464251362616224, + "grad_norm": 0.7893585562705994, + "learning_rate": 0.00023080885440712032, + "loss": 6.2768, + "num_input_tokens_seen": 64880640, + "step": 495 + }, + { + "epoch": 0.08515549855722988, + "grad_norm": 1.150215983390808, + "learning_rate": 0.00023011259684970676, + "loss": 6.2422, + "num_input_tokens_seen": 65273856, + "step": 498 + }, + { + "epoch": 0.08566848348829753, + "grad_norm": 1.1255900859832764, + "learning_rate": 0.00022942260250228647, + "loss": 6.2772, + "num_input_tokens_seen": 65667072, + "step": 501 + }, + { + "epoch": 0.08618146841936518, + "grad_norm": 1.0153459310531616, + "learning_rate": 0.00022873877802294162, + "loss": 6.2121, + "num_input_tokens_seen": 66060288, + "step": 504 + }, + { + "epoch": 0.08669445335043283, + "grad_norm": 0.8988346457481384, + "learning_rate": 0.0002280610320057476, + "loss": 6.2168, + "num_input_tokens_seen": 66453504, + "step": 507 + }, + { + "epoch": 0.08720743828150049, + "grad_norm": 1.5252900123596191, + "learning_rate": 0.00022738927492945034, + "loss": 6.269, + "num_input_tokens_seen": 66846720, + "step": 510 + }, + { + "epoch": 0.08772042321256814, + "grad_norm": 1.5005996227264404, + "learning_rate": 0.00022672341910779707, + "loss": 6.2683, + "num_input_tokens_seen": 67239936, + "step": 513 + }, + { + "epoch": 0.08823340814363578, + "grad_norm": 1.3754152059555054, + "learning_rate": 0.00022606337864145685, + "loss": 6.2707, + "num_input_tokens_seen": 67633152, + "step": 516 + }, + { + "epoch": 0.08874639307470343, + "grad_norm": 1.0265483856201172, + "learning_rate": 0.0002254090693714725, + "loss": 6.2469, + "num_input_tokens_seen": 68026368, + "step": 519 + }, + { + "epoch": 0.08925937800577108, + "grad_norm": 1.0772042274475098, + "learning_rate": 0.00022476040883418578, + "loss": 6.2262, + "num_input_tokens_seen": 68419584, + "step": 522 + }, + { + "epoch": 0.08977236293683873, + "grad_norm": 1.050948977470398, + "learning_rate": 0.00022411731621758152, + "loss": 6.2141, + "num_input_tokens_seen": 68812800, + "step": 525 + }, + { + "epoch": 0.09028534786790637, + "grad_norm": 1.455889344215393, + "learning_rate": 0.00022347971231899736, + "loss": 6.2429, + "num_input_tokens_seen": 69206016, + "step": 528 + }, + { + "epoch": 0.09079833279897404, + "grad_norm": 1.0375335216522217, + "learning_rate": 0.00022284751950415004, + "loss": 6.2112, + "num_input_tokens_seen": 69599232, + "step": 531 + }, + { + "epoch": 0.09131131773004168, + "grad_norm": 0.9029484987258911, + "learning_rate": 0.00022222066166742937, + "loss": 6.212, + "num_input_tokens_seen": 69992448, + "step": 534 + }, + { + "epoch": 0.09182430266110933, + "grad_norm": 2.0414113998413086, + "learning_rate": 0.0002215990641934136, + "loss": 6.2324, + "num_input_tokens_seen": 70385664, + "step": 537 + }, + { + "epoch": 0.09233728759217698, + "grad_norm": 1.7731704711914062, + "learning_rate": 0.00022098265391956294, + "loss": 6.2663, + "num_input_tokens_seen": 70778880, + "step": 540 + }, + { + "epoch": 0.09285027252324463, + "grad_norm": 1.9474400281906128, + "learning_rate": 0.00022037135910004776, + "loss": 6.2006, + "num_input_tokens_seen": 71172096, + "step": 543 + }, + { + "epoch": 0.09336325745431227, + "grad_norm": 1.139887809753418, + "learning_rate": 0.00021976510937067167, + "loss": 6.1835, + "num_input_tokens_seen": 71565312, + "step": 546 + }, + { + "epoch": 0.09387624238537993, + "grad_norm": 1.4211649894714355, + "learning_rate": 0.0002191638357148503, + "loss": 6.2367, + "num_input_tokens_seen": 71958528, + "step": 549 + }, + { + "epoch": 0.09438922731644758, + "grad_norm": 1.0333606004714966, + "learning_rate": 0.00021856747043060817, + "loss": 6.1811, + "num_input_tokens_seen": 72351744, + "step": 552 + }, + { + "epoch": 0.09490221224751523, + "grad_norm": 1.1093227863311768, + "learning_rate": 0.00021797594709855838, + "loss": 6.1716, + "num_input_tokens_seen": 72744960, + "step": 555 + }, + { + "epoch": 0.09541519717858288, + "grad_norm": 1.3098039627075195, + "learning_rate": 0.00021738920055083008, + "loss": 6.2282, + "num_input_tokens_seen": 73138176, + "step": 558 + }, + { + "epoch": 0.09592818210965053, + "grad_norm": 0.9683685898780823, + "learning_rate": 0.00021680716684091162, + "loss": 6.1501, + "num_input_tokens_seen": 73531392, + "step": 561 + }, + { + "epoch": 0.09644116704071817, + "grad_norm": 0.9972584247589111, + "learning_rate": 0.0002162297832143763, + "loss": 6.1839, + "num_input_tokens_seen": 73924608, + "step": 564 + }, + { + "epoch": 0.09695415197178583, + "grad_norm": 0.8736885190010071, + "learning_rate": 0.00021565698808046193, + "loss": 6.1591, + "num_input_tokens_seen": 74317824, + "step": 567 + }, + { + "epoch": 0.09746713690285348, + "grad_norm": 1.025628924369812, + "learning_rate": 0.0002150887209844738, + "loss": 6.2475, + "num_input_tokens_seen": 74711040, + "step": 570 + }, + { + "epoch": 0.09798012183392113, + "grad_norm": 1.068459153175354, + "learning_rate": 0.00021452492258098351, + "loss": 6.1618, + "num_input_tokens_seen": 75104256, + "step": 573 + }, + { + "epoch": 0.09849310676498878, + "grad_norm": 0.8722976446151733, + "learning_rate": 0.0002139655346077961, + "loss": 6.1769, + "num_input_tokens_seen": 75497472, + "step": 576 + }, + { + "epoch": 0.09900609169605643, + "grad_norm": 0.99879390001297, + "learning_rate": 0.00021341049986066098, + "loss": 6.1479, + "num_input_tokens_seen": 75890688, + "step": 579 + }, + { + "epoch": 0.09951907662712407, + "grad_norm": 1.2947535514831543, + "learning_rate": 0.00021285976216869982, + "loss": 6.1689, + "num_input_tokens_seen": 76283904, + "step": 582 + }, + { + "epoch": 0.10003206155819173, + "grad_norm": 1.198506236076355, + "learning_rate": 0.00021231326637052871, + "loss": 6.176, + "num_input_tokens_seen": 76677120, + "step": 585 + }, + { + "epoch": 0.10054504648925938, + "grad_norm": 1.9186298847198486, + "learning_rate": 0.00021177095829105132, + "loss": 6.1542, + "num_input_tokens_seen": 77070336, + "step": 588 + }, + { + "epoch": 0.10105803142032703, + "grad_norm": 1.6377959251403809, + "learning_rate": 0.00021123278471890086, + "loss": 6.1783, + "num_input_tokens_seen": 77463552, + "step": 591 + }, + { + "epoch": 0.10157101635139468, + "grad_norm": 1.5491153001785278, + "learning_rate": 0.00021069869338450912, + "loss": 6.192, + "num_input_tokens_seen": 77856768, + "step": 594 + }, + { + "epoch": 0.10208400128246232, + "grad_norm": 1.4104523658752441, + "learning_rate": 0.0002101686329387827, + "loss": 6.2015, + "num_input_tokens_seen": 78249984, + "step": 597 + }, + { + "epoch": 0.10259698621352997, + "grad_norm": 1.230622410774231, + "learning_rate": 0.00020964255293236627, + "loss": 6.1604, + "num_input_tokens_seen": 78643200, + "step": 600 + }, + { + "epoch": 0.10310997114459763, + "grad_norm": 0.9526540040969849, + "learning_rate": 0.00020912040379547395, + "loss": 6.1395, + "num_input_tokens_seen": 79036416, + "step": 603 + }, + { + "epoch": 0.10362295607566528, + "grad_norm": 1.131076455116272, + "learning_rate": 0.00020860213681827064, + "loss": 6.1538, + "num_input_tokens_seen": 79429632, + "step": 606 + }, + { + "epoch": 0.10413594100673293, + "grad_norm": 1.2227041721343994, + "learning_rate": 0.00020808770413178535, + "loss": 6.1001, + "num_input_tokens_seen": 79822848, + "step": 609 + }, + { + "epoch": 0.10464892593780058, + "grad_norm": 1.2456010580062866, + "learning_rate": 0.00020757705868933984, + "loss": 6.153, + "num_input_tokens_seen": 80216064, + "step": 612 + }, + { + "epoch": 0.10516191086886822, + "grad_norm": 1.1259610652923584, + "learning_rate": 0.00020707015424847639, + "loss": 6.1156, + "num_input_tokens_seen": 80609280, + "step": 615 + }, + { + "epoch": 0.10567489579993587, + "grad_norm": 1.1174367666244507, + "learning_rate": 0.00020656694535336808, + "loss": 6.1389, + "num_input_tokens_seen": 81002496, + "step": 618 + }, + { + "epoch": 0.10618788073100352, + "grad_norm": 1.1163129806518555, + "learning_rate": 0.00020606738731769765, + "loss": 6.1592, + "num_input_tokens_seen": 81395712, + "step": 621 + }, + { + "epoch": 0.10670086566207118, + "grad_norm": 1.0666199922561646, + "learning_rate": 0.0002055714362079892, + "loss": 6.1443, + "num_input_tokens_seen": 81788928, + "step": 624 + }, + { + "epoch": 0.10721385059313883, + "grad_norm": 1.0415575504302979, + "learning_rate": 0.00020507904882737917, + "loss": 6.1783, + "num_input_tokens_seen": 82182144, + "step": 627 + }, + { + "epoch": 0.10772683552420648, + "grad_norm": 1.8121472597122192, + "learning_rate": 0.00020459018269981298, + "loss": 6.1281, + "num_input_tokens_seen": 82575360, + "step": 630 + }, + { + "epoch": 0.10823982045527412, + "grad_norm": 1.4155458211898804, + "learning_rate": 0.00020410479605465385, + "loss": 6.1657, + "num_input_tokens_seen": 82968576, + "step": 633 + }, + { + "epoch": 0.10875280538634177, + "grad_norm": 1.4348151683807373, + "learning_rate": 0.00020362284781169176, + "loss": 6.1204, + "num_input_tokens_seen": 83361792, + "step": 636 + }, + { + "epoch": 0.10926579031740942, + "grad_norm": 1.1229435205459595, + "learning_rate": 0.00020314429756653965, + "loss": 6.1741, + "num_input_tokens_seen": 83755008, + "step": 639 + }, + { + "epoch": 0.10977877524847708, + "grad_norm": 1.871185064315796, + "learning_rate": 0.00020266910557640547, + "loss": 6.1443, + "num_input_tokens_seen": 84148224, + "step": 642 + }, + { + "epoch": 0.11029176017954473, + "grad_norm": 1.2954288721084595, + "learning_rate": 0.00020219723274622864, + "loss": 6.0994, + "num_input_tokens_seen": 84541440, + "step": 645 + }, + { + "epoch": 0.11080474511061238, + "grad_norm": 1.6798019409179688, + "learning_rate": 0.00020172864061517005, + "loss": 6.105, + "num_input_tokens_seen": 84934656, + "step": 648 + }, + { + "epoch": 0.11131773004168002, + "grad_norm": 1.2881441116333008, + "learning_rate": 0.00020126329134344468, + "loss": 6.0997, + "num_input_tokens_seen": 85327872, + "step": 651 + }, + { + "epoch": 0.11183071497274767, + "grad_norm": 1.2833970785140991, + "learning_rate": 0.000200801147699487, + "loss": 6.084, + "num_input_tokens_seen": 85721088, + "step": 654 + }, + { + "epoch": 0.11234369990381532, + "grad_norm": 1.1184037923812866, + "learning_rate": 0.00020034217304743868, + "loss": 6.0939, + "num_input_tokens_seen": 86114304, + "step": 657 + }, + { + "epoch": 0.11285668483488298, + "grad_norm": 1.2673637866973877, + "learning_rate": 0.00019988633133495007, + "loss": 6.058, + "num_input_tokens_seen": 86507520, + "step": 660 + }, + { + "epoch": 0.11336966976595063, + "grad_norm": 1.0385109186172485, + "learning_rate": 0.00019943358708128528, + "loss": 6.0954, + "num_input_tokens_seen": 86900736, + "step": 663 + }, + { + "epoch": 0.11388265469701828, + "grad_norm": 1.233398675918579, + "learning_rate": 0.00019898390536572197, + "loss": 6.1543, + "num_input_tokens_seen": 87293952, + "step": 666 + }, + { + "epoch": 0.11439563962808592, + "grad_norm": 0.970950186252594, + "learning_rate": 0.00019853725181623823, + "loss": 6.1156, + "num_input_tokens_seen": 87687168, + "step": 669 + }, + { + "epoch": 0.11490862455915357, + "grad_norm": 1.1220591068267822, + "learning_rate": 0.00019809359259847711, + "loss": 6.1181, + "num_input_tokens_seen": 88080384, + "step": 672 + }, + { + "epoch": 0.11542160949022122, + "grad_norm": 1.175992488861084, + "learning_rate": 0.00019765289440498121, + "loss": 6.1052, + "num_input_tokens_seen": 88473600, + "step": 675 + }, + { + "epoch": 0.11593459442128888, + "grad_norm": 1.138728141784668, + "learning_rate": 0.00019721512444468987, + "loss": 6.059, + "num_input_tokens_seen": 88866816, + "step": 678 + }, + { + "epoch": 0.11644757935235653, + "grad_norm": 1.0036921501159668, + "learning_rate": 0.00019678025043269053, + "loss": 6.065, + "num_input_tokens_seen": 89260032, + "step": 681 + }, + { + "epoch": 0.11696056428342418, + "grad_norm": 0.8844671249389648, + "learning_rate": 0.00019634824058021848, + "loss": 6.1027, + "num_input_tokens_seen": 89653248, + "step": 684 + }, + { + "epoch": 0.11747354921449182, + "grad_norm": 1.0989590883255005, + "learning_rate": 0.000195919063584896, + "loss": 6.1392, + "num_input_tokens_seen": 90046464, + "step": 687 + }, + { + "epoch": 0.11798653414555947, + "grad_norm": 0.9273776412010193, + "learning_rate": 0.00019549268862120603, + "loss": 6.097, + "num_input_tokens_seen": 90439680, + "step": 690 + }, + { + "epoch": 0.11849951907662712, + "grad_norm": 0.8641604781150818, + "learning_rate": 0.00019506908533119244, + "loss": 6.08, + "num_input_tokens_seen": 90832896, + "step": 693 + }, + { + "epoch": 0.11901250400769478, + "grad_norm": 0.8257124423980713, + "learning_rate": 0.00019464822381538125, + "loss": 6.0782, + "num_input_tokens_seen": 91226112, + "step": 696 + }, + { + "epoch": 0.11952548893876243, + "grad_norm": 1.1454071998596191, + "learning_rate": 0.00019423007462391608, + "loss": 6.0874, + "num_input_tokens_seen": 91619328, + "step": 699 + }, + { + "epoch": 0.12003847386983008, + "grad_norm": 1.0243558883666992, + "learning_rate": 0.0001938146087479026, + "loss": 6.0659, + "num_input_tokens_seen": 92012544, + "step": 702 + }, + { + "epoch": 0.12055145880089772, + "grad_norm": 1.1402223110198975, + "learning_rate": 0.0001934017976109553, + "loss": 6.0855, + "num_input_tokens_seen": 92405760, + "step": 705 + }, + { + "epoch": 0.12106444373196537, + "grad_norm": 1.1279404163360596, + "learning_rate": 0.00019299161306094212, + "loss": 6.0553, + "num_input_tokens_seen": 92798976, + "step": 708 + }, + { + "epoch": 0.12157742866303302, + "grad_norm": 1.2544893026351929, + "learning_rate": 0.00019258402736191987, + "loss": 6.0772, + "num_input_tokens_seen": 93192192, + "step": 711 + }, + { + "epoch": 0.12209041359410068, + "grad_norm": 1.333325743675232, + "learning_rate": 0.00019217901318625737, + "loss": 6.0254, + "num_input_tokens_seen": 93585408, + "step": 714 + }, + { + "epoch": 0.12260339852516833, + "grad_norm": 1.4925867319107056, + "learning_rate": 0.00019177654360693922, + "loss": 6.1594, + "num_input_tokens_seen": 93978624, + "step": 717 + }, + { + "epoch": 0.12311638345623598, + "grad_norm": 1.4974321126937866, + "learning_rate": 0.00019137659209004636, + "loss": 6.0507, + "num_input_tokens_seen": 94371840, + "step": 720 + }, + { + "epoch": 0.12362936838730362, + "grad_norm": 1.6772838830947876, + "learning_rate": 0.00019097913248740852, + "loss": 6.1063, + "num_input_tokens_seen": 94765056, + "step": 723 + }, + { + "epoch": 0.12414235331837127, + "grad_norm": 1.1892170906066895, + "learning_rate": 0.00019058413902942387, + "loss": 6.0525, + "num_input_tokens_seen": 95158272, + "step": 726 + }, + { + "epoch": 0.12465533824943892, + "grad_norm": 1.5597805976867676, + "learning_rate": 0.00019019158631804098, + "loss": 6.0913, + "num_input_tokens_seen": 95551488, + "step": 729 + }, + { + "epoch": 0.12516832318050658, + "grad_norm": 1.638031005859375, + "learning_rate": 0.0001898014493198996, + "loss": 6.1035, + "num_input_tokens_seen": 95944704, + "step": 732 + }, + { + "epoch": 0.1256813081115742, + "grad_norm": 1.4688829183578491, + "learning_rate": 0.00018941370335962538, + "loss": 6.0601, + "num_input_tokens_seen": 96337920, + "step": 735 + }, + { + "epoch": 0.12619429304264188, + "grad_norm": 0.9651637077331543, + "learning_rate": 0.00018902832411327452, + "loss": 6.068, + "num_input_tokens_seen": 96731136, + "step": 738 + }, + { + "epoch": 0.1267072779737095, + "grad_norm": 1.2612296342849731, + "learning_rate": 0.00018864528760192487, + "loss": 6.0142, + "num_input_tokens_seen": 97124352, + "step": 741 + }, + { + "epoch": 0.12722026290477717, + "grad_norm": 1.0166645050048828, + "learning_rate": 0.00018826457018540895, + "loss": 6.0268, + "num_input_tokens_seen": 97517568, + "step": 744 + }, + { + "epoch": 0.12773324783584483, + "grad_norm": 1.159142255783081, + "learning_rate": 0.00018788614855618575, + "loss": 6.0269, + "num_input_tokens_seen": 97910784, + "step": 747 + }, + { + "epoch": 0.12824623276691247, + "grad_norm": 0.9123517870903015, + "learning_rate": 0.00018750999973334755, + "loss": 6.0037, + "num_input_tokens_seen": 98304000, + "step": 750 + }, + { + "epoch": 0.12875921769798013, + "grad_norm": 0.9361982345581055, + "learning_rate": 0.00018713610105675787, + "loss": 6.0246, + "num_input_tokens_seen": 98697216, + "step": 753 + }, + { + "epoch": 0.12927220262904776, + "grad_norm": 0.8802709579467773, + "learning_rate": 0.00018676443018131788, + "loss": 6.0012, + "num_input_tokens_seen": 99090432, + "step": 756 + }, + { + "epoch": 0.12978518756011542, + "grad_norm": 1.0109879970550537, + "learning_rate": 0.00018639496507135743, + "loss": 6.0781, + "num_input_tokens_seen": 99483648, + "step": 759 + }, + { + "epoch": 0.13029817249118308, + "grad_norm": 0.9797167778015137, + "learning_rate": 0.00018602768399514743, + "loss": 6.0611, + "num_input_tokens_seen": 99876864, + "step": 762 + }, + { + "epoch": 0.13081115742225072, + "grad_norm": 1.554618000984192, + "learning_rate": 0.0001856625655195309, + "loss": 6.0584, + "num_input_tokens_seen": 100270080, + "step": 765 + }, + { + "epoch": 0.13132414235331838, + "grad_norm": 1.2623248100280762, + "learning_rate": 0.00018529958850466993, + "loss": 6.0177, + "num_input_tokens_seen": 100663296, + "step": 768 + }, + { + "epoch": 0.131837127284386, + "grad_norm": 1.2730941772460938, + "learning_rate": 0.000184938732098904, + "loss": 6.0187, + "num_input_tokens_seen": 101056512, + "step": 771 + }, + { + "epoch": 0.13235011221545367, + "grad_norm": 1.1981124877929688, + "learning_rate": 0.00018457997573371942, + "loss": 6.0086, + "num_input_tokens_seen": 101449728, + "step": 774 + }, + { + "epoch": 0.1328630971465213, + "grad_norm": 1.0194454193115234, + "learning_rate": 0.00018422329911882464, + "loss": 6.0152, + "num_input_tokens_seen": 101842944, + "step": 777 + }, + { + "epoch": 0.13337608207758897, + "grad_norm": 1.068668246269226, + "learning_rate": 0.0001838686822373302, + "loss": 6.0552, + "num_input_tokens_seen": 102236160, + "step": 780 + }, + { + "epoch": 0.13388906700865663, + "grad_norm": 1.023908019065857, + "learning_rate": 0.00018351610534103057, + "loss": 6.0448, + "num_input_tokens_seen": 102629376, + "step": 783 + }, + { + "epoch": 0.13440205193972427, + "grad_norm": 0.8301390409469604, + "learning_rate": 0.0001831655489457848, + "loss": 5.979, + "num_input_tokens_seen": 103022592, + "step": 786 + }, + { + "epoch": 0.13491503687079193, + "grad_norm": 1.0532444715499878, + "learning_rate": 0.00018281699382699399, + "loss": 6.0356, + "num_input_tokens_seen": 103415808, + "step": 789 + }, + { + "epoch": 0.13542802180185956, + "grad_norm": 0.9144531488418579, + "learning_rate": 0.00018247042101517312, + "loss": 5.9964, + "num_input_tokens_seen": 103809024, + "step": 792 + }, + { + "epoch": 0.13594100673292722, + "grad_norm": 0.9134213328361511, + "learning_rate": 0.00018212581179161483, + "loss": 6.0034, + "num_input_tokens_seen": 104202240, + "step": 795 + }, + { + "epoch": 0.13645399166399488, + "grad_norm": 0.9252693057060242, + "learning_rate": 0.0001817831476841428, + "loss": 6.0547, + "num_input_tokens_seen": 104595456, + "step": 798 + }, + { + "epoch": 0.13679598161803996, + "eval_accuracy": 0.14596971177332682, + "eval_loss": 6.421415328979492, + "eval_runtime": 112.8503, + "eval_samples_per_second": 2.658, + "eval_steps_per_second": 1.329, + "num_input_tokens_seen": 104857600, + "step": 800 + }, + { + "epoch": 0.13696697659506252, + "grad_norm": 0.7578924298286438, + "learning_rate": 0.00018144241046295307, + "loss": 6.0183, + "num_input_tokens_seen": 104988672, + "step": 801 + }, + { + "epoch": 0.13747996152613018, + "grad_norm": 0.8038005232810974, + "learning_rate": 0.0001811035821365402, + "loss": 6.0242, + "num_input_tokens_seen": 105381888, + "step": 804 + }, + { + "epoch": 0.1379929464571978, + "grad_norm": 0.8382763266563416, + "learning_rate": 0.0001807666449477075, + "loss": 6.0535, + "num_input_tokens_seen": 105775104, + "step": 807 + }, + { + "epoch": 0.13850593138826547, + "grad_norm": 0.8043891787528992, + "learning_rate": 0.0001804315813696581, + "loss": 6.0307, + "num_input_tokens_seen": 106168320, + "step": 810 + }, + { + "epoch": 0.1390189163193331, + "grad_norm": 1.0049474239349365, + "learning_rate": 0.00018009837410216546, + "loss": 5.9799, + "num_input_tokens_seen": 106561536, + "step": 813 + }, + { + "epoch": 0.13953190125040077, + "grad_norm": 1.1410833597183228, + "learning_rate": 0.00017976700606782165, + "loss": 5.9542, + "num_input_tokens_seen": 106954752, + "step": 816 + }, + { + "epoch": 0.14004488618146843, + "grad_norm": 1.1920111179351807, + "learning_rate": 0.0001794374604083612, + "loss": 6.0421, + "num_input_tokens_seen": 107347968, + "step": 819 + }, + { + "epoch": 0.14055787111253606, + "grad_norm": 1.410753607749939, + "learning_rate": 0.00017910972048105852, + "loss": 6.0533, + "num_input_tokens_seen": 107741184, + "step": 822 + }, + { + "epoch": 0.14107085604360373, + "grad_norm": 1.0810803174972534, + "learning_rate": 0.00017878376985519786, + "loss": 6.0042, + "num_input_tokens_seen": 108134400, + "step": 825 + }, + { + "epoch": 0.14158384097467136, + "grad_norm": 0.937443196773529, + "learning_rate": 0.00017845959230861343, + "loss": 5.9796, + "num_input_tokens_seen": 108527616, + "step": 828 + }, + { + "epoch": 0.14209682590573902, + "grad_norm": 0.9939092397689819, + "learning_rate": 0.00017813717182429826, + "loss": 5.9832, + "num_input_tokens_seen": 108920832, + "step": 831 + }, + { + "epoch": 0.14260981083680666, + "grad_norm": 0.9864884614944458, + "learning_rate": 0.00017781649258708038, + "loss": 5.9771, + "num_input_tokens_seen": 109314048, + "step": 834 + }, + { + "epoch": 0.14312279576787432, + "grad_norm": 1.1324708461761475, + "learning_rate": 0.0001774975389803645, + "loss": 6.0005, + "num_input_tokens_seen": 109707264, + "step": 837 + }, + { + "epoch": 0.14363578069894198, + "grad_norm": 1.1927917003631592, + "learning_rate": 0.00017718029558293758, + "loss": 6.0208, + "num_input_tokens_seen": 110100480, + "step": 840 + }, + { + "epoch": 0.1441487656300096, + "grad_norm": 1.1395940780639648, + "learning_rate": 0.00017686474716583739, + "loss": 5.9824, + "num_input_tokens_seen": 110493696, + "step": 843 + }, + { + "epoch": 0.14466175056107727, + "grad_norm": 1.562849998474121, + "learning_rate": 0.00017655087868928166, + "loss": 5.9618, + "num_input_tokens_seen": 110886912, + "step": 846 + }, + { + "epoch": 0.1451747354921449, + "grad_norm": 1.2808341979980469, + "learning_rate": 0.00017623867529965745, + "loss": 6.0031, + "num_input_tokens_seen": 111280128, + "step": 849 + }, + { + "epoch": 0.14568772042321257, + "grad_norm": 1.1808559894561768, + "learning_rate": 0.00017592812232656866, + "loss": 5.9783, + "num_input_tokens_seen": 111673344, + "step": 852 + }, + { + "epoch": 0.14620070535428023, + "grad_norm": 1.1873037815093994, + "learning_rate": 0.00017561920527994052, + "loss": 5.9943, + "num_input_tokens_seen": 112066560, + "step": 855 + }, + { + "epoch": 0.14671369028534786, + "grad_norm": 1.1178746223449707, + "learning_rate": 0.00017531190984717987, + "loss": 5.938, + "num_input_tokens_seen": 112459776, + "step": 858 + }, + { + "epoch": 0.14722667521641553, + "grad_norm": 1.8096652030944824, + "learning_rate": 0.00017500622189039, + "loss": 6.0226, + "num_input_tokens_seen": 112852992, + "step": 861 + }, + { + "epoch": 0.14773966014748316, + "grad_norm": 1.264701247215271, + "learning_rate": 0.00017470212744363856, + "loss": 6.0132, + "num_input_tokens_seen": 113246208, + "step": 864 + }, + { + "epoch": 0.14825264507855082, + "grad_norm": 1.6313904523849487, + "learning_rate": 0.00017439961271027758, + "loss": 5.9518, + "num_input_tokens_seen": 113639424, + "step": 867 + }, + { + "epoch": 0.14876563000961845, + "grad_norm": 1.2754333019256592, + "learning_rate": 0.00017409866406031439, + "loss": 5.9424, + "num_input_tokens_seen": 114032640, + "step": 870 + }, + { + "epoch": 0.14927861494068612, + "grad_norm": 1.5135891437530518, + "learning_rate": 0.00017379926802783236, + "loss": 5.9789, + "num_input_tokens_seen": 114425856, + "step": 873 + }, + { + "epoch": 0.14979159987175378, + "grad_norm": 0.951026439666748, + "learning_rate": 0.00017350141130845995, + "loss": 5.9601, + "num_input_tokens_seen": 114819072, + "step": 876 + }, + { + "epoch": 0.1503045848028214, + "grad_norm": 1.3585782051086426, + "learning_rate": 0.00017320508075688773, + "loss": 5.9518, + "num_input_tokens_seen": 115212288, + "step": 879 + }, + { + "epoch": 0.15081756973388907, + "grad_norm": 1.0978291034698486, + "learning_rate": 0.0001729102633844315, + "loss": 5.9492, + "num_input_tokens_seen": 115605504, + "step": 882 + }, + { + "epoch": 0.1513305546649567, + "grad_norm": 1.389070987701416, + "learning_rate": 0.0001726169463566411, + "loss": 5.9284, + "num_input_tokens_seen": 115998720, + "step": 885 + }, + { + "epoch": 0.15184353959602437, + "grad_norm": 0.956652045249939, + "learning_rate": 0.00017232511699095387, + "loss": 6.0113, + "num_input_tokens_seen": 116391936, + "step": 888 + }, + { + "epoch": 0.15235652452709203, + "grad_norm": 1.1337711811065674, + "learning_rate": 0.00017203476275439095, + "loss": 5.904, + "num_input_tokens_seen": 116785152, + "step": 891 + }, + { + "epoch": 0.15286950945815966, + "grad_norm": 0.9648370742797852, + "learning_rate": 0.00017174587126129703, + "loss": 5.9447, + "num_input_tokens_seen": 117178368, + "step": 894 + }, + { + "epoch": 0.15338249438922733, + "grad_norm": 0.9501051902770996, + "learning_rate": 0.00017145843027112077, + "loss": 5.9404, + "num_input_tokens_seen": 117571584, + "step": 897 + }, + { + "epoch": 0.15389547932029496, + "grad_norm": 0.9637885093688965, + "learning_rate": 0.00017117242768623688, + "loss": 5.9458, + "num_input_tokens_seen": 117964800, + "step": 900 + }, + { + "epoch": 0.15440846425136262, + "grad_norm": 1.0075721740722656, + "learning_rate": 0.00017088785154980728, + "loss": 5.9596, + "num_input_tokens_seen": 118358016, + "step": 903 + }, + { + "epoch": 0.15492144918243025, + "grad_norm": 1.1554243564605713, + "learning_rate": 0.00017060469004368157, + "loss": 5.9451, + "num_input_tokens_seen": 118751232, + "step": 906 + }, + { + "epoch": 0.15543443411349792, + "grad_norm": 0.8994986414909363, + "learning_rate": 0.0001703229314863357, + "loss": 5.9276, + "num_input_tokens_seen": 119144448, + "step": 909 + }, + { + "epoch": 0.15594741904456558, + "grad_norm": 0.9552657604217529, + "learning_rate": 0.0001700425643308478, + "loss": 5.9344, + "num_input_tokens_seen": 119537664, + "step": 912 + }, + { + "epoch": 0.1564604039756332, + "grad_norm": 1.1688953638076782, + "learning_rate": 0.00016976357716291072, + "loss": 5.9453, + "num_input_tokens_seen": 119930880, + "step": 915 + }, + { + "epoch": 0.15697338890670087, + "grad_norm": 0.9850606918334961, + "learning_rate": 0.0001694859586988799, + "loss": 5.9218, + "num_input_tokens_seen": 120324096, + "step": 918 + }, + { + "epoch": 0.1574863738377685, + "grad_norm": 0.9341318607330322, + "learning_rate": 0.00016920969778385703, + "loss": 5.8967, + "num_input_tokens_seen": 120717312, + "step": 921 + }, + { + "epoch": 0.15799935876883617, + "grad_norm": 1.1161161661148071, + "learning_rate": 0.00016893478338980708, + "loss": 5.9717, + "num_input_tokens_seen": 121110528, + "step": 924 + }, + { + "epoch": 0.1585123436999038, + "grad_norm": 1.0169016122817993, + "learning_rate": 0.00016866120461370946, + "loss": 5.9749, + "num_input_tokens_seen": 121503744, + "step": 927 + }, + { + "epoch": 0.15902532863097146, + "grad_norm": 0.8988534212112427, + "learning_rate": 0.00016838895067574185, + "loss": 5.9448, + "num_input_tokens_seen": 121896960, + "step": 930 + }, + { + "epoch": 0.15953831356203912, + "grad_norm": 0.7732037305831909, + "learning_rate": 0.00016811801091749597, + "loss": 5.928, + "num_input_tokens_seen": 122290176, + "step": 933 + }, + { + "epoch": 0.16005129849310676, + "grad_norm": 0.7758464217185974, + "learning_rate": 0.00016784837480022532, + "loss": 5.9203, + "num_input_tokens_seen": 122683392, + "step": 936 + }, + { + "epoch": 0.16056428342417442, + "grad_norm": 0.688848614692688, + "learning_rate": 0.0001675800319031231, + "loss": 5.9253, + "num_input_tokens_seen": 123076608, + "step": 939 + }, + { + "epoch": 0.16107726835524205, + "grad_norm": 0.8796403408050537, + "learning_rate": 0.00016731297192163077, + "loss": 5.9361, + "num_input_tokens_seen": 123469824, + "step": 942 + }, + { + "epoch": 0.16159025328630972, + "grad_norm": 1.0328707695007324, + "learning_rate": 0.00016704718466577608, + "loss": 5.94, + "num_input_tokens_seen": 123863040, + "step": 945 + }, + { + "epoch": 0.16210323821737738, + "grad_norm": 1.015504240989685, + "learning_rate": 0.00016678266005854003, + "loss": 5.9536, + "num_input_tokens_seen": 124256256, + "step": 948 + }, + { + "epoch": 0.162616223148445, + "grad_norm": 0.8219988346099854, + "learning_rate": 0.00016651938813425204, + "loss": 5.9519, + "num_input_tokens_seen": 124649472, + "step": 951 + }, + { + "epoch": 0.16312920807951267, + "grad_norm": 1.0020133256912231, + "learning_rate": 0.00016625735903701302, + "loss": 5.9622, + "num_input_tokens_seen": 125042688, + "step": 954 + }, + { + "epoch": 0.1636421930105803, + "grad_norm": 0.8336507678031921, + "learning_rate": 0.00016599656301914591, + "loss": 5.9003, + "num_input_tokens_seen": 125435904, + "step": 957 + }, + { + "epoch": 0.16415517794164797, + "grad_norm": 0.9200095534324646, + "learning_rate": 0.0001657369904396722, + "loss": 5.8612, + "num_input_tokens_seen": 125829120, + "step": 960 + }, + { + "epoch": 0.1646681628727156, + "grad_norm": 0.858650267124176, + "learning_rate": 0.0001654786317628154, + "loss": 5.9106, + "num_input_tokens_seen": 126222336, + "step": 963 + }, + { + "epoch": 0.16518114780378326, + "grad_norm": 0.8560724258422852, + "learning_rate": 0.00016522147755652932, + "loss": 5.9117, + "num_input_tokens_seen": 126615552, + "step": 966 + }, + { + "epoch": 0.16569413273485092, + "grad_norm": 0.8329317569732666, + "learning_rate": 0.00016496551849105217, + "loss": 5.9295, + "num_input_tokens_seen": 127008768, + "step": 969 + }, + { + "epoch": 0.16620711766591856, + "grad_norm": 0.9380521774291992, + "learning_rate": 0.00016471074533748437, + "loss": 5.9553, + "num_input_tokens_seen": 127401984, + "step": 972 + }, + { + "epoch": 0.16672010259698622, + "grad_norm": 1.0690613985061646, + "learning_rate": 0.00016445714896639137, + "loss": 5.9229, + "num_input_tokens_seen": 127795200, + "step": 975 + }, + { + "epoch": 0.16723308752805385, + "grad_norm": 1.3576298952102661, + "learning_rate": 0.00016420472034642939, + "loss": 5.9454, + "num_input_tokens_seen": 128188416, + "step": 978 + }, + { + "epoch": 0.16774607245912151, + "grad_norm": 1.040351390838623, + "learning_rate": 0.00016395345054299445, + "loss": 5.9541, + "num_input_tokens_seen": 128581632, + "step": 981 + }, + { + "epoch": 0.16825905739018918, + "grad_norm": 0.8309887647628784, + "learning_rate": 0.0001637033307168943, + "loss": 5.8765, + "num_input_tokens_seen": 128974848, + "step": 984 + }, + { + "epoch": 0.1687720423212568, + "grad_norm": 0.7443415522575378, + "learning_rate": 0.00016345435212304236, + "loss": 5.9289, + "num_input_tokens_seen": 129368064, + "step": 987 + }, + { + "epoch": 0.16928502725232447, + "grad_norm": 0.8244521617889404, + "learning_rate": 0.00016320650610917334, + "loss": 5.9387, + "num_input_tokens_seen": 129761280, + "step": 990 + }, + { + "epoch": 0.1697980121833921, + "grad_norm": 0.8613632321357727, + "learning_rate": 0.0001629597841145805, + "loss": 5.9035, + "num_input_tokens_seen": 130154496, + "step": 993 + }, + { + "epoch": 0.17031099711445977, + "grad_norm": 0.9145336151123047, + "learning_rate": 0.00016271417766887378, + "loss": 5.9495, + "num_input_tokens_seen": 130547712, + "step": 996 + }, + { + "epoch": 0.1708239820455274, + "grad_norm": 1.0651459693908691, + "learning_rate": 0.00016246967839075817, + "loss": 5.9386, + "num_input_tokens_seen": 130940928, + "step": 999 + }, + { + "epoch": 0.17133696697659506, + "grad_norm": 1.1779698133468628, + "learning_rate": 0.00016222627798683257, + "loss": 5.8951, + "num_input_tokens_seen": 131334144, + "step": 1002 + }, + { + "epoch": 0.17184995190766272, + "grad_norm": 0.9768991470336914, + "learning_rate": 0.00016198396825040817, + "loss": 5.9006, + "num_input_tokens_seen": 131727360, + "step": 1005 + }, + { + "epoch": 0.17236293683873036, + "grad_norm": 0.8397189974784851, + "learning_rate": 0.00016174274106034645, + "loss": 5.9255, + "num_input_tokens_seen": 132120576, + "step": 1008 + }, + { + "epoch": 0.17287592176979802, + "grad_norm": 0.7937173843383789, + "learning_rate": 0.00016150258837991562, + "loss": 5.8806, + "num_input_tokens_seen": 132513792, + "step": 1011 + }, + { + "epoch": 0.17338890670086565, + "grad_norm": 0.8085054159164429, + "learning_rate": 0.00016126350225566634, + "loss": 5.893, + "num_input_tokens_seen": 132907008, + "step": 1014 + }, + { + "epoch": 0.17390189163193331, + "grad_norm": 0.836296021938324, + "learning_rate": 0.0001610254748163253, + "loss": 5.8943, + "num_input_tokens_seen": 133300224, + "step": 1017 + }, + { + "epoch": 0.17441487656300098, + "grad_norm": 0.8128154873847961, + "learning_rate": 0.0001607884982717066, + "loss": 5.9156, + "num_input_tokens_seen": 133693440, + "step": 1020 + }, + { + "epoch": 0.1749278614940686, + "grad_norm": 1.1624642610549927, + "learning_rate": 0.00016055256491164112, + "loss": 5.8715, + "num_input_tokens_seen": 134086656, + "step": 1023 + }, + { + "epoch": 0.17544084642513627, + "grad_norm": 0.7400571703910828, + "learning_rate": 0.000160317667104923, + "loss": 5.9113, + "num_input_tokens_seen": 134479872, + "step": 1026 + }, + { + "epoch": 0.1759538313562039, + "grad_norm": 0.864396333694458, + "learning_rate": 0.0001600837972982725, + "loss": 5.8986, + "num_input_tokens_seen": 134873088, + "step": 1029 + }, + { + "epoch": 0.17646681628727157, + "grad_norm": 0.8100318312644958, + "learning_rate": 0.00015985094801531627, + "loss": 5.9568, + "num_input_tokens_seen": 135266304, + "step": 1032 + }, + { + "epoch": 0.1769798012183392, + "grad_norm": 0.739395797252655, + "learning_rate": 0.0001596191118555833, + "loss": 5.9094, + "num_input_tokens_seen": 135659520, + "step": 1035 + }, + { + "epoch": 0.17749278614940686, + "grad_norm": 0.8485215306282043, + "learning_rate": 0.0001593882814935171, + "loss": 5.9509, + "num_input_tokens_seen": 136052736, + "step": 1038 + }, + { + "epoch": 0.17800577108047452, + "grad_norm": 0.8100500106811523, + "learning_rate": 0.00015915844967750344, + "loss": 5.8577, + "num_input_tokens_seen": 136445952, + "step": 1041 + }, + { + "epoch": 0.17851875601154216, + "grad_norm": 0.9649944305419922, + "learning_rate": 0.00015892960922891358, + "loss": 5.901, + "num_input_tokens_seen": 136839168, + "step": 1044 + }, + { + "epoch": 0.17903174094260982, + "grad_norm": 1.0253026485443115, + "learning_rate": 0.00015870175304116244, + "loss": 5.8591, + "num_input_tokens_seen": 137232384, + "step": 1047 + }, + { + "epoch": 0.17954472587367745, + "grad_norm": 1.2840728759765625, + "learning_rate": 0.00015847487407878166, + "loss": 5.9175, + "num_input_tokens_seen": 137625600, + "step": 1050 + }, + { + "epoch": 0.1800577108047451, + "grad_norm": 0.9364380240440369, + "learning_rate": 0.0001582489653765074, + "loss": 5.8954, + "num_input_tokens_seen": 138018816, + "step": 1053 + }, + { + "epoch": 0.18057069573581275, + "grad_norm": 0.835299551486969, + "learning_rate": 0.0001580240200383818, + "loss": 5.866, + "num_input_tokens_seen": 138412032, + "step": 1056 + }, + { + "epoch": 0.1810836806668804, + "grad_norm": 1.4820502996444702, + "learning_rate": 0.0001578000312368693, + "loss": 5.8628, + "num_input_tokens_seen": 138805248, + "step": 1059 + }, + { + "epoch": 0.18159666559794807, + "grad_norm": 1.0213690996170044, + "learning_rate": 0.0001575769922119859, + "loss": 5.8842, + "num_input_tokens_seen": 139198464, + "step": 1062 + }, + { + "epoch": 0.1821096505290157, + "grad_norm": 1.2635438442230225, + "learning_rate": 0.0001573548962704424, + "loss": 5.8898, + "num_input_tokens_seen": 139591680, + "step": 1065 + }, + { + "epoch": 0.18262263546008337, + "grad_norm": 0.9599955677986145, + "learning_rate": 0.00015713373678480076, + "loss": 5.9028, + "num_input_tokens_seen": 139984896, + "step": 1068 + }, + { + "epoch": 0.183135620391151, + "grad_norm": 1.3038582801818848, + "learning_rate": 0.00015691350719264352, + "loss": 5.898, + "num_input_tokens_seen": 140378112, + "step": 1071 + }, + { + "epoch": 0.18364860532221866, + "grad_norm": 1.0840892791748047, + "learning_rate": 0.00015669420099575582, + "loss": 5.8597, + "num_input_tokens_seen": 140771328, + "step": 1074 + }, + { + "epoch": 0.18416159025328632, + "grad_norm": 1.0119037628173828, + "learning_rate": 0.00015647581175932002, + "loss": 5.8969, + "num_input_tokens_seen": 141164544, + "step": 1077 + }, + { + "epoch": 0.18467457518435396, + "grad_norm": 0.8829667568206787, + "learning_rate": 0.00015625833311112293, + "loss": 5.8546, + "num_input_tokens_seen": 141557760, + "step": 1080 + }, + { + "epoch": 0.18518756011542162, + "grad_norm": 0.8768784403800964, + "learning_rate": 0.00015604175874077463, + "loss": 5.8718, + "num_input_tokens_seen": 141950976, + "step": 1083 + }, + { + "epoch": 0.18570054504648925, + "grad_norm": 0.9819164276123047, + "learning_rate": 0.00015582608239893955, + "loss": 5.8585, + "num_input_tokens_seen": 142344192, + "step": 1086 + }, + { + "epoch": 0.1862135299775569, + "grad_norm": 0.8090091943740845, + "learning_rate": 0.00015561129789657898, + "loss": 5.8592, + "num_input_tokens_seen": 142737408, + "step": 1089 + }, + { + "epoch": 0.18672651490862455, + "grad_norm": 0.878285825252533, + "learning_rate": 0.0001553973991042052, + "loss": 5.9156, + "num_input_tokens_seen": 143130624, + "step": 1092 + }, + { + "epoch": 0.1872394998396922, + "grad_norm": 0.8004051446914673, + "learning_rate": 0.00015518437995114688, + "loss": 5.8415, + "num_input_tokens_seen": 143523840, + "step": 1095 + }, + { + "epoch": 0.18775248477075987, + "grad_norm": 0.8821293711662292, + "learning_rate": 0.0001549722344248251, + "loss": 5.8736, + "num_input_tokens_seen": 143917056, + "step": 1098 + }, + { + "epoch": 0.1882654697018275, + "grad_norm": 0.8214676380157471, + "learning_rate": 0.00015476095657004097, + "loss": 5.8634, + "num_input_tokens_seen": 144310272, + "step": 1101 + }, + { + "epoch": 0.18877845463289517, + "grad_norm": 0.9714133739471436, + "learning_rate": 0.00015455054048827327, + "loss": 5.8468, + "num_input_tokens_seen": 144703488, + "step": 1104 + }, + { + "epoch": 0.1892914395639628, + "grad_norm": 0.9200727939605713, + "learning_rate": 0.00015434098033698665, + "loss": 5.8887, + "num_input_tokens_seen": 145096704, + "step": 1107 + }, + { + "epoch": 0.18980442449503046, + "grad_norm": 0.8920320868492126, + "learning_rate": 0.00015413227032895076, + "loss": 5.8708, + "num_input_tokens_seen": 145489920, + "step": 1110 + }, + { + "epoch": 0.19031740942609812, + "grad_norm": 0.7470078468322754, + "learning_rate": 0.00015392440473156833, + "loss": 5.8795, + "num_input_tokens_seen": 145883136, + "step": 1113 + }, + { + "epoch": 0.19083039435716576, + "grad_norm": 0.8388041853904724, + "learning_rate": 0.0001537173778662143, + "loss": 5.823, + "num_input_tokens_seen": 146276352, + "step": 1116 + }, + { + "epoch": 0.19134337928823342, + "grad_norm": 0.767766535282135, + "learning_rate": 0.00015351118410758416, + "loss": 5.8513, + "num_input_tokens_seen": 146669568, + "step": 1119 + }, + { + "epoch": 0.19185636421930105, + "grad_norm": 0.8732740879058838, + "learning_rate": 0.00015330581788305177, + "loss": 5.8557, + "num_input_tokens_seen": 147062784, + "step": 1122 + }, + { + "epoch": 0.1923693491503687, + "grad_norm": 0.8404717445373535, + "learning_rate": 0.0001531012736720371, + "loss": 5.8324, + "num_input_tokens_seen": 147456000, + "step": 1125 + }, + { + "epoch": 0.19288233408143635, + "grad_norm": 0.9904493093490601, + "learning_rate": 0.0001528975460053826, + "loss": 5.8828, + "num_input_tokens_seen": 147849216, + "step": 1128 + }, + { + "epoch": 0.193395319012504, + "grad_norm": 1.0272847414016724, + "learning_rate": 0.00015269462946473922, + "loss": 5.8644, + "num_input_tokens_seen": 148242432, + "step": 1131 + }, + { + "epoch": 0.19390830394357167, + "grad_norm": 0.9573217630386353, + "learning_rate": 0.00015249251868196107, + "loss": 5.8459, + "num_input_tokens_seen": 148635648, + "step": 1134 + }, + { + "epoch": 0.1944212888746393, + "grad_norm": 0.9274708032608032, + "learning_rate": 0.00015229120833850902, + "loss": 5.8528, + "num_input_tokens_seen": 149028864, + "step": 1137 + }, + { + "epoch": 0.19493427380570696, + "grad_norm": 0.8022924661636353, + "learning_rate": 0.0001520906931648627, + "loss": 5.8369, + "num_input_tokens_seen": 149422080, + "step": 1140 + }, + { + "epoch": 0.1954472587367746, + "grad_norm": 1.0030078887939453, + "learning_rate": 0.00015189096793994132, + "loss": 5.8514, + "num_input_tokens_seen": 149815296, + "step": 1143 + }, + { + "epoch": 0.19596024366784226, + "grad_norm": 1.0330564975738525, + "learning_rate": 0.00015169202749053254, + "loss": 5.877, + "num_input_tokens_seen": 150208512, + "step": 1146 + }, + { + "epoch": 0.1964732285989099, + "grad_norm": 0.8017846345901489, + "learning_rate": 0.00015149386669072978, + "loss": 5.9143, + "num_input_tokens_seen": 150601728, + "step": 1149 + }, + { + "epoch": 0.19698621352997756, + "grad_norm": 0.9474872946739197, + "learning_rate": 0.00015129648046137753, + "loss": 5.8327, + "num_input_tokens_seen": 150994944, + "step": 1152 + }, + { + "epoch": 0.19749919846104522, + "grad_norm": 0.8487737774848938, + "learning_rate": 0.0001510998637695244, + "loss": 5.9011, + "num_input_tokens_seen": 151388160, + "step": 1155 + }, + { + "epoch": 0.19801218339211285, + "grad_norm": 0.8053280711174011, + "learning_rate": 0.00015090401162788414, + "loss": 5.8002, + "num_input_tokens_seen": 151781376, + "step": 1158 + }, + { + "epoch": 0.1985251683231805, + "grad_norm": 0.7677531838417053, + "learning_rate": 0.00015070891909430456, + "loss": 5.8301, + "num_input_tokens_seen": 152174592, + "step": 1161 + }, + { + "epoch": 0.19903815325424815, + "grad_norm": 0.8054178357124329, + "learning_rate": 0.0001505145812712434, + "loss": 5.8065, + "num_input_tokens_seen": 152567808, + "step": 1164 + }, + { + "epoch": 0.1995511381853158, + "grad_norm": 1.0484933853149414, + "learning_rate": 0.00015032099330525203, + "loss": 5.8322, + "num_input_tokens_seen": 152961024, + "step": 1167 + }, + { + "epoch": 0.20006412311638347, + "grad_norm": 0.8616706132888794, + "learning_rate": 0.0001501281503864666, + "loss": 5.8494, + "num_input_tokens_seen": 153354240, + "step": 1170 + }, + { + "epoch": 0.2005771080474511, + "grad_norm": 1.0345689058303833, + "learning_rate": 0.00014993604774810574, + "loss": 5.8326, + "num_input_tokens_seen": 153747456, + "step": 1173 + }, + { + "epoch": 0.20109009297851876, + "grad_norm": 0.9733481407165527, + "learning_rate": 0.00014974468066597592, + "loss": 5.8334, + "num_input_tokens_seen": 154140672, + "step": 1176 + }, + { + "epoch": 0.2016030779095864, + "grad_norm": 0.9011761546134949, + "learning_rate": 0.0001495540444579833, + "loss": 5.7905, + "num_input_tokens_seen": 154533888, + "step": 1179 + }, + { + "epoch": 0.20211606284065406, + "grad_norm": 0.8033789992332458, + "learning_rate": 0.00014936413448365292, + "loss": 5.8216, + "num_input_tokens_seen": 154927104, + "step": 1182 + }, + { + "epoch": 0.2026290477717217, + "grad_norm": 0.8766458630561829, + "learning_rate": 0.00014917494614365384, + "loss": 5.8183, + "num_input_tokens_seen": 155320320, + "step": 1185 + }, + { + "epoch": 0.20314203270278935, + "grad_norm": 0.8766114711761475, + "learning_rate": 0.00014898647487933156, + "loss": 5.8486, + "num_input_tokens_seen": 155713536, + "step": 1188 + }, + { + "epoch": 0.20365501763385702, + "grad_norm": 0.988873302936554, + "learning_rate": 0.00014879871617224662, + "loss": 5.8093, + "num_input_tokens_seen": 156106752, + "step": 1191 + }, + { + "epoch": 0.20416800256492465, + "grad_norm": 0.9486966729164124, + "learning_rate": 0.00014861166554371963, + "loss": 5.7705, + "num_input_tokens_seen": 156499968, + "step": 1194 + }, + { + "epoch": 0.2046809874959923, + "grad_norm": 0.8582146763801575, + "learning_rate": 0.00014842531855438251, + "loss": 5.7997, + "num_input_tokens_seen": 156893184, + "step": 1197 + }, + { + "epoch": 0.20519397242705995, + "grad_norm": 0.7633207440376282, + "learning_rate": 0.00014823967080373592, + "loss": 5.8133, + "num_input_tokens_seen": 157286400, + "step": 1200 + }, + { + "epoch": 0.20519397242705995, + "eval_accuracy": 0.15496498941540465, + "eval_loss": 6.256631851196289, + "eval_runtime": 110.7502, + "eval_samples_per_second": 2.709, + "eval_steps_per_second": 1.354, + "num_input_tokens_seen": 157286400, + "step": 1200 + }, + { + "epoch": 0.2057069573581276, + "grad_norm": 0.7945148944854736, + "learning_rate": 0.000148054717929713, + "loss": 5.8051, + "num_input_tokens_seen": 157679616, + "step": 1203 + }, + { + "epoch": 0.20621994228919527, + "grad_norm": 0.6455596089363098, + "learning_rate": 0.00014787045560824864, + "loss": 5.7968, + "num_input_tokens_seen": 158072832, + "step": 1206 + }, + { + "epoch": 0.2067329272202629, + "grad_norm": 0.7505893111228943, + "learning_rate": 0.00014768687955285517, + "loss": 5.7999, + "num_input_tokens_seen": 158466048, + "step": 1209 + }, + { + "epoch": 0.20724591215133056, + "grad_norm": 0.7592454552650452, + "learning_rate": 0.00014750398551420315, + "loss": 5.8526, + "num_input_tokens_seen": 158859264, + "step": 1212 + }, + { + "epoch": 0.2077588970823982, + "grad_norm": 0.8120488524436951, + "learning_rate": 0.00014732176927970863, + "loss": 5.8122, + "num_input_tokens_seen": 159252480, + "step": 1215 + }, + { + "epoch": 0.20827188201346586, + "grad_norm": 0.9561596512794495, + "learning_rate": 0.0001471402266731254, + "loss": 5.7933, + "num_input_tokens_seen": 159645696, + "step": 1218 + }, + { + "epoch": 0.2087848669445335, + "grad_norm": 1.0503617525100708, + "learning_rate": 0.00014695935355414297, + "loss": 5.8015, + "num_input_tokens_seen": 160038912, + "step": 1221 + }, + { + "epoch": 0.20929785187560115, + "grad_norm": 0.7630879282951355, + "learning_rate": 0.00014677914581799015, + "loss": 5.8202, + "num_input_tokens_seen": 160432128, + "step": 1224 + }, + { + "epoch": 0.20981083680666882, + "grad_norm": 0.8692222237586975, + "learning_rate": 0.00014659959939504366, + "loss": 5.8312, + "num_input_tokens_seen": 160825344, + "step": 1227 + }, + { + "epoch": 0.21032382173773645, + "grad_norm": 0.9576478004455566, + "learning_rate": 0.00014642071025044203, + "loss": 5.8261, + "num_input_tokens_seen": 161218560, + "step": 1230 + }, + { + "epoch": 0.2108368066688041, + "grad_norm": 1.0278856754302979, + "learning_rate": 0.000146242474383705, + "loss": 5.8203, + "num_input_tokens_seen": 161611776, + "step": 1233 + }, + { + "epoch": 0.21134979159987174, + "grad_norm": 1.0111125707626343, + "learning_rate": 0.00014606488782835757, + "loss": 5.78, + "num_input_tokens_seen": 162004992, + "step": 1236 + }, + { + "epoch": 0.2118627765309394, + "grad_norm": 1.093246340751648, + "learning_rate": 0.00014588794665155937, + "loss": 5.8341, + "num_input_tokens_seen": 162398208, + "step": 1239 + }, + { + "epoch": 0.21237576146200704, + "grad_norm": 0.9989133477210999, + "learning_rate": 0.0001457116469537388, + "loss": 5.8199, + "num_input_tokens_seen": 162791424, + "step": 1242 + }, + { + "epoch": 0.2128887463930747, + "grad_norm": 0.9265642762184143, + "learning_rate": 0.00014553598486823202, + "loss": 5.833, + "num_input_tokens_seen": 163184640, + "step": 1245 + }, + { + "epoch": 0.21340173132414236, + "grad_norm": 0.9529325366020203, + "learning_rate": 0.0001453609565609269, + "loss": 5.7129, + "num_input_tokens_seen": 163577856, + "step": 1248 + }, + { + "epoch": 0.21391471625521, + "grad_norm": 0.8420143723487854, + "learning_rate": 0.00014518655822991146, + "loss": 5.7953, + "num_input_tokens_seen": 163971072, + "step": 1251 + }, + { + "epoch": 0.21442770118627766, + "grad_norm": 0.977813720703125, + "learning_rate": 0.0001450127861051269, + "loss": 5.8014, + "num_input_tokens_seen": 164364288, + "step": 1254 + }, + { + "epoch": 0.2149406861173453, + "grad_norm": 0.8765429854393005, + "learning_rate": 0.00014483963644802545, + "loss": 5.8276, + "num_input_tokens_seen": 164757504, + "step": 1257 + }, + { + "epoch": 0.21545367104841295, + "grad_norm": 0.8605163097381592, + "learning_rate": 0.00014466710555123243, + "loss": 5.726, + "num_input_tokens_seen": 165150720, + "step": 1260 + }, + { + "epoch": 0.21596665597948062, + "grad_norm": 1.005022644996643, + "learning_rate": 0.000144495189738213, + "loss": 5.8138, + "num_input_tokens_seen": 165543936, + "step": 1263 + }, + { + "epoch": 0.21647964091054825, + "grad_norm": 0.8421231508255005, + "learning_rate": 0.00014432388536294303, + "loss": 5.8112, + "num_input_tokens_seen": 165937152, + "step": 1266 + }, + { + "epoch": 0.2169926258416159, + "grad_norm": 0.8746516108512878, + "learning_rate": 0.00014415318880958418, + "loss": 5.8362, + "num_input_tokens_seen": 166330368, + "step": 1269 + }, + { + "epoch": 0.21750561077268354, + "grad_norm": 0.9044854044914246, + "learning_rate": 0.0001439830964921636, + "loss": 5.8002, + "num_input_tokens_seen": 166723584, + "step": 1272 + }, + { + "epoch": 0.2180185957037512, + "grad_norm": 1.2117860317230225, + "learning_rate": 0.00014381360485425735, + "loss": 5.8231, + "num_input_tokens_seen": 167116800, + "step": 1275 + }, + { + "epoch": 0.21853158063481884, + "grad_norm": 1.057684302330017, + "learning_rate": 0.00014364471036867806, + "loss": 5.8179, + "num_input_tokens_seen": 167510016, + "step": 1278 + }, + { + "epoch": 0.2190445655658865, + "grad_norm": 0.8717806339263916, + "learning_rate": 0.00014347640953716679, + "loss": 5.7661, + "num_input_tokens_seen": 167903232, + "step": 1281 + }, + { + "epoch": 0.21955755049695416, + "grad_norm": 0.7429178953170776, + "learning_rate": 0.00014330869889008863, + "loss": 5.7949, + "num_input_tokens_seen": 168296448, + "step": 1284 + }, + { + "epoch": 0.2200705354280218, + "grad_norm": 0.7271013855934143, + "learning_rate": 0.00014314157498613212, + "loss": 5.8544, + "num_input_tokens_seen": 168689664, + "step": 1287 + }, + { + "epoch": 0.22058352035908946, + "grad_norm": 0.8151692748069763, + "learning_rate": 0.0001429750344120129, + "loss": 5.808, + "num_input_tokens_seen": 169082880, + "step": 1290 + }, + { + "epoch": 0.2210965052901571, + "grad_norm": 0.7940250039100647, + "learning_rate": 0.00014280907378218079, + "loss": 5.8021, + "num_input_tokens_seen": 169476096, + "step": 1293 + }, + { + "epoch": 0.22160949022122475, + "grad_norm": 0.7792456150054932, + "learning_rate": 0.00014264368973853074, + "loss": 5.8054, + "num_input_tokens_seen": 169869312, + "step": 1296 + }, + { + "epoch": 0.22212247515229241, + "grad_norm": 0.7758190035820007, + "learning_rate": 0.00014247887895011744, + "loss": 5.7821, + "num_input_tokens_seen": 170262528, + "step": 1299 + }, + { + "epoch": 0.22263546008336005, + "grad_norm": 1.006454348564148, + "learning_rate": 0.00014231463811287352, + "loss": 5.8103, + "num_input_tokens_seen": 170655744, + "step": 1302 + }, + { + "epoch": 0.2231484450144277, + "grad_norm": 0.9445181488990784, + "learning_rate": 0.00014215096394933147, + "loss": 5.7801, + "num_input_tokens_seen": 171048960, + "step": 1305 + }, + { + "epoch": 0.22366142994549534, + "grad_norm": 1.0699506998062134, + "learning_rate": 0.00014198785320834877, + "loss": 5.7746, + "num_input_tokens_seen": 171442176, + "step": 1308 + }, + { + "epoch": 0.224174414876563, + "grad_norm": 0.8202515244483948, + "learning_rate": 0.0001418253026648367, + "loss": 5.7614, + "num_input_tokens_seen": 171835392, + "step": 1311 + }, + { + "epoch": 0.22468739980763064, + "grad_norm": 0.8101188540458679, + "learning_rate": 0.00014166330911949266, + "loss": 5.772, + "num_input_tokens_seen": 172228608, + "step": 1314 + }, + { + "epoch": 0.2252003847386983, + "grad_norm": 0.8872966766357422, + "learning_rate": 0.00014150186939853544, + "loss": 5.7827, + "num_input_tokens_seen": 172621824, + "step": 1317 + }, + { + "epoch": 0.22571336966976596, + "grad_norm": 0.9942976236343384, + "learning_rate": 0.00014134098035344428, + "loss": 5.7677, + "num_input_tokens_seen": 173015040, + "step": 1320 + }, + { + "epoch": 0.2262263546008336, + "grad_norm": 0.8756197094917297, + "learning_rate": 0.00014118063886070086, + "loss": 5.7569, + "num_input_tokens_seen": 173408256, + "step": 1323 + }, + { + "epoch": 0.22673933953190126, + "grad_norm": 0.9162293672561646, + "learning_rate": 0.00014102084182153463, + "loss": 5.7365, + "num_input_tokens_seen": 173801472, + "step": 1326 + }, + { + "epoch": 0.2272523244629689, + "grad_norm": 0.7763002514839172, + "learning_rate": 0.00014086158616167125, + "loss": 5.7591, + "num_input_tokens_seen": 174194688, + "step": 1329 + }, + { + "epoch": 0.22776530939403655, + "grad_norm": 0.7513299584388733, + "learning_rate": 0.0001407028688310842, + "loss": 5.7568, + "num_input_tokens_seen": 174587904, + "step": 1332 + }, + { + "epoch": 0.2282782943251042, + "grad_norm": 0.8344148397445679, + "learning_rate": 0.0001405446868037495, + "loss": 5.7722, + "num_input_tokens_seen": 174981120, + "step": 1335 + }, + { + "epoch": 0.22879127925617185, + "grad_norm": 0.9340312480926514, + "learning_rate": 0.00014038703707740325, + "loss": 5.774, + "num_input_tokens_seen": 175374336, + "step": 1338 + }, + { + "epoch": 0.2293042641872395, + "grad_norm": 0.8781185746192932, + "learning_rate": 0.0001402299166733024, + "loss": 5.7994, + "num_input_tokens_seen": 175767552, + "step": 1341 + }, + { + "epoch": 0.22981724911830714, + "grad_norm": 0.7272279262542725, + "learning_rate": 0.00014007332263598843, + "loss": 5.8141, + "num_input_tokens_seen": 176160768, + "step": 1344 + }, + { + "epoch": 0.2303302340493748, + "grad_norm": 0.8324047327041626, + "learning_rate": 0.0001399172520330537, + "loss": 5.7908, + "num_input_tokens_seen": 176553984, + "step": 1347 + }, + { + "epoch": 0.23084321898044244, + "grad_norm": 1.018019199371338, + "learning_rate": 0.00013976170195491086, + "loss": 5.7865, + "num_input_tokens_seen": 176947200, + "step": 1350 + }, + { + "epoch": 0.2313562039115101, + "grad_norm": 0.9174796342849731, + "learning_rate": 0.00013960666951456512, + "loss": 5.7007, + "num_input_tokens_seen": 177340416, + "step": 1353 + }, + { + "epoch": 0.23186918884257776, + "grad_norm": 0.9878000617027283, + "learning_rate": 0.00013945215184738905, + "loss": 5.7603, + "num_input_tokens_seen": 177733632, + "step": 1356 + }, + { + "epoch": 0.2323821737736454, + "grad_norm": 0.9942120909690857, + "learning_rate": 0.00013929814611090044, + "loss": 5.7836, + "num_input_tokens_seen": 178126848, + "step": 1359 + }, + { + "epoch": 0.23289515870471306, + "grad_norm": 1.0599764585494995, + "learning_rate": 0.00013914464948454254, + "loss": 5.7588, + "num_input_tokens_seen": 178520064, + "step": 1362 + }, + { + "epoch": 0.2334081436357807, + "grad_norm": 0.9592947959899902, + "learning_rate": 0.00013899165916946712, + "loss": 5.7805, + "num_input_tokens_seen": 178913280, + "step": 1365 + }, + { + "epoch": 0.23392112856684835, + "grad_norm": 0.8882340788841248, + "learning_rate": 0.00013883917238832015, + "loss": 5.722, + "num_input_tokens_seen": 179306496, + "step": 1368 + }, + { + "epoch": 0.23443411349791599, + "grad_norm": 1.0210182666778564, + "learning_rate": 0.00013868718638503002, + "loss": 5.7775, + "num_input_tokens_seen": 179699712, + "step": 1371 + }, + { + "epoch": 0.23494709842898365, + "grad_norm": 1.022925853729248, + "learning_rate": 0.00013853569842459833, + "loss": 5.7809, + "num_input_tokens_seen": 180092928, + "step": 1374 + }, + { + "epoch": 0.2354600833600513, + "grad_norm": 0.926238477230072, + "learning_rate": 0.00013838470579289325, + "loss": 5.7873, + "num_input_tokens_seen": 180486144, + "step": 1377 + }, + { + "epoch": 0.23597306829111894, + "grad_norm": 0.9598406553268433, + "learning_rate": 0.000138234205796445, + "loss": 5.7696, + "num_input_tokens_seen": 180879360, + "step": 1380 + }, + { + "epoch": 0.2364860532221866, + "grad_norm": 0.8754788637161255, + "learning_rate": 0.00013808419576224448, + "loss": 5.7407, + "num_input_tokens_seen": 181272576, + "step": 1383 + }, + { + "epoch": 0.23699903815325424, + "grad_norm": 0.8794489502906799, + "learning_rate": 0.0001379346730375435, + "loss": 5.7445, + "num_input_tokens_seen": 181665792, + "step": 1386 + }, + { + "epoch": 0.2375120230843219, + "grad_norm": 0.7969871759414673, + "learning_rate": 0.0001377856349896579, + "loss": 5.7955, + "num_input_tokens_seen": 182059008, + "step": 1389 + }, + { + "epoch": 0.23802500801538956, + "grad_norm": 0.9116011261940002, + "learning_rate": 0.00013763707900577292, + "loss": 5.7498, + "num_input_tokens_seen": 182452224, + "step": 1392 + }, + { + "epoch": 0.2385379929464572, + "grad_norm": 1.0278836488723755, + "learning_rate": 0.0001374890024927507, + "loss": 5.7964, + "num_input_tokens_seen": 182845440, + "step": 1395 + }, + { + "epoch": 0.23905097787752486, + "grad_norm": 0.8830828070640564, + "learning_rate": 0.00013734140287694022, + "loss": 5.7192, + "num_input_tokens_seen": 183238656, + "step": 1398 + }, + { + "epoch": 0.2395639628085925, + "grad_norm": 0.9770954847335815, + "learning_rate": 0.0001371942776039894, + "loss": 5.7229, + "num_input_tokens_seen": 183631872, + "step": 1401 + }, + { + "epoch": 0.24007694773966015, + "grad_norm": 0.7094408273696899, + "learning_rate": 0.0001370476241386595, + "loss": 5.7209, + "num_input_tokens_seen": 184025088, + "step": 1404 + }, + { + "epoch": 0.24058993267072779, + "grad_norm": 0.9037938117980957, + "learning_rate": 0.00013690143996464142, + "loss": 5.8047, + "num_input_tokens_seen": 184418304, + "step": 1407 + }, + { + "epoch": 0.24110291760179545, + "grad_norm": 0.7936687469482422, + "learning_rate": 0.00013675572258437476, + "loss": 5.7976, + "num_input_tokens_seen": 184811520, + "step": 1410 + }, + { + "epoch": 0.2416159025328631, + "grad_norm": 0.7403805255889893, + "learning_rate": 0.00013661046951886816, + "loss": 5.7839, + "num_input_tokens_seen": 185204736, + "step": 1413 + }, + { + "epoch": 0.24212888746393074, + "grad_norm": 0.7771823406219482, + "learning_rate": 0.00013646567830752246, + "loss": 5.7907, + "num_input_tokens_seen": 185597952, + "step": 1416 + }, + { + "epoch": 0.2426418723949984, + "grad_norm": 0.6545729637145996, + "learning_rate": 0.0001363213465079555, + "loss": 5.6881, + "num_input_tokens_seen": 185991168, + "step": 1419 + }, + { + "epoch": 0.24315485732606604, + "grad_norm": 0.8105820417404175, + "learning_rate": 0.00013617747169582915, + "loss": 5.7786, + "num_input_tokens_seen": 186384384, + "step": 1422 + }, + { + "epoch": 0.2436678422571337, + "grad_norm": 0.9515424370765686, + "learning_rate": 0.00013603405146467827, + "loss": 5.7501, + "num_input_tokens_seen": 186777600, + "step": 1425 + }, + { + "epoch": 0.24418082718820136, + "grad_norm": 0.8837321400642395, + "learning_rate": 0.00013589108342574154, + "loss": 5.7674, + "num_input_tokens_seen": 187170816, + "step": 1428 + }, + { + "epoch": 0.244693812119269, + "grad_norm": 0.7665640711784363, + "learning_rate": 0.0001357485652077945, + "loss": 5.7271, + "num_input_tokens_seen": 187564032, + "step": 1431 + }, + { + "epoch": 0.24520679705033666, + "grad_norm": 0.7923889756202698, + "learning_rate": 0.00013560649445698437, + "loss": 5.7707, + "num_input_tokens_seen": 187957248, + "step": 1434 + }, + { + "epoch": 0.2457197819814043, + "grad_norm": 0.806024968624115, + "learning_rate": 0.00013546486883666656, + "loss": 5.7383, + "num_input_tokens_seen": 188350464, + "step": 1437 + }, + { + "epoch": 0.24623276691247195, + "grad_norm": 0.8087600469589233, + "learning_rate": 0.00013532368602724355, + "loss": 5.746, + "num_input_tokens_seen": 188743680, + "step": 1440 + }, + { + "epoch": 0.24674575184353958, + "grad_norm": 0.7250015735626221, + "learning_rate": 0.00013518294372600513, + "loss": 5.7275, + "num_input_tokens_seen": 189136896, + "step": 1443 + }, + { + "epoch": 0.24725873677460725, + "grad_norm": 0.8613927364349365, + "learning_rate": 0.00013504263964697066, + "loss": 5.7787, + "num_input_tokens_seen": 189530112, + "step": 1446 + }, + { + "epoch": 0.2477717217056749, + "grad_norm": 0.839963972568512, + "learning_rate": 0.00013490277152073324, + "loss": 5.7404, + "num_input_tokens_seen": 189923328, + "step": 1449 + }, + { + "epoch": 0.24828470663674254, + "grad_norm": 1.002913236618042, + "learning_rate": 0.0001347633370943054, + "loss": 5.7409, + "num_input_tokens_seen": 190316544, + "step": 1452 + }, + { + "epoch": 0.2487976915678102, + "grad_norm": 0.9510200619697571, + "learning_rate": 0.00013462433413096678, + "loss": 5.7916, + "num_input_tokens_seen": 190709760, + "step": 1455 + }, + { + "epoch": 0.24931067649887784, + "grad_norm": 0.8563526272773743, + "learning_rate": 0.00013448576041011335, + "loss": 5.692, + "num_input_tokens_seen": 191102976, + "step": 1458 + }, + { + "epoch": 0.2498236614299455, + "grad_norm": 1.1031644344329834, + "learning_rate": 0.0001343476137271086, + "loss": 5.7583, + "num_input_tokens_seen": 191496192, + "step": 1461 + }, + { + "epoch": 0.25033664636101316, + "grad_norm": 1.085344672203064, + "learning_rate": 0.00013420989189313586, + "loss": 5.7663, + "num_input_tokens_seen": 191889408, + "step": 1464 + }, + { + "epoch": 0.2508496312920808, + "grad_norm": 0.8237776160240173, + "learning_rate": 0.00013407259273505302, + "loss": 5.7291, + "num_input_tokens_seen": 192282624, + "step": 1467 + }, + { + "epoch": 0.2513626162231484, + "grad_norm": 0.7341859340667725, + "learning_rate": 0.00013393571409524825, + "loss": 5.7751, + "num_input_tokens_seen": 192675840, + "step": 1470 + }, + { + "epoch": 0.2518756011542161, + "grad_norm": 0.7785446047782898, + "learning_rate": 0.0001337992538314978, + "loss": 5.705, + "num_input_tokens_seen": 193069056, + "step": 1473 + }, + { + "epoch": 0.25238858608528375, + "grad_norm": 0.9106130003929138, + "learning_rate": 0.00013366320981682498, + "loss": 5.7583, + "num_input_tokens_seen": 193462272, + "step": 1476 + }, + { + "epoch": 0.2529015710163514, + "grad_norm": 0.8381192684173584, + "learning_rate": 0.0001335275799393611, + "loss": 5.725, + "num_input_tokens_seen": 193855488, + "step": 1479 + }, + { + "epoch": 0.253414555947419, + "grad_norm": 0.8275421261787415, + "learning_rate": 0.00013339236210220762, + "loss": 5.727, + "num_input_tokens_seen": 194248704, + "step": 1482 + }, + { + "epoch": 0.2539275408784867, + "grad_norm": 0.9162700176239014, + "learning_rate": 0.00013325755422330005, + "loss": 5.7105, + "num_input_tokens_seen": 194641920, + "step": 1485 + }, + { + "epoch": 0.25444052580955434, + "grad_norm": 0.7449747323989868, + "learning_rate": 0.0001331231542352734, + "loss": 5.7681, + "num_input_tokens_seen": 195035136, + "step": 1488 + }, + { + "epoch": 0.254953510740622, + "grad_norm": 0.7723684310913086, + "learning_rate": 0.00013298916008532878, + "loss": 5.7201, + "num_input_tokens_seen": 195428352, + "step": 1491 + }, + { + "epoch": 0.25546649567168966, + "grad_norm": 0.7799750566482544, + "learning_rate": 0.000132855569735102, + "loss": 5.7213, + "num_input_tokens_seen": 195821568, + "step": 1494 + }, + { + "epoch": 0.25597948060275727, + "grad_norm": 1.017113447189331, + "learning_rate": 0.00013272238116053312, + "loss": 5.7102, + "num_input_tokens_seen": 196214784, + "step": 1497 + }, + { + "epoch": 0.25649246553382493, + "grad_norm": 0.9866935014724731, + "learning_rate": 0.00013258959235173777, + "loss": 5.7064, + "num_input_tokens_seen": 196608000, + "step": 1500 + }, + { + "epoch": 0.2570054504648926, + "grad_norm": 1.0269203186035156, + "learning_rate": 0.0001324572013128796, + "loss": 5.7444, + "num_input_tokens_seen": 197001216, + "step": 1503 + }, + { + "epoch": 0.25751843539596025, + "grad_norm": 1.189927101135254, + "learning_rate": 0.00013232520606204452, + "loss": 5.6448, + "num_input_tokens_seen": 197394432, + "step": 1506 + }, + { + "epoch": 0.2580314203270279, + "grad_norm": 1.0342806577682495, + "learning_rate": 0.0001321936046311159, + "loss": 5.6614, + "num_input_tokens_seen": 197787648, + "step": 1509 + }, + { + "epoch": 0.2585444052580955, + "grad_norm": 0.8518815040588379, + "learning_rate": 0.0001320623950656514, + "loss": 5.7448, + "num_input_tokens_seen": 198180864, + "step": 1512 + }, + { + "epoch": 0.2590573901891632, + "grad_norm": 0.7017694711685181, + "learning_rate": 0.00013193157542476102, + "loss": 5.7127, + "num_input_tokens_seen": 198574080, + "step": 1515 + }, + { + "epoch": 0.25957037512023085, + "grad_norm": 0.7911322712898254, + "learning_rate": 0.00013180114378098651, + "loss": 5.6725, + "num_input_tokens_seen": 198967296, + "step": 1518 + }, + { + "epoch": 0.2600833600512985, + "grad_norm": 0.7968602776527405, + "learning_rate": 0.0001316710982201822, + "loss": 5.7116, + "num_input_tokens_seen": 199360512, + "step": 1521 + }, + { + "epoch": 0.26059634498236617, + "grad_norm": 0.8894298672676086, + "learning_rate": 0.0001315414368413969, + "loss": 5.7022, + "num_input_tokens_seen": 199753728, + "step": 1524 + }, + { + "epoch": 0.2611093299134338, + "grad_norm": 0.9683092832565308, + "learning_rate": 0.00013141215775675717, + "loss": 5.7418, + "num_input_tokens_seen": 200146944, + "step": 1527 + }, + { + "epoch": 0.26162231484450144, + "grad_norm": 0.947652280330658, + "learning_rate": 0.000131283259091352, + "loss": 5.7165, + "num_input_tokens_seen": 200540160, + "step": 1530 + }, + { + "epoch": 0.2621352997755691, + "grad_norm": 0.7747431993484497, + "learning_rate": 0.00013115473898311848, + "loss": 5.7537, + "num_input_tokens_seen": 200933376, + "step": 1533 + }, + { + "epoch": 0.26264828470663676, + "grad_norm": 0.719679594039917, + "learning_rate": 0.00013102659558272893, + "loss": 5.7737, + "num_input_tokens_seen": 201326592, + "step": 1536 + }, + { + "epoch": 0.26316126963770436, + "grad_norm": 0.7812837958335876, + "learning_rate": 0.000130898827053479, + "loss": 5.7438, + "num_input_tokens_seen": 201719808, + "step": 1539 + }, + { + "epoch": 0.263674254568772, + "grad_norm": 0.8203752040863037, + "learning_rate": 0.00013077143157117724, + "loss": 5.7158, + "num_input_tokens_seen": 202113024, + "step": 1542 + }, + { + "epoch": 0.2641872394998397, + "grad_norm": 0.9190983772277832, + "learning_rate": 0.00013064440732403566, + "loss": 5.6373, + "num_input_tokens_seen": 202506240, + "step": 1545 + }, + { + "epoch": 0.26470022443090735, + "grad_norm": 0.894138753414154, + "learning_rate": 0.0001305177525125614, + "loss": 5.7164, + "num_input_tokens_seen": 202899456, + "step": 1548 + }, + { + "epoch": 0.265213209361975, + "grad_norm": 1.0378000736236572, + "learning_rate": 0.00013039146534944986, + "loss": 5.7008, + "num_input_tokens_seen": 203292672, + "step": 1551 + }, + { + "epoch": 0.2657261942930426, + "grad_norm": 0.9447348117828369, + "learning_rate": 0.00013026554405947864, + "loss": 5.7429, + "num_input_tokens_seen": 203685888, + "step": 1554 + }, + { + "epoch": 0.2662391792241103, + "grad_norm": 0.8675848841667175, + "learning_rate": 0.00013013998687940264, + "loss": 5.7207, + "num_input_tokens_seen": 204079104, + "step": 1557 + }, + { + "epoch": 0.26675216415517794, + "grad_norm": 1.0198581218719482, + "learning_rate": 0.00013001479205785067, + "loss": 5.7049, + "num_input_tokens_seen": 204472320, + "step": 1560 + }, + { + "epoch": 0.2672651490862456, + "grad_norm": 0.863447368144989, + "learning_rate": 0.0001298899578552225, + "loss": 5.753, + "num_input_tokens_seen": 204865536, + "step": 1563 + }, + { + "epoch": 0.26777813401731326, + "grad_norm": 0.9340473413467407, + "learning_rate": 0.0001297654825435875, + "loss": 5.6951, + "num_input_tokens_seen": 205258752, + "step": 1566 + }, + { + "epoch": 0.26829111894838087, + "grad_norm": 0.826303243637085, + "learning_rate": 0.0001296413644065842, + "loss": 5.6856, + "num_input_tokens_seen": 205651968, + "step": 1569 + }, + { + "epoch": 0.26880410387944853, + "grad_norm": 0.8205902576446533, + "learning_rate": 0.0001295176017393209, + "loss": 5.7166, + "num_input_tokens_seen": 206045184, + "step": 1572 + }, + { + "epoch": 0.2693170888105162, + "grad_norm": 0.8139010667800903, + "learning_rate": 0.00012939419284827716, + "loss": 5.7012, + "num_input_tokens_seen": 206438400, + "step": 1575 + }, + { + "epoch": 0.26983007374158385, + "grad_norm": 0.8484136462211609, + "learning_rate": 0.00012927113605120665, + "loss": 5.7306, + "num_input_tokens_seen": 206831616, + "step": 1578 + }, + { + "epoch": 0.2703430586726515, + "grad_norm": 0.8395585417747498, + "learning_rate": 0.00012914842967704074, + "loss": 5.7253, + "num_input_tokens_seen": 207224832, + "step": 1581 + }, + { + "epoch": 0.2708560436037191, + "grad_norm": 0.8231976628303528, + "learning_rate": 0.00012902607206579324, + "loss": 5.6968, + "num_input_tokens_seen": 207618048, + "step": 1584 + }, + { + "epoch": 0.2713690285347868, + "grad_norm": 0.9288123250007629, + "learning_rate": 0.000128904061568466, + "loss": 5.6838, + "num_input_tokens_seen": 208011264, + "step": 1587 + }, + { + "epoch": 0.27188201346585444, + "grad_norm": 1.3254705667495728, + "learning_rate": 0.00012878239654695573, + "loss": 5.7356, + "num_input_tokens_seen": 208404480, + "step": 1590 + }, + { + "epoch": 0.2723949983969221, + "grad_norm": 0.9656926393508911, + "learning_rate": 0.0001286610753739614, + "loss": 5.7178, + "num_input_tokens_seen": 208797696, + "step": 1593 + }, + { + "epoch": 0.27290798332798977, + "grad_norm": 0.7304670214653015, + "learning_rate": 0.00012854009643289304, + "loss": 5.7276, + "num_input_tokens_seen": 209190912, + "step": 1596 + }, + { + "epoch": 0.2734209682590574, + "grad_norm": 0.7497060298919678, + "learning_rate": 0.0001284194581177811, + "loss": 5.7212, + "num_input_tokens_seen": 209584128, + "step": 1599 + }, + { + "epoch": 0.2735919632360799, + "eval_accuracy": 0.16197361993160722, + "eval_loss": 6.1410651206970215, + "eval_runtime": 110.7292, + "eval_samples_per_second": 2.709, + "eval_steps_per_second": 1.355, + "num_input_tokens_seen": 209715200, + "step": 1600 + }, + { + "epoch": 0.27393395319012503, + "grad_norm": 0.8801946640014648, + "learning_rate": 0.0001282991588331871, + "loss": 5.6859, + "num_input_tokens_seen": 209977344, + "step": 1602 + }, + { + "epoch": 0.2744469381211927, + "grad_norm": 0.8806388974189758, + "learning_rate": 0.00012817919699411473, + "loss": 5.6518, + "num_input_tokens_seen": 210370560, + "step": 1605 + }, + { + "epoch": 0.27495992305226036, + "grad_norm": 0.8406085968017578, + "learning_rate": 0.00012805957102592246, + "loss": 5.6947, + "num_input_tokens_seen": 210763776, + "step": 1608 + }, + { + "epoch": 0.27547290798332796, + "grad_norm": 0.985683798789978, + "learning_rate": 0.0001279402793642365, + "loss": 5.6736, + "num_input_tokens_seen": 211156992, + "step": 1611 + }, + { + "epoch": 0.2759858929143956, + "grad_norm": 1.0117186307907104, + "learning_rate": 0.00012782132045486498, + "loss": 5.688, + "num_input_tokens_seen": 211550208, + "step": 1614 + }, + { + "epoch": 0.2764988778454633, + "grad_norm": 0.7577993869781494, + "learning_rate": 0.00012770269275371276, + "loss": 5.6798, + "num_input_tokens_seen": 211943424, + "step": 1617 + }, + { + "epoch": 0.27701186277653095, + "grad_norm": 1.0011804103851318, + "learning_rate": 0.0001275843947266976, + "loss": 5.6961, + "num_input_tokens_seen": 212336640, + "step": 1620 + }, + { + "epoch": 0.2775248477075986, + "grad_norm": 0.7885441184043884, + "learning_rate": 0.00012746642484966631, + "loss": 5.7085, + "num_input_tokens_seen": 212729856, + "step": 1623 + }, + { + "epoch": 0.2780378326386662, + "grad_norm": 0.7686509490013123, + "learning_rate": 0.00012734878160831288, + "loss": 5.7337, + "num_input_tokens_seen": 213123072, + "step": 1626 + }, + { + "epoch": 0.2785508175697339, + "grad_norm": 0.7514293789863586, + "learning_rate": 0.00012723146349809627, + "loss": 5.6868, + "num_input_tokens_seen": 213516288, + "step": 1629 + }, + { + "epoch": 0.27906380250080154, + "grad_norm": 0.7466459274291992, + "learning_rate": 0.00012711446902415993, + "loss": 5.6678, + "num_input_tokens_seen": 213909504, + "step": 1632 + }, + { + "epoch": 0.2795767874318692, + "grad_norm": 0.7304571866989136, + "learning_rate": 0.00012699779670125177, + "loss": 5.677, + "num_input_tokens_seen": 214302720, + "step": 1635 + }, + { + "epoch": 0.28008977236293686, + "grad_norm": 0.7830135226249695, + "learning_rate": 0.00012688144505364484, + "loss": 5.6716, + "num_input_tokens_seen": 214695936, + "step": 1638 + }, + { + "epoch": 0.28060275729400447, + "grad_norm": 0.8072492480278015, + "learning_rate": 0.00012676541261505907, + "loss": 5.7114, + "num_input_tokens_seen": 215089152, + "step": 1641 + }, + { + "epoch": 0.28111574222507213, + "grad_norm": 0.828734815120697, + "learning_rate": 0.00012664969792858355, + "loss": 5.6624, + "num_input_tokens_seen": 215482368, + "step": 1644 + }, + { + "epoch": 0.2816287271561398, + "grad_norm": 1.1204993724822998, + "learning_rate": 0.00012653429954659974, + "loss": 5.6849, + "num_input_tokens_seen": 215875584, + "step": 1647 + }, + { + "epoch": 0.28214171208720745, + "grad_norm": 0.8877683877944946, + "learning_rate": 0.00012641921603070546, + "loss": 5.7019, + "num_input_tokens_seen": 216268800, + "step": 1650 + }, + { + "epoch": 0.2826546970182751, + "grad_norm": 0.9409082531929016, + "learning_rate": 0.00012630444595163954, + "loss": 5.6977, + "num_input_tokens_seen": 216662016, + "step": 1653 + }, + { + "epoch": 0.2831676819493427, + "grad_norm": 1.058484435081482, + "learning_rate": 0.0001261899878892072, + "loss": 5.716, + "num_input_tokens_seen": 217055232, + "step": 1656 + }, + { + "epoch": 0.2836806668804104, + "grad_norm": 0.8193328380584717, + "learning_rate": 0.00012607584043220635, + "loss": 5.6651, + "num_input_tokens_seen": 217448448, + "step": 1659 + }, + { + "epoch": 0.28419365181147804, + "grad_norm": 0.8166428208351135, + "learning_rate": 0.00012596200217835447, + "loss": 5.6376, + "num_input_tokens_seen": 217841664, + "step": 1662 + }, + { + "epoch": 0.2847066367425457, + "grad_norm": 0.9603435397148132, + "learning_rate": 0.00012584847173421627, + "loss": 5.7189, + "num_input_tokens_seen": 218234880, + "step": 1665 + }, + { + "epoch": 0.2852196216736133, + "grad_norm": 0.8467148542404175, + "learning_rate": 0.000125735247715132, + "loss": 5.6642, + "num_input_tokens_seen": 218628096, + "step": 1668 + }, + { + "epoch": 0.28573260660468097, + "grad_norm": 0.8301926255226135, + "learning_rate": 0.00012562232874514657, + "loss": 5.6971, + "num_input_tokens_seen": 219021312, + "step": 1671 + }, + { + "epoch": 0.28624559153574863, + "grad_norm": 0.8587663769721985, + "learning_rate": 0.0001255097134569393, + "loss": 5.6961, + "num_input_tokens_seen": 219414528, + "step": 1674 + }, + { + "epoch": 0.2867585764668163, + "grad_norm": 0.9333747625350952, + "learning_rate": 0.00012539740049175436, + "loss": 5.6821, + "num_input_tokens_seen": 219807744, + "step": 1677 + }, + { + "epoch": 0.28727156139788396, + "grad_norm": 0.7336766123771667, + "learning_rate": 0.00012528538849933206, + "loss": 5.678, + "num_input_tokens_seen": 220200960, + "step": 1680 + }, + { + "epoch": 0.28778454632895156, + "grad_norm": 0.7425720691680908, + "learning_rate": 0.00012517367613784042, + "loss": 5.6576, + "num_input_tokens_seen": 220594176, + "step": 1683 + }, + { + "epoch": 0.2882975312600192, + "grad_norm": 0.7801720499992371, + "learning_rate": 0.00012506226207380784, + "loss": 5.6991, + "num_input_tokens_seen": 220987392, + "step": 1686 + }, + { + "epoch": 0.2888105161910869, + "grad_norm": 0.7652679681777954, + "learning_rate": 0.00012495114498205616, + "loss": 5.6489, + "num_input_tokens_seen": 221380608, + "step": 1689 + }, + { + "epoch": 0.28932350112215455, + "grad_norm": 0.8375071287155151, + "learning_rate": 0.00012484032354563457, + "loss": 5.6568, + "num_input_tokens_seen": 221773824, + "step": 1692 + }, + { + "epoch": 0.2898364860532222, + "grad_norm": 0.7536932826042175, + "learning_rate": 0.0001247297964557539, + "loss": 5.6788, + "num_input_tokens_seen": 222167040, + "step": 1695 + }, + { + "epoch": 0.2903494709842898, + "grad_norm": 0.807601273059845, + "learning_rate": 0.0001246195624117219, + "loss": 5.6461, + "num_input_tokens_seen": 222560256, + "step": 1698 + }, + { + "epoch": 0.2908624559153575, + "grad_norm": 0.700380802154541, + "learning_rate": 0.0001245096201208786, + "loss": 5.6886, + "num_input_tokens_seen": 222953472, + "step": 1701 + }, + { + "epoch": 0.29137544084642514, + "grad_norm": 0.7354556322097778, + "learning_rate": 0.00012439996829853315, + "loss": 5.6543, + "num_input_tokens_seen": 223346688, + "step": 1704 + }, + { + "epoch": 0.2918884257774928, + "grad_norm": 0.8027917146682739, + "learning_rate": 0.00012429060566790032, + "loss": 5.7035, + "num_input_tokens_seen": 223739904, + "step": 1707 + }, + { + "epoch": 0.29240141070856046, + "grad_norm": 0.9573730826377869, + "learning_rate": 0.0001241815309600383, + "loss": 5.6513, + "num_input_tokens_seen": 224133120, + "step": 1710 + }, + { + "epoch": 0.29291439563962807, + "grad_norm": 1.0752445459365845, + "learning_rate": 0.00012407274291378672, + "loss": 5.6466, + "num_input_tokens_seen": 224526336, + "step": 1713 + }, + { + "epoch": 0.29342738057069573, + "grad_norm": 1.0931577682495117, + "learning_rate": 0.0001239642402757056, + "loss": 5.6842, + "num_input_tokens_seen": 224919552, + "step": 1716 + }, + { + "epoch": 0.2939403655017634, + "grad_norm": 0.7436386346817017, + "learning_rate": 0.00012385602180001445, + "loss": 5.645, + "num_input_tokens_seen": 225312768, + "step": 1719 + }, + { + "epoch": 0.29445335043283105, + "grad_norm": 0.7766621708869934, + "learning_rate": 0.0001237480862485324, + "loss": 5.6611, + "num_input_tokens_seen": 225705984, + "step": 1722 + }, + { + "epoch": 0.2949663353638987, + "grad_norm": 0.8995407223701477, + "learning_rate": 0.0001236404323906186, + "loss": 5.6912, + "num_input_tokens_seen": 226099200, + "step": 1725 + }, + { + "epoch": 0.2954793202949663, + "grad_norm": 0.9217162728309631, + "learning_rate": 0.00012353305900311327, + "loss": 5.6695, + "num_input_tokens_seen": 226492416, + "step": 1728 + }, + { + "epoch": 0.295992305226034, + "grad_norm": 1.1063520908355713, + "learning_rate": 0.00012342596487027938, + "loss": 5.6392, + "num_input_tokens_seen": 226885632, + "step": 1731 + }, + { + "epoch": 0.29650529015710164, + "grad_norm": 1.064606785774231, + "learning_rate": 0.00012331914878374486, + "loss": 5.6703, + "num_input_tokens_seen": 227278848, + "step": 1734 + }, + { + "epoch": 0.2970182750881693, + "grad_norm": 0.8207647800445557, + "learning_rate": 0.00012321260954244523, + "loss": 5.6174, + "num_input_tokens_seen": 227672064, + "step": 1737 + }, + { + "epoch": 0.2975312600192369, + "grad_norm": 0.9838363528251648, + "learning_rate": 0.00012310634595256696, + "loss": 5.6604, + "num_input_tokens_seen": 228065280, + "step": 1740 + }, + { + "epoch": 0.29804424495030457, + "grad_norm": 1.094573974609375, + "learning_rate": 0.0001230003568274913, + "loss": 5.6628, + "num_input_tokens_seen": 228458496, + "step": 1743 + }, + { + "epoch": 0.29855722988137223, + "grad_norm": 0.8992403745651245, + "learning_rate": 0.00012289464098773857, + "loss": 5.6452, + "num_input_tokens_seen": 228851712, + "step": 1746 + }, + { + "epoch": 0.2990702148124399, + "grad_norm": 0.9246178269386292, + "learning_rate": 0.00012278919726091303, + "loss": 5.6766, + "num_input_tokens_seen": 229244928, + "step": 1749 + }, + { + "epoch": 0.29958319974350756, + "grad_norm": 0.8067333102226257, + "learning_rate": 0.00012268402448164836, + "loss": 5.6916, + "num_input_tokens_seen": 229638144, + "step": 1752 + }, + { + "epoch": 0.30009618467457516, + "grad_norm": 0.8054561614990234, + "learning_rate": 0.00012257912149155346, + "loss": 5.6736, + "num_input_tokens_seen": 230031360, + "step": 1755 + }, + { + "epoch": 0.3006091696056428, + "grad_norm": 0.8408488631248474, + "learning_rate": 0.00012247448713915892, + "loss": 5.7006, + "num_input_tokens_seen": 230424576, + "step": 1758 + }, + { + "epoch": 0.3011221545367105, + "grad_norm": 0.8754010200500488, + "learning_rate": 0.00012237012027986385, + "loss": 5.6319, + "num_input_tokens_seen": 230817792, + "step": 1761 + }, + { + "epoch": 0.30163513946777815, + "grad_norm": 0.9594668745994568, + "learning_rate": 0.00012226601977588348, + "loss": 5.6849, + "num_input_tokens_seen": 231211008, + "step": 1764 + }, + { + "epoch": 0.3021481243988458, + "grad_norm": 0.7682183384895325, + "learning_rate": 0.0001221621844961969, + "loss": 5.6842, + "num_input_tokens_seen": 231604224, + "step": 1767 + }, + { + "epoch": 0.3026611093299134, + "grad_norm": 0.8172413110733032, + "learning_rate": 0.00012205861331649545, + "loss": 5.7111, + "num_input_tokens_seen": 231997440, + "step": 1770 + }, + { + "epoch": 0.3031740942609811, + "grad_norm": 0.893372118473053, + "learning_rate": 0.0001219553051191317, + "loss": 5.6743, + "num_input_tokens_seen": 232390656, + "step": 1773 + }, + { + "epoch": 0.30368707919204874, + "grad_norm": 0.8585564494132996, + "learning_rate": 0.00012185225879306862, + "loss": 5.6044, + "num_input_tokens_seen": 232783872, + "step": 1776 + }, + { + "epoch": 0.3042000641231164, + "grad_norm": 0.7689789533615112, + "learning_rate": 0.00012174947323382965, + "loss": 5.6363, + "num_input_tokens_seen": 233177088, + "step": 1779 + }, + { + "epoch": 0.30471304905418406, + "grad_norm": 0.7540669441223145, + "learning_rate": 0.00012164694734344876, + "loss": 5.6607, + "num_input_tokens_seen": 233570304, + "step": 1782 + }, + { + "epoch": 0.30522603398525167, + "grad_norm": 0.9404392242431641, + "learning_rate": 0.00012154468003042123, + "loss": 5.6585, + "num_input_tokens_seen": 233963520, + "step": 1785 + }, + { + "epoch": 0.3057390189163193, + "grad_norm": 1.020546793937683, + "learning_rate": 0.00012144267020965491, + "loss": 5.629, + "num_input_tokens_seen": 234356736, + "step": 1788 + }, + { + "epoch": 0.306252003847387, + "grad_norm": 0.9944835305213928, + "learning_rate": 0.00012134091680242182, + "loss": 5.6689, + "num_input_tokens_seen": 234749952, + "step": 1791 + }, + { + "epoch": 0.30676498877845465, + "grad_norm": 0.9175474047660828, + "learning_rate": 0.00012123941873631032, + "loss": 5.652, + "num_input_tokens_seen": 235143168, + "step": 1794 + }, + { + "epoch": 0.30727797370952226, + "grad_norm": 0.8481835722923279, + "learning_rate": 0.00012113817494517742, + "loss": 5.6716, + "num_input_tokens_seen": 235536384, + "step": 1797 + }, + { + "epoch": 0.3077909586405899, + "grad_norm": 0.8672162294387817, + "learning_rate": 0.00012103718436910204, + "loss": 5.6861, + "num_input_tokens_seen": 235929600, + "step": 1800 + }, + { + "epoch": 0.3083039435716576, + "grad_norm": 0.730492353439331, + "learning_rate": 0.00012093644595433816, + "loss": 5.6866, + "num_input_tokens_seen": 236322816, + "step": 1803 + }, + { + "epoch": 0.30881692850272524, + "grad_norm": 0.7994592189788818, + "learning_rate": 0.00012083595865326879, + "loss": 5.6567, + "num_input_tokens_seen": 236716032, + "step": 1806 + }, + { + "epoch": 0.3093299134337929, + "grad_norm": 0.8021286129951477, + "learning_rate": 0.00012073572142436013, + "loss": 5.6251, + "num_input_tokens_seen": 237109248, + "step": 1809 + }, + { + "epoch": 0.3098428983648605, + "grad_norm": 0.721517026424408, + "learning_rate": 0.0001206357332321163, + "loss": 5.6599, + "num_input_tokens_seen": 237502464, + "step": 1812 + }, + { + "epoch": 0.31035588329592817, + "grad_norm": 0.9358505606651306, + "learning_rate": 0.00012053599304703434, + "loss": 5.6717, + "num_input_tokens_seen": 237895680, + "step": 1815 + }, + { + "epoch": 0.31086886822699583, + "grad_norm": 1.005147099494934, + "learning_rate": 0.0001204364998455597, + "loss": 5.6534, + "num_input_tokens_seen": 238288896, + "step": 1818 + }, + { + "epoch": 0.3113818531580635, + "grad_norm": 0.7538228034973145, + "learning_rate": 0.00012033725261004223, + "loss": 5.6279, + "num_input_tokens_seen": 238682112, + "step": 1821 + }, + { + "epoch": 0.31189483808913115, + "grad_norm": 0.7190991640090942, + "learning_rate": 0.00012023825032869223, + "loss": 5.6295, + "num_input_tokens_seen": 239075328, + "step": 1824 + }, + { + "epoch": 0.31240782302019876, + "grad_norm": 0.8383211493492126, + "learning_rate": 0.00012013949199553745, + "loss": 5.7043, + "num_input_tokens_seen": 239468544, + "step": 1827 + }, + { + "epoch": 0.3129208079512664, + "grad_norm": 0.8222533464431763, + "learning_rate": 0.00012004097661037986, + "loss": 5.6339, + "num_input_tokens_seen": 239861760, + "step": 1830 + }, + { + "epoch": 0.3134337928823341, + "grad_norm": 0.7332392930984497, + "learning_rate": 0.00011994270317875327, + "loss": 5.6096, + "num_input_tokens_seen": 240254976, + "step": 1833 + }, + { + "epoch": 0.31394677781340175, + "grad_norm": 0.7699775099754333, + "learning_rate": 0.00011984467071188111, + "loss": 5.6614, + "num_input_tokens_seen": 240648192, + "step": 1836 + }, + { + "epoch": 0.3144597627444694, + "grad_norm": 0.6810494065284729, + "learning_rate": 0.00011974687822663465, + "loss": 5.6252, + "num_input_tokens_seen": 241041408, + "step": 1839 + }, + { + "epoch": 0.314972747675537, + "grad_norm": 0.7561641931533813, + "learning_rate": 0.00011964932474549163, + "loss": 5.5747, + "num_input_tokens_seen": 241434624, + "step": 1842 + }, + { + "epoch": 0.3154857326066047, + "grad_norm": 0.9112014770507812, + "learning_rate": 0.00011955200929649517, + "loss": 5.6209, + "num_input_tokens_seen": 241827840, + "step": 1845 + }, + { + "epoch": 0.31599871753767234, + "grad_norm": 0.8621751666069031, + "learning_rate": 0.00011945493091321312, + "loss": 5.6557, + "num_input_tokens_seen": 242221056, + "step": 1848 + }, + { + "epoch": 0.31651170246874, + "grad_norm": 0.7570284605026245, + "learning_rate": 0.00011935808863469773, + "loss": 5.6446, + "num_input_tokens_seen": 242614272, + "step": 1851 + }, + { + "epoch": 0.3170246873998076, + "grad_norm": 0.8017822504043579, + "learning_rate": 0.00011926148150544575, + "loss": 5.6767, + "num_input_tokens_seen": 243007488, + "step": 1854 + }, + { + "epoch": 0.31753767233087526, + "grad_norm": 0.7850795984268188, + "learning_rate": 0.00011916510857535883, + "loss": 5.62, + "num_input_tokens_seen": 243400704, + "step": 1857 + }, + { + "epoch": 0.3180506572619429, + "grad_norm": 0.774463951587677, + "learning_rate": 0.00011906896889970413, + "loss": 5.6407, + "num_input_tokens_seen": 243793920, + "step": 1860 + }, + { + "epoch": 0.3185636421930106, + "grad_norm": 0.8820813894271851, + "learning_rate": 0.00011897306153907562, + "loss": 5.6399, + "num_input_tokens_seen": 244187136, + "step": 1863 + }, + { + "epoch": 0.31907662712407825, + "grad_norm": 0.8753612041473389, + "learning_rate": 0.00011887738555935545, + "loss": 5.6036, + "num_input_tokens_seen": 244580352, + "step": 1866 + }, + { + "epoch": 0.31958961205514586, + "grad_norm": 0.8607076406478882, + "learning_rate": 0.00011878194003167571, + "loss": 5.6037, + "num_input_tokens_seen": 244973568, + "step": 1869 + }, + { + "epoch": 0.3201025969862135, + "grad_norm": 0.9494906663894653, + "learning_rate": 0.00011868672403238055, + "loss": 5.5947, + "num_input_tokens_seen": 245366784, + "step": 1872 + }, + { + "epoch": 0.3206155819172812, + "grad_norm": 0.9436231851577759, + "learning_rate": 0.00011859173664298873, + "loss": 5.6752, + "num_input_tokens_seen": 245760000, + "step": 1875 + }, + { + "epoch": 0.32112856684834884, + "grad_norm": 0.8782811760902405, + "learning_rate": 0.00011849697695015632, + "loss": 5.6304, + "num_input_tokens_seen": 246153216, + "step": 1878 + }, + { + "epoch": 0.3216415517794165, + "grad_norm": 0.8508527278900146, + "learning_rate": 0.00011840244404563977, + "loss": 5.6076, + "num_input_tokens_seen": 246546432, + "step": 1881 + }, + { + "epoch": 0.3221545367104841, + "grad_norm": 0.7841192483901978, + "learning_rate": 0.00011830813702625953, + "loss": 5.568, + "num_input_tokens_seen": 246939648, + "step": 1884 + }, + { + "epoch": 0.32266752164155177, + "grad_norm": 0.8864216804504395, + "learning_rate": 0.0001182140549938636, + "loss": 5.6292, + "num_input_tokens_seen": 247332864, + "step": 1887 + }, + { + "epoch": 0.32318050657261943, + "grad_norm": 0.7558512091636658, + "learning_rate": 0.00011812019705529174, + "loss": 5.6347, + "num_input_tokens_seen": 247726080, + "step": 1890 + }, + { + "epoch": 0.3236934915036871, + "grad_norm": 0.8658297657966614, + "learning_rate": 0.00011802656232233979, + "loss": 5.6167, + "num_input_tokens_seen": 248119296, + "step": 1893 + }, + { + "epoch": 0.32420647643475475, + "grad_norm": 0.7362368702888489, + "learning_rate": 0.00011793314991172442, + "loss": 5.5635, + "num_input_tokens_seen": 248512512, + "step": 1896 + }, + { + "epoch": 0.32471946136582236, + "grad_norm": 0.7577558755874634, + "learning_rate": 0.00011783995894504806, + "loss": 5.6168, + "num_input_tokens_seen": 248905728, + "step": 1899 + }, + { + "epoch": 0.32523244629689, + "grad_norm": 0.7319400906562805, + "learning_rate": 0.00011774698854876431, + "loss": 5.6247, + "num_input_tokens_seen": 249298944, + "step": 1902 + }, + { + "epoch": 0.3257454312279577, + "grad_norm": 0.7369369864463806, + "learning_rate": 0.00011765423785414348, + "loss": 5.6446, + "num_input_tokens_seen": 249692160, + "step": 1905 + }, + { + "epoch": 0.32625841615902534, + "grad_norm": 0.7148075103759766, + "learning_rate": 0.00011756170599723845, + "loss": 5.5845, + "num_input_tokens_seen": 250085376, + "step": 1908 + }, + { + "epoch": 0.326771401090093, + "grad_norm": 0.8362352252006531, + "learning_rate": 0.00011746939211885098, + "loss": 5.6083, + "num_input_tokens_seen": 250478592, + "step": 1911 + }, + { + "epoch": 0.3272843860211606, + "grad_norm": 0.7224807739257812, + "learning_rate": 0.00011737729536449814, + "loss": 5.5792, + "num_input_tokens_seen": 250871808, + "step": 1914 + }, + { + "epoch": 0.3277973709522283, + "grad_norm": 0.8332315683364868, + "learning_rate": 0.00011728541488437912, + "loss": 5.6376, + "num_input_tokens_seen": 251265024, + "step": 1917 + }, + { + "epoch": 0.32831035588329593, + "grad_norm": 0.89626544713974, + "learning_rate": 0.00011719374983334221, + "loss": 5.6722, + "num_input_tokens_seen": 251658240, + "step": 1920 + }, + { + "epoch": 0.3288233408143636, + "grad_norm": 0.6044060587882996, + "learning_rate": 0.0001171022993708523, + "loss": 5.6473, + "num_input_tokens_seen": 252051456, + "step": 1923 + }, + { + "epoch": 0.3293363257454312, + "grad_norm": 0.6592041850090027, + "learning_rate": 0.00011701106266095837, + "loss": 5.624, + "num_input_tokens_seen": 252444672, + "step": 1926 + }, + { + "epoch": 0.32984931067649886, + "grad_norm": 0.7988864183425903, + "learning_rate": 0.00011692003887226147, + "loss": 5.604, + "num_input_tokens_seen": 252837888, + "step": 1929 + }, + { + "epoch": 0.3303622956075665, + "grad_norm": 0.9502041935920715, + "learning_rate": 0.00011682922717788286, + "loss": 5.659, + "num_input_tokens_seen": 253231104, + "step": 1932 + }, + { + "epoch": 0.3308752805386342, + "grad_norm": 1.0182008743286133, + "learning_rate": 0.0001167386267554325, + "loss": 5.6019, + "num_input_tokens_seen": 253624320, + "step": 1935 + }, + { + "epoch": 0.33138826546970185, + "grad_norm": 0.9730847477912903, + "learning_rate": 0.00011664823678697777, + "loss": 5.6701, + "num_input_tokens_seen": 254017536, + "step": 1938 + }, + { + "epoch": 0.33190125040076945, + "grad_norm": 0.7735222578048706, + "learning_rate": 0.00011655805645901238, + "loss": 5.5851, + "num_input_tokens_seen": 254410752, + "step": 1941 + }, + { + "epoch": 0.3324142353318371, + "grad_norm": 0.7792080044746399, + "learning_rate": 0.0001164680849624257, + "loss": 5.6015, + "num_input_tokens_seen": 254803968, + "step": 1944 + }, + { + "epoch": 0.3329272202629048, + "grad_norm": 0.7463663816452026, + "learning_rate": 0.0001163783214924723, + "loss": 5.6114, + "num_input_tokens_seen": 255197184, + "step": 1947 + }, + { + "epoch": 0.33344020519397244, + "grad_norm": 0.6915552020072937, + "learning_rate": 0.00011628876524874155, + "loss": 5.6049, + "num_input_tokens_seen": 255590400, + "step": 1950 + }, + { + "epoch": 0.3339531901250401, + "grad_norm": 0.7225996851921082, + "learning_rate": 0.00011619941543512788, + "loss": 5.6132, + "num_input_tokens_seen": 255983616, + "step": 1953 + }, + { + "epoch": 0.3344661750561077, + "grad_norm": 0.7305698990821838, + "learning_rate": 0.00011611027125980086, + "loss": 5.6121, + "num_input_tokens_seen": 256376832, + "step": 1956 + }, + { + "epoch": 0.33497915998717537, + "grad_norm": 0.699140191078186, + "learning_rate": 0.00011602133193517582, + "loss": 5.5685, + "num_input_tokens_seen": 256770048, + "step": 1959 + }, + { + "epoch": 0.33549214491824303, + "grad_norm": 0.7200695872306824, + "learning_rate": 0.00011593259667788463, + "loss": 5.639, + "num_input_tokens_seen": 257163264, + "step": 1962 + }, + { + "epoch": 0.3360051298493107, + "grad_norm": 0.84376060962677, + "learning_rate": 0.0001158440647087466, + "loss": 5.6694, + "num_input_tokens_seen": 257556480, + "step": 1965 + }, + { + "epoch": 0.33651811478037835, + "grad_norm": 0.8291401863098145, + "learning_rate": 0.00011575573525274, + "loss": 5.5855, + "num_input_tokens_seen": 257949696, + "step": 1968 + }, + { + "epoch": 0.33703109971144596, + "grad_norm": 0.743291437625885, + "learning_rate": 0.0001156676075389733, + "loss": 5.5808, + "num_input_tokens_seen": 258342912, + "step": 1971 + }, + { + "epoch": 0.3375440846425136, + "grad_norm": 0.9974462389945984, + "learning_rate": 0.000115579680800657, + "loss": 5.5863, + "num_input_tokens_seen": 258736128, + "step": 1974 + }, + { + "epoch": 0.3380570695735813, + "grad_norm": 1.0641543865203857, + "learning_rate": 0.00011549195427507569, + "loss": 5.6162, + "num_input_tokens_seen": 259129344, + "step": 1977 + }, + { + "epoch": 0.33857005450464894, + "grad_norm": 1.3743962049484253, + "learning_rate": 0.00011540442720356016, + "loss": 5.6052, + "num_input_tokens_seen": 259522560, + "step": 1980 + }, + { + "epoch": 0.33908303943571655, + "grad_norm": 0.8301076889038086, + "learning_rate": 0.0001153170988314599, + "loss": 5.5922, + "num_input_tokens_seen": 259915776, + "step": 1983 + }, + { + "epoch": 0.3395960243667842, + "grad_norm": 0.7537018656730652, + "learning_rate": 0.00011522996840811572, + "loss": 5.5989, + "num_input_tokens_seen": 260308992, + "step": 1986 + }, + { + "epoch": 0.34010900929785187, + "grad_norm": 0.9778670072555542, + "learning_rate": 0.00011514303518683271, + "loss": 5.614, + "num_input_tokens_seen": 260702208, + "step": 1989 + }, + { + "epoch": 0.34062199422891953, + "grad_norm": 0.9663039445877075, + "learning_rate": 0.00011505629842485338, + "loss": 5.6108, + "num_input_tokens_seen": 261095424, + "step": 1992 + }, + { + "epoch": 0.3411349791599872, + "grad_norm": 0.9101848006248474, + "learning_rate": 0.00011496975738333083, + "loss": 5.5891, + "num_input_tokens_seen": 261488640, + "step": 1995 + }, + { + "epoch": 0.3416479640910548, + "grad_norm": 0.8312708735466003, + "learning_rate": 0.00011488341132730259, + "loss": 5.6175, + "num_input_tokens_seen": 261881856, + "step": 1998 + }, + { + "epoch": 0.3419899540450999, + "eval_accuracy": 0.16692558215274386, + "eval_loss": 6.050157070159912, + "eval_runtime": 112.2283, + "eval_samples_per_second": 2.673, + "eval_steps_per_second": 1.337, + "num_input_tokens_seen": 262144000, + "step": 2000 + }, + { + "epoch": 0.34216094902212246, + "grad_norm": 0.836723268032074, + "learning_rate": 0.00011479725952566419, + "loss": 5.5988, + "num_input_tokens_seen": 262275072, + "step": 2001 + }, + { + "epoch": 0.3426739339531901, + "grad_norm": 0.7804883122444153, + "learning_rate": 0.00011471130125114323, + "loss": 5.6296, + "num_input_tokens_seen": 262668288, + "step": 2004 + }, + { + "epoch": 0.3431869188842578, + "grad_norm": 0.9573701620101929, + "learning_rate": 0.00011462553578027366, + "loss": 5.5844, + "num_input_tokens_seen": 263061504, + "step": 2007 + }, + { + "epoch": 0.34369990381532545, + "grad_norm": 0.9088414311408997, + "learning_rate": 0.00011453996239337006, + "loss": 5.5551, + "num_input_tokens_seen": 263454720, + "step": 2010 + }, + { + "epoch": 0.34421288874639305, + "grad_norm": 0.8941265940666199, + "learning_rate": 0.00011445458037450239, + "loss": 5.5586, + "num_input_tokens_seen": 263847936, + "step": 2013 + }, + { + "epoch": 0.3447258736774607, + "grad_norm": 0.7578518390655518, + "learning_rate": 0.00011436938901147081, + "loss": 5.6165, + "num_input_tokens_seen": 264241152, + "step": 2016 + }, + { + "epoch": 0.3452388586085284, + "grad_norm": 0.7264053821563721, + "learning_rate": 0.00011428438759578074, + "loss": 5.5951, + "num_input_tokens_seen": 264634368, + "step": 2019 + }, + { + "epoch": 0.34575184353959604, + "grad_norm": 0.7484355568885803, + "learning_rate": 0.00011419957542261805, + "loss": 5.6023, + "num_input_tokens_seen": 265027584, + "step": 2022 + }, + { + "epoch": 0.3462648284706637, + "grad_norm": 0.7840693593025208, + "learning_rate": 0.0001141149517908246, + "loss": 5.5978, + "num_input_tokens_seen": 265420800, + "step": 2025 + }, + { + "epoch": 0.3467778134017313, + "grad_norm": 0.7896338701248169, + "learning_rate": 0.0001140305160028738, + "loss": 5.6215, + "num_input_tokens_seen": 265814016, + "step": 2028 + }, + { + "epoch": 0.34729079833279897, + "grad_norm": 0.8777353763580322, + "learning_rate": 0.00011394626736484653, + "loss": 5.5965, + "num_input_tokens_seen": 266207232, + "step": 2031 + }, + { + "epoch": 0.34780378326386663, + "grad_norm": 0.8751804232597351, + "learning_rate": 0.00011386220518640724, + "loss": 5.6445, + "num_input_tokens_seen": 266600448, + "step": 2034 + }, + { + "epoch": 0.3483167681949343, + "grad_norm": 0.8036639094352722, + "learning_rate": 0.00011377832878078, + "loss": 5.5957, + "num_input_tokens_seen": 266993664, + "step": 2037 + }, + { + "epoch": 0.34882975312600195, + "grad_norm": 0.8183003664016724, + "learning_rate": 0.00011369463746472517, + "loss": 5.6243, + "num_input_tokens_seen": 267386880, + "step": 2040 + }, + { + "epoch": 0.34934273805706956, + "grad_norm": 0.8002908825874329, + "learning_rate": 0.00011361113055851587, + "loss": 5.5953, + "num_input_tokens_seen": 267780096, + "step": 2043 + }, + { + "epoch": 0.3498557229881372, + "grad_norm": 0.6631841063499451, + "learning_rate": 0.00011352780738591478, + "loss": 5.6013, + "num_input_tokens_seen": 268173312, + "step": 2046 + }, + { + "epoch": 0.3503687079192049, + "grad_norm": 0.7779785394668579, + "learning_rate": 0.00011344466727415132, + "loss": 5.6058, + "num_input_tokens_seen": 268566528, + "step": 2049 + }, + { + "epoch": 0.35088169285027254, + "grad_norm": 0.722675621509552, + "learning_rate": 0.00011336170955389853, + "loss": 5.6014, + "num_input_tokens_seen": 268959744, + "step": 2052 + }, + { + "epoch": 0.35139467778134015, + "grad_norm": 0.725713312625885, + "learning_rate": 0.00011327893355925084, + "loss": 5.6318, + "num_input_tokens_seen": 269352960, + "step": 2055 + }, + { + "epoch": 0.3519076627124078, + "grad_norm": 0.7272054553031921, + "learning_rate": 0.0001131963386277012, + "loss": 5.5913, + "num_input_tokens_seen": 269746176, + "step": 2058 + }, + { + "epoch": 0.35242064764347547, + "grad_norm": 0.7865688800811768, + "learning_rate": 0.00011311392410011913, + "loss": 5.5727, + "num_input_tokens_seen": 270139392, + "step": 2061 + }, + { + "epoch": 0.35293363257454313, + "grad_norm": 0.754695475101471, + "learning_rate": 0.00011303168932072842, + "loss": 5.5762, + "num_input_tokens_seen": 270532608, + "step": 2064 + }, + { + "epoch": 0.3534466175056108, + "grad_norm": 0.7348251342773438, + "learning_rate": 0.00011294963363708538, + "loss": 5.5913, + "num_input_tokens_seen": 270925824, + "step": 2067 + }, + { + "epoch": 0.3539596024366784, + "grad_norm": 0.8406401872634888, + "learning_rate": 0.00011286775640005698, + "loss": 5.5496, + "num_input_tokens_seen": 271319040, + "step": 2070 + }, + { + "epoch": 0.35447258736774606, + "grad_norm": 0.8418242335319519, + "learning_rate": 0.00011278605696379935, + "loss": 5.5903, + "num_input_tokens_seen": 271712256, + "step": 2073 + }, + { + "epoch": 0.3549855722988137, + "grad_norm": 0.6441095471382141, + "learning_rate": 0.00011270453468573625, + "loss": 5.5503, + "num_input_tokens_seen": 272105472, + "step": 2076 + }, + { + "epoch": 0.3554985572298814, + "grad_norm": 0.7358053922653198, + "learning_rate": 0.00011262318892653804, + "loss": 5.5992, + "num_input_tokens_seen": 272498688, + "step": 2079 + }, + { + "epoch": 0.35601154216094905, + "grad_norm": 0.7976645231246948, + "learning_rate": 0.00011254201905010056, + "loss": 5.594, + "num_input_tokens_seen": 272891904, + "step": 2082 + }, + { + "epoch": 0.35652452709201665, + "grad_norm": 0.9197404980659485, + "learning_rate": 0.00011246102442352411, + "loss": 5.5648, + "num_input_tokens_seen": 273285120, + "step": 2085 + }, + { + "epoch": 0.3570375120230843, + "grad_norm": 0.9233546853065491, + "learning_rate": 0.00011238020441709289, + "loss": 5.5126, + "num_input_tokens_seen": 273678336, + "step": 2088 + }, + { + "epoch": 0.357550496954152, + "grad_norm": 0.8510675430297852, + "learning_rate": 0.00011229955840425433, + "loss": 5.5816, + "num_input_tokens_seen": 274071552, + "step": 2091 + }, + { + "epoch": 0.35806348188521964, + "grad_norm": 0.7524028420448303, + "learning_rate": 0.00011221908576159871, + "loss": 5.5925, + "num_input_tokens_seen": 274464768, + "step": 2094 + }, + { + "epoch": 0.3585764668162873, + "grad_norm": 0.9298664927482605, + "learning_rate": 0.00011213878586883904, + "loss": 5.5632, + "num_input_tokens_seen": 274857984, + "step": 2097 + }, + { + "epoch": 0.3590894517473549, + "grad_norm": 0.8467714190483093, + "learning_rate": 0.00011205865810879076, + "loss": 5.5589, + "num_input_tokens_seen": 275251200, + "step": 2100 + }, + { + "epoch": 0.35960243667842257, + "grad_norm": 0.7781933546066284, + "learning_rate": 0.00011197870186735193, + "loss": 5.5408, + "num_input_tokens_seen": 275644416, + "step": 2103 + }, + { + "epoch": 0.3601154216094902, + "grad_norm": 0.8665353655815125, + "learning_rate": 0.00011189891653348355, + "loss": 5.6069, + "num_input_tokens_seen": 276037632, + "step": 2106 + }, + { + "epoch": 0.3606284065405579, + "grad_norm": 0.8551245927810669, + "learning_rate": 0.00011181930149918981, + "loss": 5.5846, + "num_input_tokens_seen": 276430848, + "step": 2109 + }, + { + "epoch": 0.3611413914716255, + "grad_norm": 0.7167636752128601, + "learning_rate": 0.00011173985615949868, + "loss": 5.5516, + "num_input_tokens_seen": 276824064, + "step": 2112 + }, + { + "epoch": 0.36165437640269316, + "grad_norm": 0.6893343329429626, + "learning_rate": 0.00011166057991244258, + "loss": 5.5724, + "num_input_tokens_seen": 277217280, + "step": 2115 + }, + { + "epoch": 0.3621673613337608, + "grad_norm": 0.9303981065750122, + "learning_rate": 0.00011158147215903933, + "loss": 5.5756, + "num_input_tokens_seen": 277610496, + "step": 2118 + }, + { + "epoch": 0.3626803462648285, + "grad_norm": 0.7787050008773804, + "learning_rate": 0.00011150253230327296, + "loss": 5.5545, + "num_input_tokens_seen": 278003712, + "step": 2121 + }, + { + "epoch": 0.36319333119589614, + "grad_norm": 0.7556629180908203, + "learning_rate": 0.00011142375975207502, + "loss": 5.5149, + "num_input_tokens_seen": 278396928, + "step": 2124 + }, + { + "epoch": 0.36370631612696375, + "grad_norm": 0.8528605103492737, + "learning_rate": 0.00011134515391530575, + "loss": 5.5498, + "num_input_tokens_seen": 278790144, + "step": 2127 + }, + { + "epoch": 0.3642193010580314, + "grad_norm": 0.8373255729675293, + "learning_rate": 0.00011126671420573558, + "loss": 5.5908, + "num_input_tokens_seen": 279183360, + "step": 2130 + }, + { + "epoch": 0.36473228598909907, + "grad_norm": 0.778972327709198, + "learning_rate": 0.0001111884400390267, + "loss": 5.579, + "num_input_tokens_seen": 279576576, + "step": 2133 + }, + { + "epoch": 0.36524527092016673, + "grad_norm": 0.7942299246788025, + "learning_rate": 0.00011111033083371468, + "loss": 5.5897, + "num_input_tokens_seen": 279969792, + "step": 2136 + }, + { + "epoch": 0.3657582558512344, + "grad_norm": 0.7748274207115173, + "learning_rate": 0.00011103238601119048, + "loss": 5.5885, + "num_input_tokens_seen": 280363008, + "step": 2139 + }, + { + "epoch": 0.366271240782302, + "grad_norm": 0.7882058024406433, + "learning_rate": 0.00011095460499568234, + "loss": 5.6277, + "num_input_tokens_seen": 280756224, + "step": 2142 + }, + { + "epoch": 0.36678422571336966, + "grad_norm": 0.7407231330871582, + "learning_rate": 0.00011087698721423798, + "loss": 5.562, + "num_input_tokens_seen": 281149440, + "step": 2145 + }, + { + "epoch": 0.3672972106444373, + "grad_norm": 0.8401018381118774, + "learning_rate": 0.0001107995320967068, + "loss": 5.5526, + "num_input_tokens_seen": 281542656, + "step": 2148 + }, + { + "epoch": 0.367810195575505, + "grad_norm": 0.7504671812057495, + "learning_rate": 0.00011072223907572236, + "loss": 5.5194, + "num_input_tokens_seen": 281935872, + "step": 2151 + }, + { + "epoch": 0.36832318050657264, + "grad_norm": 0.8152016401290894, + "learning_rate": 0.0001106451075866849, + "loss": 5.5838, + "num_input_tokens_seen": 282329088, + "step": 2154 + }, + { + "epoch": 0.36883616543764025, + "grad_norm": 0.764788031578064, + "learning_rate": 0.00011056813706774403, + "loss": 5.5741, + "num_input_tokens_seen": 282722304, + "step": 2157 + }, + { + "epoch": 0.3693491503687079, + "grad_norm": 0.7509755492210388, + "learning_rate": 0.00011049132695978147, + "loss": 5.5572, + "num_input_tokens_seen": 283115520, + "step": 2160 + }, + { + "epoch": 0.3698621352997756, + "grad_norm": 1.0009026527404785, + "learning_rate": 0.0001104146767063941, + "loss": 5.5376, + "num_input_tokens_seen": 283508736, + "step": 2163 + }, + { + "epoch": 0.37037512023084324, + "grad_norm": 0.7605016231536865, + "learning_rate": 0.00011033818575387697, + "loss": 5.5529, + "num_input_tokens_seen": 283901952, + "step": 2166 + }, + { + "epoch": 0.37088810516191084, + "grad_norm": 0.7910396456718445, + "learning_rate": 0.00011026185355120653, + "loss": 5.5987, + "num_input_tokens_seen": 284295168, + "step": 2169 + }, + { + "epoch": 0.3714010900929785, + "grad_norm": 0.7264419794082642, + "learning_rate": 0.00011018567955002388, + "loss": 5.5723, + "num_input_tokens_seen": 284688384, + "step": 2172 + }, + { + "epoch": 0.37191407502404616, + "grad_norm": 0.6918433308601379, + "learning_rate": 0.00011010966320461834, + "loss": 5.5759, + "num_input_tokens_seen": 285081600, + "step": 2175 + }, + { + "epoch": 0.3724270599551138, + "grad_norm": 0.7263005375862122, + "learning_rate": 0.00011003380397191095, + "loss": 5.5918, + "num_input_tokens_seen": 285474816, + "step": 2178 + }, + { + "epoch": 0.3729400448861815, + "grad_norm": 0.8244202733039856, + "learning_rate": 0.00010995810131143818, + "loss": 5.5039, + "num_input_tokens_seen": 285868032, + "step": 2181 + }, + { + "epoch": 0.3734530298172491, + "grad_norm": 0.8637891411781311, + "learning_rate": 0.00010988255468533583, + "loss": 5.5602, + "num_input_tokens_seen": 286261248, + "step": 2184 + }, + { + "epoch": 0.37396601474831676, + "grad_norm": 0.7446593046188354, + "learning_rate": 0.0001098071635583229, + "loss": 5.573, + "num_input_tokens_seen": 286654464, + "step": 2187 + }, + { + "epoch": 0.3744789996793844, + "grad_norm": 0.8517831563949585, + "learning_rate": 0.00010973192739768566, + "loss": 5.5137, + "num_input_tokens_seen": 287047680, + "step": 2190 + }, + { + "epoch": 0.3749919846104521, + "grad_norm": 0.8291754722595215, + "learning_rate": 0.00010965684567326188, + "loss": 5.5965, + "num_input_tokens_seen": 287440896, + "step": 2193 + }, + { + "epoch": 0.37550496954151974, + "grad_norm": 0.7934954762458801, + "learning_rate": 0.00010958191785742515, + "loss": 5.5761, + "num_input_tokens_seen": 287834112, + "step": 2196 + }, + { + "epoch": 0.37601795447258735, + "grad_norm": 1.1249563694000244, + "learning_rate": 0.00010950714342506926, + "loss": 5.5857, + "num_input_tokens_seen": 288227328, + "step": 2199 + }, + { + "epoch": 0.376530939403655, + "grad_norm": 0.9391211867332458, + "learning_rate": 0.00010943252185359275, + "loss": 5.5403, + "num_input_tokens_seen": 288620544, + "step": 2202 + }, + { + "epoch": 0.37704392433472267, + "grad_norm": 0.8083456754684448, + "learning_rate": 0.00010935805262288362, + "loss": 5.5593, + "num_input_tokens_seen": 289013760, + "step": 2205 + }, + { + "epoch": 0.37755690926579033, + "grad_norm": 0.8864873051643372, + "learning_rate": 0.00010928373521530409, + "loss": 5.5901, + "num_input_tokens_seen": 289406976, + "step": 2208 + }, + { + "epoch": 0.378069894196858, + "grad_norm": 0.7634648084640503, + "learning_rate": 0.00010920956911567537, + "loss": 5.5755, + "num_input_tokens_seen": 289800192, + "step": 2211 + }, + { + "epoch": 0.3785828791279256, + "grad_norm": 0.9811239242553711, + "learning_rate": 0.00010913555381126287, + "loss": 5.5405, + "num_input_tokens_seen": 290193408, + "step": 2214 + }, + { + "epoch": 0.37909586405899326, + "grad_norm": 0.8969237208366394, + "learning_rate": 0.00010906168879176115, + "loss": 5.6022, + "num_input_tokens_seen": 290586624, + "step": 2217 + }, + { + "epoch": 0.3796088489900609, + "grad_norm": 0.8845155835151672, + "learning_rate": 0.00010898797354927919, + "loss": 5.554, + "num_input_tokens_seen": 290979840, + "step": 2220 + }, + { + "epoch": 0.3801218339211286, + "grad_norm": 0.7560102343559265, + "learning_rate": 0.0001089144075783257, + "loss": 5.6002, + "num_input_tokens_seen": 291373056, + "step": 2223 + }, + { + "epoch": 0.38063481885219624, + "grad_norm": 0.7519661784172058, + "learning_rate": 0.00010884099037579465, + "loss": 5.5744, + "num_input_tokens_seen": 291766272, + "step": 2226 + }, + { + "epoch": 0.38114780378326385, + "grad_norm": 0.7482137084007263, + "learning_rate": 0.00010876772144095075, + "loss": 5.4982, + "num_input_tokens_seen": 292159488, + "step": 2229 + }, + { + "epoch": 0.3816607887143315, + "grad_norm": 0.7390425205230713, + "learning_rate": 0.00010869460027541504, + "loss": 5.5839, + "num_input_tokens_seen": 292552704, + "step": 2232 + }, + { + "epoch": 0.3821737736453992, + "grad_norm": 0.7970213890075684, + "learning_rate": 0.00010862162638315081, + "loss": 5.5299, + "num_input_tokens_seen": 292945920, + "step": 2235 + }, + { + "epoch": 0.38268675857646683, + "grad_norm": 1.0196114778518677, + "learning_rate": 0.00010854879927044931, + "loss": 5.5759, + "num_input_tokens_seen": 293339136, + "step": 2238 + }, + { + "epoch": 0.38319974350753444, + "grad_norm": 0.8929862380027771, + "learning_rate": 0.00010847611844591587, + "loss": 5.5529, + "num_input_tokens_seen": 293732352, + "step": 2241 + }, + { + "epoch": 0.3837127284386021, + "grad_norm": 0.7914404273033142, + "learning_rate": 0.00010840358342045581, + "loss": 5.5529, + "num_input_tokens_seen": 294125568, + "step": 2244 + }, + { + "epoch": 0.38422571336966976, + "grad_norm": 0.8195559978485107, + "learning_rate": 0.00010833119370726075, + "loss": 5.5227, + "num_input_tokens_seen": 294518784, + "step": 2247 + }, + { + "epoch": 0.3847386983007374, + "grad_norm": 0.947847306728363, + "learning_rate": 0.00010825894882179485, + "loss": 5.5733, + "num_input_tokens_seen": 294912000, + "step": 2250 + }, + { + "epoch": 0.3852516832318051, + "grad_norm": 0.9748887419700623, + "learning_rate": 0.00010818684828178117, + "loss": 5.5793, + "num_input_tokens_seen": 295305216, + "step": 2253 + }, + { + "epoch": 0.3857646681628727, + "grad_norm": 0.8942933678627014, + "learning_rate": 0.00010811489160718815, + "loss": 5.5403, + "num_input_tokens_seen": 295698432, + "step": 2256 + }, + { + "epoch": 0.38627765309394035, + "grad_norm": 0.7008263468742371, + "learning_rate": 0.00010804307832021618, + "loss": 5.5767, + "num_input_tokens_seen": 296091648, + "step": 2259 + }, + { + "epoch": 0.386790638025008, + "grad_norm": 0.7309878468513489, + "learning_rate": 0.0001079714079452843, + "loss": 5.5122, + "num_input_tokens_seen": 296484864, + "step": 2262 + }, + { + "epoch": 0.3873036229560757, + "grad_norm": 0.9144716858863831, + "learning_rate": 0.000107899880009017, + "loss": 5.57, + "num_input_tokens_seen": 296878080, + "step": 2265 + }, + { + "epoch": 0.38781660788714334, + "grad_norm": 0.961901068687439, + "learning_rate": 0.00010782849404023096, + "loss": 5.5156, + "num_input_tokens_seen": 297271296, + "step": 2268 + }, + { + "epoch": 0.38832959281821094, + "grad_norm": 0.7054316997528076, + "learning_rate": 0.00010775724956992224, + "loss": 5.5626, + "num_input_tokens_seen": 297664512, + "step": 2271 + }, + { + "epoch": 0.3888425777492786, + "grad_norm": 0.8600339889526367, + "learning_rate": 0.00010768614613125303, + "loss": 5.5521, + "num_input_tokens_seen": 298057728, + "step": 2274 + }, + { + "epoch": 0.38935556268034627, + "grad_norm": 0.7894279956817627, + "learning_rate": 0.0001076151832595391, + "loss": 5.5412, + "num_input_tokens_seen": 298450944, + "step": 2277 + }, + { + "epoch": 0.38986854761141393, + "grad_norm": 0.7673327922821045, + "learning_rate": 0.0001075443604922369, + "loss": 5.584, + "num_input_tokens_seen": 298844160, + "step": 2280 + }, + { + "epoch": 0.3903815325424816, + "grad_norm": 0.7792801856994629, + "learning_rate": 0.00010747367736893089, + "loss": 5.5592, + "num_input_tokens_seen": 299237376, + "step": 2283 + }, + { + "epoch": 0.3908945174735492, + "grad_norm": 0.8032937049865723, + "learning_rate": 0.00010740313343132098, + "loss": 5.5543, + "num_input_tokens_seen": 299630592, + "step": 2286 + }, + { + "epoch": 0.39140750240461686, + "grad_norm": 0.731970489025116, + "learning_rate": 0.00010733272822321011, + "loss": 5.5259, + "num_input_tokens_seen": 300023808, + "step": 2289 + }, + { + "epoch": 0.3919204873356845, + "grad_norm": 0.7217367887496948, + "learning_rate": 0.00010726246129049176, + "loss": 5.5442, + "num_input_tokens_seen": 300417024, + "step": 2292 + }, + { + "epoch": 0.3924334722667522, + "grad_norm": 0.7392825484275818, + "learning_rate": 0.00010719233218113771, + "loss": 5.5274, + "num_input_tokens_seen": 300810240, + "step": 2295 + }, + { + "epoch": 0.3929464571978198, + "grad_norm": 0.7724013924598694, + "learning_rate": 0.00010712234044518587, + "loss": 5.5069, + "num_input_tokens_seen": 301203456, + "step": 2298 + }, + { + "epoch": 0.39345944212888745, + "grad_norm": 0.709718644618988, + "learning_rate": 0.00010705248563472809, + "loss": 5.5211, + "num_input_tokens_seen": 301596672, + "step": 2301 + }, + { + "epoch": 0.3939724270599551, + "grad_norm": 0.7409883141517639, + "learning_rate": 0.00010698276730389805, + "loss": 5.5102, + "num_input_tokens_seen": 301989888, + "step": 2304 + }, + { + "epoch": 0.39448541199102277, + "grad_norm": 0.8129496574401855, + "learning_rate": 0.0001069131850088595, + "loss": 5.5545, + "num_input_tokens_seen": 302383104, + "step": 2307 + }, + { + "epoch": 0.39499839692209043, + "grad_norm": 1.0027920007705688, + "learning_rate": 0.00010684373830779422, + "loss": 5.5445, + "num_input_tokens_seen": 302776320, + "step": 2310 + }, + { + "epoch": 0.39551138185315804, + "grad_norm": 0.8895756006240845, + "learning_rate": 0.0001067744267608903, + "loss": 5.5625, + "num_input_tokens_seen": 303169536, + "step": 2313 + }, + { + "epoch": 0.3960243667842257, + "grad_norm": 0.8327840566635132, + "learning_rate": 0.00010670524993033049, + "loss": 5.5472, + "num_input_tokens_seen": 303562752, + "step": 2316 + }, + { + "epoch": 0.39653735171529336, + "grad_norm": 1.1410460472106934, + "learning_rate": 0.00010663620738028051, + "loss": 5.5659, + "num_input_tokens_seen": 303955968, + "step": 2319 + }, + { + "epoch": 0.397050336646361, + "grad_norm": 0.889072835445404, + "learning_rate": 0.0001065672986768775, + "loss": 5.5594, + "num_input_tokens_seen": 304349184, + "step": 2322 + }, + { + "epoch": 0.3975633215774287, + "grad_norm": 0.8344824910163879, + "learning_rate": 0.0001064985233882187, + "loss": 5.553, + "num_input_tokens_seen": 304742400, + "step": 2325 + }, + { + "epoch": 0.3980763065084963, + "grad_norm": 0.7969459891319275, + "learning_rate": 0.00010642988108434991, + "loss": 5.5389, + "num_input_tokens_seen": 305135616, + "step": 2328 + }, + { + "epoch": 0.39858929143956395, + "grad_norm": 0.9471580386161804, + "learning_rate": 0.00010636137133725434, + "loss": 5.5615, + "num_input_tokens_seen": 305528832, + "step": 2331 + }, + { + "epoch": 0.3991022763706316, + "grad_norm": 0.8734350204467773, + "learning_rate": 0.00010629299372084134, + "loss": 5.5455, + "num_input_tokens_seen": 305922048, + "step": 2334 + }, + { + "epoch": 0.3996152613016993, + "grad_norm": 0.8351041078567505, + "learning_rate": 0.00010622474781093524, + "loss": 5.5332, + "num_input_tokens_seen": 306315264, + "step": 2337 + }, + { + "epoch": 0.40012824623276694, + "grad_norm": 0.8178178668022156, + "learning_rate": 0.00010615663318526436, + "loss": 5.5456, + "num_input_tokens_seen": 306708480, + "step": 2340 + }, + { + "epoch": 0.40064123116383454, + "grad_norm": 0.8616231679916382, + "learning_rate": 0.00010608864942345, + "loss": 5.5559, + "num_input_tokens_seen": 307101696, + "step": 2343 + }, + { + "epoch": 0.4011542160949022, + "grad_norm": 0.9160477519035339, + "learning_rate": 0.00010602079610699554, + "loss": 5.5369, + "num_input_tokens_seen": 307494912, + "step": 2346 + }, + { + "epoch": 0.40166720102596987, + "grad_norm": 0.8481185436248779, + "learning_rate": 0.00010595307281927571, + "loss": 5.5697, + "num_input_tokens_seen": 307888128, + "step": 2349 + }, + { + "epoch": 0.40218018595703753, + "grad_norm": 0.7999601364135742, + "learning_rate": 0.00010588547914552566, + "loss": 5.5475, + "num_input_tokens_seen": 308281344, + "step": 2352 + }, + { + "epoch": 0.40269317088810513, + "grad_norm": 0.8336549997329712, + "learning_rate": 0.00010581801467283045, + "loss": 5.5177, + "num_input_tokens_seen": 308674560, + "step": 2355 + }, + { + "epoch": 0.4032061558191728, + "grad_norm": 0.9885947108268738, + "learning_rate": 0.00010575067899011441, + "loss": 5.5241, + "num_input_tokens_seen": 309067776, + "step": 2358 + }, + { + "epoch": 0.40371914075024046, + "grad_norm": 0.811789870262146, + "learning_rate": 0.00010568347168813064, + "loss": 5.5635, + "num_input_tokens_seen": 309460992, + "step": 2361 + }, + { + "epoch": 0.4042321256813081, + "grad_norm": 0.7453095316886902, + "learning_rate": 0.00010561639235945043, + "loss": 5.5208, + "num_input_tokens_seen": 309854208, + "step": 2364 + }, + { + "epoch": 0.4047451106123758, + "grad_norm": 0.7175402045249939, + "learning_rate": 0.00010554944059845314, + "loss": 5.5112, + "num_input_tokens_seen": 310247424, + "step": 2367 + }, + { + "epoch": 0.4052580955434434, + "grad_norm": 0.7702206373214722, + "learning_rate": 0.00010548261600131565, + "loss": 5.5175, + "num_input_tokens_seen": 310640640, + "step": 2370 + }, + { + "epoch": 0.40577108047451105, + "grad_norm": 0.7227572798728943, + "learning_rate": 0.00010541591816600227, + "loss": 5.5596, + "num_input_tokens_seen": 311033856, + "step": 2373 + }, + { + "epoch": 0.4062840654055787, + "grad_norm": 0.8014532327651978, + "learning_rate": 0.00010534934669225456, + "loss": 5.4984, + "num_input_tokens_seen": 311427072, + "step": 2376 + }, + { + "epoch": 0.40679705033664637, + "grad_norm": 0.867141604423523, + "learning_rate": 0.0001052829011815812, + "loss": 5.5651, + "num_input_tokens_seen": 311820288, + "step": 2379 + }, + { + "epoch": 0.40731003526771403, + "grad_norm": 0.6916822791099548, + "learning_rate": 0.00010521658123724799, + "loss": 5.5142, + "num_input_tokens_seen": 312213504, + "step": 2382 + }, + { + "epoch": 0.40782302019878164, + "grad_norm": 0.7513076066970825, + "learning_rate": 0.00010515038646426796, + "loss": 5.5373, + "num_input_tokens_seen": 312606720, + "step": 2385 + }, + { + "epoch": 0.4083360051298493, + "grad_norm": 0.7861223220825195, + "learning_rate": 0.00010508431646939135, + "loss": 5.5649, + "num_input_tokens_seen": 312999936, + "step": 2388 + }, + { + "epoch": 0.40884899006091696, + "grad_norm": 0.7609034180641174, + "learning_rate": 0.00010501837086109599, + "loss": 5.5171, + "num_input_tokens_seen": 313393152, + "step": 2391 + }, + { + "epoch": 0.4093619749919846, + "grad_norm": 0.6941331624984741, + "learning_rate": 0.00010495254924957736, + "loss": 5.5279, + "num_input_tokens_seen": 313786368, + "step": 2394 + }, + { + "epoch": 0.4098749599230523, + "grad_norm": 0.6960221529006958, + "learning_rate": 0.00010488685124673906, + "loss": 5.53, + "num_input_tokens_seen": 314179584, + "step": 2397 + }, + { + "epoch": 0.4103879448541199, + "grad_norm": 0.7227168083190918, + "learning_rate": 0.00010482127646618314, + "loss": 5.5014, + "num_input_tokens_seen": 314572800, + "step": 2400 + }, + { + "epoch": 0.4103879448541199, + "eval_accuracy": 0.16866471258752647, + "eval_loss": 5.9827094078063965, + "eval_runtime": 115.7853, + "eval_samples_per_second": 2.591, + "eval_steps_per_second": 1.296, + "num_input_tokens_seen": 314572800, + "step": 2400 + }, + { + "epoch": 0.41090092978518755, + "grad_norm": 0.7502657771110535, + "learning_rate": 0.00010475582452320052, + "loss": 5.5249, + "num_input_tokens_seen": 314966016, + "step": 2403 + }, + { + "epoch": 0.4114139147162552, + "grad_norm": 0.709670901298523, + "learning_rate": 0.00010469049503476158, + "loss": 5.5021, + "num_input_tokens_seen": 315359232, + "step": 2406 + }, + { + "epoch": 0.4119268996473229, + "grad_norm": 0.8333126902580261, + "learning_rate": 0.00010462528761950672, + "loss": 5.5293, + "num_input_tokens_seen": 315752448, + "step": 2409 + }, + { + "epoch": 0.41243988457839054, + "grad_norm": 0.7035155296325684, + "learning_rate": 0.00010456020189773697, + "loss": 5.5508, + "num_input_tokens_seen": 316145664, + "step": 2412 + }, + { + "epoch": 0.41295286950945814, + "grad_norm": 0.7859997749328613, + "learning_rate": 0.00010449523749140482, + "loss": 5.5175, + "num_input_tokens_seen": 316538880, + "step": 2415 + }, + { + "epoch": 0.4134658544405258, + "grad_norm": 0.7992687821388245, + "learning_rate": 0.00010443039402410475, + "loss": 5.5136, + "num_input_tokens_seen": 316932096, + "step": 2418 + }, + { + "epoch": 0.41397883937159347, + "grad_norm": 0.8525195717811584, + "learning_rate": 0.00010436567112106444, + "loss": 5.54, + "num_input_tokens_seen": 317325312, + "step": 2421 + }, + { + "epoch": 0.4144918243026611, + "grad_norm": 0.6531423330307007, + "learning_rate": 0.00010430106840913532, + "loss": 5.4994, + "num_input_tokens_seen": 317718528, + "step": 2424 + }, + { + "epoch": 0.41500480923372873, + "grad_norm": 0.6764413714408875, + "learning_rate": 0.00010423658551678376, + "loss": 5.496, + "num_input_tokens_seen": 318111744, + "step": 2427 + }, + { + "epoch": 0.4155177941647964, + "grad_norm": 0.6776644587516785, + "learning_rate": 0.00010417222207408196, + "loss": 5.5749, + "num_input_tokens_seen": 318504960, + "step": 2430 + }, + { + "epoch": 0.41603077909586406, + "grad_norm": 0.790438711643219, + "learning_rate": 0.00010410797771269917, + "loss": 5.5339, + "num_input_tokens_seen": 318898176, + "step": 2433 + }, + { + "epoch": 0.4165437640269317, + "grad_norm": 0.8480133414268494, + "learning_rate": 0.00010404385206589268, + "loss": 5.5411, + "num_input_tokens_seen": 319291392, + "step": 2436 + }, + { + "epoch": 0.4170567489579994, + "grad_norm": 0.832699716091156, + "learning_rate": 0.00010397984476849915, + "loss": 5.5524, + "num_input_tokens_seen": 319684608, + "step": 2439 + }, + { + "epoch": 0.417569733889067, + "grad_norm": 0.8338193297386169, + "learning_rate": 0.00010391595545692583, + "loss": 5.4927, + "num_input_tokens_seen": 320077824, + "step": 2442 + }, + { + "epoch": 0.41808271882013465, + "grad_norm": 0.7665431499481201, + "learning_rate": 0.00010385218376914195, + "loss": 5.5396, + "num_input_tokens_seen": 320471040, + "step": 2445 + }, + { + "epoch": 0.4185957037512023, + "grad_norm": 0.719890832901001, + "learning_rate": 0.00010378852934466992, + "loss": 5.5215, + "num_input_tokens_seen": 320864256, + "step": 2448 + }, + { + "epoch": 0.41910868868226997, + "grad_norm": 0.7127965688705444, + "learning_rate": 0.000103724991824577, + "loss": 5.517, + "num_input_tokens_seen": 321257472, + "step": 2451 + }, + { + "epoch": 0.41962167361333763, + "grad_norm": 0.841174840927124, + "learning_rate": 0.00010366157085146666, + "loss": 5.5549, + "num_input_tokens_seen": 321650688, + "step": 2454 + }, + { + "epoch": 0.42013465854440524, + "grad_norm": 0.9073885083198547, + "learning_rate": 0.00010359826606947015, + "loss": 5.5391, + "num_input_tokens_seen": 322043904, + "step": 2457 + }, + { + "epoch": 0.4206476434754729, + "grad_norm": 0.7966265082359314, + "learning_rate": 0.00010353507712423819, + "loss": 5.5094, + "num_input_tokens_seen": 322437120, + "step": 2460 + }, + { + "epoch": 0.42116062840654056, + "grad_norm": 0.7716068029403687, + "learning_rate": 0.00010347200366293252, + "loss": 5.5039, + "num_input_tokens_seen": 322830336, + "step": 2463 + }, + { + "epoch": 0.4216736133376082, + "grad_norm": 1.1247206926345825, + "learning_rate": 0.00010340904533421777, + "loss": 5.5764, + "num_input_tokens_seen": 323223552, + "step": 2466 + }, + { + "epoch": 0.4221865982686759, + "grad_norm": 1.0969719886779785, + "learning_rate": 0.00010334620178825307, + "loss": 5.5091, + "num_input_tokens_seen": 323616768, + "step": 2469 + }, + { + "epoch": 0.4226995831997435, + "grad_norm": 0.7282251119613647, + "learning_rate": 0.00010328347267668404, + "loss": 5.5254, + "num_input_tokens_seen": 324009984, + "step": 2472 + }, + { + "epoch": 0.42321256813081115, + "grad_norm": 0.7151831984519958, + "learning_rate": 0.0001032208576526346, + "loss": 5.5016, + "num_input_tokens_seen": 324403200, + "step": 2475 + }, + { + "epoch": 0.4237255530618788, + "grad_norm": 0.7762022018432617, + "learning_rate": 0.0001031583563706989, + "loss": 5.4836, + "num_input_tokens_seen": 324796416, + "step": 2478 + }, + { + "epoch": 0.4242385379929465, + "grad_norm": 0.7001504898071289, + "learning_rate": 0.00010309596848693339, + "loss": 5.5204, + "num_input_tokens_seen": 325189632, + "step": 2481 + }, + { + "epoch": 0.4247515229240141, + "grad_norm": 0.8730801343917847, + "learning_rate": 0.00010303369365884883, + "loss": 5.5267, + "num_input_tokens_seen": 325582848, + "step": 2484 + }, + { + "epoch": 0.42526450785508174, + "grad_norm": 0.9579359889030457, + "learning_rate": 0.00010297153154540234, + "loss": 5.4667, + "num_input_tokens_seen": 325976064, + "step": 2487 + }, + { + "epoch": 0.4257774927861494, + "grad_norm": 0.7636502385139465, + "learning_rate": 0.00010290948180698962, + "loss": 5.5169, + "num_input_tokens_seen": 326369280, + "step": 2490 + }, + { + "epoch": 0.42629047771721706, + "grad_norm": 0.7500234246253967, + "learning_rate": 0.00010284754410543722, + "loss": 5.4919, + "num_input_tokens_seen": 326762496, + "step": 2493 + }, + { + "epoch": 0.4268034626482847, + "grad_norm": 0.9112027883529663, + "learning_rate": 0.0001027857181039946, + "loss": 5.5179, + "num_input_tokens_seen": 327155712, + "step": 2496 + }, + { + "epoch": 0.42731644757935233, + "grad_norm": 0.820213794708252, + "learning_rate": 0.00010272400346732667, + "loss": 5.5183, + "num_input_tokens_seen": 327548928, + "step": 2499 + }, + { + "epoch": 0.42782943251042, + "grad_norm": 0.8203981518745422, + "learning_rate": 0.00010266239986150597, + "loss": 5.532, + "num_input_tokens_seen": 327942144, + "step": 2502 + }, + { + "epoch": 0.42834241744148766, + "grad_norm": 0.7337089776992798, + "learning_rate": 0.00010260090695400518, + "loss": 5.4943, + "num_input_tokens_seen": 328335360, + "step": 2505 + }, + { + "epoch": 0.4288554023725553, + "grad_norm": 0.7428235411643982, + "learning_rate": 0.00010253952441368959, + "loss": 5.4861, + "num_input_tokens_seen": 328728576, + "step": 2508 + }, + { + "epoch": 0.429368387303623, + "grad_norm": 0.7361696362495422, + "learning_rate": 0.00010247825191080954, + "loss": 5.5521, + "num_input_tokens_seen": 329121792, + "step": 2511 + }, + { + "epoch": 0.4298813722346906, + "grad_norm": 0.6974912285804749, + "learning_rate": 0.00010241708911699302, + "loss": 5.456, + "num_input_tokens_seen": 329515008, + "step": 2514 + }, + { + "epoch": 0.43039435716575825, + "grad_norm": 0.8302775621414185, + "learning_rate": 0.00010235603570523828, + "loss": 5.507, + "num_input_tokens_seen": 329908224, + "step": 2517 + }, + { + "epoch": 0.4309073420968259, + "grad_norm": 0.8034529089927673, + "learning_rate": 0.00010229509134990649, + "loss": 5.4999, + "num_input_tokens_seen": 330301440, + "step": 2520 + }, + { + "epoch": 0.43142032702789357, + "grad_norm": 0.812213122844696, + "learning_rate": 0.00010223425572671442, + "loss": 5.5129, + "num_input_tokens_seen": 330694656, + "step": 2523 + }, + { + "epoch": 0.43193331195896123, + "grad_norm": 0.9187823534011841, + "learning_rate": 0.00010217352851272726, + "loss": 5.5116, + "num_input_tokens_seen": 331087872, + "step": 2526 + }, + { + "epoch": 0.43244629689002884, + "grad_norm": 0.9143493175506592, + "learning_rate": 0.00010211290938635132, + "loss": 5.4893, + "num_input_tokens_seen": 331481088, + "step": 2529 + }, + { + "epoch": 0.4329592818210965, + "grad_norm": 0.6335490942001343, + "learning_rate": 0.00010205239802732692, + "loss": 5.4349, + "num_input_tokens_seen": 331874304, + "step": 2532 + }, + { + "epoch": 0.43347226675216416, + "grad_norm": 0.7708688974380493, + "learning_rate": 0.00010199199411672136, + "loss": 5.4729, + "num_input_tokens_seen": 332267520, + "step": 2535 + }, + { + "epoch": 0.4339852516832318, + "grad_norm": 0.7687310576438904, + "learning_rate": 0.00010193169733692172, + "loss": 5.5084, + "num_input_tokens_seen": 332660736, + "step": 2538 + }, + { + "epoch": 0.4344982366142995, + "grad_norm": 0.8705409169197083, + "learning_rate": 0.00010187150737162795, + "loss": 5.5312, + "num_input_tokens_seen": 333053952, + "step": 2541 + }, + { + "epoch": 0.4350112215453671, + "grad_norm": 0.7544601559638977, + "learning_rate": 0.00010181142390584588, + "loss": 5.5007, + "num_input_tokens_seen": 333447168, + "step": 2544 + }, + { + "epoch": 0.43552420647643475, + "grad_norm": 0.7180626392364502, + "learning_rate": 0.00010175144662588028, + "loss": 5.5181, + "num_input_tokens_seen": 333840384, + "step": 2547 + }, + { + "epoch": 0.4360371914075024, + "grad_norm": 0.8776599764823914, + "learning_rate": 0.00010169157521932794, + "loss": 5.5253, + "num_input_tokens_seen": 334233600, + "step": 2550 + }, + { + "epoch": 0.4365501763385701, + "grad_norm": 0.8420502543449402, + "learning_rate": 0.00010163180937507096, + "loss": 5.4906, + "num_input_tokens_seen": 334626816, + "step": 2553 + }, + { + "epoch": 0.4370631612696377, + "grad_norm": 0.7820854783058167, + "learning_rate": 0.00010157214878326983, + "loss": 5.4878, + "num_input_tokens_seen": 335020032, + "step": 2556 + }, + { + "epoch": 0.43757614620070534, + "grad_norm": 0.7041083574295044, + "learning_rate": 0.00010151259313535675, + "loss": 5.5047, + "num_input_tokens_seen": 335413248, + "step": 2559 + }, + { + "epoch": 0.438089131131773, + "grad_norm": 0.7176365852355957, + "learning_rate": 0.00010145314212402889, + "loss": 5.4753, + "num_input_tokens_seen": 335806464, + "step": 2562 + }, + { + "epoch": 0.43860211606284066, + "grad_norm": 0.6593925356864929, + "learning_rate": 0.00010139379544324182, + "loss": 5.5398, + "num_input_tokens_seen": 336199680, + "step": 2565 + }, + { + "epoch": 0.4391151009939083, + "grad_norm": 0.8498782515525818, + "learning_rate": 0.00010133455278820273, + "loss": 5.5204, + "num_input_tokens_seen": 336592896, + "step": 2568 + }, + { + "epoch": 0.43962808592497593, + "grad_norm": 0.7824472188949585, + "learning_rate": 0.00010127541385536402, + "loss": 5.4865, + "num_input_tokens_seen": 336986112, + "step": 2571 + }, + { + "epoch": 0.4401410708560436, + "grad_norm": 0.7467549443244934, + "learning_rate": 0.00010121637834241672, + "loss": 5.4581, + "num_input_tokens_seen": 337379328, + "step": 2574 + }, + { + "epoch": 0.44065405578711125, + "grad_norm": 0.8907204866409302, + "learning_rate": 0.00010115744594828388, + "loss": 5.5488, + "num_input_tokens_seen": 337772544, + "step": 2577 + }, + { + "epoch": 0.4411670407181789, + "grad_norm": 0.9740023016929626, + "learning_rate": 0.00010109861637311432, + "loss": 5.5207, + "num_input_tokens_seen": 338165760, + "step": 2580 + }, + { + "epoch": 0.4416800256492466, + "grad_norm": 0.9154981970787048, + "learning_rate": 0.00010103988931827606, + "loss": 5.4704, + "num_input_tokens_seen": 338558976, + "step": 2583 + }, + { + "epoch": 0.4421930105803142, + "grad_norm": 0.8180415630340576, + "learning_rate": 0.00010098126448635004, + "loss": 5.5134, + "num_input_tokens_seen": 338952192, + "step": 2586 + }, + { + "epoch": 0.44270599551138184, + "grad_norm": 0.8424299359321594, + "learning_rate": 0.00010092274158112377, + "loss": 5.5021, + "num_input_tokens_seen": 339345408, + "step": 2589 + }, + { + "epoch": 0.4432189804424495, + "grad_norm": 0.7481082081794739, + "learning_rate": 0.00010086432030758502, + "loss": 5.4675, + "num_input_tokens_seen": 339738624, + "step": 2592 + }, + { + "epoch": 0.44373196537351717, + "grad_norm": 0.9558732509613037, + "learning_rate": 0.00010080600037191566, + "loss": 5.4996, + "num_input_tokens_seen": 340131840, + "step": 2595 + }, + { + "epoch": 0.44424495030458483, + "grad_norm": 0.9134954810142517, + "learning_rate": 0.00010074778148148528, + "loss": 5.5097, + "num_input_tokens_seen": 340525056, + "step": 2598 + }, + { + "epoch": 0.44475793523565244, + "grad_norm": 0.7287417054176331, + "learning_rate": 0.00010068966334484521, + "loss": 5.5043, + "num_input_tokens_seen": 340918272, + "step": 2601 + }, + { + "epoch": 0.4452709201667201, + "grad_norm": 0.9164344668388367, + "learning_rate": 0.00010063164567172234, + "loss": 5.5109, + "num_input_tokens_seen": 341311488, + "step": 2604 + }, + { + "epoch": 0.44578390509778776, + "grad_norm": 0.7782894968986511, + "learning_rate": 0.00010057372817301295, + "loss": 5.5104, + "num_input_tokens_seen": 341704704, + "step": 2607 + }, + { + "epoch": 0.4462968900288554, + "grad_norm": 0.8201740980148315, + "learning_rate": 0.00010051591056077674, + "loss": 5.5767, + "num_input_tokens_seen": 342097920, + "step": 2610 + }, + { + "epoch": 0.446809874959923, + "grad_norm": 0.8876894116401672, + "learning_rate": 0.00010045819254823074, + "loss": 5.4695, + "num_input_tokens_seen": 342491136, + "step": 2613 + }, + { + "epoch": 0.4473228598909907, + "grad_norm": 0.708038866519928, + "learning_rate": 0.0001004005738497435, + "loss": 5.4691, + "num_input_tokens_seen": 342884352, + "step": 2616 + }, + { + "epoch": 0.44783584482205835, + "grad_norm": 0.79575514793396, + "learning_rate": 0.0001003430541808289, + "loss": 5.4651, + "num_input_tokens_seen": 343277568, + "step": 2619 + }, + { + "epoch": 0.448348829753126, + "grad_norm": 0.8515266180038452, + "learning_rate": 0.00010028563325814057, + "loss": 5.4751, + "num_input_tokens_seen": 343670784, + "step": 2622 + }, + { + "epoch": 0.44886181468419367, + "grad_norm": 0.8329154253005981, + "learning_rate": 0.00010022831079946566, + "loss": 5.5093, + "num_input_tokens_seen": 344064000, + "step": 2625 + }, + { + "epoch": 0.4493747996152613, + "grad_norm": 0.871041476726532, + "learning_rate": 0.00010017108652371934, + "loss": 5.4697, + "num_input_tokens_seen": 344457216, + "step": 2628 + }, + { + "epoch": 0.44988778454632894, + "grad_norm": 0.649675190448761, + "learning_rate": 0.0001001139601509388, + "loss": 5.4813, + "num_input_tokens_seen": 344850432, + "step": 2631 + }, + { + "epoch": 0.4504007694773966, + "grad_norm": 0.7647591829299927, + "learning_rate": 0.00010005693140227763, + "loss": 5.4817, + "num_input_tokens_seen": 345243648, + "step": 2634 + }, + { + "epoch": 0.45091375440846426, + "grad_norm": 0.8665961027145386, + "learning_rate": 9.999999999999999e-05, + "loss": 5.4681, + "num_input_tokens_seen": 345636864, + "step": 2637 + }, + { + "epoch": 0.4514267393395319, + "grad_norm": 0.7397317290306091, + "learning_rate": 9.994316566747503e-05, + "loss": 5.4471, + "num_input_tokens_seen": 346030080, + "step": 2640 + }, + { + "epoch": 0.45193972427059953, + "grad_norm": 0.7339237928390503, + "learning_rate": 9.988642812917122e-05, + "loss": 5.4694, + "num_input_tokens_seen": 346423296, + "step": 2643 + }, + { + "epoch": 0.4524527092016672, + "grad_norm": 0.7354670166969299, + "learning_rate": 9.98297871106506e-05, + "loss": 5.4715, + "num_input_tokens_seen": 346816512, + "step": 2646 + }, + { + "epoch": 0.45296569413273485, + "grad_norm": 0.7800298929214478, + "learning_rate": 9.977324233856346e-05, + "loss": 5.5178, + "num_input_tokens_seen": 347209728, + "step": 2649 + }, + { + "epoch": 0.4534786790638025, + "grad_norm": 0.7750239968299866, + "learning_rate": 9.971679354064264e-05, + "loss": 5.4624, + "num_input_tokens_seen": 347602944, + "step": 2652 + }, + { + "epoch": 0.4539916639948702, + "grad_norm": 0.8229271173477173, + "learning_rate": 9.966044044569793e-05, + "loss": 5.5072, + "num_input_tokens_seen": 347996160, + "step": 2655 + }, + { + "epoch": 0.4545046489259378, + "grad_norm": 0.7565569877624512, + "learning_rate": 9.960418278361088e-05, + "loss": 5.5323, + "num_input_tokens_seen": 348389376, + "step": 2658 + }, + { + "epoch": 0.45501763385700544, + "grad_norm": 0.7968035936355591, + "learning_rate": 9.954802028532911e-05, + "loss": 5.4457, + "num_input_tokens_seen": 348782592, + "step": 2661 + }, + { + "epoch": 0.4555306187880731, + "grad_norm": 1.092264175415039, + "learning_rate": 9.949195268286099e-05, + "loss": 5.5038, + "num_input_tokens_seen": 349175808, + "step": 2664 + }, + { + "epoch": 0.45604360371914077, + "grad_norm": 0.9976704120635986, + "learning_rate": 9.943597970927025e-05, + "loss": 5.4596, + "num_input_tokens_seen": 349569024, + "step": 2667 + }, + { + "epoch": 0.4565565886502084, + "grad_norm": 0.8629297018051147, + "learning_rate": 9.938010109867075e-05, + "loss": 5.4183, + "num_input_tokens_seen": 349962240, + "step": 2670 + }, + { + "epoch": 0.45706957358127603, + "grad_norm": 0.9318130016326904, + "learning_rate": 9.932431658622104e-05, + "loss": 5.5005, + "num_input_tokens_seen": 350355456, + "step": 2673 + }, + { + "epoch": 0.4575825585123437, + "grad_norm": 0.7205513715744019, + "learning_rate": 9.926862590811912e-05, + "loss": 5.4921, + "num_input_tokens_seen": 350748672, + "step": 2676 + }, + { + "epoch": 0.45809554344341136, + "grad_norm": 0.8188350200653076, + "learning_rate": 9.921302880159722e-05, + "loss": 5.4676, + "num_input_tokens_seen": 351141888, + "step": 2679 + }, + { + "epoch": 0.458608528374479, + "grad_norm": 0.7740142941474915, + "learning_rate": 9.915752500491666e-05, + "loss": 5.4431, + "num_input_tokens_seen": 351535104, + "step": 2682 + }, + { + "epoch": 0.4591215133055466, + "grad_norm": 0.7324191331863403, + "learning_rate": 9.910211425736248e-05, + "loss": 5.4386, + "num_input_tokens_seen": 351928320, + "step": 2685 + }, + { + "epoch": 0.4596344982366143, + "grad_norm": 0.7066339254379272, + "learning_rate": 9.904679629923856e-05, + "loss": 5.5035, + "num_input_tokens_seen": 352321536, + "step": 2688 + }, + { + "epoch": 0.46014748316768195, + "grad_norm": 0.7234504222869873, + "learning_rate": 9.899157087186225e-05, + "loss": 5.4922, + "num_input_tokens_seen": 352714752, + "step": 2691 + }, + { + "epoch": 0.4606604680987496, + "grad_norm": 0.7445735335350037, + "learning_rate": 9.893643771755952e-05, + "loss": 5.4954, + "num_input_tokens_seen": 353107968, + "step": 2694 + }, + { + "epoch": 0.46117345302981727, + "grad_norm": 0.7736021876335144, + "learning_rate": 9.88813965796597e-05, + "loss": 5.4861, + "num_input_tokens_seen": 353501184, + "step": 2697 + }, + { + "epoch": 0.4616864379608849, + "grad_norm": 0.8421680927276611, + "learning_rate": 9.882644720249061e-05, + "loss": 5.4398, + "num_input_tokens_seen": 353894400, + "step": 2700 + }, + { + "epoch": 0.46219942289195254, + "grad_norm": 0.8024502992630005, + "learning_rate": 9.877158933137354e-05, + "loss": 5.4792, + "num_input_tokens_seen": 354287616, + "step": 2703 + }, + { + "epoch": 0.4627124078230202, + "grad_norm": 0.6904874444007874, + "learning_rate": 9.871682271261825e-05, + "loss": 5.4698, + "num_input_tokens_seen": 354680832, + "step": 2706 + }, + { + "epoch": 0.46322539275408786, + "grad_norm": 0.7565279603004456, + "learning_rate": 9.866214709351803e-05, + "loss": 5.4867, + "num_input_tokens_seen": 355074048, + "step": 2709 + }, + { + "epoch": 0.4637383776851555, + "grad_norm": 0.7363823056221008, + "learning_rate": 9.860756222234493e-05, + "loss": 5.5109, + "num_input_tokens_seen": 355467264, + "step": 2712 + }, + { + "epoch": 0.46425136261622313, + "grad_norm": 0.74873948097229, + "learning_rate": 9.855306784834474e-05, + "loss": 5.4505, + "num_input_tokens_seen": 355860480, + "step": 2715 + }, + { + "epoch": 0.4647643475472908, + "grad_norm": 0.7366782426834106, + "learning_rate": 9.849866372173222e-05, + "loss": 5.5185, + "num_input_tokens_seen": 356253696, + "step": 2718 + }, + { + "epoch": 0.46527733247835845, + "grad_norm": 0.8568825125694275, + "learning_rate": 9.84443495936863e-05, + "loss": 5.4669, + "num_input_tokens_seen": 356646912, + "step": 2721 + }, + { + "epoch": 0.4657903174094261, + "grad_norm": 0.7414649724960327, + "learning_rate": 9.839012521634527e-05, + "loss": 5.4915, + "num_input_tokens_seen": 357040128, + "step": 2724 + }, + { + "epoch": 0.4663033023404938, + "grad_norm": 0.8652457594871521, + "learning_rate": 9.83359903428021e-05, + "loss": 5.4666, + "num_input_tokens_seen": 357433344, + "step": 2727 + }, + { + "epoch": 0.4668162872715614, + "grad_norm": 0.6931141018867493, + "learning_rate": 9.828194472709959e-05, + "loss": 5.4974, + "num_input_tokens_seen": 357826560, + "step": 2730 + }, + { + "epoch": 0.46732927220262904, + "grad_norm": 0.8068703413009644, + "learning_rate": 9.822798812422577e-05, + "loss": 5.4778, + "num_input_tokens_seen": 358219776, + "step": 2733 + }, + { + "epoch": 0.4678422571336967, + "grad_norm": 0.7154794931411743, + "learning_rate": 9.817412029010924e-05, + "loss": 5.4866, + "num_input_tokens_seen": 358612992, + "step": 2736 + }, + { + "epoch": 0.46835524206476437, + "grad_norm": 0.6981579065322876, + "learning_rate": 9.81203409816145e-05, + "loss": 5.4718, + "num_input_tokens_seen": 359006208, + "step": 2739 + }, + { + "epoch": 0.46886822699583197, + "grad_norm": 0.8242044448852539, + "learning_rate": 9.806664995653737e-05, + "loss": 5.4839, + "num_input_tokens_seen": 359399424, + "step": 2742 + }, + { + "epoch": 0.46938121192689963, + "grad_norm": 0.7493621706962585, + "learning_rate": 9.80130469736003e-05, + "loss": 5.4704, + "num_input_tokens_seen": 359792640, + "step": 2745 + }, + { + "epoch": 0.4698941968579673, + "grad_norm": 0.7475427389144897, + "learning_rate": 9.7959531792448e-05, + "loss": 5.4021, + "num_input_tokens_seen": 360185856, + "step": 2748 + }, + { + "epoch": 0.47040718178903496, + "grad_norm": 0.7679263353347778, + "learning_rate": 9.79061041736428e-05, + "loss": 5.5059, + "num_input_tokens_seen": 360579072, + "step": 2751 + }, + { + "epoch": 0.4709201667201026, + "grad_norm": 0.7734596729278564, + "learning_rate": 9.785276387866011e-05, + "loss": 5.4497, + "num_input_tokens_seen": 360972288, + "step": 2754 + }, + { + "epoch": 0.4714331516511702, + "grad_norm": 0.7543107867240906, + "learning_rate": 9.779951066988407e-05, + "loss": 5.4706, + "num_input_tokens_seen": 361365504, + "step": 2757 + }, + { + "epoch": 0.4719461365822379, + "grad_norm": 0.6790075898170471, + "learning_rate": 9.774634431060301e-05, + "loss": 5.4785, + "num_input_tokens_seen": 361758720, + "step": 2760 + }, + { + "epoch": 0.47245912151330555, + "grad_norm": 0.7988852262496948, + "learning_rate": 9.769326456500506e-05, + "loss": 5.4941, + "num_input_tokens_seen": 362151936, + "step": 2763 + }, + { + "epoch": 0.4729721064443732, + "grad_norm": 0.6978922486305237, + "learning_rate": 9.76402711981738e-05, + "loss": 5.4991, + "num_input_tokens_seen": 362545152, + "step": 2766 + }, + { + "epoch": 0.47348509137544087, + "grad_norm": 0.7577718496322632, + "learning_rate": 9.758736397608374e-05, + "loss": 5.4926, + "num_input_tokens_seen": 362938368, + "step": 2769 + }, + { + "epoch": 0.4739980763065085, + "grad_norm": 0.6399986147880554, + "learning_rate": 9.753454266559622e-05, + "loss": 5.4783, + "num_input_tokens_seen": 363331584, + "step": 2772 + }, + { + "epoch": 0.47451106123757614, + "grad_norm": 0.7283456921577454, + "learning_rate": 9.74818070344549e-05, + "loss": 5.5014, + "num_input_tokens_seen": 363724800, + "step": 2775 + }, + { + "epoch": 0.4750240461686438, + "grad_norm": 0.8536427021026611, + "learning_rate": 9.742915685128152e-05, + "loss": 5.5269, + "num_input_tokens_seen": 364118016, + "step": 2778 + }, + { + "epoch": 0.47553703109971146, + "grad_norm": 0.6984924077987671, + "learning_rate": 9.737659188557171e-05, + "loss": 5.467, + "num_input_tokens_seen": 364511232, + "step": 2781 + }, + { + "epoch": 0.4760500160307791, + "grad_norm": 0.8646853566169739, + "learning_rate": 9.732411190769063e-05, + "loss": 5.4627, + "num_input_tokens_seen": 364904448, + "step": 2784 + }, + { + "epoch": 0.47656300096184673, + "grad_norm": 0.7577709555625916, + "learning_rate": 9.727171668886887e-05, + "loss": 5.4473, + "num_input_tokens_seen": 365297664, + "step": 2787 + }, + { + "epoch": 0.4770759858929144, + "grad_norm": 0.7367388010025024, + "learning_rate": 9.721940600119815e-05, + "loss": 5.4691, + "num_input_tokens_seen": 365690880, + "step": 2790 + }, + { + "epoch": 0.47758897082398205, + "grad_norm": 0.7202689051628113, + "learning_rate": 9.716717961762733e-05, + "loss": 5.5151, + "num_input_tokens_seen": 366084096, + "step": 2793 + }, + { + "epoch": 0.4781019557550497, + "grad_norm": 0.7965312004089355, + "learning_rate": 9.711503731195804e-05, + "loss": 5.4734, + "num_input_tokens_seen": 366477312, + "step": 2796 + }, + { + "epoch": 0.4786149406861173, + "grad_norm": 0.7794646620750427, + "learning_rate": 9.706297885884074e-05, + "loss": 5.4882, + "num_input_tokens_seen": 366870528, + "step": 2799 + }, + { + "epoch": 0.47878593566313987, + "eval_accuracy": 0.17308907344080768, + "eval_loss": 5.920318603515625, + "eval_runtime": 109.4183, + "eval_samples_per_second": 2.742, + "eval_steps_per_second": 1.371, + "num_input_tokens_seen": 367001600, + "step": 2800 + }, + { + "epoch": 0.479127925617185, + "grad_norm": 0.6703746914863586, + "learning_rate": 9.701100403377059e-05, + "loss": 5.5227, + "num_input_tokens_seen": 367263744, + "step": 2802 + }, + { + "epoch": 0.47964091054825264, + "grad_norm": 0.649358868598938, + "learning_rate": 9.695911261308335e-05, + "loss": 5.5274, + "num_input_tokens_seen": 367656960, + "step": 2805 + }, + { + "epoch": 0.4801538954793203, + "grad_norm": 0.7234277725219727, + "learning_rate": 9.69073043739513e-05, + "loss": 5.4809, + "num_input_tokens_seen": 368050176, + "step": 2808 + }, + { + "epoch": 0.48066688041038796, + "grad_norm": 0.8192933797836304, + "learning_rate": 9.685557909437936e-05, + "loss": 5.4696, + "num_input_tokens_seen": 368443392, + "step": 2811 + }, + { + "epoch": 0.48117986534145557, + "grad_norm": 0.8990679383277893, + "learning_rate": 9.680393655320099e-05, + "loss": 5.4669, + "num_input_tokens_seen": 368836608, + "step": 2814 + }, + { + "epoch": 0.48169285027252323, + "grad_norm": 0.8200316429138184, + "learning_rate": 9.67523765300742e-05, + "loss": 5.4802, + "num_input_tokens_seen": 369229824, + "step": 2817 + }, + { + "epoch": 0.4822058352035909, + "grad_norm": 0.9789479970932007, + "learning_rate": 9.670089880547766e-05, + "loss": 5.4415, + "num_input_tokens_seen": 369623040, + "step": 2820 + }, + { + "epoch": 0.48271882013465855, + "grad_norm": 0.7308698892593384, + "learning_rate": 9.664950316070681e-05, + "loss": 5.5066, + "num_input_tokens_seen": 370016256, + "step": 2823 + }, + { + "epoch": 0.4832318050657262, + "grad_norm": 0.6770713925361633, + "learning_rate": 9.659818937786982e-05, + "loss": 5.4506, + "num_input_tokens_seen": 370409472, + "step": 2826 + }, + { + "epoch": 0.4837447899967938, + "grad_norm": 0.6626781225204468, + "learning_rate": 9.654695723988381e-05, + "loss": 5.4453, + "num_input_tokens_seen": 370802688, + "step": 2829 + }, + { + "epoch": 0.4842577749278615, + "grad_norm": 0.741698682308197, + "learning_rate": 9.649580653047106e-05, + "loss": 5.4754, + "num_input_tokens_seen": 371195904, + "step": 2832 + }, + { + "epoch": 0.48477075985892915, + "grad_norm": 0.7719034552574158, + "learning_rate": 9.644473703415494e-05, + "loss": 5.4889, + "num_input_tokens_seen": 371589120, + "step": 2835 + }, + { + "epoch": 0.4852837447899968, + "grad_norm": 0.7475199103355408, + "learning_rate": 9.63937485362564e-05, + "loss": 5.4272, + "num_input_tokens_seen": 371982336, + "step": 2838 + }, + { + "epoch": 0.48579672972106447, + "grad_norm": 0.7785484790802002, + "learning_rate": 9.634284082288993e-05, + "loss": 5.4695, + "num_input_tokens_seen": 372375552, + "step": 2841 + }, + { + "epoch": 0.4863097146521321, + "grad_norm": 0.6962818503379822, + "learning_rate": 9.629201368095994e-05, + "loss": 5.4975, + "num_input_tokens_seen": 372768768, + "step": 2844 + }, + { + "epoch": 0.48682269958319974, + "grad_norm": 0.7307711839675903, + "learning_rate": 9.624126689815691e-05, + "loss": 5.4963, + "num_input_tokens_seen": 373161984, + "step": 2847 + }, + { + "epoch": 0.4873356845142674, + "grad_norm": 0.9694541692733765, + "learning_rate": 9.61906002629538e-05, + "loss": 5.4076, + "num_input_tokens_seen": 373555200, + "step": 2850 + }, + { + "epoch": 0.48784866944533506, + "grad_norm": 0.7529440522193909, + "learning_rate": 9.614001356460217e-05, + "loss": 5.4576, + "num_input_tokens_seen": 373948416, + "step": 2853 + }, + { + "epoch": 0.4883616543764027, + "grad_norm": 0.8865497708320618, + "learning_rate": 9.608950659312869e-05, + "loss": 5.4596, + "num_input_tokens_seen": 374341632, + "step": 2856 + }, + { + "epoch": 0.4888746393074703, + "grad_norm": 0.8430432677268982, + "learning_rate": 9.603907913933133e-05, + "loss": 5.4806, + "num_input_tokens_seen": 374734848, + "step": 2859 + }, + { + "epoch": 0.489387624238538, + "grad_norm": 0.6669235229492188, + "learning_rate": 9.598873099477574e-05, + "loss": 5.4507, + "num_input_tokens_seen": 375128064, + "step": 2862 + }, + { + "epoch": 0.48990060916960565, + "grad_norm": 0.8070666790008545, + "learning_rate": 9.593846195179174e-05, + "loss": 5.454, + "num_input_tokens_seen": 375521280, + "step": 2865 + }, + { + "epoch": 0.4904135941006733, + "grad_norm": 0.8225948214530945, + "learning_rate": 9.588827180346961e-05, + "loss": 5.4561, + "num_input_tokens_seen": 375914496, + "step": 2868 + }, + { + "epoch": 0.4909265790317409, + "grad_norm": 0.7412498593330383, + "learning_rate": 9.583816034365655e-05, + "loss": 5.4657, + "num_input_tokens_seen": 376307712, + "step": 2871 + }, + { + "epoch": 0.4914395639628086, + "grad_norm": 0.7847759127616882, + "learning_rate": 9.578812736695315e-05, + "loss": 5.4268, + "num_input_tokens_seen": 376700928, + "step": 2874 + }, + { + "epoch": 0.49195254889387624, + "grad_norm": 0.7222384214401245, + "learning_rate": 9.573817266870979e-05, + "loss": 5.4318, + "num_input_tokens_seen": 377094144, + "step": 2877 + }, + { + "epoch": 0.4924655338249439, + "grad_norm": 0.6922466158866882, + "learning_rate": 9.568829604502318e-05, + "loss": 5.5184, + "num_input_tokens_seen": 377487360, + "step": 2880 + }, + { + "epoch": 0.49297851875601156, + "grad_norm": 0.7505002021789551, + "learning_rate": 9.563849729273287e-05, + "loss": 5.4798, + "num_input_tokens_seen": 377880576, + "step": 2883 + }, + { + "epoch": 0.49349150368707917, + "grad_norm": 0.705833911895752, + "learning_rate": 9.558877620941768e-05, + "loss": 5.4977, + "num_input_tokens_seen": 378273792, + "step": 2886 + }, + { + "epoch": 0.49400448861814683, + "grad_norm": 0.7315141558647156, + "learning_rate": 9.553913259339242e-05, + "loss": 5.428, + "num_input_tokens_seen": 378667008, + "step": 2889 + }, + { + "epoch": 0.4945174735492145, + "grad_norm": 0.7071065902709961, + "learning_rate": 9.548956624370426e-05, + "loss": 5.4213, + "num_input_tokens_seen": 379060224, + "step": 2892 + }, + { + "epoch": 0.49503045848028215, + "grad_norm": 0.7307475209236145, + "learning_rate": 9.54400769601295e-05, + "loss": 5.4333, + "num_input_tokens_seen": 379453440, + "step": 2895 + }, + { + "epoch": 0.4955434434113498, + "grad_norm": 0.7351901531219482, + "learning_rate": 9.539066454316994e-05, + "loss": 5.4426, + "num_input_tokens_seen": 379846656, + "step": 2898 + }, + { + "epoch": 0.4960564283424174, + "grad_norm": 0.7794121503829956, + "learning_rate": 9.534132879404975e-05, + "loss": 5.4382, + "num_input_tokens_seen": 380239872, + "step": 2901 + }, + { + "epoch": 0.4965694132734851, + "grad_norm": 0.7838775515556335, + "learning_rate": 9.529206951471193e-05, + "loss": 5.4391, + "num_input_tokens_seen": 380633088, + "step": 2904 + }, + { + "epoch": 0.49708239820455274, + "grad_norm": 0.729987382888794, + "learning_rate": 9.524288650781515e-05, + "loss": 5.4505, + "num_input_tokens_seen": 381026304, + "step": 2907 + }, + { + "epoch": 0.4975953831356204, + "grad_norm": 0.7023864984512329, + "learning_rate": 9.519377957673018e-05, + "loss": 5.4664, + "num_input_tokens_seen": 381419520, + "step": 2910 + }, + { + "epoch": 0.49810836806668807, + "grad_norm": 0.7854782342910767, + "learning_rate": 9.51447485255368e-05, + "loss": 5.4174, + "num_input_tokens_seen": 381812736, + "step": 2913 + }, + { + "epoch": 0.4986213529977557, + "grad_norm": 0.8409479856491089, + "learning_rate": 9.509579315902049e-05, + "loss": 5.413, + "num_input_tokens_seen": 382205952, + "step": 2916 + }, + { + "epoch": 0.49913433792882334, + "grad_norm": 0.9801114201545715, + "learning_rate": 9.504691328266901e-05, + "loss": 5.4627, + "num_input_tokens_seen": 382599168, + "step": 2919 + }, + { + "epoch": 0.499647322859891, + "grad_norm": 0.699877917766571, + "learning_rate": 9.499810870266937e-05, + "loss": 5.4361, + "num_input_tokens_seen": 382992384, + "step": 2922 + }, + { + "epoch": 0.5001603077909587, + "grad_norm": 0.7811382412910461, + "learning_rate": 9.494937922590444e-05, + "loss": 5.4357, + "num_input_tokens_seen": 383385600, + "step": 2925 + }, + { + "epoch": 0.5006732927220263, + "grad_norm": 0.6606900095939636, + "learning_rate": 9.49007246599498e-05, + "loss": 5.4534, + "num_input_tokens_seen": 383778816, + "step": 2928 + }, + { + "epoch": 0.501186277653094, + "grad_norm": 0.7077214121818542, + "learning_rate": 9.485214481307057e-05, + "loss": 5.4578, + "num_input_tokens_seen": 384172032, + "step": 2931 + }, + { + "epoch": 0.5016992625841616, + "grad_norm": 0.7163876891136169, + "learning_rate": 9.480363949421822e-05, + "loss": 5.4555, + "num_input_tokens_seen": 384565248, + "step": 2934 + }, + { + "epoch": 0.5022122475152292, + "grad_norm": 0.7486274838447571, + "learning_rate": 9.475520851302736e-05, + "loss": 5.4676, + "num_input_tokens_seen": 384958464, + "step": 2937 + }, + { + "epoch": 0.5027252324462969, + "grad_norm": 0.8384826183319092, + "learning_rate": 9.470685167981269e-05, + "loss": 5.4334, + "num_input_tokens_seen": 385351680, + "step": 2940 + }, + { + "epoch": 0.5032382173773645, + "grad_norm": 0.7711573243141174, + "learning_rate": 9.465856880556584e-05, + "loss": 5.3987, + "num_input_tokens_seen": 385744896, + "step": 2943 + }, + { + "epoch": 0.5037512023084322, + "grad_norm": 0.7309294939041138, + "learning_rate": 9.461035970195224e-05, + "loss": 5.4298, + "num_input_tokens_seen": 386138112, + "step": 2946 + }, + { + "epoch": 0.5042641872394998, + "grad_norm": 0.7861335873603821, + "learning_rate": 9.45622241813081e-05, + "loss": 5.4336, + "num_input_tokens_seen": 386531328, + "step": 2949 + }, + { + "epoch": 0.5047771721705675, + "grad_norm": 0.6272249221801758, + "learning_rate": 9.451416205663726e-05, + "loss": 5.4306, + "num_input_tokens_seen": 386924544, + "step": 2952 + }, + { + "epoch": 0.5052901571016352, + "grad_norm": 0.805742084980011, + "learning_rate": 9.446617314160821e-05, + "loss": 5.4855, + "num_input_tokens_seen": 387317760, + "step": 2955 + }, + { + "epoch": 0.5058031420327028, + "grad_norm": 0.7901838421821594, + "learning_rate": 9.441825725055105e-05, + "loss": 5.4566, + "num_input_tokens_seen": 387710976, + "step": 2958 + }, + { + "epoch": 0.5063161269637705, + "grad_norm": 0.7398175597190857, + "learning_rate": 9.437041419845438e-05, + "loss": 5.4069, + "num_input_tokens_seen": 388104192, + "step": 2961 + }, + { + "epoch": 0.506829111894838, + "grad_norm": 0.6904522776603699, + "learning_rate": 9.432264380096243e-05, + "loss": 5.45, + "num_input_tokens_seen": 388497408, + "step": 2964 + }, + { + "epoch": 0.5073420968259057, + "grad_norm": 0.8031904697418213, + "learning_rate": 9.4274945874372e-05, + "loss": 5.413, + "num_input_tokens_seen": 388890624, + "step": 2967 + }, + { + "epoch": 0.5078550817569734, + "grad_norm": 0.892519474029541, + "learning_rate": 9.422732023562952e-05, + "loss": 5.4264, + "num_input_tokens_seen": 389283840, + "step": 2970 + }, + { + "epoch": 0.508368066688041, + "grad_norm": 0.8635051846504211, + "learning_rate": 9.417976670232808e-05, + "loss": 5.4243, + "num_input_tokens_seen": 389677056, + "step": 2973 + }, + { + "epoch": 0.5088810516191087, + "grad_norm": 0.785637617111206, + "learning_rate": 9.413228509270448e-05, + "loss": 5.4366, + "num_input_tokens_seen": 390070272, + "step": 2976 + }, + { + "epoch": 0.5093940365501763, + "grad_norm": 0.6426643133163452, + "learning_rate": 9.408487522563637e-05, + "loss": 5.4627, + "num_input_tokens_seen": 390463488, + "step": 2979 + }, + { + "epoch": 0.509907021481244, + "grad_norm": 0.7258966565132141, + "learning_rate": 9.403753692063932e-05, + "loss": 5.4237, + "num_input_tokens_seen": 390856704, + "step": 2982 + }, + { + "epoch": 0.5104200064123117, + "grad_norm": 0.7019416093826294, + "learning_rate": 9.39902699978639e-05, + "loss": 5.3984, + "num_input_tokens_seen": 391249920, + "step": 2985 + }, + { + "epoch": 0.5109329913433793, + "grad_norm": 0.6688271760940552, + "learning_rate": 9.394307427809288e-05, + "loss": 5.4452, + "num_input_tokens_seen": 391643136, + "step": 2988 + }, + { + "epoch": 0.511445976274447, + "grad_norm": 0.6243648529052734, + "learning_rate": 9.389594958273828e-05, + "loss": 5.4079, + "num_input_tokens_seen": 392036352, + "step": 2991 + }, + { + "epoch": 0.5119589612055145, + "grad_norm": 0.7106574177742004, + "learning_rate": 9.384889573383865e-05, + "loss": 5.4911, + "num_input_tokens_seen": 392429568, + "step": 2994 + }, + { + "epoch": 0.5124719461365822, + "grad_norm": 0.8549032211303711, + "learning_rate": 9.380191255405614e-05, + "loss": 5.4275, + "num_input_tokens_seen": 392822784, + "step": 2997 + }, + { + "epoch": 0.5129849310676499, + "grad_norm": 0.7520581483840942, + "learning_rate": 9.375499986667377e-05, + "loss": 5.4381, + "num_input_tokens_seen": 393216000, + "step": 3000 + }, + { + "epoch": 0.5134979159987175, + "grad_norm": 1.052897572517395, + "learning_rate": 9.370815749559257e-05, + "loss": 5.4058, + "num_input_tokens_seen": 393609216, + "step": 3003 + }, + { + "epoch": 0.5140109009297852, + "grad_norm": 0.8729445338249207, + "learning_rate": 9.366138526532885e-05, + "loss": 5.4444, + "num_input_tokens_seen": 394002432, + "step": 3006 + }, + { + "epoch": 0.5145238858608528, + "grad_norm": 0.7415926456451416, + "learning_rate": 9.361468300101144e-05, + "loss": 5.4457, + "num_input_tokens_seen": 394395648, + "step": 3009 + }, + { + "epoch": 0.5150368707919205, + "grad_norm": 0.7771437764167786, + "learning_rate": 9.356805052837894e-05, + "loss": 5.4288, + "num_input_tokens_seen": 394788864, + "step": 3012 + }, + { + "epoch": 0.5155498557229882, + "grad_norm": 0.8604034781455994, + "learning_rate": 9.352148767377697e-05, + "loss": 5.4217, + "num_input_tokens_seen": 395182080, + "step": 3015 + }, + { + "epoch": 0.5160628406540558, + "grad_norm": 0.9626191854476929, + "learning_rate": 9.347499426415546e-05, + "loss": 5.4037, + "num_input_tokens_seen": 395575296, + "step": 3018 + }, + { + "epoch": 0.5165758255851234, + "grad_norm": 0.8346033096313477, + "learning_rate": 9.342857012706596e-05, + "loss": 5.414, + "num_input_tokens_seen": 395968512, + "step": 3021 + }, + { + "epoch": 0.517088810516191, + "grad_norm": 0.7749696969985962, + "learning_rate": 9.338221509065894e-05, + "loss": 5.4262, + "num_input_tokens_seen": 396361728, + "step": 3024 + }, + { + "epoch": 0.5176017954472587, + "grad_norm": 0.844420850276947, + "learning_rate": 9.333592898368119e-05, + "loss": 5.4352, + "num_input_tokens_seen": 396754944, + "step": 3027 + }, + { + "epoch": 0.5181147803783264, + "grad_norm": 0.7230442762374878, + "learning_rate": 9.328971163547297e-05, + "loss": 5.5282, + "num_input_tokens_seen": 397148160, + "step": 3030 + }, + { + "epoch": 0.518627765309394, + "grad_norm": 0.9192338585853577, + "learning_rate": 9.324356287596562e-05, + "loss": 5.4224, + "num_input_tokens_seen": 397541376, + "step": 3033 + }, + { + "epoch": 0.5191407502404617, + "grad_norm": 0.9499441385269165, + "learning_rate": 9.319748253567871e-05, + "loss": 5.4896, + "num_input_tokens_seen": 397934592, + "step": 3036 + }, + { + "epoch": 0.5196537351715294, + "grad_norm": 0.7503839135169983, + "learning_rate": 9.315147044571765e-05, + "loss": 5.4228, + "num_input_tokens_seen": 398327808, + "step": 3039 + }, + { + "epoch": 0.520166720102597, + "grad_norm": 0.754388153553009, + "learning_rate": 9.310552643777079e-05, + "loss": 5.4824, + "num_input_tokens_seen": 398721024, + "step": 3042 + }, + { + "epoch": 0.5206797050336647, + "grad_norm": 0.8488169312477112, + "learning_rate": 9.305965034410718e-05, + "loss": 5.4188, + "num_input_tokens_seen": 399114240, + "step": 3045 + }, + { + "epoch": 0.5211926899647323, + "grad_norm": 0.8866889476776123, + "learning_rate": 9.301384199757371e-05, + "loss": 5.4401, + "num_input_tokens_seen": 399507456, + "step": 3048 + }, + { + "epoch": 0.5217056748957999, + "grad_norm": 0.8220815062522888, + "learning_rate": 9.296810123159271e-05, + "loss": 5.459, + "num_input_tokens_seen": 399900672, + "step": 3051 + }, + { + "epoch": 0.5222186598268675, + "grad_norm": 0.7505759000778198, + "learning_rate": 9.292242788015935e-05, + "loss": 5.4215, + "num_input_tokens_seen": 400293888, + "step": 3054 + }, + { + "epoch": 0.5227316447579352, + "grad_norm": 0.767932116985321, + "learning_rate": 9.287682177783917e-05, + "loss": 5.4263, + "num_input_tokens_seen": 400687104, + "step": 3057 + }, + { + "epoch": 0.5232446296890029, + "grad_norm": 0.7310931086540222, + "learning_rate": 9.283128275976545e-05, + "loss": 5.4805, + "num_input_tokens_seen": 401080320, + "step": 3060 + }, + { + "epoch": 0.5237576146200705, + "grad_norm": 0.7426648736000061, + "learning_rate": 9.278581066163683e-05, + "loss": 5.4506, + "num_input_tokens_seen": 401473536, + "step": 3063 + }, + { + "epoch": 0.5242705995511382, + "grad_norm": 0.8278624415397644, + "learning_rate": 9.27404053197147e-05, + "loss": 5.4258, + "num_input_tokens_seen": 401866752, + "step": 3066 + }, + { + "epoch": 0.5247835844822059, + "grad_norm": 0.7115198373794556, + "learning_rate": 9.269506657082087e-05, + "loss": 5.3972, + "num_input_tokens_seen": 402259968, + "step": 3069 + }, + { + "epoch": 0.5252965694132735, + "grad_norm": 0.8112291693687439, + "learning_rate": 9.264979425233496e-05, + "loss": 5.4267, + "num_input_tokens_seen": 402653184, + "step": 3072 + }, + { + "epoch": 0.5258095543443412, + "grad_norm": 0.7363867163658142, + "learning_rate": 9.260458820219201e-05, + "loss": 5.4101, + "num_input_tokens_seen": 403046400, + "step": 3075 + }, + { + "epoch": 0.5263225392754087, + "grad_norm": 0.8078411221504211, + "learning_rate": 9.25594482588801e-05, + "loss": 5.4623, + "num_input_tokens_seen": 403439616, + "step": 3078 + }, + { + "epoch": 0.5268355242064764, + "grad_norm": 0.8161293268203735, + "learning_rate": 9.251437426143784e-05, + "loss": 5.4603, + "num_input_tokens_seen": 403832832, + "step": 3081 + }, + { + "epoch": 0.527348509137544, + "grad_norm": 0.778689444065094, + "learning_rate": 9.2469366049452e-05, + "loss": 5.4116, + "num_input_tokens_seen": 404226048, + "step": 3084 + }, + { + "epoch": 0.5278614940686117, + "grad_norm": 0.9100328683853149, + "learning_rate": 9.24244234630551e-05, + "loss": 5.4603, + "num_input_tokens_seen": 404619264, + "step": 3087 + }, + { + "epoch": 0.5283744789996794, + "grad_norm": 0.8067951798439026, + "learning_rate": 9.237954634292307e-05, + "loss": 5.4426, + "num_input_tokens_seen": 405012480, + "step": 3090 + }, + { + "epoch": 0.528887463930747, + "grad_norm": 0.7273784279823303, + "learning_rate": 9.233473453027276e-05, + "loss": 5.3976, + "num_input_tokens_seen": 405405696, + "step": 3093 + }, + { + "epoch": 0.5294004488618147, + "grad_norm": 0.9762039184570312, + "learning_rate": 9.228998786685971e-05, + "loss": 5.4157, + "num_input_tokens_seen": 405798912, + "step": 3096 + }, + { + "epoch": 0.5299134337928824, + "grad_norm": 0.7714497447013855, + "learning_rate": 9.22453061949758e-05, + "loss": 5.4444, + "num_input_tokens_seen": 406192128, + "step": 3099 + }, + { + "epoch": 0.53042641872395, + "grad_norm": 0.7574513554573059, + "learning_rate": 9.220068935744674e-05, + "loss": 5.4365, + "num_input_tokens_seen": 406585344, + "step": 3102 + }, + { + "epoch": 0.5309394036550177, + "grad_norm": 0.8316619396209717, + "learning_rate": 9.215613719763e-05, + "loss": 5.4588, + "num_input_tokens_seen": 406978560, + "step": 3105 + }, + { + "epoch": 0.5314523885860852, + "grad_norm": 0.819129228591919, + "learning_rate": 9.211164955941232e-05, + "loss": 5.4262, + "num_input_tokens_seen": 407371776, + "step": 3108 + }, + { + "epoch": 0.5319653735171529, + "grad_norm": 0.7421338558197021, + "learning_rate": 9.206722628720746e-05, + "loss": 5.4259, + "num_input_tokens_seen": 407764992, + "step": 3111 + }, + { + "epoch": 0.5324783584482206, + "grad_norm": 0.7415031790733337, + "learning_rate": 9.202286722595394e-05, + "loss": 5.3804, + "num_input_tokens_seen": 408158208, + "step": 3114 + }, + { + "epoch": 0.5329913433792882, + "grad_norm": 0.7613891959190369, + "learning_rate": 9.197857222111274e-05, + "loss": 5.4539, + "num_input_tokens_seen": 408551424, + "step": 3117 + }, + { + "epoch": 0.5335043283103559, + "grad_norm": 0.7640434503555298, + "learning_rate": 9.19343411186651e-05, + "loss": 5.4272, + "num_input_tokens_seen": 408944640, + "step": 3120 + }, + { + "epoch": 0.5340173132414235, + "grad_norm": 0.7618759870529175, + "learning_rate": 9.189017376511012e-05, + "loss": 5.4546, + "num_input_tokens_seen": 409337856, + "step": 3123 + }, + { + "epoch": 0.5345302981724912, + "grad_norm": 0.7451426386833191, + "learning_rate": 9.184607000746269e-05, + "loss": 5.4063, + "num_input_tokens_seen": 409731072, + "step": 3126 + }, + { + "epoch": 0.5350432831035589, + "grad_norm": 0.8144820928573608, + "learning_rate": 9.18020296932512e-05, + "loss": 5.3909, + "num_input_tokens_seen": 410124288, + "step": 3129 + }, + { + "epoch": 0.5355562680346265, + "grad_norm": 0.7408854365348816, + "learning_rate": 9.175805267051529e-05, + "loss": 5.4057, + "num_input_tokens_seen": 410517504, + "step": 3132 + }, + { + "epoch": 0.5360692529656942, + "grad_norm": 0.6958907246589661, + "learning_rate": 9.171413878780367e-05, + "loss": 5.4055, + "num_input_tokens_seen": 410910720, + "step": 3135 + }, + { + "epoch": 0.5365822378967617, + "grad_norm": 0.948639452457428, + "learning_rate": 9.167028789417202e-05, + "loss": 5.4399, + "num_input_tokens_seen": 411303936, + "step": 3138 + }, + { + "epoch": 0.5370952228278294, + "grad_norm": 0.9219982028007507, + "learning_rate": 9.162649983918063e-05, + "loss": 5.4244, + "num_input_tokens_seen": 411697152, + "step": 3141 + }, + { + "epoch": 0.5376082077588971, + "grad_norm": 0.8096942901611328, + "learning_rate": 9.15827744728924e-05, + "loss": 5.4215, + "num_input_tokens_seen": 412090368, + "step": 3144 + }, + { + "epoch": 0.5381211926899647, + "grad_norm": 0.7412171363830566, + "learning_rate": 9.153911164587056e-05, + "loss": 5.3988, + "num_input_tokens_seen": 412483584, + "step": 3147 + }, + { + "epoch": 0.5386341776210324, + "grad_norm": 0.878857433795929, + "learning_rate": 9.149551120917665e-05, + "loss": 5.4578, + "num_input_tokens_seen": 412876800, + "step": 3150 + }, + { + "epoch": 0.5391471625521, + "grad_norm": 0.6796370148658752, + "learning_rate": 9.145197301436826e-05, + "loss": 5.4093, + "num_input_tokens_seen": 413270016, + "step": 3153 + }, + { + "epoch": 0.5396601474831677, + "grad_norm": 0.7488420009613037, + "learning_rate": 9.140849691349699e-05, + "loss": 5.415, + "num_input_tokens_seen": 413663232, + "step": 3156 + }, + { + "epoch": 0.5401731324142354, + "grad_norm": 0.7959953546524048, + "learning_rate": 9.136508275910631e-05, + "loss": 5.4424, + "num_input_tokens_seen": 414056448, + "step": 3159 + }, + { + "epoch": 0.540686117345303, + "grad_norm": 0.7761291265487671, + "learning_rate": 9.132173040422948e-05, + "loss": 5.3982, + "num_input_tokens_seen": 414449664, + "step": 3162 + }, + { + "epoch": 0.5411991022763706, + "grad_norm": 0.7197316884994507, + "learning_rate": 9.127843970238739e-05, + "loss": 5.4369, + "num_input_tokens_seen": 414842880, + "step": 3165 + }, + { + "epoch": 0.5417120872074382, + "grad_norm": 0.8164528012275696, + "learning_rate": 9.123521050758656e-05, + "loss": 5.3976, + "num_input_tokens_seen": 415236096, + "step": 3168 + }, + { + "epoch": 0.5422250721385059, + "grad_norm": 0.7773632407188416, + "learning_rate": 9.119204267431711e-05, + "loss": 5.4227, + "num_input_tokens_seen": 415629312, + "step": 3171 + }, + { + "epoch": 0.5427380570695736, + "grad_norm": 0.7797026634216309, + "learning_rate": 9.114893605755055e-05, + "loss": 5.4028, + "num_input_tokens_seen": 416022528, + "step": 3174 + }, + { + "epoch": 0.5432510420006412, + "grad_norm": 0.8309365510940552, + "learning_rate": 9.110589051273787e-05, + "loss": 5.38, + "num_input_tokens_seen": 416415744, + "step": 3177 + }, + { + "epoch": 0.5437640269317089, + "grad_norm": 0.7560862302780151, + "learning_rate": 9.106290589580741e-05, + "loss": 5.4071, + "num_input_tokens_seen": 416808960, + "step": 3180 + }, + { + "epoch": 0.5442770118627765, + "grad_norm": 0.9633191227912903, + "learning_rate": 9.101998206316296e-05, + "loss": 5.454, + "num_input_tokens_seen": 417202176, + "step": 3183 + }, + { + "epoch": 0.5447899967938442, + "grad_norm": 0.9686263203620911, + "learning_rate": 9.097711887168163e-05, + "loss": 5.4424, + "num_input_tokens_seen": 417595392, + "step": 3186 + }, + { + "epoch": 0.5453029817249119, + "grad_norm": 0.7610862255096436, + "learning_rate": 9.093431617871184e-05, + "loss": 5.4298, + "num_input_tokens_seen": 417988608, + "step": 3189 + }, + { + "epoch": 0.5458159666559795, + "grad_norm": 0.8192333579063416, + "learning_rate": 9.08915738420714e-05, + "loss": 5.3675, + "num_input_tokens_seen": 418381824, + "step": 3192 + }, + { + "epoch": 0.5463289515870471, + "grad_norm": 0.8405448198318481, + "learning_rate": 9.084889172004556e-05, + "loss": 5.4132, + "num_input_tokens_seen": 418775040, + "step": 3195 + }, + { + "epoch": 0.5468419365181147, + "grad_norm": 0.8114674687385559, + "learning_rate": 9.080626967138484e-05, + "loss": 5.3972, + "num_input_tokens_seen": 419168256, + "step": 3198 + }, + { + "epoch": 0.5471839264721599, + "eval_accuracy": 0.17817944960104218, + "eval_loss": 5.861401081085205, + "eval_runtime": 114.3884, + "eval_samples_per_second": 2.623, + "eval_steps_per_second": 1.311, + "num_input_tokens_seen": 419430400, + "step": 3200 + }, + { + "epoch": 0.5473549214491824, + "grad_norm": 0.7660786509513855, + "learning_rate": 9.076370755530334e-05, + "loss": 5.4147, + "num_input_tokens_seen": 419561472, + "step": 3201 + }, + { + "epoch": 0.5478679063802501, + "grad_norm": 0.8808948397636414, + "learning_rate": 9.072120523147654e-05, + "loss": 5.4063, + "num_input_tokens_seen": 419954688, + "step": 3204 + }, + { + "epoch": 0.5483808913113177, + "grad_norm": 0.751136302947998, + "learning_rate": 9.067876256003947e-05, + "loss": 5.393, + "num_input_tokens_seen": 420347904, + "step": 3207 + }, + { + "epoch": 0.5488938762423854, + "grad_norm": 0.7583978772163391, + "learning_rate": 9.063637940158486e-05, + "loss": 5.441, + "num_input_tokens_seen": 420741120, + "step": 3210 + }, + { + "epoch": 0.549406861173453, + "grad_norm": 0.685180127620697, + "learning_rate": 9.059405561716102e-05, + "loss": 5.3506, + "num_input_tokens_seen": 421134336, + "step": 3213 + }, + { + "epoch": 0.5499198461045207, + "grad_norm": 0.7538748979568481, + "learning_rate": 9.05517910682701e-05, + "loss": 5.4059, + "num_input_tokens_seen": 421527552, + "step": 3216 + }, + { + "epoch": 0.5504328310355884, + "grad_norm": 0.7428572773933411, + "learning_rate": 9.050958561686607e-05, + "loss": 5.4318, + "num_input_tokens_seen": 421920768, + "step": 3219 + }, + { + "epoch": 0.5509458159666559, + "grad_norm": 0.6747097373008728, + "learning_rate": 9.046743912535294e-05, + "loss": 5.412, + "num_input_tokens_seen": 422313984, + "step": 3222 + }, + { + "epoch": 0.5514588008977236, + "grad_norm": 0.7077423930168152, + "learning_rate": 9.042535145658275e-05, + "loss": 5.4399, + "num_input_tokens_seen": 422707200, + "step": 3225 + }, + { + "epoch": 0.5519717858287913, + "grad_norm": 0.7870452404022217, + "learning_rate": 9.038332247385375e-05, + "loss": 5.4146, + "num_input_tokens_seen": 423100416, + "step": 3228 + }, + { + "epoch": 0.5524847707598589, + "grad_norm": 0.788215160369873, + "learning_rate": 9.034135204090863e-05, + "loss": 5.3864, + "num_input_tokens_seen": 423493632, + "step": 3231 + }, + { + "epoch": 0.5529977556909266, + "grad_norm": 0.8093016743659973, + "learning_rate": 9.029944002193249e-05, + "loss": 5.4134, + "num_input_tokens_seen": 423886848, + "step": 3234 + }, + { + "epoch": 0.5535107406219942, + "grad_norm": 0.8293936252593994, + "learning_rate": 9.025758628155108e-05, + "loss": 5.403, + "num_input_tokens_seen": 424280064, + "step": 3237 + }, + { + "epoch": 0.5540237255530619, + "grad_norm": 0.8486737012863159, + "learning_rate": 9.021579068482906e-05, + "loss": 5.4037, + "num_input_tokens_seen": 424673280, + "step": 3240 + }, + { + "epoch": 0.5545367104841296, + "grad_norm": 0.7581062316894531, + "learning_rate": 9.017405309726795e-05, + "loss": 5.4085, + "num_input_tokens_seen": 425066496, + "step": 3243 + }, + { + "epoch": 0.5550496954151972, + "grad_norm": 0.7119733691215515, + "learning_rate": 9.013237338480452e-05, + "loss": 5.412, + "num_input_tokens_seen": 425459712, + "step": 3246 + }, + { + "epoch": 0.5555626803462649, + "grad_norm": 0.6932626962661743, + "learning_rate": 9.009075141380889e-05, + "loss": 5.3921, + "num_input_tokens_seen": 425852928, + "step": 3249 + }, + { + "epoch": 0.5560756652773324, + "grad_norm": 0.7145546078681946, + "learning_rate": 9.004918705108273e-05, + "loss": 5.4198, + "num_input_tokens_seen": 426246144, + "step": 3252 + }, + { + "epoch": 0.5565886502084001, + "grad_norm": 0.7427055239677429, + "learning_rate": 9.000768016385747e-05, + "loss": 5.4181, + "num_input_tokens_seen": 426639360, + "step": 3255 + }, + { + "epoch": 0.5571016351394678, + "grad_norm": 0.794049859046936, + "learning_rate": 8.996623061979255e-05, + "loss": 5.4207, + "num_input_tokens_seen": 427032576, + "step": 3258 + }, + { + "epoch": 0.5576146200705354, + "grad_norm": 0.7688232064247131, + "learning_rate": 8.992483828697364e-05, + "loss": 5.3995, + "num_input_tokens_seen": 427425792, + "step": 3261 + }, + { + "epoch": 0.5581276050016031, + "grad_norm": 0.78509521484375, + "learning_rate": 8.988350303391082e-05, + "loss": 5.3776, + "num_input_tokens_seen": 427819008, + "step": 3264 + }, + { + "epoch": 0.5586405899326707, + "grad_norm": 0.7266300320625305, + "learning_rate": 8.984222472953694e-05, + "loss": 5.3997, + "num_input_tokens_seen": 428212224, + "step": 3267 + }, + { + "epoch": 0.5591535748637384, + "grad_norm": 0.7247098684310913, + "learning_rate": 8.980100324320567e-05, + "loss": 5.3709, + "num_input_tokens_seen": 428605440, + "step": 3270 + }, + { + "epoch": 0.5596665597948061, + "grad_norm": 0.7840573191642761, + "learning_rate": 8.975983844469008e-05, + "loss": 5.3795, + "num_input_tokens_seen": 428998656, + "step": 3273 + }, + { + "epoch": 0.5601795447258737, + "grad_norm": 0.766008734703064, + "learning_rate": 8.97187302041806e-05, + "loss": 5.392, + "num_input_tokens_seen": 429391872, + "step": 3276 + }, + { + "epoch": 0.5606925296569413, + "grad_norm": 0.7803478837013245, + "learning_rate": 8.967767839228347e-05, + "loss": 5.3733, + "num_input_tokens_seen": 429785088, + "step": 3279 + }, + { + "epoch": 0.5612055145880089, + "grad_norm": 0.7566540837287903, + "learning_rate": 8.963668288001898e-05, + "loss": 5.4469, + "num_input_tokens_seen": 430178304, + "step": 3282 + }, + { + "epoch": 0.5617184995190766, + "grad_norm": 0.7852625250816345, + "learning_rate": 8.959574353881981e-05, + "loss": 5.4397, + "num_input_tokens_seen": 430571520, + "step": 3285 + }, + { + "epoch": 0.5622314844501443, + "grad_norm": 0.7693859934806824, + "learning_rate": 8.955486024052926e-05, + "loss": 5.4515, + "num_input_tokens_seen": 430964736, + "step": 3288 + }, + { + "epoch": 0.5627444693812119, + "grad_norm": 0.7919835448265076, + "learning_rate": 8.951403285739966e-05, + "loss": 5.3651, + "num_input_tokens_seen": 431357952, + "step": 3291 + }, + { + "epoch": 0.5632574543122796, + "grad_norm": 0.7930036187171936, + "learning_rate": 8.947326126209056e-05, + "loss": 5.3632, + "num_input_tokens_seen": 431751168, + "step": 3294 + }, + { + "epoch": 0.5637704392433472, + "grad_norm": 0.8255593776702881, + "learning_rate": 8.943254532766725e-05, + "loss": 5.4164, + "num_input_tokens_seen": 432144384, + "step": 3297 + }, + { + "epoch": 0.5642834241744149, + "grad_norm": 0.7580281496047974, + "learning_rate": 8.939188492759893e-05, + "loss": 5.4217, + "num_input_tokens_seen": 432537600, + "step": 3300 + }, + { + "epoch": 0.5647964091054826, + "grad_norm": 0.8095049858093262, + "learning_rate": 8.935127993575714e-05, + "loss": 5.4079, + "num_input_tokens_seen": 432930816, + "step": 3303 + }, + { + "epoch": 0.5653093940365502, + "grad_norm": 0.8694627285003662, + "learning_rate": 8.93107302264141e-05, + "loss": 5.4076, + "num_input_tokens_seen": 433324032, + "step": 3306 + }, + { + "epoch": 0.5658223789676178, + "grad_norm": 0.7595628499984741, + "learning_rate": 8.927023567424106e-05, + "loss": 5.3759, + "num_input_tokens_seen": 433717248, + "step": 3309 + }, + { + "epoch": 0.5663353638986854, + "grad_norm": 0.7479428648948669, + "learning_rate": 8.922979615430672e-05, + "loss": 5.4004, + "num_input_tokens_seen": 434110464, + "step": 3312 + }, + { + "epoch": 0.5668483488297531, + "grad_norm": 0.7877030372619629, + "learning_rate": 8.918941154207554e-05, + "loss": 5.3521, + "num_input_tokens_seen": 434503680, + "step": 3315 + }, + { + "epoch": 0.5673613337608208, + "grad_norm": 0.711147129535675, + "learning_rate": 8.914908171340622e-05, + "loss": 5.3964, + "num_input_tokens_seen": 434896896, + "step": 3318 + }, + { + "epoch": 0.5678743186918884, + "grad_norm": 0.8448489904403687, + "learning_rate": 8.910880654455001e-05, + "loss": 5.3609, + "num_input_tokens_seen": 435290112, + "step": 3321 + }, + { + "epoch": 0.5683873036229561, + "grad_norm": 1.0078827142715454, + "learning_rate": 8.906858591214913e-05, + "loss": 5.3788, + "num_input_tokens_seen": 435683328, + "step": 3324 + }, + { + "epoch": 0.5689002885540237, + "grad_norm": 0.9142970442771912, + "learning_rate": 8.902841969323526e-05, + "loss": 5.4028, + "num_input_tokens_seen": 436076544, + "step": 3327 + }, + { + "epoch": 0.5694132734850914, + "grad_norm": 0.7522501945495605, + "learning_rate": 8.898830776522789e-05, + "loss": 5.371, + "num_input_tokens_seen": 436469760, + "step": 3330 + }, + { + "epoch": 0.5699262584161591, + "grad_norm": 0.7678289413452148, + "learning_rate": 8.894825000593272e-05, + "loss": 5.4068, + "num_input_tokens_seen": 436862976, + "step": 3333 + }, + { + "epoch": 0.5704392433472266, + "grad_norm": 0.8633313179016113, + "learning_rate": 8.890824629354019e-05, + "loss": 5.4327, + "num_input_tokens_seen": 437256192, + "step": 3336 + }, + { + "epoch": 0.5709522282782943, + "grad_norm": 0.7590151429176331, + "learning_rate": 8.886829650662388e-05, + "loss": 5.428, + "num_input_tokens_seen": 437649408, + "step": 3339 + }, + { + "epoch": 0.5714652132093619, + "grad_norm": 0.847149133682251, + "learning_rate": 8.882840052413889e-05, + "loss": 5.3417, + "num_input_tokens_seen": 438042624, + "step": 3342 + }, + { + "epoch": 0.5719781981404296, + "grad_norm": 0.898617684841156, + "learning_rate": 8.878855822542044e-05, + "loss": 5.4506, + "num_input_tokens_seen": 438435840, + "step": 3345 + }, + { + "epoch": 0.5724911830714973, + "grad_norm": 0.763965368270874, + "learning_rate": 8.874876949018225e-05, + "loss": 5.3538, + "num_input_tokens_seen": 438829056, + "step": 3348 + }, + { + "epoch": 0.5730041680025649, + "grad_norm": 0.6895188093185425, + "learning_rate": 8.8709034198515e-05, + "loss": 5.3798, + "num_input_tokens_seen": 439222272, + "step": 3351 + }, + { + "epoch": 0.5735171529336326, + "grad_norm": 0.7596139907836914, + "learning_rate": 8.866935223088484e-05, + "loss": 5.4335, + "num_input_tokens_seen": 439615488, + "step": 3354 + }, + { + "epoch": 0.5740301378647003, + "grad_norm": 0.7017662525177002, + "learning_rate": 8.86297234681319e-05, + "loss": 5.3408, + "num_input_tokens_seen": 440008704, + "step": 3357 + }, + { + "epoch": 0.5745431227957679, + "grad_norm": 0.68537437915802, + "learning_rate": 8.859014779146879e-05, + "loss": 5.389, + "num_input_tokens_seen": 440401920, + "step": 3360 + }, + { + "epoch": 0.5750561077268356, + "grad_norm": 0.7110214233398438, + "learning_rate": 8.855062508247906e-05, + "loss": 5.388, + "num_input_tokens_seen": 440795136, + "step": 3363 + }, + { + "epoch": 0.5755690926579031, + "grad_norm": 0.7052029371261597, + "learning_rate": 8.851115522311569e-05, + "loss": 5.369, + "num_input_tokens_seen": 441188352, + "step": 3366 + }, + { + "epoch": 0.5760820775889708, + "grad_norm": 0.7604022026062012, + "learning_rate": 8.847173809569973e-05, + "loss": 5.4126, + "num_input_tokens_seen": 441581568, + "step": 3369 + }, + { + "epoch": 0.5765950625200384, + "grad_norm": 0.9355179071426392, + "learning_rate": 8.843237358291869e-05, + "loss": 5.4177, + "num_input_tokens_seen": 441974784, + "step": 3372 + }, + { + "epoch": 0.5771080474511061, + "grad_norm": 0.7173782587051392, + "learning_rate": 8.839306156782517e-05, + "loss": 5.3681, + "num_input_tokens_seen": 442368000, + "step": 3375 + }, + { + "epoch": 0.5776210323821738, + "grad_norm": 0.6853963136672974, + "learning_rate": 8.835380193383536e-05, + "loss": 5.4111, + "num_input_tokens_seen": 442761216, + "step": 3378 + }, + { + "epoch": 0.5781340173132414, + "grad_norm": 0.8973987102508545, + "learning_rate": 8.831459456472757e-05, + "loss": 5.3506, + "num_input_tokens_seen": 443154432, + "step": 3381 + }, + { + "epoch": 0.5786470022443091, + "grad_norm": 0.9155653119087219, + "learning_rate": 8.827543934464083e-05, + "loss": 5.4238, + "num_input_tokens_seen": 443547648, + "step": 3384 + }, + { + "epoch": 0.5791599871753768, + "grad_norm": 0.8291952013969421, + "learning_rate": 8.823633615807338e-05, + "loss": 5.3837, + "num_input_tokens_seen": 443940864, + "step": 3387 + }, + { + "epoch": 0.5796729721064444, + "grad_norm": 0.8799310922622681, + "learning_rate": 8.81972848898814e-05, + "loss": 5.4034, + "num_input_tokens_seen": 444334080, + "step": 3390 + }, + { + "epoch": 0.580185957037512, + "grad_norm": 0.8494542837142944, + "learning_rate": 8.815828542527734e-05, + "loss": 5.3643, + "num_input_tokens_seen": 444727296, + "step": 3393 + }, + { + "epoch": 0.5806989419685796, + "grad_norm": 0.8374956250190735, + "learning_rate": 8.811933764982872e-05, + "loss": 5.3538, + "num_input_tokens_seen": 445120512, + "step": 3396 + }, + { + "epoch": 0.5812119268996473, + "grad_norm": 0.7371034026145935, + "learning_rate": 8.80804414494566e-05, + "loss": 5.3759, + "num_input_tokens_seen": 445513728, + "step": 3399 + }, + { + "epoch": 0.581724911830715, + "grad_norm": 0.905910074710846, + "learning_rate": 8.804159671043426e-05, + "loss": 5.3869, + "num_input_tokens_seen": 445906944, + "step": 3402 + }, + { + "epoch": 0.5822378967617826, + "grad_norm": 0.7442747950553894, + "learning_rate": 8.80028033193857e-05, + "loss": 5.3816, + "num_input_tokens_seen": 446300160, + "step": 3405 + }, + { + "epoch": 0.5827508816928503, + "grad_norm": 0.7881158590316772, + "learning_rate": 8.796406116328433e-05, + "loss": 5.4244, + "num_input_tokens_seen": 446693376, + "step": 3408 + }, + { + "epoch": 0.5832638666239179, + "grad_norm": 0.7259749174118042, + "learning_rate": 8.792537012945155e-05, + "loss": 5.3949, + "num_input_tokens_seen": 447086592, + "step": 3411 + }, + { + "epoch": 0.5837768515549856, + "grad_norm": 0.7151450514793396, + "learning_rate": 8.788673010555546e-05, + "loss": 5.3901, + "num_input_tokens_seen": 447479808, + "step": 3414 + }, + { + "epoch": 0.5842898364860533, + "grad_norm": 0.7341894507408142, + "learning_rate": 8.784814097960928e-05, + "loss": 5.3559, + "num_input_tokens_seen": 447873024, + "step": 3417 + }, + { + "epoch": 0.5848028214171209, + "grad_norm": 0.805620014667511, + "learning_rate": 8.780960263997026e-05, + "loss": 5.3723, + "num_input_tokens_seen": 448266240, + "step": 3420 + }, + { + "epoch": 0.5853158063481885, + "grad_norm": 0.6728326082229614, + "learning_rate": 8.777111497533811e-05, + "loss": 5.4222, + "num_input_tokens_seen": 448659456, + "step": 3423 + }, + { + "epoch": 0.5858287912792561, + "grad_norm": 0.7800388336181641, + "learning_rate": 8.773267787475375e-05, + "loss": 5.4013, + "num_input_tokens_seen": 449052672, + "step": 3426 + }, + { + "epoch": 0.5863417762103238, + "grad_norm": 0.7759156823158264, + "learning_rate": 8.769429122759794e-05, + "loss": 5.3962, + "num_input_tokens_seen": 449445888, + "step": 3429 + }, + { + "epoch": 0.5868547611413915, + "grad_norm": 0.7210240364074707, + "learning_rate": 8.765595492358994e-05, + "loss": 5.4314, + "num_input_tokens_seen": 449839104, + "step": 3432 + }, + { + "epoch": 0.5873677460724591, + "grad_norm": 0.7143703699111938, + "learning_rate": 8.761766885278622e-05, + "loss": 5.4162, + "num_input_tokens_seen": 450232320, + "step": 3435 + }, + { + "epoch": 0.5878807310035268, + "grad_norm": 0.8158110976219177, + "learning_rate": 8.757943290557907e-05, + "loss": 5.4427, + "num_input_tokens_seen": 450625536, + "step": 3438 + }, + { + "epoch": 0.5883937159345944, + "grad_norm": 1.0660381317138672, + "learning_rate": 8.754124697269531e-05, + "loss": 5.3696, + "num_input_tokens_seen": 451018752, + "step": 3441 + }, + { + "epoch": 0.5889067008656621, + "grad_norm": 0.7566826939582825, + "learning_rate": 8.7503110945195e-05, + "loss": 5.3177, + "num_input_tokens_seen": 451411968, + "step": 3444 + }, + { + "epoch": 0.5894196857967298, + "grad_norm": 1.0346297025680542, + "learning_rate": 8.746502471447013e-05, + "loss": 5.3814, + "num_input_tokens_seen": 451805184, + "step": 3447 + }, + { + "epoch": 0.5899326707277974, + "grad_norm": 0.7817288637161255, + "learning_rate": 8.742698817224326e-05, + "loss": 5.3748, + "num_input_tokens_seen": 452198400, + "step": 3450 + }, + { + "epoch": 0.590445655658865, + "grad_norm": 0.6449154019355774, + "learning_rate": 8.738900121056633e-05, + "loss": 5.3848, + "num_input_tokens_seen": 452591616, + "step": 3453 + }, + { + "epoch": 0.5909586405899326, + "grad_norm": 0.789215624332428, + "learning_rate": 8.735106372181928e-05, + "loss": 5.3952, + "num_input_tokens_seen": 452984832, + "step": 3456 + }, + { + "epoch": 0.5914716255210003, + "grad_norm": 0.7104623317718506, + "learning_rate": 8.731317559870881e-05, + "loss": 5.3851, + "num_input_tokens_seen": 453378048, + "step": 3459 + }, + { + "epoch": 0.591984610452068, + "grad_norm": 0.6851987838745117, + "learning_rate": 8.727533673426715e-05, + "loss": 5.3495, + "num_input_tokens_seen": 453771264, + "step": 3462 + }, + { + "epoch": 0.5924975953831356, + "grad_norm": 0.661577582359314, + "learning_rate": 8.723754702185069e-05, + "loss": 5.4102, + "num_input_tokens_seen": 454164480, + "step": 3465 + }, + { + "epoch": 0.5930105803142033, + "grad_norm": 0.7876046895980835, + "learning_rate": 8.719980635513879e-05, + "loss": 5.3651, + "num_input_tokens_seen": 454557696, + "step": 3468 + }, + { + "epoch": 0.593523565245271, + "grad_norm": 0.6847128868103027, + "learning_rate": 8.716211462813248e-05, + "loss": 5.374, + "num_input_tokens_seen": 454950912, + "step": 3471 + }, + { + "epoch": 0.5940365501763386, + "grad_norm": 0.7161657214164734, + "learning_rate": 8.712447173515334e-05, + "loss": 5.3707, + "num_input_tokens_seen": 455344128, + "step": 3474 + }, + { + "epoch": 0.5945495351074063, + "grad_norm": 0.7502399682998657, + "learning_rate": 8.708687757084202e-05, + "loss": 5.3479, + "num_input_tokens_seen": 455737344, + "step": 3477 + }, + { + "epoch": 0.5950625200384738, + "grad_norm": 0.7003999948501587, + "learning_rate": 8.704933203015719e-05, + "loss": 5.3245, + "num_input_tokens_seen": 456130560, + "step": 3480 + }, + { + "epoch": 0.5955755049695415, + "grad_norm": 0.7405338287353516, + "learning_rate": 8.701183500837426e-05, + "loss": 5.3814, + "num_input_tokens_seen": 456523776, + "step": 3483 + }, + { + "epoch": 0.5960884899006091, + "grad_norm": 0.7183709144592285, + "learning_rate": 8.697438640108417e-05, + "loss": 5.3444, + "num_input_tokens_seen": 456916992, + "step": 3486 + }, + { + "epoch": 0.5966014748316768, + "grad_norm": 0.7054752707481384, + "learning_rate": 8.693698610419203e-05, + "loss": 5.4152, + "num_input_tokens_seen": 457310208, + "step": 3489 + }, + { + "epoch": 0.5971144597627445, + "grad_norm": 0.7215176820755005, + "learning_rate": 8.689963401391618e-05, + "loss": 5.3406, + "num_input_tokens_seen": 457703424, + "step": 3492 + }, + { + "epoch": 0.5976274446938121, + "grad_norm": 0.7162594795227051, + "learning_rate": 8.686233002678664e-05, + "loss": 5.4583, + "num_input_tokens_seen": 458096640, + "step": 3495 + }, + { + "epoch": 0.5981404296248798, + "grad_norm": 0.7248669862747192, + "learning_rate": 8.682507403964426e-05, + "loss": 5.339, + "num_input_tokens_seen": 458489856, + "step": 3498 + }, + { + "epoch": 0.5986534145559474, + "grad_norm": 0.7000369429588318, + "learning_rate": 8.67878659496392e-05, + "loss": 5.3227, + "num_input_tokens_seen": 458883072, + "step": 3501 + }, + { + "epoch": 0.5991663994870151, + "grad_norm": 0.7467644810676575, + "learning_rate": 8.675070565422998e-05, + "loss": 5.421, + "num_input_tokens_seen": 459276288, + "step": 3504 + }, + { + "epoch": 0.5996793844180828, + "grad_norm": 0.7262928485870361, + "learning_rate": 8.671359305118213e-05, + "loss": 5.3765, + "num_input_tokens_seen": 459669504, + "step": 3507 + }, + { + "epoch": 0.6001923693491503, + "grad_norm": 0.721593976020813, + "learning_rate": 8.667652803856712e-05, + "loss": 5.415, + "num_input_tokens_seen": 460062720, + "step": 3510 + }, + { + "epoch": 0.600705354280218, + "grad_norm": 0.712061882019043, + "learning_rate": 8.663951051476112e-05, + "loss": 5.4037, + "num_input_tokens_seen": 460455936, + "step": 3513 + }, + { + "epoch": 0.6012183392112856, + "grad_norm": 0.6668177247047424, + "learning_rate": 8.660254037844386e-05, + "loss": 5.3052, + "num_input_tokens_seen": 460849152, + "step": 3516 + }, + { + "epoch": 0.6017313241423533, + "grad_norm": 0.6952577233314514, + "learning_rate": 8.656561752859744e-05, + "loss": 5.3895, + "num_input_tokens_seen": 461242368, + "step": 3519 + }, + { + "epoch": 0.602244309073421, + "grad_norm": 0.8409635424613953, + "learning_rate": 8.652874186450518e-05, + "loss": 5.3989, + "num_input_tokens_seen": 461635584, + "step": 3522 + }, + { + "epoch": 0.6027572940044886, + "grad_norm": 0.7468051314353943, + "learning_rate": 8.64919132857505e-05, + "loss": 5.3822, + "num_input_tokens_seen": 462028800, + "step": 3525 + }, + { + "epoch": 0.6032702789355563, + "grad_norm": 0.6477757692337036, + "learning_rate": 8.645513169221575e-05, + "loss": 5.3758, + "num_input_tokens_seen": 462422016, + "step": 3528 + }, + { + "epoch": 0.603783263866624, + "grad_norm": 0.7509175539016724, + "learning_rate": 8.641839698408096e-05, + "loss": 5.3906, + "num_input_tokens_seen": 462815232, + "step": 3531 + }, + { + "epoch": 0.6042962487976916, + "grad_norm": 0.7417723536491394, + "learning_rate": 8.638170906182295e-05, + "loss": 5.4307, + "num_input_tokens_seen": 463208448, + "step": 3534 + }, + { + "epoch": 0.6048092337287592, + "grad_norm": 0.9790907502174377, + "learning_rate": 8.634506782621394e-05, + "loss": 5.387, + "num_input_tokens_seen": 463601664, + "step": 3537 + }, + { + "epoch": 0.6053222186598268, + "grad_norm": 0.7534716129302979, + "learning_rate": 8.630847317832056e-05, + "loss": 5.3362, + "num_input_tokens_seen": 463994880, + "step": 3540 + }, + { + "epoch": 0.6058352035908945, + "grad_norm": 0.8518982529640198, + "learning_rate": 8.627192501950274e-05, + "loss": 5.4182, + "num_input_tokens_seen": 464388096, + "step": 3543 + }, + { + "epoch": 0.6063481885219622, + "grad_norm": 0.9109683632850647, + "learning_rate": 8.623542325141249e-05, + "loss": 5.3571, + "num_input_tokens_seen": 464781312, + "step": 3546 + }, + { + "epoch": 0.6068611734530298, + "grad_norm": 0.7174829840660095, + "learning_rate": 8.619896777599289e-05, + "loss": 5.4502, + "num_input_tokens_seen": 465174528, + "step": 3549 + }, + { + "epoch": 0.6073741583840975, + "grad_norm": 0.796428918838501, + "learning_rate": 8.616255849547694e-05, + "loss": 5.3815, + "num_input_tokens_seen": 465567744, + "step": 3552 + }, + { + "epoch": 0.6078871433151651, + "grad_norm": 0.9209778308868408, + "learning_rate": 8.612619531238647e-05, + "loss": 5.4294, + "num_input_tokens_seen": 465960960, + "step": 3555 + }, + { + "epoch": 0.6084001282462328, + "grad_norm": 0.9266733527183533, + "learning_rate": 8.6089878129531e-05, + "loss": 5.3943, + "num_input_tokens_seen": 466354176, + "step": 3558 + }, + { + "epoch": 0.6089131131773005, + "grad_norm": 0.7748308181762695, + "learning_rate": 8.60536068500068e-05, + "loss": 5.3581, + "num_input_tokens_seen": 466747392, + "step": 3561 + }, + { + "epoch": 0.6094260981083681, + "grad_norm": 0.8532904386520386, + "learning_rate": 8.601738137719548e-05, + "loss": 5.3198, + "num_input_tokens_seen": 467140608, + "step": 3564 + }, + { + "epoch": 0.6099390830394357, + "grad_norm": 0.8617231845855713, + "learning_rate": 8.598120161476338e-05, + "loss": 5.4047, + "num_input_tokens_seen": 467533824, + "step": 3567 + }, + { + "epoch": 0.6104520679705033, + "grad_norm": 0.7476005554199219, + "learning_rate": 8.594506746665999e-05, + "loss": 5.3728, + "num_input_tokens_seen": 467927040, + "step": 3570 + }, + { + "epoch": 0.610965052901571, + "grad_norm": 0.7166175842285156, + "learning_rate": 8.590897883711732e-05, + "loss": 5.3925, + "num_input_tokens_seen": 468320256, + "step": 3573 + }, + { + "epoch": 0.6114780378326387, + "grad_norm": 0.6815547347068787, + "learning_rate": 8.587293563064851e-05, + "loss": 5.3407, + "num_input_tokens_seen": 468713472, + "step": 3576 + }, + { + "epoch": 0.6119910227637063, + "grad_norm": 0.7247835397720337, + "learning_rate": 8.583693775204695e-05, + "loss": 5.3833, + "num_input_tokens_seen": 469106688, + "step": 3579 + }, + { + "epoch": 0.612504007694774, + "grad_norm": 0.715282142162323, + "learning_rate": 8.580098510638516e-05, + "loss": 5.3863, + "num_input_tokens_seen": 469499904, + "step": 3582 + }, + { + "epoch": 0.6130169926258416, + "grad_norm": 0.8423024415969849, + "learning_rate": 8.576507759901377e-05, + "loss": 5.4177, + "num_input_tokens_seen": 469893120, + "step": 3585 + }, + { + "epoch": 0.6135299775569093, + "grad_norm": 0.7851782441139221, + "learning_rate": 8.572921513556039e-05, + "loss": 5.375, + "num_input_tokens_seen": 470286336, + "step": 3588 + }, + { + "epoch": 0.614042962487977, + "grad_norm": 0.9765253067016602, + "learning_rate": 8.569339762192868e-05, + "loss": 5.3584, + "num_input_tokens_seen": 470679552, + "step": 3591 + }, + { + "epoch": 0.6145559474190445, + "grad_norm": 0.799020528793335, + "learning_rate": 8.565762496429728e-05, + "loss": 5.3448, + "num_input_tokens_seen": 471072768, + "step": 3594 + }, + { + "epoch": 0.6150689323501122, + "grad_norm": 0.711087703704834, + "learning_rate": 8.562189706911872e-05, + "loss": 5.3743, + "num_input_tokens_seen": 471465984, + "step": 3597 + }, + { + "epoch": 0.6155819172811798, + "grad_norm": 0.8044856190681458, + "learning_rate": 8.558621384311844e-05, + "loss": 5.3983, + "num_input_tokens_seen": 471859200, + "step": 3600 + }, + { + "epoch": 0.6155819172811798, + "eval_accuracy": 0.17727405959941378, + "eval_loss": 5.83395528793335, + "eval_runtime": 110.1105, + "eval_samples_per_second": 2.725, + "eval_steps_per_second": 1.362, + "num_input_tokens_seen": 471859200, + "step": 3600 + }, + { + "epoch": 0.6160949022122475, + "grad_norm": 0.6745046973228455, + "learning_rate": 8.555057519329377e-05, + "loss": 5.3712, + "num_input_tokens_seen": 472252416, + "step": 3603 + }, + { + "epoch": 0.6166078871433152, + "grad_norm": 0.7151539921760559, + "learning_rate": 8.551498102691291e-05, + "loss": 5.3079, + "num_input_tokens_seen": 472645632, + "step": 3606 + }, + { + "epoch": 0.6171208720743828, + "grad_norm": 0.7192103266716003, + "learning_rate": 8.547943125151391e-05, + "loss": 5.3905, + "num_input_tokens_seen": 473038848, + "step": 3609 + }, + { + "epoch": 0.6176338570054505, + "grad_norm": 0.734131932258606, + "learning_rate": 8.544392577490364e-05, + "loss": 5.3244, + "num_input_tokens_seen": 473432064, + "step": 3612 + }, + { + "epoch": 0.6181468419365181, + "grad_norm": 0.6585795283317566, + "learning_rate": 8.54084645051568e-05, + "loss": 5.3767, + "num_input_tokens_seen": 473825280, + "step": 3615 + }, + { + "epoch": 0.6186598268675858, + "grad_norm": 0.7334539294242859, + "learning_rate": 8.537304735061498e-05, + "loss": 5.3683, + "num_input_tokens_seen": 474218496, + "step": 3618 + }, + { + "epoch": 0.6191728117986535, + "grad_norm": 0.7433605790138245, + "learning_rate": 8.533767421988556e-05, + "loss": 5.3546, + "num_input_tokens_seen": 474611712, + "step": 3621 + }, + { + "epoch": 0.619685796729721, + "grad_norm": 0.7147760987281799, + "learning_rate": 8.530234502184079e-05, + "loss": 5.3851, + "num_input_tokens_seen": 475004928, + "step": 3624 + }, + { + "epoch": 0.6201987816607887, + "grad_norm": 0.7629379034042358, + "learning_rate": 8.526705966561678e-05, + "loss": 5.3321, + "num_input_tokens_seen": 475398144, + "step": 3627 + }, + { + "epoch": 0.6207117665918563, + "grad_norm": 0.7201533317565918, + "learning_rate": 8.523181806061252e-05, + "loss": 5.3973, + "num_input_tokens_seen": 475791360, + "step": 3630 + }, + { + "epoch": 0.621224751522924, + "grad_norm": 0.7145413756370544, + "learning_rate": 8.519662011648894e-05, + "loss": 5.357, + "num_input_tokens_seen": 476184576, + "step": 3633 + }, + { + "epoch": 0.6217377364539917, + "grad_norm": 0.6908425092697144, + "learning_rate": 8.516146574316785e-05, + "loss": 5.3227, + "num_input_tokens_seen": 476577792, + "step": 3636 + }, + { + "epoch": 0.6222507213850593, + "grad_norm": 0.7059429883956909, + "learning_rate": 8.512635485083106e-05, + "loss": 5.361, + "num_input_tokens_seen": 476971008, + "step": 3639 + }, + { + "epoch": 0.622763706316127, + "grad_norm": 0.7348899841308594, + "learning_rate": 8.509128734991941e-05, + "loss": 5.3768, + "num_input_tokens_seen": 477364224, + "step": 3642 + }, + { + "epoch": 0.6232766912471946, + "grad_norm": 0.8156319260597229, + "learning_rate": 8.505626315113171e-05, + "loss": 5.3989, + "num_input_tokens_seen": 477757440, + "step": 3645 + }, + { + "epoch": 0.6237896761782623, + "grad_norm": 0.776240885257721, + "learning_rate": 8.50212821654239e-05, + "loss": 5.3562, + "num_input_tokens_seen": 478150656, + "step": 3648 + }, + { + "epoch": 0.6243026611093299, + "grad_norm": 0.7777855396270752, + "learning_rate": 8.498634430400809e-05, + "loss": 5.3648, + "num_input_tokens_seen": 478543872, + "step": 3651 + }, + { + "epoch": 0.6248156460403975, + "grad_norm": 0.8572577834129333, + "learning_rate": 8.495144947835149e-05, + "loss": 5.3517, + "num_input_tokens_seen": 478937088, + "step": 3654 + }, + { + "epoch": 0.6253286309714652, + "grad_norm": 0.9028589725494385, + "learning_rate": 8.491659760017563e-05, + "loss": 5.3633, + "num_input_tokens_seen": 479330304, + "step": 3657 + }, + { + "epoch": 0.6258416159025328, + "grad_norm": 0.8123112320899963, + "learning_rate": 8.488178858145536e-05, + "loss": 5.3626, + "num_input_tokens_seen": 479723520, + "step": 3660 + }, + { + "epoch": 0.6263546008336005, + "grad_norm": 0.6874297261238098, + "learning_rate": 8.484702233441784e-05, + "loss": 5.3445, + "num_input_tokens_seen": 480116736, + "step": 3663 + }, + { + "epoch": 0.6268675857646682, + "grad_norm": 0.7794182300567627, + "learning_rate": 8.481229877154171e-05, + "loss": 5.4051, + "num_input_tokens_seen": 480509952, + "step": 3666 + }, + { + "epoch": 0.6273805706957358, + "grad_norm": 0.9119608402252197, + "learning_rate": 8.477761780555616e-05, + "loss": 5.3604, + "num_input_tokens_seen": 480903168, + "step": 3669 + }, + { + "epoch": 0.6278935556268035, + "grad_norm": 0.8943549394607544, + "learning_rate": 8.474297934943995e-05, + "loss": 5.3822, + "num_input_tokens_seen": 481296384, + "step": 3672 + }, + { + "epoch": 0.6284065405578712, + "grad_norm": 0.7565066814422607, + "learning_rate": 8.470838331642053e-05, + "loss": 5.3687, + "num_input_tokens_seen": 481689600, + "step": 3675 + }, + { + "epoch": 0.6289195254889388, + "grad_norm": 0.8296188116073608, + "learning_rate": 8.46738296199731e-05, + "loss": 5.4102, + "num_input_tokens_seen": 482082816, + "step": 3678 + }, + { + "epoch": 0.6294325104200064, + "grad_norm": 0.7473737597465515, + "learning_rate": 8.463931817381974e-05, + "loss": 5.3368, + "num_input_tokens_seen": 482476032, + "step": 3681 + }, + { + "epoch": 0.629945495351074, + "grad_norm": 0.7469894289970398, + "learning_rate": 8.460484889192852e-05, + "loss": 5.347, + "num_input_tokens_seen": 482869248, + "step": 3684 + }, + { + "epoch": 0.6304584802821417, + "grad_norm": 0.7461103796958923, + "learning_rate": 8.457042168851248e-05, + "loss": 5.376, + "num_input_tokens_seen": 483262464, + "step": 3687 + }, + { + "epoch": 0.6309714652132093, + "grad_norm": 0.7393492460250854, + "learning_rate": 8.45360364780289e-05, + "loss": 5.3428, + "num_input_tokens_seen": 483655680, + "step": 3690 + }, + { + "epoch": 0.631484450144277, + "grad_norm": 0.7223976850509644, + "learning_rate": 8.450169317517828e-05, + "loss": 5.3517, + "num_input_tokens_seen": 484048896, + "step": 3693 + }, + { + "epoch": 0.6319974350753447, + "grad_norm": 0.7259080410003662, + "learning_rate": 8.446739169490354e-05, + "loss": 5.3331, + "num_input_tokens_seen": 484442112, + "step": 3696 + }, + { + "epoch": 0.6325104200064123, + "grad_norm": 0.7238535284996033, + "learning_rate": 8.443313195238902e-05, + "loss": 5.3805, + "num_input_tokens_seen": 484835328, + "step": 3699 + }, + { + "epoch": 0.63302340493748, + "grad_norm": 0.7401497960090637, + "learning_rate": 8.439891386305977e-05, + "loss": 5.399, + "num_input_tokens_seen": 485228544, + "step": 3702 + }, + { + "epoch": 0.6335363898685477, + "grad_norm": 0.8473367094993591, + "learning_rate": 8.436473734258046e-05, + "loss": 5.3433, + "num_input_tokens_seen": 485621760, + "step": 3705 + }, + { + "epoch": 0.6340493747996152, + "grad_norm": 0.8806385397911072, + "learning_rate": 8.433060230685473e-05, + "loss": 5.4077, + "num_input_tokens_seen": 486014976, + "step": 3708 + }, + { + "epoch": 0.6345623597306829, + "grad_norm": 0.6987698078155518, + "learning_rate": 8.429650867202415e-05, + "loss": 5.356, + "num_input_tokens_seen": 486408192, + "step": 3711 + }, + { + "epoch": 0.6350753446617505, + "grad_norm": 0.74212247133255, + "learning_rate": 8.426245635446741e-05, + "loss": 5.3557, + "num_input_tokens_seen": 486801408, + "step": 3714 + }, + { + "epoch": 0.6355883295928182, + "grad_norm": 0.7430636882781982, + "learning_rate": 8.422844527079955e-05, + "loss": 5.3656, + "num_input_tokens_seen": 487194624, + "step": 3717 + }, + { + "epoch": 0.6361013145238859, + "grad_norm": 0.7818143963813782, + "learning_rate": 8.419447533787093e-05, + "loss": 5.3528, + "num_input_tokens_seen": 487587840, + "step": 3720 + }, + { + "epoch": 0.6366142994549535, + "grad_norm": 0.7322751879692078, + "learning_rate": 8.416054647276643e-05, + "loss": 5.35, + "num_input_tokens_seen": 487981056, + "step": 3723 + }, + { + "epoch": 0.6371272843860212, + "grad_norm": 0.7790704369544983, + "learning_rate": 8.41266585928048e-05, + "loss": 5.3617, + "num_input_tokens_seen": 488374272, + "step": 3726 + }, + { + "epoch": 0.6376402693170888, + "grad_norm": 0.8650009632110596, + "learning_rate": 8.409281161553747e-05, + "loss": 5.3689, + "num_input_tokens_seen": 488767488, + "step": 3729 + }, + { + "epoch": 0.6381532542481565, + "grad_norm": 0.8796480298042297, + "learning_rate": 8.405900545874799e-05, + "loss": 5.362, + "num_input_tokens_seen": 489160704, + "step": 3732 + }, + { + "epoch": 0.6386662391792242, + "grad_norm": 0.7147157788276672, + "learning_rate": 8.402524004045107e-05, + "loss": 5.3659, + "num_input_tokens_seen": 489553920, + "step": 3735 + }, + { + "epoch": 0.6391792241102917, + "grad_norm": 0.7321900725364685, + "learning_rate": 8.399151527889171e-05, + "loss": 5.4014, + "num_input_tokens_seen": 489947136, + "step": 3738 + }, + { + "epoch": 0.6396922090413594, + "grad_norm": 0.7102051973342896, + "learning_rate": 8.39578310925445e-05, + "loss": 5.3454, + "num_input_tokens_seen": 490340352, + "step": 3741 + }, + { + "epoch": 0.640205193972427, + "grad_norm": 0.7498131990432739, + "learning_rate": 8.392418740011266e-05, + "loss": 5.3246, + "num_input_tokens_seen": 490733568, + "step": 3744 + }, + { + "epoch": 0.6407181789034947, + "grad_norm": 0.7187573313713074, + "learning_rate": 8.389058412052728e-05, + "loss": 5.2761, + "num_input_tokens_seen": 491126784, + "step": 3747 + }, + { + "epoch": 0.6412311638345624, + "grad_norm": 0.6994777917861938, + "learning_rate": 8.385702117294651e-05, + "loss": 5.329, + "num_input_tokens_seen": 491520000, + "step": 3750 + }, + { + "epoch": 0.64174414876563, + "grad_norm": 0.7428282499313354, + "learning_rate": 8.382349847675467e-05, + "loss": 5.3071, + "num_input_tokens_seen": 491913216, + "step": 3753 + }, + { + "epoch": 0.6422571336966977, + "grad_norm": 0.7503423094749451, + "learning_rate": 8.379001595156155e-05, + "loss": 5.3232, + "num_input_tokens_seen": 492306432, + "step": 3756 + }, + { + "epoch": 0.6427701186277653, + "grad_norm": 0.7488569617271423, + "learning_rate": 8.375657351720148e-05, + "loss": 5.3871, + "num_input_tokens_seen": 492699648, + "step": 3759 + }, + { + "epoch": 0.643283103558833, + "grad_norm": 0.8150780200958252, + "learning_rate": 8.372317109373264e-05, + "loss": 5.3916, + "num_input_tokens_seen": 493092864, + "step": 3762 + }, + { + "epoch": 0.6437960884899007, + "grad_norm": 0.8473458290100098, + "learning_rate": 8.368980860143615e-05, + "loss": 5.3534, + "num_input_tokens_seen": 493486080, + "step": 3765 + }, + { + "epoch": 0.6443090734209682, + "grad_norm": 0.6835038661956787, + "learning_rate": 8.365648596081538e-05, + "loss": 5.3142, + "num_input_tokens_seen": 493879296, + "step": 3768 + }, + { + "epoch": 0.6448220583520359, + "grad_norm": 0.7488033771514893, + "learning_rate": 8.362320309259501e-05, + "loss": 5.3631, + "num_input_tokens_seen": 494272512, + "step": 3771 + }, + { + "epoch": 0.6453350432831035, + "grad_norm": 0.8691450357437134, + "learning_rate": 8.35899599177204e-05, + "loss": 5.3643, + "num_input_tokens_seen": 494665728, + "step": 3774 + }, + { + "epoch": 0.6458480282141712, + "grad_norm": 0.7763018012046814, + "learning_rate": 8.355675635735668e-05, + "loss": 5.2909, + "num_input_tokens_seen": 495058944, + "step": 3777 + }, + { + "epoch": 0.6463610131452389, + "grad_norm": 0.8163045644760132, + "learning_rate": 8.352359233288804e-05, + "loss": 5.3112, + "num_input_tokens_seen": 495452160, + "step": 3780 + }, + { + "epoch": 0.6468739980763065, + "grad_norm": 0.7238712906837463, + "learning_rate": 8.349046776591689e-05, + "loss": 5.325, + "num_input_tokens_seen": 495845376, + "step": 3783 + }, + { + "epoch": 0.6473869830073742, + "grad_norm": 0.6693491339683533, + "learning_rate": 8.345738257826312e-05, + "loss": 5.3193, + "num_input_tokens_seen": 496238592, + "step": 3786 + }, + { + "epoch": 0.6478999679384418, + "grad_norm": 0.6386433243751526, + "learning_rate": 8.342433669196334e-05, + "loss": 5.3531, + "num_input_tokens_seen": 496631808, + "step": 3789 + }, + { + "epoch": 0.6484129528695095, + "grad_norm": 0.6837417483329773, + "learning_rate": 8.339133002927001e-05, + "loss": 5.309, + "num_input_tokens_seen": 497025024, + "step": 3792 + }, + { + "epoch": 0.6489259378005771, + "grad_norm": 0.7823799252510071, + "learning_rate": 8.335836251265084e-05, + "loss": 5.3361, + "num_input_tokens_seen": 497418240, + "step": 3795 + }, + { + "epoch": 0.6494389227316447, + "grad_norm": 0.7094940543174744, + "learning_rate": 8.332543406478784e-05, + "loss": 5.3467, + "num_input_tokens_seen": 497811456, + "step": 3798 + }, + { + "epoch": 0.6499519076627124, + "grad_norm": 0.6330589056015015, + "learning_rate": 8.329254460857673e-05, + "loss": 5.2885, + "num_input_tokens_seen": 498204672, + "step": 3801 + }, + { + "epoch": 0.65046489259378, + "grad_norm": 0.716102123260498, + "learning_rate": 8.325969406712602e-05, + "loss": 5.3826, + "num_input_tokens_seen": 498597888, + "step": 3804 + }, + { + "epoch": 0.6509778775248477, + "grad_norm": 0.7873062491416931, + "learning_rate": 8.322688236375638e-05, + "loss": 5.3422, + "num_input_tokens_seen": 498991104, + "step": 3807 + }, + { + "epoch": 0.6514908624559154, + "grad_norm": 0.8517410159111023, + "learning_rate": 8.319410942199984e-05, + "loss": 5.3181, + "num_input_tokens_seen": 499384320, + "step": 3810 + }, + { + "epoch": 0.652003847386983, + "grad_norm": 0.7410987615585327, + "learning_rate": 8.316137516559907e-05, + "loss": 5.3124, + "num_input_tokens_seen": 499777536, + "step": 3813 + }, + { + "epoch": 0.6525168323180507, + "grad_norm": 0.7573698163032532, + "learning_rate": 8.312867951850651e-05, + "loss": 5.3404, + "num_input_tokens_seen": 500170752, + "step": 3816 + }, + { + "epoch": 0.6530298172491183, + "grad_norm": 0.876664936542511, + "learning_rate": 8.309602240488386e-05, + "loss": 5.3598, + "num_input_tokens_seen": 500563968, + "step": 3819 + }, + { + "epoch": 0.653542802180186, + "grad_norm": 0.7487987875938416, + "learning_rate": 8.306340374910112e-05, + "loss": 5.3531, + "num_input_tokens_seen": 500957184, + "step": 3822 + }, + { + "epoch": 0.6540557871112536, + "grad_norm": 0.865337610244751, + "learning_rate": 8.303082347573595e-05, + "loss": 5.3075, + "num_input_tokens_seen": 501350400, + "step": 3825 + }, + { + "epoch": 0.6545687720423212, + "grad_norm": 0.7958502769470215, + "learning_rate": 8.299828150957296e-05, + "loss": 5.3461, + "num_input_tokens_seen": 501743616, + "step": 3828 + }, + { + "epoch": 0.6550817569733889, + "grad_norm": 0.7690322399139404, + "learning_rate": 8.29657777756029e-05, + "loss": 5.2989, + "num_input_tokens_seen": 502136832, + "step": 3831 + }, + { + "epoch": 0.6555947419044565, + "grad_norm": 0.7034088969230652, + "learning_rate": 8.29333121990221e-05, + "loss": 5.3167, + "num_input_tokens_seen": 502530048, + "step": 3834 + }, + { + "epoch": 0.6561077268355242, + "grad_norm": 0.7316693067550659, + "learning_rate": 8.29008847052315e-05, + "loss": 5.397, + "num_input_tokens_seen": 502923264, + "step": 3837 + }, + { + "epoch": 0.6566207117665919, + "grad_norm": 0.7567901611328125, + "learning_rate": 8.28684952198361e-05, + "loss": 5.31, + "num_input_tokens_seen": 503316480, + "step": 3840 + }, + { + "epoch": 0.6571336966976595, + "grad_norm": 0.7767483592033386, + "learning_rate": 8.283614366864425e-05, + "loss": 5.3663, + "num_input_tokens_seen": 503709696, + "step": 3843 + }, + { + "epoch": 0.6576466816287272, + "grad_norm": 0.7799834609031677, + "learning_rate": 8.280382997766685e-05, + "loss": 5.3231, + "num_input_tokens_seen": 504102912, + "step": 3846 + }, + { + "epoch": 0.6581596665597949, + "grad_norm": 0.810688316822052, + "learning_rate": 8.277155407311666e-05, + "loss": 5.3251, + "num_input_tokens_seen": 504496128, + "step": 3849 + }, + { + "epoch": 0.6586726514908624, + "grad_norm": 0.7437490820884705, + "learning_rate": 8.27393158814077e-05, + "loss": 5.3464, + "num_input_tokens_seen": 504889344, + "step": 3852 + }, + { + "epoch": 0.6591856364219301, + "grad_norm": 0.7459340691566467, + "learning_rate": 8.270711532915435e-05, + "loss": 5.3044, + "num_input_tokens_seen": 505282560, + "step": 3855 + }, + { + "epoch": 0.6596986213529977, + "grad_norm": 0.8228518962860107, + "learning_rate": 8.267495234317081e-05, + "loss": 5.3375, + "num_input_tokens_seen": 505675776, + "step": 3858 + }, + { + "epoch": 0.6602116062840654, + "grad_norm": 0.6991084218025208, + "learning_rate": 8.264282685047038e-05, + "loss": 5.3671, + "num_input_tokens_seen": 506068992, + "step": 3861 + }, + { + "epoch": 0.660724591215133, + "grad_norm": 0.7868938446044922, + "learning_rate": 8.261073877826466e-05, + "loss": 5.3483, + "num_input_tokens_seen": 506462208, + "step": 3864 + }, + { + "epoch": 0.6612375761462007, + "grad_norm": 0.7679697275161743, + "learning_rate": 8.2578688053963e-05, + "loss": 5.3693, + "num_input_tokens_seen": 506855424, + "step": 3867 + }, + { + "epoch": 0.6617505610772684, + "grad_norm": 0.7753176689147949, + "learning_rate": 8.254667460517166e-05, + "loss": 5.3721, + "num_input_tokens_seen": 507248640, + "step": 3870 + }, + { + "epoch": 0.662263546008336, + "grad_norm": 0.8868235945701599, + "learning_rate": 8.251469835969328e-05, + "loss": 5.3506, + "num_input_tokens_seen": 507641856, + "step": 3873 + }, + { + "epoch": 0.6627765309394037, + "grad_norm": 0.9391675591468811, + "learning_rate": 8.248275924552608e-05, + "loss": 5.2984, + "num_input_tokens_seen": 508035072, + "step": 3876 + }, + { + "epoch": 0.6632895158704714, + "grad_norm": 0.757840633392334, + "learning_rate": 8.245085719086321e-05, + "loss": 5.3594, + "num_input_tokens_seen": 508428288, + "step": 3879 + }, + { + "epoch": 0.6638025008015389, + "grad_norm": 0.7130749821662903, + "learning_rate": 8.24189921240921e-05, + "loss": 5.338, + "num_input_tokens_seen": 508821504, + "step": 3882 + }, + { + "epoch": 0.6643154857326066, + "grad_norm": 0.8119063973426819, + "learning_rate": 8.238716397379381e-05, + "loss": 5.3518, + "num_input_tokens_seen": 509214720, + "step": 3885 + }, + { + "epoch": 0.6648284706636742, + "grad_norm": 0.7790501117706299, + "learning_rate": 8.235537266874219e-05, + "loss": 5.3012, + "num_input_tokens_seen": 509607936, + "step": 3888 + }, + { + "epoch": 0.6653414555947419, + "grad_norm": 0.7022935152053833, + "learning_rate": 8.232361813790342e-05, + "loss": 5.2907, + "num_input_tokens_seen": 510001152, + "step": 3891 + }, + { + "epoch": 0.6658544405258096, + "grad_norm": 0.8035649657249451, + "learning_rate": 8.229190031043528e-05, + "loss": 5.2965, + "num_input_tokens_seen": 510394368, + "step": 3894 + }, + { + "epoch": 0.6663674254568772, + "grad_norm": 0.6912944316864014, + "learning_rate": 8.22602191156864e-05, + "loss": 5.3651, + "num_input_tokens_seen": 510787584, + "step": 3897 + }, + { + "epoch": 0.6668804103879449, + "grad_norm": 0.6894614100456238, + "learning_rate": 8.222857448319569e-05, + "loss": 5.2651, + "num_input_tokens_seen": 511180800, + "step": 3900 + }, + { + "epoch": 0.6673933953190125, + "grad_norm": 0.7291402220726013, + "learning_rate": 8.219696634269164e-05, + "loss": 5.3479, + "num_input_tokens_seen": 511574016, + "step": 3903 + }, + { + "epoch": 0.6679063802500802, + "grad_norm": 0.7843152284622192, + "learning_rate": 8.21653946240917e-05, + "loss": 5.2789, + "num_input_tokens_seen": 511967232, + "step": 3906 + }, + { + "epoch": 0.6684193651811478, + "grad_norm": 0.6679741144180298, + "learning_rate": 8.21338592575016e-05, + "loss": 5.2875, + "num_input_tokens_seen": 512360448, + "step": 3909 + }, + { + "epoch": 0.6689323501122154, + "grad_norm": 0.709000289440155, + "learning_rate": 8.210236017321469e-05, + "loss": 5.3695, + "num_input_tokens_seen": 512753664, + "step": 3912 + }, + { + "epoch": 0.6694453350432831, + "grad_norm": 0.701045036315918, + "learning_rate": 8.207089730171132e-05, + "loss": 5.3274, + "num_input_tokens_seen": 513146880, + "step": 3915 + }, + { + "epoch": 0.6699583199743507, + "grad_norm": 0.741085946559906, + "learning_rate": 8.203947057365817e-05, + "loss": 5.3338, + "num_input_tokens_seen": 513540096, + "step": 3918 + }, + { + "epoch": 0.6704713049054184, + "grad_norm": 0.7441378235816956, + "learning_rate": 8.200807991990765e-05, + "loss": 5.3587, + "num_input_tokens_seen": 513933312, + "step": 3921 + }, + { + "epoch": 0.6709842898364861, + "grad_norm": 0.6489686965942383, + "learning_rate": 8.197672527149723e-05, + "loss": 5.3287, + "num_input_tokens_seen": 514326528, + "step": 3924 + }, + { + "epoch": 0.6714972747675537, + "grad_norm": 0.6942331194877625, + "learning_rate": 8.194540655964876e-05, + "loss": 5.3292, + "num_input_tokens_seen": 514719744, + "step": 3927 + }, + { + "epoch": 0.6720102596986214, + "grad_norm": 0.8609412312507629, + "learning_rate": 8.191412371576794e-05, + "loss": 5.3108, + "num_input_tokens_seen": 515112960, + "step": 3930 + }, + { + "epoch": 0.672523244629689, + "grad_norm": 0.866399347782135, + "learning_rate": 8.188287667144362e-05, + "loss": 5.3481, + "num_input_tokens_seen": 515506176, + "step": 3933 + }, + { + "epoch": 0.6730362295607567, + "grad_norm": 0.6309357285499573, + "learning_rate": 8.185166535844714e-05, + "loss": 5.3815, + "num_input_tokens_seen": 515899392, + "step": 3936 + }, + { + "epoch": 0.6735492144918243, + "grad_norm": 0.8205288052558899, + "learning_rate": 8.182048970873184e-05, + "loss": 5.359, + "num_input_tokens_seen": 516292608, + "step": 3939 + }, + { + "epoch": 0.6740621994228919, + "grad_norm": 0.8117387294769287, + "learning_rate": 8.178934965443225e-05, + "loss": 5.2779, + "num_input_tokens_seen": 516685824, + "step": 3942 + }, + { + "epoch": 0.6745751843539596, + "grad_norm": 0.7551442980766296, + "learning_rate": 8.175824512786359e-05, + "loss": 5.363, + "num_input_tokens_seen": 517079040, + "step": 3945 + }, + { + "epoch": 0.6750881692850272, + "grad_norm": 0.7865480184555054, + "learning_rate": 8.172717606152118e-05, + "loss": 5.3067, + "num_input_tokens_seen": 517472256, + "step": 3948 + }, + { + "epoch": 0.6756011542160949, + "grad_norm": 0.7011180520057678, + "learning_rate": 8.16961423880797e-05, + "loss": 5.2977, + "num_input_tokens_seen": 517865472, + "step": 3951 + }, + { + "epoch": 0.6761141391471626, + "grad_norm": 0.7099843621253967, + "learning_rate": 8.166514404039269e-05, + "loss": 5.3637, + "num_input_tokens_seen": 518258688, + "step": 3954 + }, + { + "epoch": 0.6766271240782302, + "grad_norm": 0.760845422744751, + "learning_rate": 8.16341809514919e-05, + "loss": 5.307, + "num_input_tokens_seen": 518651904, + "step": 3957 + }, + { + "epoch": 0.6771401090092979, + "grad_norm": 0.8513478636741638, + "learning_rate": 8.160325305458667e-05, + "loss": 5.3293, + "num_input_tokens_seen": 519045120, + "step": 3960 + }, + { + "epoch": 0.6776530939403655, + "grad_norm": 0.804071307182312, + "learning_rate": 8.157236028306331e-05, + "loss": 5.3303, + "num_input_tokens_seen": 519438336, + "step": 3963 + }, + { + "epoch": 0.6781660788714331, + "grad_norm": 0.7225372791290283, + "learning_rate": 8.154150257048457e-05, + "loss": 5.3751, + "num_input_tokens_seen": 519831552, + "step": 3966 + }, + { + "epoch": 0.6786790638025008, + "grad_norm": 0.7206907272338867, + "learning_rate": 8.151067985058899e-05, + "loss": 5.3409, + "num_input_tokens_seen": 520224768, + "step": 3969 + }, + { + "epoch": 0.6791920487335684, + "grad_norm": 0.7497385740280151, + "learning_rate": 8.147989205729026e-05, + "loss": 5.3236, + "num_input_tokens_seen": 520617984, + "step": 3972 + }, + { + "epoch": 0.6797050336646361, + "grad_norm": 0.7918210029602051, + "learning_rate": 8.14491391246767e-05, + "loss": 5.3549, + "num_input_tokens_seen": 521011200, + "step": 3975 + }, + { + "epoch": 0.6802180185957037, + "grad_norm": 0.8181982636451721, + "learning_rate": 8.141842098701066e-05, + "loss": 5.3461, + "num_input_tokens_seen": 521404416, + "step": 3978 + }, + { + "epoch": 0.6807310035267714, + "grad_norm": 0.7581628561019897, + "learning_rate": 8.138773757872787e-05, + "loss": 5.3411, + "num_input_tokens_seen": 521797632, + "step": 3981 + }, + { + "epoch": 0.6812439884578391, + "grad_norm": 0.7180889844894409, + "learning_rate": 8.135708883443689e-05, + "loss": 5.3157, + "num_input_tokens_seen": 522190848, + "step": 3984 + }, + { + "epoch": 0.6817569733889067, + "grad_norm": 0.7033380270004272, + "learning_rate": 8.132647468891857e-05, + "loss": 5.3305, + "num_input_tokens_seen": 522584064, + "step": 3987 + }, + { + "epoch": 0.6822699583199744, + "grad_norm": 0.7060420513153076, + "learning_rate": 8.129589507712537e-05, + "loss": 5.3815, + "num_input_tokens_seen": 522977280, + "step": 3990 + }, + { + "epoch": 0.682782943251042, + "grad_norm": 0.7359711527824402, + "learning_rate": 8.126534993418085e-05, + "loss": 5.3349, + "num_input_tokens_seen": 523370496, + "step": 3993 + }, + { + "epoch": 0.6832959281821096, + "grad_norm": 0.8105011582374573, + "learning_rate": 8.123483919537908e-05, + "loss": 5.3192, + "num_input_tokens_seen": 523763712, + "step": 3996 + }, + { + "epoch": 0.6838089131131773, + "grad_norm": 0.7450350522994995, + "learning_rate": 8.120436279618406e-05, + "loss": 5.3175, + "num_input_tokens_seen": 524156928, + "step": 3999 + }, + { + "epoch": 0.6839799080901998, + "eval_accuracy": 0.18136948379742712, + "eval_loss": 5.791625022888184, + "eval_runtime": 111.6249, + "eval_samples_per_second": 2.688, + "eval_steps_per_second": 1.344, + "num_input_tokens_seen": 524288000, + "step": 4000 + }, + { + "epoch": 0.6843218980442449, + "grad_norm": 0.742009162902832, + "learning_rate": 8.117392067222913e-05, + "loss": 5.2929, + "num_input_tokens_seen": 524550144, + "step": 4002 + }, + { + "epoch": 0.6848348829753126, + "grad_norm": 0.8065789937973022, + "learning_rate": 8.114351275931643e-05, + "loss": 5.3041, + "num_input_tokens_seen": 524943360, + "step": 4005 + }, + { + "epoch": 0.6853478679063802, + "grad_norm": 0.8113951086997986, + "learning_rate": 8.111313899341628e-05, + "loss": 5.3378, + "num_input_tokens_seen": 525336576, + "step": 4008 + }, + { + "epoch": 0.6858608528374479, + "grad_norm": 0.7892742156982422, + "learning_rate": 8.10827993106667e-05, + "loss": 5.3148, + "num_input_tokens_seen": 525729792, + "step": 4011 + }, + { + "epoch": 0.6863738377685156, + "grad_norm": 0.8820670247077942, + "learning_rate": 8.105249364737273e-05, + "loss": 5.2681, + "num_input_tokens_seen": 526123008, + "step": 4014 + }, + { + "epoch": 0.6868868226995832, + "grad_norm": 0.7888779640197754, + "learning_rate": 8.102222194000602e-05, + "loss": 5.3129, + "num_input_tokens_seen": 526516224, + "step": 4017 + }, + { + "epoch": 0.6873998076306509, + "grad_norm": 0.8002054691314697, + "learning_rate": 8.099198412520408e-05, + "loss": 5.2667, + "num_input_tokens_seen": 526909440, + "step": 4020 + }, + { + "epoch": 0.6879127925617184, + "grad_norm": 0.8794166445732117, + "learning_rate": 8.096178013976995e-05, + "loss": 5.3393, + "num_input_tokens_seen": 527302656, + "step": 4023 + }, + { + "epoch": 0.6884257774927861, + "grad_norm": 0.8726845979690552, + "learning_rate": 8.093160992067137e-05, + "loss": 5.3492, + "num_input_tokens_seen": 527695872, + "step": 4026 + }, + { + "epoch": 0.6889387624238538, + "grad_norm": 0.7601503133773804, + "learning_rate": 8.09014734050405e-05, + "loss": 5.3517, + "num_input_tokens_seen": 528089088, + "step": 4029 + }, + { + "epoch": 0.6894517473549214, + "grad_norm": 0.8088439106941223, + "learning_rate": 8.087137053017323e-05, + "loss": 5.3387, + "num_input_tokens_seen": 528482304, + "step": 4032 + }, + { + "epoch": 0.6899647322859891, + "grad_norm": 0.7194026112556458, + "learning_rate": 8.084130123352858e-05, + "loss": 5.3721, + "num_input_tokens_seen": 528875520, + "step": 4035 + }, + { + "epoch": 0.6904777172170568, + "grad_norm": 0.7850322723388672, + "learning_rate": 8.081126545272833e-05, + "loss": 5.3402, + "num_input_tokens_seen": 529268736, + "step": 4038 + }, + { + "epoch": 0.6909907021481244, + "grad_norm": 0.6744662523269653, + "learning_rate": 8.078126312555625e-05, + "loss": 5.3444, + "num_input_tokens_seen": 529661952, + "step": 4041 + }, + { + "epoch": 0.6915036870791921, + "grad_norm": 0.7170355916023254, + "learning_rate": 8.075129418995781e-05, + "loss": 5.3821, + "num_input_tokens_seen": 530055168, + "step": 4044 + }, + { + "epoch": 0.6920166720102597, + "grad_norm": 0.7175304889678955, + "learning_rate": 8.072135858403943e-05, + "loss": 5.3562, + "num_input_tokens_seen": 530448384, + "step": 4047 + }, + { + "epoch": 0.6925296569413274, + "grad_norm": 0.7564287185668945, + "learning_rate": 8.069145624606803e-05, + "loss": 5.3044, + "num_input_tokens_seen": 530841600, + "step": 4050 + }, + { + "epoch": 0.693042641872395, + "grad_norm": 0.7804622650146484, + "learning_rate": 8.06615871144705e-05, + "loss": 5.3709, + "num_input_tokens_seen": 531234816, + "step": 4053 + }, + { + "epoch": 0.6935556268034626, + "grad_norm": 0.7738920450210571, + "learning_rate": 8.063175112783317e-05, + "loss": 5.3234, + "num_input_tokens_seen": 531628032, + "step": 4056 + }, + { + "epoch": 0.6940686117345303, + "grad_norm": 0.7369733452796936, + "learning_rate": 8.060194822490128e-05, + "loss": 5.2849, + "num_input_tokens_seen": 532021248, + "step": 4059 + }, + { + "epoch": 0.6945815966655979, + "grad_norm": 0.6854174733161926, + "learning_rate": 8.057217834457838e-05, + "loss": 5.3224, + "num_input_tokens_seen": 532414464, + "step": 4062 + }, + { + "epoch": 0.6950945815966656, + "grad_norm": 0.6956667304039001, + "learning_rate": 8.054244142592593e-05, + "loss": 5.3002, + "num_input_tokens_seen": 532807680, + "step": 4065 + }, + { + "epoch": 0.6956075665277333, + "grad_norm": 0.8726805448532104, + "learning_rate": 8.051273740816265e-05, + "loss": 5.3259, + "num_input_tokens_seen": 533200896, + "step": 4068 + }, + { + "epoch": 0.6961205514588009, + "grad_norm": 0.7644637227058411, + "learning_rate": 8.048306623066408e-05, + "loss": 5.3527, + "num_input_tokens_seen": 533594112, + "step": 4071 + }, + { + "epoch": 0.6966335363898686, + "grad_norm": 0.7367181181907654, + "learning_rate": 8.045342783296207e-05, + "loss": 5.2681, + "num_input_tokens_seen": 533987328, + "step": 4074 + }, + { + "epoch": 0.6971465213209362, + "grad_norm": 0.8064795732498169, + "learning_rate": 8.04238221547442e-05, + "loss": 5.3359, + "num_input_tokens_seen": 534380544, + "step": 4077 + }, + { + "epoch": 0.6976595062520039, + "grad_norm": 0.7639942169189453, + "learning_rate": 8.03942491358533e-05, + "loss": 5.3703, + "num_input_tokens_seen": 534773760, + "step": 4080 + }, + { + "epoch": 0.6981724911830715, + "grad_norm": 0.7479289770126343, + "learning_rate": 8.036470871628689e-05, + "loss": 5.3445, + "num_input_tokens_seen": 535166976, + "step": 4083 + }, + { + "epoch": 0.6986854761141391, + "grad_norm": 0.9488199949264526, + "learning_rate": 8.033520083619678e-05, + "loss": 5.3398, + "num_input_tokens_seen": 535560192, + "step": 4086 + }, + { + "epoch": 0.6991984610452068, + "grad_norm": 0.8831247687339783, + "learning_rate": 8.030572543588844e-05, + "loss": 5.3225, + "num_input_tokens_seen": 535953408, + "step": 4089 + }, + { + "epoch": 0.6997114459762744, + "grad_norm": 0.787559449672699, + "learning_rate": 8.027628245582056e-05, + "loss": 5.3118, + "num_input_tokens_seen": 536346624, + "step": 4092 + }, + { + "epoch": 0.7002244309073421, + "grad_norm": 0.8601611852645874, + "learning_rate": 8.024687183660457e-05, + "loss": 5.3217, + "num_input_tokens_seen": 536739840, + "step": 4095 + }, + { + "epoch": 0.7007374158384098, + "grad_norm": 0.762852668762207, + "learning_rate": 8.021749351900399e-05, + "loss": 5.3815, + "num_input_tokens_seen": 537133056, + "step": 4098 + }, + { + "epoch": 0.7012504007694774, + "grad_norm": 0.6953300833702087, + "learning_rate": 8.018814744393415e-05, + "loss": 5.3272, + "num_input_tokens_seen": 537526272, + "step": 4101 + }, + { + "epoch": 0.7017633857005451, + "grad_norm": 0.7371882796287537, + "learning_rate": 8.01588335524615e-05, + "loss": 5.2754, + "num_input_tokens_seen": 537919488, + "step": 4104 + }, + { + "epoch": 0.7022763706316127, + "grad_norm": 0.7776148319244385, + "learning_rate": 8.01295517858032e-05, + "loss": 5.3249, + "num_input_tokens_seen": 538312704, + "step": 4107 + }, + { + "epoch": 0.7027893555626803, + "grad_norm": 0.7305675745010376, + "learning_rate": 8.010030208532664e-05, + "loss": 5.3191, + "num_input_tokens_seen": 538705920, + "step": 4110 + }, + { + "epoch": 0.703302340493748, + "grad_norm": 0.7893403172492981, + "learning_rate": 8.007108439254888e-05, + "loss": 5.2771, + "num_input_tokens_seen": 539099136, + "step": 4113 + }, + { + "epoch": 0.7038153254248156, + "grad_norm": 0.9661571383476257, + "learning_rate": 8.004189864913625e-05, + "loss": 5.34, + "num_input_tokens_seen": 539492352, + "step": 4116 + }, + { + "epoch": 0.7043283103558833, + "grad_norm": 0.7898194789886475, + "learning_rate": 8.001274479690375e-05, + "loss": 5.3278, + "num_input_tokens_seen": 539885568, + "step": 4119 + }, + { + "epoch": 0.7048412952869509, + "grad_norm": 0.7659739851951599, + "learning_rate": 7.998362277781467e-05, + "loss": 5.3226, + "num_input_tokens_seen": 540278784, + "step": 4122 + }, + { + "epoch": 0.7053542802180186, + "grad_norm": 0.889785885810852, + "learning_rate": 7.995453253398004e-05, + "loss": 5.2994, + "num_input_tokens_seen": 540672000, + "step": 4125 + }, + { + "epoch": 0.7058672651490863, + "grad_norm": 0.7162066698074341, + "learning_rate": 7.992547400765813e-05, + "loss": 5.3294, + "num_input_tokens_seen": 541065216, + "step": 4128 + }, + { + "epoch": 0.7063802500801539, + "grad_norm": 0.6695894002914429, + "learning_rate": 7.989644714125407e-05, + "loss": 5.296, + "num_input_tokens_seen": 541458432, + "step": 4131 + }, + { + "epoch": 0.7068932350112216, + "grad_norm": 0.7473436594009399, + "learning_rate": 7.986745187731925e-05, + "loss": 5.3078, + "num_input_tokens_seen": 541851648, + "step": 4134 + }, + { + "epoch": 0.7074062199422892, + "grad_norm": 0.7301039099693298, + "learning_rate": 7.983848815855091e-05, + "loss": 5.3322, + "num_input_tokens_seen": 542244864, + "step": 4137 + }, + { + "epoch": 0.7079192048733568, + "grad_norm": 0.7744476795196533, + "learning_rate": 7.980955592779166e-05, + "loss": 5.3224, + "num_input_tokens_seen": 542638080, + "step": 4140 + }, + { + "epoch": 0.7084321898044245, + "grad_norm": 0.7293386459350586, + "learning_rate": 7.978065512802896e-05, + "loss": 5.3634, + "num_input_tokens_seen": 543031296, + "step": 4143 + }, + { + "epoch": 0.7089451747354921, + "grad_norm": 0.6727131009101868, + "learning_rate": 7.975178570239474e-05, + "loss": 5.3217, + "num_input_tokens_seen": 543424512, + "step": 4146 + }, + { + "epoch": 0.7094581596665598, + "grad_norm": 0.7421523928642273, + "learning_rate": 7.972294759416482e-05, + "loss": 5.3374, + "num_input_tokens_seen": 543817728, + "step": 4149 + }, + { + "epoch": 0.7099711445976274, + "grad_norm": 0.6859121918678284, + "learning_rate": 7.969414074675855e-05, + "loss": 5.3435, + "num_input_tokens_seen": 544210944, + "step": 4152 + }, + { + "epoch": 0.7104841295286951, + "grad_norm": 0.7437335848808289, + "learning_rate": 7.966536510373822e-05, + "loss": 5.3121, + "num_input_tokens_seen": 544604160, + "step": 4155 + }, + { + "epoch": 0.7109971144597628, + "grad_norm": 0.7755163311958313, + "learning_rate": 7.963662060880875e-05, + "loss": 5.3148, + "num_input_tokens_seen": 544997376, + "step": 4158 + }, + { + "epoch": 0.7115100993908304, + "grad_norm": 0.7485529780387878, + "learning_rate": 7.960790720581703e-05, + "loss": 5.2968, + "num_input_tokens_seen": 545390592, + "step": 4161 + }, + { + "epoch": 0.7120230843218981, + "grad_norm": 0.7412263751029968, + "learning_rate": 7.957922483875172e-05, + "loss": 5.3184, + "num_input_tokens_seen": 545783808, + "step": 4164 + }, + { + "epoch": 0.7125360692529656, + "grad_norm": 0.7376791834831238, + "learning_rate": 7.95505734517425e-05, + "loss": 5.3002, + "num_input_tokens_seen": 546177024, + "step": 4167 + }, + { + "epoch": 0.7130490541840333, + "grad_norm": 0.7660586833953857, + "learning_rate": 7.952195298905983e-05, + "loss": 5.3282, + "num_input_tokens_seen": 546570240, + "step": 4170 + }, + { + "epoch": 0.713562039115101, + "grad_norm": 0.729190468788147, + "learning_rate": 7.949336339511443e-05, + "loss": 5.3018, + "num_input_tokens_seen": 546963456, + "step": 4173 + }, + { + "epoch": 0.7140750240461686, + "grad_norm": 0.7029862403869629, + "learning_rate": 7.946480461445679e-05, + "loss": 5.2991, + "num_input_tokens_seen": 547356672, + "step": 4176 + }, + { + "epoch": 0.7145880089772363, + "grad_norm": 0.8115814924240112, + "learning_rate": 7.943627659177671e-05, + "loss": 5.2578, + "num_input_tokens_seen": 547749888, + "step": 4179 + }, + { + "epoch": 0.715100993908304, + "grad_norm": 0.9112886190414429, + "learning_rate": 7.940777927190298e-05, + "loss": 5.3422, + "num_input_tokens_seen": 548143104, + "step": 4182 + }, + { + "epoch": 0.7156139788393716, + "grad_norm": 0.7369971871376038, + "learning_rate": 7.937931259980275e-05, + "loss": 5.3049, + "num_input_tokens_seen": 548536320, + "step": 4185 + }, + { + "epoch": 0.7161269637704393, + "grad_norm": 0.7676149606704712, + "learning_rate": 7.935087652058122e-05, + "loss": 5.3626, + "num_input_tokens_seen": 548929536, + "step": 4188 + }, + { + "epoch": 0.7166399487015069, + "grad_norm": 0.6889289617538452, + "learning_rate": 7.932247097948111e-05, + "loss": 5.2493, + "num_input_tokens_seen": 549322752, + "step": 4191 + }, + { + "epoch": 0.7171529336325746, + "grad_norm": 0.7226115465164185, + "learning_rate": 7.929409592188228e-05, + "loss": 5.3026, + "num_input_tokens_seen": 549715968, + "step": 4194 + }, + { + "epoch": 0.7176659185636421, + "grad_norm": 0.7485541701316833, + "learning_rate": 7.926575129330127e-05, + "loss": 5.3454, + "num_input_tokens_seen": 550109184, + "step": 4197 + }, + { + "epoch": 0.7181789034947098, + "grad_norm": 0.7012051343917847, + "learning_rate": 7.923743703939083e-05, + "loss": 5.3409, + "num_input_tokens_seen": 550502400, + "step": 4200 + }, + { + "epoch": 0.7186918884257775, + "grad_norm": 0.7870872616767883, + "learning_rate": 7.920915310593953e-05, + "loss": 5.2632, + "num_input_tokens_seen": 550895616, + "step": 4203 + }, + { + "epoch": 0.7192048733568451, + "grad_norm": 0.7597965598106384, + "learning_rate": 7.918089943887127e-05, + "loss": 5.3635, + "num_input_tokens_seen": 551288832, + "step": 4206 + }, + { + "epoch": 0.7197178582879128, + "grad_norm": 0.7153079509735107, + "learning_rate": 7.915267598424488e-05, + "loss": 5.2955, + "num_input_tokens_seen": 551682048, + "step": 4209 + }, + { + "epoch": 0.7202308432189805, + "grad_norm": 0.713191568851471, + "learning_rate": 7.91244826882537e-05, + "loss": 5.3174, + "num_input_tokens_seen": 552075264, + "step": 4212 + }, + { + "epoch": 0.7207438281500481, + "grad_norm": 0.774713397026062, + "learning_rate": 7.909631949722512e-05, + "loss": 5.3312, + "num_input_tokens_seen": 552468480, + "step": 4215 + }, + { + "epoch": 0.7212568130811158, + "grad_norm": 0.6564570665359497, + "learning_rate": 7.90681863576202e-05, + "loss": 5.2906, + "num_input_tokens_seen": 552861696, + "step": 4218 + }, + { + "epoch": 0.7217697980121834, + "grad_norm": 0.7904906868934631, + "learning_rate": 7.904008321603313e-05, + "loss": 5.282, + "num_input_tokens_seen": 553254912, + "step": 4221 + }, + { + "epoch": 0.722282782943251, + "grad_norm": 0.7598406672477722, + "learning_rate": 7.90120100191909e-05, + "loss": 5.3137, + "num_input_tokens_seen": 553648128, + "step": 4224 + }, + { + "epoch": 0.7227957678743187, + "grad_norm": 0.7061671018600464, + "learning_rate": 7.898396671395296e-05, + "loss": 5.3025, + "num_input_tokens_seen": 554041344, + "step": 4227 + }, + { + "epoch": 0.7233087528053863, + "grad_norm": 0.7577418684959412, + "learning_rate": 7.895595324731055e-05, + "loss": 5.2867, + "num_input_tokens_seen": 554434560, + "step": 4230 + }, + { + "epoch": 0.723821737736454, + "grad_norm": 0.8547803163528442, + "learning_rate": 7.892796956638649e-05, + "loss": 5.3218, + "num_input_tokens_seen": 554827776, + "step": 4233 + }, + { + "epoch": 0.7243347226675216, + "grad_norm": 0.6954768300056458, + "learning_rate": 7.890001561843465e-05, + "loss": 5.2734, + "num_input_tokens_seen": 555220992, + "step": 4236 + }, + { + "epoch": 0.7248477075985893, + "grad_norm": 0.6641873121261597, + "learning_rate": 7.88720913508397e-05, + "loss": 5.3519, + "num_input_tokens_seen": 555614208, + "step": 4239 + }, + { + "epoch": 0.725360692529657, + "grad_norm": 0.7015215754508972, + "learning_rate": 7.88441967111164e-05, + "loss": 5.3236, + "num_input_tokens_seen": 556007424, + "step": 4242 + }, + { + "epoch": 0.7258736774607246, + "grad_norm": 0.8150922656059265, + "learning_rate": 7.881633164690944e-05, + "loss": 5.3056, + "num_input_tokens_seen": 556400640, + "step": 4245 + }, + { + "epoch": 0.7263866623917923, + "grad_norm": 0.7670294642448425, + "learning_rate": 7.878849610599295e-05, + "loss": 5.341, + "num_input_tokens_seen": 556793856, + "step": 4248 + }, + { + "epoch": 0.7268996473228599, + "grad_norm": 0.6963903307914734, + "learning_rate": 7.876069003627009e-05, + "loss": 5.3405, + "num_input_tokens_seen": 557187072, + "step": 4251 + }, + { + "epoch": 0.7274126322539275, + "grad_norm": 0.7592700719833374, + "learning_rate": 7.873291338577257e-05, + "loss": 5.3088, + "num_input_tokens_seen": 557580288, + "step": 4254 + }, + { + "epoch": 0.7279256171849952, + "grad_norm": 0.7837212085723877, + "learning_rate": 7.870516610266037e-05, + "loss": 5.3128, + "num_input_tokens_seen": 557973504, + "step": 4257 + }, + { + "epoch": 0.7284386021160628, + "grad_norm": 0.6755237579345703, + "learning_rate": 7.86774481352212e-05, + "loss": 5.321, + "num_input_tokens_seen": 558366720, + "step": 4260 + }, + { + "epoch": 0.7289515870471305, + "grad_norm": 0.7284892201423645, + "learning_rate": 7.864975943187024e-05, + "loss": 5.2798, + "num_input_tokens_seen": 558759936, + "step": 4263 + }, + { + "epoch": 0.7294645719781981, + "grad_norm": 0.7344948053359985, + "learning_rate": 7.862209994114962e-05, + "loss": 5.2867, + "num_input_tokens_seen": 559153152, + "step": 4266 + }, + { + "epoch": 0.7299775569092658, + "grad_norm": 0.7037729024887085, + "learning_rate": 7.859446961172803e-05, + "loss": 5.2836, + "num_input_tokens_seen": 559546368, + "step": 4269 + }, + { + "epoch": 0.7304905418403335, + "grad_norm": 0.7983783483505249, + "learning_rate": 7.856686839240038e-05, + "loss": 5.3308, + "num_input_tokens_seen": 559939584, + "step": 4272 + }, + { + "epoch": 0.7310035267714011, + "grad_norm": 0.7463051080703735, + "learning_rate": 7.853929623208739e-05, + "loss": 5.3328, + "num_input_tokens_seen": 560332800, + "step": 4275 + }, + { + "epoch": 0.7315165117024688, + "grad_norm": 0.7885538339614868, + "learning_rate": 7.851175307983515e-05, + "loss": 5.3424, + "num_input_tokens_seen": 560726016, + "step": 4278 + }, + { + "epoch": 0.7320294966335363, + "grad_norm": 0.810090959072113, + "learning_rate": 7.84842388848147e-05, + "loss": 5.2868, + "num_input_tokens_seen": 561119232, + "step": 4281 + }, + { + "epoch": 0.732542481564604, + "grad_norm": 0.7771869897842407, + "learning_rate": 7.845675359632176e-05, + "loss": 5.2846, + "num_input_tokens_seen": 561512448, + "step": 4284 + }, + { + "epoch": 0.7330554664956717, + "grad_norm": 0.7376582026481628, + "learning_rate": 7.842929716377623e-05, + "loss": 5.2987, + "num_input_tokens_seen": 561905664, + "step": 4287 + }, + { + "epoch": 0.7335684514267393, + "grad_norm": 0.6871978640556335, + "learning_rate": 7.84018695367218e-05, + "loss": 5.236, + "num_input_tokens_seen": 562298880, + "step": 4290 + }, + { + "epoch": 0.734081436357807, + "grad_norm": 0.8098225593566895, + "learning_rate": 7.837447066482563e-05, + "loss": 5.3488, + "num_input_tokens_seen": 562692096, + "step": 4293 + }, + { + "epoch": 0.7345944212888746, + "grad_norm": 0.8568369150161743, + "learning_rate": 7.834710049787791e-05, + "loss": 5.3218, + "num_input_tokens_seen": 563085312, + "step": 4296 + }, + { + "epoch": 0.7351074062199423, + "grad_norm": 0.7187222242355347, + "learning_rate": 7.831975898579147e-05, + "loss": 5.3066, + "num_input_tokens_seen": 563478528, + "step": 4299 + }, + { + "epoch": 0.73562039115101, + "grad_norm": 0.7189285755157471, + "learning_rate": 7.829244607860141e-05, + "loss": 5.2294, + "num_input_tokens_seen": 563871744, + "step": 4302 + }, + { + "epoch": 0.7361333760820776, + "grad_norm": 1.026648759841919, + "learning_rate": 7.826516172646476e-05, + "loss": 5.2955, + "num_input_tokens_seen": 564264960, + "step": 4305 + }, + { + "epoch": 0.7366463610131453, + "grad_norm": 0.7149240374565125, + "learning_rate": 7.823790587966001e-05, + "loss": 5.3512, + "num_input_tokens_seen": 564658176, + "step": 4308 + }, + { + "epoch": 0.7371593459442128, + "grad_norm": 0.7869164943695068, + "learning_rate": 7.821067848858679e-05, + "loss": 5.2569, + "num_input_tokens_seen": 565051392, + "step": 4311 + }, + { + "epoch": 0.7376723308752805, + "grad_norm": 0.882475733757019, + "learning_rate": 7.818347950376548e-05, + "loss": 5.3148, + "num_input_tokens_seen": 565444608, + "step": 4314 + }, + { + "epoch": 0.7381853158063482, + "grad_norm": 0.818747878074646, + "learning_rate": 7.815630887583679e-05, + "loss": 5.3013, + "num_input_tokens_seen": 565837824, + "step": 4317 + }, + { + "epoch": 0.7386983007374158, + "grad_norm": 0.854594886302948, + "learning_rate": 7.812916655556147e-05, + "loss": 5.3028, + "num_input_tokens_seen": 566231040, + "step": 4320 + }, + { + "epoch": 0.7392112856684835, + "grad_norm": 1.0109381675720215, + "learning_rate": 7.810205249381987e-05, + "loss": 5.3257, + "num_input_tokens_seen": 566624256, + "step": 4323 + }, + { + "epoch": 0.7397242705995511, + "grad_norm": 0.8108871579170227, + "learning_rate": 7.80749666416116e-05, + "loss": 5.2564, + "num_input_tokens_seen": 567017472, + "step": 4326 + }, + { + "epoch": 0.7402372555306188, + "grad_norm": 0.9545855522155762, + "learning_rate": 7.80479089500551e-05, + "loss": 5.3024, + "num_input_tokens_seen": 567410688, + "step": 4329 + }, + { + "epoch": 0.7407502404616865, + "grad_norm": 0.7847074866294861, + "learning_rate": 7.802087937038731e-05, + "loss": 5.2754, + "num_input_tokens_seen": 567803904, + "step": 4332 + }, + { + "epoch": 0.7412632253927541, + "grad_norm": 0.7441365122795105, + "learning_rate": 7.799387785396339e-05, + "loss": 5.2589, + "num_input_tokens_seen": 568197120, + "step": 4335 + }, + { + "epoch": 0.7417762103238217, + "grad_norm": 0.7231647968292236, + "learning_rate": 7.796690435225613e-05, + "loss": 5.3153, + "num_input_tokens_seen": 568590336, + "step": 4338 + }, + { + "epoch": 0.7422891952548893, + "grad_norm": 0.7042517066001892, + "learning_rate": 7.793995881685584e-05, + "loss": 5.2829, + "num_input_tokens_seen": 568983552, + "step": 4341 + }, + { + "epoch": 0.742802180185957, + "grad_norm": 0.6561905741691589, + "learning_rate": 7.791304119946978e-05, + "loss": 5.2513, + "num_input_tokens_seen": 569376768, + "step": 4344 + }, + { + "epoch": 0.7433151651170247, + "grad_norm": 0.7863454818725586, + "learning_rate": 7.788615145192192e-05, + "loss": 5.2902, + "num_input_tokens_seen": 569769984, + "step": 4347 + }, + { + "epoch": 0.7438281500480923, + "grad_norm": 0.8196501135826111, + "learning_rate": 7.785928952615248e-05, + "loss": 5.2557, + "num_input_tokens_seen": 570163200, + "step": 4350 + }, + { + "epoch": 0.74434113497916, + "grad_norm": 0.7091982960700989, + "learning_rate": 7.783245537421777e-05, + "loss": 5.2603, + "num_input_tokens_seen": 570556416, + "step": 4353 + }, + { + "epoch": 0.7448541199102277, + "grad_norm": 0.6906124353408813, + "learning_rate": 7.780564894828949e-05, + "loss": 5.319, + "num_input_tokens_seen": 570949632, + "step": 4356 + }, + { + "epoch": 0.7453671048412953, + "grad_norm": 0.9479339718818665, + "learning_rate": 7.777887020065473e-05, + "loss": 5.3323, + "num_input_tokens_seen": 571342848, + "step": 4359 + }, + { + "epoch": 0.745880089772363, + "grad_norm": 0.6457544565200806, + "learning_rate": 7.775211908371534e-05, + "loss": 5.2699, + "num_input_tokens_seen": 571736064, + "step": 4362 + }, + { + "epoch": 0.7463930747034306, + "grad_norm": 0.8440648913383484, + "learning_rate": 7.772539554998778e-05, + "loss": 5.2753, + "num_input_tokens_seen": 572129280, + "step": 4365 + }, + { + "epoch": 0.7469060596344982, + "grad_norm": 0.8873343467712402, + "learning_rate": 7.76986995521026e-05, + "loss": 5.2944, + "num_input_tokens_seen": 572522496, + "step": 4368 + }, + { + "epoch": 0.7474190445655658, + "grad_norm": 0.7578151822090149, + "learning_rate": 7.767203104280422e-05, + "loss": 5.2523, + "num_input_tokens_seen": 572915712, + "step": 4371 + }, + { + "epoch": 0.7479320294966335, + "grad_norm": 0.7755122780799866, + "learning_rate": 7.764538997495046e-05, + "loss": 5.3015, + "num_input_tokens_seen": 573308928, + "step": 4374 + }, + { + "epoch": 0.7484450144277012, + "grad_norm": 0.8372567296028137, + "learning_rate": 7.761877630151229e-05, + "loss": 5.2548, + "num_input_tokens_seen": 573702144, + "step": 4377 + }, + { + "epoch": 0.7489579993587688, + "grad_norm": 0.8696802854537964, + "learning_rate": 7.759218997557344e-05, + "loss": 5.308, + "num_input_tokens_seen": 574095360, + "step": 4380 + }, + { + "epoch": 0.7494709842898365, + "grad_norm": 0.6701018810272217, + "learning_rate": 7.756563095033e-05, + "loss": 5.2979, + "num_input_tokens_seen": 574488576, + "step": 4383 + }, + { + "epoch": 0.7499839692209042, + "grad_norm": 0.7554642558097839, + "learning_rate": 7.75390991790902e-05, + "loss": 5.3501, + "num_input_tokens_seen": 574881792, + "step": 4386 + }, + { + "epoch": 0.7504969541519718, + "grad_norm": 0.7375338077545166, + "learning_rate": 7.751259461527394e-05, + "loss": 5.2445, + "num_input_tokens_seen": 575275008, + "step": 4389 + }, + { + "epoch": 0.7510099390830395, + "grad_norm": 0.6931892037391663, + "learning_rate": 7.748611721241256e-05, + "loss": 5.2891, + "num_input_tokens_seen": 575668224, + "step": 4392 + }, + { + "epoch": 0.7515229240141071, + "grad_norm": 0.7114129066467285, + "learning_rate": 7.745966692414832e-05, + "loss": 5.2936, + "num_input_tokens_seen": 576061440, + "step": 4395 + }, + { + "epoch": 0.7520359089451747, + "grad_norm": 0.7084842324256897, + "learning_rate": 7.743324370423433e-05, + "loss": 5.3014, + "num_input_tokens_seen": 576454656, + "step": 4398 + }, + { + "epoch": 0.7523778988992198, + "eval_accuracy": 0.18137436899527765, + "eval_loss": 5.756495952606201, + "eval_runtime": 110.1663, + "eval_samples_per_second": 2.723, + "eval_steps_per_second": 1.362, + "num_input_tokens_seen": 576716800, + "step": 4400 + }, + { + "epoch": 0.7525488938762424, + "grad_norm": 0.8616915941238403, + "learning_rate": 7.74068475065339e-05, + "loss": 5.2833, + "num_input_tokens_seen": 576847872, + "step": 4401 + }, + { + "epoch": 0.75306187880731, + "grad_norm": 0.7370516061782837, + "learning_rate": 7.738047828502048e-05, + "loss": 5.2773, + "num_input_tokens_seen": 577241088, + "step": 4404 + }, + { + "epoch": 0.7535748637383777, + "grad_norm": 0.8121903538703918, + "learning_rate": 7.735413599377714e-05, + "loss": 5.2661, + "num_input_tokens_seen": 577634304, + "step": 4407 + }, + { + "epoch": 0.7540878486694453, + "grad_norm": 0.775635302066803, + "learning_rate": 7.732782058699632e-05, + "loss": 5.2528, + "num_input_tokens_seen": 578027520, + "step": 4410 + }, + { + "epoch": 0.754600833600513, + "grad_norm": 0.6981225609779358, + "learning_rate": 7.730153201897945e-05, + "loss": 5.3143, + "num_input_tokens_seen": 578420736, + "step": 4413 + }, + { + "epoch": 0.7551138185315807, + "grad_norm": 0.6825100779533386, + "learning_rate": 7.727527024413663e-05, + "loss": 5.2626, + "num_input_tokens_seen": 578813952, + "step": 4416 + }, + { + "epoch": 0.7556268034626483, + "grad_norm": 0.7106081247329712, + "learning_rate": 7.724903521698631e-05, + "loss": 5.272, + "num_input_tokens_seen": 579207168, + "step": 4419 + }, + { + "epoch": 0.756139788393716, + "grad_norm": 0.7914432287216187, + "learning_rate": 7.722282689215501e-05, + "loss": 5.2469, + "num_input_tokens_seen": 579600384, + "step": 4422 + }, + { + "epoch": 0.7566527733247835, + "grad_norm": 0.8186551928520203, + "learning_rate": 7.719664522437684e-05, + "loss": 5.3293, + "num_input_tokens_seen": 579993600, + "step": 4425 + }, + { + "epoch": 0.7571657582558512, + "grad_norm": 0.7702829837799072, + "learning_rate": 7.717049016849333e-05, + "loss": 5.2875, + "num_input_tokens_seen": 580386816, + "step": 4428 + }, + { + "epoch": 0.7576787431869189, + "grad_norm": 0.7700650095939636, + "learning_rate": 7.714436167945303e-05, + "loss": 5.2823, + "num_input_tokens_seen": 580780032, + "step": 4431 + }, + { + "epoch": 0.7581917281179865, + "grad_norm": 0.8458641767501831, + "learning_rate": 7.71182597123112e-05, + "loss": 5.2691, + "num_input_tokens_seen": 581173248, + "step": 4434 + }, + { + "epoch": 0.7587047130490542, + "grad_norm": 0.6728209853172302, + "learning_rate": 7.709218422222942e-05, + "loss": 5.2768, + "num_input_tokens_seen": 581566464, + "step": 4437 + }, + { + "epoch": 0.7592176979801218, + "grad_norm": 0.7042077779769897, + "learning_rate": 7.706613516447538e-05, + "loss": 5.2504, + "num_input_tokens_seen": 581959680, + "step": 4440 + }, + { + "epoch": 0.7597306829111895, + "grad_norm": 0.7626017332077026, + "learning_rate": 7.704011249442249e-05, + "loss": 5.3261, + "num_input_tokens_seen": 582352896, + "step": 4443 + }, + { + "epoch": 0.7602436678422572, + "grad_norm": 0.7157188057899475, + "learning_rate": 7.70141161675496e-05, + "loss": 5.2466, + "num_input_tokens_seen": 582746112, + "step": 4446 + }, + { + "epoch": 0.7607566527733248, + "grad_norm": 0.6773563623428345, + "learning_rate": 7.69881461394406e-05, + "loss": 5.2757, + "num_input_tokens_seen": 583139328, + "step": 4449 + }, + { + "epoch": 0.7612696377043925, + "grad_norm": 0.7485852241516113, + "learning_rate": 7.696220236578416e-05, + "loss": 5.3204, + "num_input_tokens_seen": 583532544, + "step": 4452 + }, + { + "epoch": 0.76178262263546, + "grad_norm": 0.7189561128616333, + "learning_rate": 7.693628480237344e-05, + "loss": 5.2909, + "num_input_tokens_seen": 583925760, + "step": 4455 + }, + { + "epoch": 0.7622956075665277, + "grad_norm": 0.809196412563324, + "learning_rate": 7.691039340510571e-05, + "loss": 5.313, + "num_input_tokens_seen": 584318976, + "step": 4458 + }, + { + "epoch": 0.7628085924975954, + "grad_norm": 0.8321667909622192, + "learning_rate": 7.688452812998208e-05, + "loss": 5.3164, + "num_input_tokens_seen": 584712192, + "step": 4461 + }, + { + "epoch": 0.763321577428663, + "grad_norm": 0.6554286479949951, + "learning_rate": 7.685868893310715e-05, + "loss": 5.2475, + "num_input_tokens_seen": 585105408, + "step": 4464 + }, + { + "epoch": 0.7638345623597307, + "grad_norm": 0.7397018671035767, + "learning_rate": 7.683287577068874e-05, + "loss": 5.2725, + "num_input_tokens_seen": 585498624, + "step": 4467 + }, + { + "epoch": 0.7643475472907983, + "grad_norm": 0.6564092636108398, + "learning_rate": 7.680708859903753e-05, + "loss": 5.2621, + "num_input_tokens_seen": 585891840, + "step": 4470 + }, + { + "epoch": 0.764860532221866, + "grad_norm": 0.7277629971504211, + "learning_rate": 7.678132737456681e-05, + "loss": 5.2376, + "num_input_tokens_seen": 586285056, + "step": 4473 + }, + { + "epoch": 0.7653735171529337, + "grad_norm": 0.7863820195198059, + "learning_rate": 7.675559205379208e-05, + "loss": 5.2615, + "num_input_tokens_seen": 586678272, + "step": 4476 + }, + { + "epoch": 0.7658865020840013, + "grad_norm": 0.760823130607605, + "learning_rate": 7.672988259333085e-05, + "loss": 5.2611, + "num_input_tokens_seen": 587071488, + "step": 4479 + }, + { + "epoch": 0.7663994870150689, + "grad_norm": 0.8245643377304077, + "learning_rate": 7.670419894990224e-05, + "loss": 5.234, + "num_input_tokens_seen": 587464704, + "step": 4482 + }, + { + "epoch": 0.7669124719461365, + "grad_norm": 0.8459969162940979, + "learning_rate": 7.667854108032676e-05, + "loss": 5.3133, + "num_input_tokens_seen": 587857920, + "step": 4485 + }, + { + "epoch": 0.7674254568772042, + "grad_norm": 0.7487183809280396, + "learning_rate": 7.665290894152588e-05, + "loss": 5.2422, + "num_input_tokens_seen": 588251136, + "step": 4488 + }, + { + "epoch": 0.7679384418082719, + "grad_norm": 0.6825780868530273, + "learning_rate": 7.662730249052193e-05, + "loss": 5.2759, + "num_input_tokens_seen": 588644352, + "step": 4491 + }, + { + "epoch": 0.7684514267393395, + "grad_norm": 0.7450793385505676, + "learning_rate": 7.660172168443752e-05, + "loss": 5.3166, + "num_input_tokens_seen": 589037568, + "step": 4494 + }, + { + "epoch": 0.7689644116704072, + "grad_norm": 0.7206063270568848, + "learning_rate": 7.657616648049552e-05, + "loss": 5.2758, + "num_input_tokens_seen": 589430784, + "step": 4497 + }, + { + "epoch": 0.7694773966014749, + "grad_norm": 0.7094364166259766, + "learning_rate": 7.655063683601855e-05, + "loss": 5.2929, + "num_input_tokens_seen": 589824000, + "step": 4500 + }, + { + "epoch": 0.7699903815325425, + "grad_norm": 0.6987499594688416, + "learning_rate": 7.652513270842879e-05, + "loss": 5.3014, + "num_input_tokens_seen": 590217216, + "step": 4503 + }, + { + "epoch": 0.7705033664636102, + "grad_norm": 0.699375331401825, + "learning_rate": 7.649965405524765e-05, + "loss": 5.2702, + "num_input_tokens_seen": 590610432, + "step": 4506 + }, + { + "epoch": 0.7710163513946778, + "grad_norm": 0.8014332056045532, + "learning_rate": 7.647420083409549e-05, + "loss": 5.1938, + "num_input_tokens_seen": 591003648, + "step": 4509 + }, + { + "epoch": 0.7715293363257454, + "grad_norm": 0.7827511429786682, + "learning_rate": 7.64487730026913e-05, + "loss": 5.2683, + "num_input_tokens_seen": 591396864, + "step": 4512 + }, + { + "epoch": 0.772042321256813, + "grad_norm": 0.7832793593406677, + "learning_rate": 7.642337051885237e-05, + "loss": 5.2913, + "num_input_tokens_seen": 591790080, + "step": 4515 + }, + { + "epoch": 0.7725553061878807, + "grad_norm": 0.7209380269050598, + "learning_rate": 7.639799334049411e-05, + "loss": 5.3039, + "num_input_tokens_seen": 592183296, + "step": 4518 + }, + { + "epoch": 0.7730682911189484, + "grad_norm": 0.7766376733779907, + "learning_rate": 7.637264142562964e-05, + "loss": 5.34, + "num_input_tokens_seen": 592576512, + "step": 4521 + }, + { + "epoch": 0.773581276050016, + "grad_norm": 0.7014556527137756, + "learning_rate": 7.634731473236961e-05, + "loss": 5.2624, + "num_input_tokens_seen": 592969728, + "step": 4524 + }, + { + "epoch": 0.7740942609810837, + "grad_norm": 0.726774275302887, + "learning_rate": 7.632201321892173e-05, + "loss": 5.2637, + "num_input_tokens_seen": 593362944, + "step": 4527 + }, + { + "epoch": 0.7746072459121514, + "grad_norm": 0.7574884295463562, + "learning_rate": 7.62967368435907e-05, + "loss": 5.2565, + "num_input_tokens_seen": 593756160, + "step": 4530 + }, + { + "epoch": 0.775120230843219, + "grad_norm": 0.6125289797782898, + "learning_rate": 7.627148556477777e-05, + "loss": 5.2344, + "num_input_tokens_seen": 594149376, + "step": 4533 + }, + { + "epoch": 0.7756332157742867, + "grad_norm": 0.6760141253471375, + "learning_rate": 7.624625934098054e-05, + "loss": 5.2637, + "num_input_tokens_seen": 594542592, + "step": 4536 + }, + { + "epoch": 0.7761462007053542, + "grad_norm": 0.7754603624343872, + "learning_rate": 7.622105813079257e-05, + "loss": 5.3232, + "num_input_tokens_seen": 594935808, + "step": 4539 + }, + { + "epoch": 0.7766591856364219, + "grad_norm": 0.7657566070556641, + "learning_rate": 7.619588189290318e-05, + "loss": 5.284, + "num_input_tokens_seen": 595329024, + "step": 4542 + }, + { + "epoch": 0.7771721705674896, + "grad_norm": 0.682299017906189, + "learning_rate": 7.617073058609718e-05, + "loss": 5.295, + "num_input_tokens_seen": 595722240, + "step": 4545 + }, + { + "epoch": 0.7776851554985572, + "grad_norm": 0.7046672701835632, + "learning_rate": 7.614560416925451e-05, + "loss": 5.271, + "num_input_tokens_seen": 596115456, + "step": 4548 + }, + { + "epoch": 0.7781981404296249, + "grad_norm": 0.6913005709648132, + "learning_rate": 7.612050260135002e-05, + "loss": 5.2684, + "num_input_tokens_seen": 596508672, + "step": 4551 + }, + { + "epoch": 0.7787111253606925, + "grad_norm": 0.7652671933174133, + "learning_rate": 7.609542584145313e-05, + "loss": 5.2268, + "num_input_tokens_seen": 596901888, + "step": 4554 + }, + { + "epoch": 0.7792241102917602, + "grad_norm": 0.6569912433624268, + "learning_rate": 7.607037384872765e-05, + "loss": 5.3011, + "num_input_tokens_seen": 597295104, + "step": 4557 + }, + { + "epoch": 0.7797370952228279, + "grad_norm": 0.6658585667610168, + "learning_rate": 7.604534658243135e-05, + "loss": 5.296, + "num_input_tokens_seen": 597688320, + "step": 4560 + }, + { + "epoch": 0.7802500801538955, + "grad_norm": 0.8153283596038818, + "learning_rate": 7.602034400191585e-05, + "loss": 5.2633, + "num_input_tokens_seen": 598081536, + "step": 4563 + }, + { + "epoch": 0.7807630650849632, + "grad_norm": 0.7321149110794067, + "learning_rate": 7.599536606662622e-05, + "loss": 5.2841, + "num_input_tokens_seen": 598474752, + "step": 4566 + }, + { + "epoch": 0.7812760500160307, + "grad_norm": 0.9558836221694946, + "learning_rate": 7.597041273610076e-05, + "loss": 5.2623, + "num_input_tokens_seen": 598867968, + "step": 4569 + }, + { + "epoch": 0.7817890349470984, + "grad_norm": 0.7963069081306458, + "learning_rate": 7.594548396997066e-05, + "loss": 5.3027, + "num_input_tokens_seen": 599261184, + "step": 4572 + }, + { + "epoch": 0.782302019878166, + "grad_norm": 0.7501682043075562, + "learning_rate": 7.592057972795984e-05, + "loss": 5.2485, + "num_input_tokens_seen": 599654400, + "step": 4575 + }, + { + "epoch": 0.7828150048092337, + "grad_norm": 0.8756089806556702, + "learning_rate": 7.58956999698846e-05, + "loss": 5.2901, + "num_input_tokens_seen": 600047616, + "step": 4578 + }, + { + "epoch": 0.7833279897403014, + "grad_norm": 0.7717293500900269, + "learning_rate": 7.587084465565331e-05, + "loss": 5.3035, + "num_input_tokens_seen": 600440832, + "step": 4581 + }, + { + "epoch": 0.783840974671369, + "grad_norm": 0.874310314655304, + "learning_rate": 7.584601374526627e-05, + "loss": 5.2816, + "num_input_tokens_seen": 600834048, + "step": 4584 + }, + { + "epoch": 0.7843539596024367, + "grad_norm": 0.7195756435394287, + "learning_rate": 7.582120719881527e-05, + "loss": 5.2297, + "num_input_tokens_seen": 601227264, + "step": 4587 + }, + { + "epoch": 0.7848669445335044, + "grad_norm": 0.8340673446655273, + "learning_rate": 7.579642497648347e-05, + "loss": 5.263, + "num_input_tokens_seen": 601620480, + "step": 4590 + }, + { + "epoch": 0.785379929464572, + "grad_norm": 0.6328992247581482, + "learning_rate": 7.577166703854501e-05, + "loss": 5.3334, + "num_input_tokens_seen": 602013696, + "step": 4593 + }, + { + "epoch": 0.7858929143956396, + "grad_norm": 0.7836408615112305, + "learning_rate": 7.574693334536489e-05, + "loss": 5.2894, + "num_input_tokens_seen": 602406912, + "step": 4596 + }, + { + "epoch": 0.7864058993267072, + "grad_norm": 0.7437619566917419, + "learning_rate": 7.572222385739856e-05, + "loss": 5.2906, + "num_input_tokens_seen": 602800128, + "step": 4599 + }, + { + "epoch": 0.7869188842577749, + "grad_norm": 0.665663480758667, + "learning_rate": 7.569753853519169e-05, + "loss": 5.3065, + "num_input_tokens_seen": 603193344, + "step": 4602 + }, + { + "epoch": 0.7874318691888426, + "grad_norm": 0.7104794383049011, + "learning_rate": 7.567287733937997e-05, + "loss": 5.2908, + "num_input_tokens_seen": 603586560, + "step": 4605 + }, + { + "epoch": 0.7879448541199102, + "grad_norm": 0.666928231716156, + "learning_rate": 7.564824023068877e-05, + "loss": 5.3252, + "num_input_tokens_seen": 603979776, + "step": 4608 + }, + { + "epoch": 0.7884578390509779, + "grad_norm": 0.7155689597129822, + "learning_rate": 7.562362716993294e-05, + "loss": 5.2782, + "num_input_tokens_seen": 604372992, + "step": 4611 + }, + { + "epoch": 0.7889708239820455, + "grad_norm": 0.7343229055404663, + "learning_rate": 7.559903811801648e-05, + "loss": 5.2521, + "num_input_tokens_seen": 604766208, + "step": 4614 + }, + { + "epoch": 0.7894838089131132, + "grad_norm": 0.7456091046333313, + "learning_rate": 7.557447303593237e-05, + "loss": 5.311, + "num_input_tokens_seen": 605159424, + "step": 4617 + }, + { + "epoch": 0.7899967938441809, + "grad_norm": 0.7218461036682129, + "learning_rate": 7.55499318847622e-05, + "loss": 5.2997, + "num_input_tokens_seen": 605552640, + "step": 4620 + }, + { + "epoch": 0.7905097787752485, + "grad_norm": 0.783655047416687, + "learning_rate": 7.552541462567598e-05, + "loss": 5.2334, + "num_input_tokens_seen": 605945856, + "step": 4623 + }, + { + "epoch": 0.7910227637063161, + "grad_norm": 0.7241744995117188, + "learning_rate": 7.550092121993191e-05, + "loss": 5.2428, + "num_input_tokens_seen": 606339072, + "step": 4626 + }, + { + "epoch": 0.7915357486373837, + "grad_norm": 0.77818363904953, + "learning_rate": 7.547645162887604e-05, + "loss": 5.2869, + "num_input_tokens_seen": 606732288, + "step": 4629 + }, + { + "epoch": 0.7920487335684514, + "grad_norm": 0.8874224424362183, + "learning_rate": 7.545200581394207e-05, + "loss": 5.253, + "num_input_tokens_seen": 607125504, + "step": 4632 + }, + { + "epoch": 0.7925617184995191, + "grad_norm": 0.8084154725074768, + "learning_rate": 7.542758373665109e-05, + "loss": 5.283, + "num_input_tokens_seen": 607518720, + "step": 4635 + }, + { + "epoch": 0.7930747034305867, + "grad_norm": 0.6735661625862122, + "learning_rate": 7.540318535861131e-05, + "loss": 5.2641, + "num_input_tokens_seen": 607911936, + "step": 4638 + }, + { + "epoch": 0.7935876883616544, + "grad_norm": 0.7534482479095459, + "learning_rate": 7.537881064151782e-05, + "loss": 5.2805, + "num_input_tokens_seen": 608305152, + "step": 4641 + }, + { + "epoch": 0.794100673292722, + "grad_norm": 0.7664649486541748, + "learning_rate": 7.535445954715228e-05, + "loss": 5.2183, + "num_input_tokens_seen": 608698368, + "step": 4644 + }, + { + "epoch": 0.7946136582237897, + "grad_norm": 0.8066388964653015, + "learning_rate": 7.53301320373828e-05, + "loss": 5.3102, + "num_input_tokens_seen": 609091584, + "step": 4647 + }, + { + "epoch": 0.7951266431548574, + "grad_norm": 0.733974277973175, + "learning_rate": 7.530582807416357e-05, + "loss": 5.2926, + "num_input_tokens_seen": 609484800, + "step": 4650 + }, + { + "epoch": 0.7956396280859249, + "grad_norm": 0.7289937138557434, + "learning_rate": 7.528154761953464e-05, + "loss": 5.276, + "num_input_tokens_seen": 609878016, + "step": 4653 + }, + { + "epoch": 0.7961526130169926, + "grad_norm": 0.7927041053771973, + "learning_rate": 7.52572906356217e-05, + "loss": 5.2783, + "num_input_tokens_seen": 610271232, + "step": 4656 + }, + { + "epoch": 0.7966655979480602, + "grad_norm": 0.718377411365509, + "learning_rate": 7.523305708463577e-05, + "loss": 5.2949, + "num_input_tokens_seen": 610664448, + "step": 4659 + }, + { + "epoch": 0.7971785828791279, + "grad_norm": 0.7721161246299744, + "learning_rate": 7.520884692887304e-05, + "loss": 5.3164, + "num_input_tokens_seen": 611057664, + "step": 4662 + }, + { + "epoch": 0.7976915678101956, + "grad_norm": 0.6871947050094604, + "learning_rate": 7.518466013071455e-05, + "loss": 5.2669, + "num_input_tokens_seen": 611450880, + "step": 4665 + }, + { + "epoch": 0.7982045527412632, + "grad_norm": 0.6703811883926392, + "learning_rate": 7.516049665262601e-05, + "loss": 5.2861, + "num_input_tokens_seen": 611844096, + "step": 4668 + }, + { + "epoch": 0.7987175376723309, + "grad_norm": 0.6838527321815491, + "learning_rate": 7.51363564571575e-05, + "loss": 5.2608, + "num_input_tokens_seen": 612237312, + "step": 4671 + }, + { + "epoch": 0.7992305226033986, + "grad_norm": 0.650619626045227, + "learning_rate": 7.511223950694318e-05, + "loss": 5.2961, + "num_input_tokens_seen": 612630528, + "step": 4674 + }, + { + "epoch": 0.7997435075344662, + "grad_norm": 0.6837221384048462, + "learning_rate": 7.508814576470118e-05, + "loss": 5.2384, + "num_input_tokens_seen": 613023744, + "step": 4677 + }, + { + "epoch": 0.8002564924655339, + "grad_norm": 0.8880760669708252, + "learning_rate": 7.50640751932333e-05, + "loss": 5.2994, + "num_input_tokens_seen": 613416960, + "step": 4680 + }, + { + "epoch": 0.8007694773966014, + "grad_norm": 0.7496533989906311, + "learning_rate": 7.504002775542471e-05, + "loss": 5.2399, + "num_input_tokens_seen": 613810176, + "step": 4683 + }, + { + "epoch": 0.8012824623276691, + "grad_norm": 0.8291968703269958, + "learning_rate": 7.50160034142438e-05, + "loss": 5.2665, + "num_input_tokens_seen": 614203392, + "step": 4686 + }, + { + "epoch": 0.8017954472587367, + "grad_norm": 0.6744365692138672, + "learning_rate": 7.499200213274185e-05, + "loss": 5.2685, + "num_input_tokens_seen": 614596608, + "step": 4689 + }, + { + "epoch": 0.8023084321898044, + "grad_norm": 0.8543739914894104, + "learning_rate": 7.496802387405287e-05, + "loss": 5.2792, + "num_input_tokens_seen": 614989824, + "step": 4692 + }, + { + "epoch": 0.8028214171208721, + "grad_norm": 0.8261051177978516, + "learning_rate": 7.494406860139334e-05, + "loss": 5.2683, + "num_input_tokens_seen": 615383040, + "step": 4695 + }, + { + "epoch": 0.8033344020519397, + "grad_norm": 0.7388364672660828, + "learning_rate": 7.492013627806192e-05, + "loss": 5.2577, + "num_input_tokens_seen": 615776256, + "step": 4698 + }, + { + "epoch": 0.8038473869830074, + "grad_norm": 0.7822057008743286, + "learning_rate": 7.489622686743933e-05, + "loss": 5.2975, + "num_input_tokens_seen": 616169472, + "step": 4701 + }, + { + "epoch": 0.8043603719140751, + "grad_norm": 0.866584837436676, + "learning_rate": 7.487234033298796e-05, + "loss": 5.2458, + "num_input_tokens_seen": 616562688, + "step": 4704 + }, + { + "epoch": 0.8048733568451427, + "grad_norm": 0.6807045340538025, + "learning_rate": 7.484847663825176e-05, + "loss": 5.2579, + "num_input_tokens_seen": 616955904, + "step": 4707 + }, + { + "epoch": 0.8053863417762103, + "grad_norm": 0.7374781966209412, + "learning_rate": 7.4824635746856e-05, + "loss": 5.2762, + "num_input_tokens_seen": 617349120, + "step": 4710 + }, + { + "epoch": 0.8058993267072779, + "grad_norm": 0.829736590385437, + "learning_rate": 7.480081762250693e-05, + "loss": 5.3006, + "num_input_tokens_seen": 617742336, + "step": 4713 + }, + { + "epoch": 0.8064123116383456, + "grad_norm": 0.768171489238739, + "learning_rate": 7.477702222899166e-05, + "loss": 5.2574, + "num_input_tokens_seen": 618135552, + "step": 4716 + }, + { + "epoch": 0.8069252965694133, + "grad_norm": 0.7289711833000183, + "learning_rate": 7.47532495301779e-05, + "loss": 5.2361, + "num_input_tokens_seen": 618528768, + "step": 4719 + }, + { + "epoch": 0.8074382815004809, + "grad_norm": 0.9484478831291199, + "learning_rate": 7.472949949001368e-05, + "loss": 5.2584, + "num_input_tokens_seen": 618921984, + "step": 4722 + }, + { + "epoch": 0.8079512664315486, + "grad_norm": 0.7003783583641052, + "learning_rate": 7.470577207252715e-05, + "loss": 5.2355, + "num_input_tokens_seen": 619315200, + "step": 4725 + }, + { + "epoch": 0.8084642513626162, + "grad_norm": 0.7656697034835815, + "learning_rate": 7.468206724182646e-05, + "loss": 5.2752, + "num_input_tokens_seen": 619708416, + "step": 4728 + }, + { + "epoch": 0.8089772362936839, + "grad_norm": 0.7776997089385986, + "learning_rate": 7.465838496209931e-05, + "loss": 5.2699, + "num_input_tokens_seen": 620101632, + "step": 4731 + }, + { + "epoch": 0.8094902212247516, + "grad_norm": 0.7209721207618713, + "learning_rate": 7.463472519761289e-05, + "loss": 5.2341, + "num_input_tokens_seen": 620494848, + "step": 4734 + }, + { + "epoch": 0.8100032061558192, + "grad_norm": 0.7924665212631226, + "learning_rate": 7.461108791271363e-05, + "loss": 5.2278, + "num_input_tokens_seen": 620888064, + "step": 4737 + }, + { + "epoch": 0.8105161910868868, + "grad_norm": 0.673541247844696, + "learning_rate": 7.458747307182692e-05, + "loss": 5.3302, + "num_input_tokens_seen": 621281280, + "step": 4740 + }, + { + "epoch": 0.8110291760179544, + "grad_norm": 0.7629460096359253, + "learning_rate": 7.456388063945693e-05, + "loss": 5.2465, + "num_input_tokens_seen": 621674496, + "step": 4743 + }, + { + "epoch": 0.8115421609490221, + "grad_norm": 0.665367603302002, + "learning_rate": 7.454031058018637e-05, + "loss": 5.3017, + "num_input_tokens_seen": 622067712, + "step": 4746 + }, + { + "epoch": 0.8120551458800898, + "grad_norm": 0.7708544731140137, + "learning_rate": 7.451676285867628e-05, + "loss": 5.2512, + "num_input_tokens_seen": 622460928, + "step": 4749 + }, + { + "epoch": 0.8125681308111574, + "grad_norm": 0.8049509525299072, + "learning_rate": 7.449323743966578e-05, + "loss": 5.2656, + "num_input_tokens_seen": 622854144, + "step": 4752 + }, + { + "epoch": 0.8130811157422251, + "grad_norm": 0.6959990859031677, + "learning_rate": 7.446973428797188e-05, + "loss": 5.2437, + "num_input_tokens_seen": 623247360, + "step": 4755 + }, + { + "epoch": 0.8135941006732927, + "grad_norm": 0.7066569328308105, + "learning_rate": 7.444625336848923e-05, + "loss": 5.2506, + "num_input_tokens_seen": 623640576, + "step": 4758 + }, + { + "epoch": 0.8141070856043604, + "grad_norm": 0.6698566675186157, + "learning_rate": 7.442279464618996e-05, + "loss": 5.2852, + "num_input_tokens_seen": 624033792, + "step": 4761 + }, + { + "epoch": 0.8146200705354281, + "grad_norm": 0.7723349332809448, + "learning_rate": 7.439935808612331e-05, + "loss": 5.3053, + "num_input_tokens_seen": 624427008, + "step": 4764 + }, + { + "epoch": 0.8151330554664957, + "grad_norm": 0.7419276237487793, + "learning_rate": 7.437594365341564e-05, + "loss": 5.2353, + "num_input_tokens_seen": 624820224, + "step": 4767 + }, + { + "epoch": 0.8156460403975633, + "grad_norm": 0.8163347840309143, + "learning_rate": 7.435255131327003e-05, + "loss": 5.2718, + "num_input_tokens_seen": 625213440, + "step": 4770 + }, + { + "epoch": 0.8161590253286309, + "grad_norm": 0.7750483751296997, + "learning_rate": 7.432918103096608e-05, + "loss": 5.2919, + "num_input_tokens_seen": 625606656, + "step": 4773 + }, + { + "epoch": 0.8166720102596986, + "grad_norm": 0.7649890184402466, + "learning_rate": 7.430583277185981e-05, + "loss": 5.2961, + "num_input_tokens_seen": 625999872, + "step": 4776 + }, + { + "epoch": 0.8171849951907663, + "grad_norm": 0.7891272306442261, + "learning_rate": 7.428250650138333e-05, + "loss": 5.255, + "num_input_tokens_seen": 626393088, + "step": 4779 + }, + { + "epoch": 0.8176979801218339, + "grad_norm": 0.7607313990592957, + "learning_rate": 7.425920218504469e-05, + "loss": 5.232, + "num_input_tokens_seen": 626786304, + "step": 4782 + }, + { + "epoch": 0.8182109650529016, + "grad_norm": 0.8134208917617798, + "learning_rate": 7.423591978842759e-05, + "loss": 5.25, + "num_input_tokens_seen": 627179520, + "step": 4785 + }, + { + "epoch": 0.8187239499839692, + "grad_norm": 0.7779147624969482, + "learning_rate": 7.421265927719126e-05, + "loss": 5.2691, + "num_input_tokens_seen": 627572736, + "step": 4788 + }, + { + "epoch": 0.8192369349150369, + "grad_norm": 0.7561124563217163, + "learning_rate": 7.418942061707016e-05, + "loss": 5.247, + "num_input_tokens_seen": 627965952, + "step": 4791 + }, + { + "epoch": 0.8197499198461046, + "grad_norm": 0.7412567138671875, + "learning_rate": 7.416620377387388e-05, + "loss": 5.3103, + "num_input_tokens_seen": 628359168, + "step": 4794 + }, + { + "epoch": 0.8202629047771721, + "grad_norm": 0.720973551273346, + "learning_rate": 7.414300871348681e-05, + "loss": 5.318, + "num_input_tokens_seen": 628752384, + "step": 4797 + }, + { + "epoch": 0.8207758897082398, + "grad_norm": 0.7904423475265503, + "learning_rate": 7.411983540186796e-05, + "loss": 5.2749, + "num_input_tokens_seen": 629145600, + "step": 4800 + }, + { + "epoch": 0.8207758897082398, + "eval_accuracy": 0.1849405634261521, + "eval_loss": 5.730287075042725, + "eval_runtime": 109.3991, + "eval_samples_per_second": 2.742, + "eval_steps_per_second": 1.371, + "num_input_tokens_seen": 629145600, + "step": 4800 + }, + { + "epoch": 0.8212888746393074, + "grad_norm": 0.7457066774368286, + "learning_rate": 7.409668380505084e-05, + "loss": 5.2593, + "num_input_tokens_seen": 629538816, + "step": 4803 + }, + { + "epoch": 0.8218018595703751, + "grad_norm": 0.7565402984619141, + "learning_rate": 7.407355388914312e-05, + "loss": 5.2582, + "num_input_tokens_seen": 629932032, + "step": 4806 + }, + { + "epoch": 0.8223148445014428, + "grad_norm": 0.7606648206710815, + "learning_rate": 7.40504456203265e-05, + "loss": 5.2638, + "num_input_tokens_seen": 630325248, + "step": 4809 + }, + { + "epoch": 0.8228278294325104, + "grad_norm": 0.7904190421104431, + "learning_rate": 7.40273589648565e-05, + "loss": 5.2671, + "num_input_tokens_seen": 630718464, + "step": 4812 + }, + { + "epoch": 0.8233408143635781, + "grad_norm": 0.736111581325531, + "learning_rate": 7.400429388906221e-05, + "loss": 5.271, + "num_input_tokens_seen": 631111680, + "step": 4815 + }, + { + "epoch": 0.8238537992946457, + "grad_norm": 0.7933962345123291, + "learning_rate": 7.398125035934614e-05, + "loss": 5.2411, + "num_input_tokens_seen": 631504896, + "step": 4818 + }, + { + "epoch": 0.8243667842257134, + "grad_norm": 0.8558014631271362, + "learning_rate": 7.395822834218396e-05, + "loss": 5.2648, + "num_input_tokens_seen": 631898112, + "step": 4821 + }, + { + "epoch": 0.8248797691567811, + "grad_norm": 0.7140774130821228, + "learning_rate": 7.393522780412432e-05, + "loss": 5.2415, + "num_input_tokens_seen": 632291328, + "step": 4824 + }, + { + "epoch": 0.8253927540878486, + "grad_norm": 0.7220463156700134, + "learning_rate": 7.391224871178872e-05, + "loss": 5.2234, + "num_input_tokens_seen": 632684544, + "step": 4827 + }, + { + "epoch": 0.8259057390189163, + "grad_norm": 0.6848180890083313, + "learning_rate": 7.388929103187108e-05, + "loss": 5.2656, + "num_input_tokens_seen": 633077760, + "step": 4830 + }, + { + "epoch": 0.826418723949984, + "grad_norm": 0.7403995990753174, + "learning_rate": 7.386635473113787e-05, + "loss": 5.2528, + "num_input_tokens_seen": 633470976, + "step": 4833 + }, + { + "epoch": 0.8269317088810516, + "grad_norm": 0.7096079587936401, + "learning_rate": 7.384343977642759e-05, + "loss": 5.2495, + "num_input_tokens_seen": 633864192, + "step": 4836 + }, + { + "epoch": 0.8274446938121193, + "grad_norm": 0.7836086750030518, + "learning_rate": 7.382054613465076e-05, + "loss": 5.2646, + "num_input_tokens_seen": 634257408, + "step": 4839 + }, + { + "epoch": 0.8279576787431869, + "grad_norm": 0.6579269766807556, + "learning_rate": 7.379767377278969e-05, + "loss": 5.2655, + "num_input_tokens_seen": 634650624, + "step": 4842 + }, + { + "epoch": 0.8284706636742546, + "grad_norm": 0.8051040768623352, + "learning_rate": 7.377482265789823e-05, + "loss": 5.2754, + "num_input_tokens_seen": 635043840, + "step": 4845 + }, + { + "epoch": 0.8289836486053223, + "grad_norm": 0.7173849940299988, + "learning_rate": 7.375199275710157e-05, + "loss": 5.2735, + "num_input_tokens_seen": 635437056, + "step": 4848 + }, + { + "epoch": 0.8294966335363899, + "grad_norm": 0.7025325298309326, + "learning_rate": 7.372918403759613e-05, + "loss": 5.2521, + "num_input_tokens_seen": 635830272, + "step": 4851 + }, + { + "epoch": 0.8300096184674575, + "grad_norm": 0.789348840713501, + "learning_rate": 7.370639646664927e-05, + "loss": 5.2772, + "num_input_tokens_seen": 636223488, + "step": 4854 + }, + { + "epoch": 0.8305226033985251, + "grad_norm": 0.727810263633728, + "learning_rate": 7.368363001159908e-05, + "loss": 5.2853, + "num_input_tokens_seen": 636616704, + "step": 4857 + }, + { + "epoch": 0.8310355883295928, + "grad_norm": 0.7564170956611633, + "learning_rate": 7.366088463985431e-05, + "loss": 5.2352, + "num_input_tokens_seen": 637009920, + "step": 4860 + }, + { + "epoch": 0.8315485732606605, + "grad_norm": 0.7294690012931824, + "learning_rate": 7.363816031889405e-05, + "loss": 5.2667, + "num_input_tokens_seen": 637403136, + "step": 4863 + }, + { + "epoch": 0.8320615581917281, + "grad_norm": 0.688892126083374, + "learning_rate": 7.361545701626754e-05, + "loss": 5.2515, + "num_input_tokens_seen": 637796352, + "step": 4866 + }, + { + "epoch": 0.8325745431227958, + "grad_norm": 0.7049474716186523, + "learning_rate": 7.359277469959405e-05, + "loss": 5.297, + "num_input_tokens_seen": 638189568, + "step": 4869 + }, + { + "epoch": 0.8330875280538634, + "grad_norm": 0.6854651570320129, + "learning_rate": 7.35701133365627e-05, + "loss": 5.1935, + "num_input_tokens_seen": 638582784, + "step": 4872 + }, + { + "epoch": 0.8336005129849311, + "grad_norm": 0.8255504965782166, + "learning_rate": 7.354747289493207e-05, + "loss": 5.2819, + "num_input_tokens_seen": 638976000, + "step": 4875 + }, + { + "epoch": 0.8341134979159988, + "grad_norm": 0.8443409204483032, + "learning_rate": 7.35248533425303e-05, + "loss": 5.2579, + "num_input_tokens_seen": 639369216, + "step": 4878 + }, + { + "epoch": 0.8346264828470664, + "grad_norm": 0.759077787399292, + "learning_rate": 7.350225464725466e-05, + "loss": 5.21, + "num_input_tokens_seen": 639762432, + "step": 4881 + }, + { + "epoch": 0.835139467778134, + "grad_norm": 0.6921237111091614, + "learning_rate": 7.347967677707148e-05, + "loss": 5.2795, + "num_input_tokens_seen": 640155648, + "step": 4884 + }, + { + "epoch": 0.8356524527092016, + "grad_norm": 0.6738846302032471, + "learning_rate": 7.345711970001593e-05, + "loss": 5.262, + "num_input_tokens_seen": 640548864, + "step": 4887 + }, + { + "epoch": 0.8361654376402693, + "grad_norm": 0.6597248315811157, + "learning_rate": 7.343458338419179e-05, + "loss": 5.2218, + "num_input_tokens_seen": 640942080, + "step": 4890 + }, + { + "epoch": 0.836678422571337, + "grad_norm": 0.8394546508789062, + "learning_rate": 7.341206779777132e-05, + "loss": 5.2289, + "num_input_tokens_seen": 641335296, + "step": 4893 + }, + { + "epoch": 0.8371914075024046, + "grad_norm": 0.7097527980804443, + "learning_rate": 7.338957290899508e-05, + "loss": 5.2544, + "num_input_tokens_seen": 641728512, + "step": 4896 + }, + { + "epoch": 0.8377043924334723, + "grad_norm": 0.7356297373771667, + "learning_rate": 7.336709868617169e-05, + "loss": 5.2585, + "num_input_tokens_seen": 642121728, + "step": 4899 + }, + { + "epoch": 0.8382173773645399, + "grad_norm": 0.7718103528022766, + "learning_rate": 7.334464509767758e-05, + "loss": 5.264, + "num_input_tokens_seen": 642514944, + "step": 4902 + }, + { + "epoch": 0.8387303622956076, + "grad_norm": 0.8452854156494141, + "learning_rate": 7.332221211195707e-05, + "loss": 5.2449, + "num_input_tokens_seen": 642908160, + "step": 4905 + }, + { + "epoch": 0.8392433472266753, + "grad_norm": 0.7810956239700317, + "learning_rate": 7.329979969752183e-05, + "loss": 5.264, + "num_input_tokens_seen": 643301376, + "step": 4908 + }, + { + "epoch": 0.8397563321577428, + "grad_norm": 0.7456896305084229, + "learning_rate": 7.327740782295093e-05, + "loss": 5.2423, + "num_input_tokens_seen": 643694592, + "step": 4911 + }, + { + "epoch": 0.8402693170888105, + "grad_norm": 0.6871115565299988, + "learning_rate": 7.325503645689056e-05, + "loss": 5.274, + "num_input_tokens_seen": 644087808, + "step": 4914 + }, + { + "epoch": 0.8407823020198781, + "grad_norm": 0.7335032820701599, + "learning_rate": 7.323268556805394e-05, + "loss": 5.2725, + "num_input_tokens_seen": 644481024, + "step": 4917 + }, + { + "epoch": 0.8412952869509458, + "grad_norm": 0.8171728849411011, + "learning_rate": 7.321035512522102e-05, + "loss": 5.2408, + "num_input_tokens_seen": 644874240, + "step": 4920 + }, + { + "epoch": 0.8418082718820135, + "grad_norm": 0.7723347544670105, + "learning_rate": 7.318804509723834e-05, + "loss": 5.254, + "num_input_tokens_seen": 645267456, + "step": 4923 + }, + { + "epoch": 0.8423212568130811, + "grad_norm": 0.7238738536834717, + "learning_rate": 7.316575545301888e-05, + "loss": 5.2625, + "num_input_tokens_seen": 645660672, + "step": 4926 + }, + { + "epoch": 0.8428342417441488, + "grad_norm": 0.638489842414856, + "learning_rate": 7.314348616154184e-05, + "loss": 5.2686, + "num_input_tokens_seen": 646053888, + "step": 4929 + }, + { + "epoch": 0.8433472266752164, + "grad_norm": 0.6416002511978149, + "learning_rate": 7.31212371918525e-05, + "loss": 5.2102, + "num_input_tokens_seen": 646447104, + "step": 4932 + }, + { + "epoch": 0.8438602116062841, + "grad_norm": 0.7499914169311523, + "learning_rate": 7.309900851306195e-05, + "loss": 5.2976, + "num_input_tokens_seen": 646840320, + "step": 4935 + }, + { + "epoch": 0.8443731965373518, + "grad_norm": 0.7242657542228699, + "learning_rate": 7.307680009434705e-05, + "loss": 5.3085, + "num_input_tokens_seen": 647233536, + "step": 4938 + }, + { + "epoch": 0.8448861814684193, + "grad_norm": 0.6967840790748596, + "learning_rate": 7.30546119049501e-05, + "loss": 5.247, + "num_input_tokens_seen": 647626752, + "step": 4941 + }, + { + "epoch": 0.845399166399487, + "grad_norm": 0.7176790237426758, + "learning_rate": 7.303244391417879e-05, + "loss": 5.2728, + "num_input_tokens_seen": 648019968, + "step": 4944 + }, + { + "epoch": 0.8459121513305546, + "grad_norm": 0.6905176043510437, + "learning_rate": 7.30102960914059e-05, + "loss": 5.2653, + "num_input_tokens_seen": 648413184, + "step": 4947 + }, + { + "epoch": 0.8464251362616223, + "grad_norm": 0.731937050819397, + "learning_rate": 7.298816840606925e-05, + "loss": 5.2449, + "num_input_tokens_seen": 648806400, + "step": 4950 + }, + { + "epoch": 0.84693812119269, + "grad_norm": 0.733711302280426, + "learning_rate": 7.296606082767145e-05, + "loss": 5.2083, + "num_input_tokens_seen": 649199616, + "step": 4953 + }, + { + "epoch": 0.8474511061237576, + "grad_norm": 0.7619096040725708, + "learning_rate": 7.294397332577968e-05, + "loss": 5.2159, + "num_input_tokens_seen": 649592832, + "step": 4956 + }, + { + "epoch": 0.8479640910548253, + "grad_norm": 0.7431106567382812, + "learning_rate": 7.292190587002563e-05, + "loss": 5.2721, + "num_input_tokens_seen": 649986048, + "step": 4959 + }, + { + "epoch": 0.848477075985893, + "grad_norm": 0.7448694705963135, + "learning_rate": 7.28998584301052e-05, + "loss": 5.2766, + "num_input_tokens_seen": 650379264, + "step": 4962 + }, + { + "epoch": 0.8489900609169606, + "grad_norm": 0.6657842397689819, + "learning_rate": 7.287783097577849e-05, + "loss": 5.2348, + "num_input_tokens_seen": 650772480, + "step": 4965 + }, + { + "epoch": 0.8495030458480282, + "grad_norm": 0.7611784934997559, + "learning_rate": 7.28558234768694e-05, + "loss": 5.2712, + "num_input_tokens_seen": 651165696, + "step": 4968 + }, + { + "epoch": 0.8500160307790958, + "grad_norm": 0.678893506526947, + "learning_rate": 7.283383590326562e-05, + "loss": 5.233, + "num_input_tokens_seen": 651558912, + "step": 4971 + }, + { + "epoch": 0.8505290157101635, + "grad_norm": 0.7050936818122864, + "learning_rate": 7.281186822491848e-05, + "loss": 5.2726, + "num_input_tokens_seen": 651952128, + "step": 4974 + }, + { + "epoch": 0.8510420006412311, + "grad_norm": 0.7030792236328125, + "learning_rate": 7.278992041184265e-05, + "loss": 5.2673, + "num_input_tokens_seen": 652345344, + "step": 4977 + }, + { + "epoch": 0.8515549855722988, + "grad_norm": 0.7716869115829468, + "learning_rate": 7.276799243411601e-05, + "loss": 5.2385, + "num_input_tokens_seen": 652738560, + "step": 4980 + }, + { + "epoch": 0.8520679705033665, + "grad_norm": 0.6835726499557495, + "learning_rate": 7.274608426187958e-05, + "loss": 5.2496, + "num_input_tokens_seen": 653131776, + "step": 4983 + }, + { + "epoch": 0.8525809554344341, + "grad_norm": 0.8090899586677551, + "learning_rate": 7.272419586533719e-05, + "loss": 5.2421, + "num_input_tokens_seen": 653524992, + "step": 4986 + }, + { + "epoch": 0.8530939403655018, + "grad_norm": 0.7765442728996277, + "learning_rate": 7.270232721475544e-05, + "loss": 5.2459, + "num_input_tokens_seen": 653918208, + "step": 4989 + }, + { + "epoch": 0.8536069252965695, + "grad_norm": 0.7543562650680542, + "learning_rate": 7.268047828046345e-05, + "loss": 5.2362, + "num_input_tokens_seen": 654311424, + "step": 4992 + }, + { + "epoch": 0.8541199102276371, + "grad_norm": 0.7674638628959656, + "learning_rate": 7.265864903285278e-05, + "loss": 5.1859, + "num_input_tokens_seen": 654704640, + "step": 4995 + }, + { + "epoch": 0.8546328951587047, + "grad_norm": 0.7830629348754883, + "learning_rate": 7.263683944237711e-05, + "loss": 5.2391, + "num_input_tokens_seen": 655097856, + "step": 4998 + }, + { + "epoch": 0.8551458800897723, + "grad_norm": 0.8434771299362183, + "learning_rate": 7.261504947955222e-05, + "loss": 5.2177, + "num_input_tokens_seen": 655491072, + "step": 5001 + }, + { + "epoch": 0.85565886502084, + "grad_norm": 0.6819344162940979, + "learning_rate": 7.259327911495573e-05, + "loss": 5.245, + "num_input_tokens_seen": 655884288, + "step": 5004 + }, + { + "epoch": 0.8561718499519076, + "grad_norm": 0.6312018632888794, + "learning_rate": 7.257152831922706e-05, + "loss": 5.2205, + "num_input_tokens_seen": 656277504, + "step": 5007 + }, + { + "epoch": 0.8566848348829753, + "grad_norm": 0.7521694898605347, + "learning_rate": 7.254979706306706e-05, + "loss": 5.2303, + "num_input_tokens_seen": 656670720, + "step": 5010 + }, + { + "epoch": 0.857197819814043, + "grad_norm": 0.7708764672279358, + "learning_rate": 7.252808531723802e-05, + "loss": 5.2255, + "num_input_tokens_seen": 657063936, + "step": 5013 + }, + { + "epoch": 0.8577108047451106, + "grad_norm": 0.7379157543182373, + "learning_rate": 7.250639305256345e-05, + "loss": 5.2368, + "num_input_tokens_seen": 657457152, + "step": 5016 + }, + { + "epoch": 0.8582237896761783, + "grad_norm": 0.7467489242553711, + "learning_rate": 7.248472023992787e-05, + "loss": 5.2539, + "num_input_tokens_seen": 657850368, + "step": 5019 + }, + { + "epoch": 0.858736774607246, + "grad_norm": 0.7960140109062195, + "learning_rate": 7.24630668502767e-05, + "loss": 5.243, + "num_input_tokens_seen": 658243584, + "step": 5022 + }, + { + "epoch": 0.8592497595383135, + "grad_norm": 0.8107805252075195, + "learning_rate": 7.244143285461608e-05, + "loss": 5.2573, + "num_input_tokens_seen": 658636800, + "step": 5025 + }, + { + "epoch": 0.8597627444693812, + "grad_norm": 0.8108943104743958, + "learning_rate": 7.241981822401273e-05, + "loss": 5.1838, + "num_input_tokens_seen": 659030016, + "step": 5028 + }, + { + "epoch": 0.8602757294004488, + "grad_norm": 0.6969135403633118, + "learning_rate": 7.23982229295937e-05, + "loss": 5.2806, + "num_input_tokens_seen": 659423232, + "step": 5031 + }, + { + "epoch": 0.8607887143315165, + "grad_norm": 0.6890487670898438, + "learning_rate": 7.237664694254637e-05, + "loss": 5.2691, + "num_input_tokens_seen": 659816448, + "step": 5034 + }, + { + "epoch": 0.8613016992625842, + "grad_norm": 0.6614696979522705, + "learning_rate": 7.235509023411809e-05, + "loss": 5.2033, + "num_input_tokens_seen": 660209664, + "step": 5037 + }, + { + "epoch": 0.8618146841936518, + "grad_norm": 0.6726818680763245, + "learning_rate": 7.233355277561621e-05, + "loss": 5.2619, + "num_input_tokens_seen": 660602880, + "step": 5040 + }, + { + "epoch": 0.8623276691247195, + "grad_norm": 0.7320277094841003, + "learning_rate": 7.231203453840776e-05, + "loss": 5.2354, + "num_input_tokens_seen": 660996096, + "step": 5043 + }, + { + "epoch": 0.8628406540557871, + "grad_norm": 0.641864001750946, + "learning_rate": 7.22905354939194e-05, + "loss": 5.2346, + "num_input_tokens_seen": 661389312, + "step": 5046 + }, + { + "epoch": 0.8633536389868548, + "grad_norm": 0.7307341694831848, + "learning_rate": 7.22690556136372e-05, + "loss": 5.2156, + "num_input_tokens_seen": 661782528, + "step": 5049 + }, + { + "epoch": 0.8638666239179225, + "grad_norm": 0.7401044368743896, + "learning_rate": 7.22475948691065e-05, + "loss": 5.275, + "num_input_tokens_seen": 662175744, + "step": 5052 + }, + { + "epoch": 0.86437960884899, + "grad_norm": 0.7346693873405457, + "learning_rate": 7.22261532319318e-05, + "loss": 5.2385, + "num_input_tokens_seen": 662568960, + "step": 5055 + }, + { + "epoch": 0.8648925937800577, + "grad_norm": 0.7968463897705078, + "learning_rate": 7.220473067377648e-05, + "loss": 5.229, + "num_input_tokens_seen": 662962176, + "step": 5058 + }, + { + "epoch": 0.8654055787111253, + "grad_norm": 0.7128798961639404, + "learning_rate": 7.218332716636276e-05, + "loss": 5.2253, + "num_input_tokens_seen": 663355392, + "step": 5061 + }, + { + "epoch": 0.865918563642193, + "grad_norm": 0.6696746945381165, + "learning_rate": 7.216194268147151e-05, + "loss": 5.183, + "num_input_tokens_seen": 663748608, + "step": 5064 + }, + { + "epoch": 0.8664315485732607, + "grad_norm": 0.8338589668273926, + "learning_rate": 7.214057719094208e-05, + "loss": 5.1871, + "num_input_tokens_seen": 664141824, + "step": 5067 + }, + { + "epoch": 0.8669445335043283, + "grad_norm": 0.8238839507102966, + "learning_rate": 7.211923066667213e-05, + "loss": 5.2382, + "num_input_tokens_seen": 664535040, + "step": 5070 + }, + { + "epoch": 0.867457518435396, + "grad_norm": 0.883547306060791, + "learning_rate": 7.20979030806175e-05, + "loss": 5.2703, + "num_input_tokens_seen": 664928256, + "step": 5073 + }, + { + "epoch": 0.8679705033664636, + "grad_norm": 0.8404785394668579, + "learning_rate": 7.207659440479209e-05, + "loss": 5.2057, + "num_input_tokens_seen": 665321472, + "step": 5076 + }, + { + "epoch": 0.8684834882975313, + "grad_norm": 0.8399271368980408, + "learning_rate": 7.20553046112676e-05, + "loss": 5.2319, + "num_input_tokens_seen": 665714688, + "step": 5079 + }, + { + "epoch": 0.868996473228599, + "grad_norm": 0.8033043742179871, + "learning_rate": 7.203403367217348e-05, + "loss": 5.2562, + "num_input_tokens_seen": 666107904, + "step": 5082 + }, + { + "epoch": 0.8695094581596665, + "grad_norm": 0.8439496159553528, + "learning_rate": 7.201278155969676e-05, + "loss": 5.2529, + "num_input_tokens_seen": 666501120, + "step": 5085 + }, + { + "epoch": 0.8700224430907342, + "grad_norm": 0.7626810669898987, + "learning_rate": 7.19915482460818e-05, + "loss": 5.261, + "num_input_tokens_seen": 666894336, + "step": 5088 + }, + { + "epoch": 0.8705354280218018, + "grad_norm": 0.8755491375923157, + "learning_rate": 7.197033370363028e-05, + "loss": 5.2855, + "num_input_tokens_seen": 667287552, + "step": 5091 + }, + { + "epoch": 0.8710484129528695, + "grad_norm": 0.8308954238891602, + "learning_rate": 7.1949137904701e-05, + "loss": 5.234, + "num_input_tokens_seen": 667680768, + "step": 5094 + }, + { + "epoch": 0.8715613978839372, + "grad_norm": 0.748672604560852, + "learning_rate": 7.192796082170961e-05, + "loss": 5.2332, + "num_input_tokens_seen": 668073984, + "step": 5097 + }, + { + "epoch": 0.8720743828150048, + "grad_norm": 0.8005703687667847, + "learning_rate": 7.190680242712868e-05, + "loss": 5.2514, + "num_input_tokens_seen": 668467200, + "step": 5100 + }, + { + "epoch": 0.8725873677460725, + "grad_norm": 0.653659462928772, + "learning_rate": 7.18856626934873e-05, + "loss": 5.2233, + "num_input_tokens_seen": 668860416, + "step": 5103 + }, + { + "epoch": 0.8731003526771401, + "grad_norm": 0.7143151760101318, + "learning_rate": 7.186454159337121e-05, + "loss": 5.2236, + "num_input_tokens_seen": 669253632, + "step": 5106 + }, + { + "epoch": 0.8736133376082078, + "grad_norm": 0.7480143904685974, + "learning_rate": 7.184343909942239e-05, + "loss": 5.1812, + "num_input_tokens_seen": 669646848, + "step": 5109 + }, + { + "epoch": 0.8741263225392754, + "grad_norm": 0.7663584351539612, + "learning_rate": 7.182235518433903e-05, + "loss": 5.211, + "num_input_tokens_seen": 670040064, + "step": 5112 + }, + { + "epoch": 0.874639307470343, + "grad_norm": 0.7375966310501099, + "learning_rate": 7.180128982087541e-05, + "loss": 5.2147, + "num_input_tokens_seen": 670433280, + "step": 5115 + }, + { + "epoch": 0.8751522924014107, + "grad_norm": 0.7116428017616272, + "learning_rate": 7.178024298184173e-05, + "loss": 5.2005, + "num_input_tokens_seen": 670826496, + "step": 5118 + }, + { + "epoch": 0.8756652773324783, + "grad_norm": 0.7579666972160339, + "learning_rate": 7.175921464010388e-05, + "loss": 5.2272, + "num_input_tokens_seen": 671219712, + "step": 5121 + }, + { + "epoch": 0.876178262263546, + "grad_norm": 0.8425345420837402, + "learning_rate": 7.173820476858339e-05, + "loss": 5.2347, + "num_input_tokens_seen": 671612928, + "step": 5124 + }, + { + "epoch": 0.8766912471946137, + "grad_norm": 0.7112694382667542, + "learning_rate": 7.171721334025732e-05, + "loss": 5.2224, + "num_input_tokens_seen": 672006144, + "step": 5127 + }, + { + "epoch": 0.8772042321256813, + "grad_norm": 0.798812747001648, + "learning_rate": 7.169624032815794e-05, + "loss": 5.2049, + "num_input_tokens_seen": 672399360, + "step": 5130 + }, + { + "epoch": 0.877717217056749, + "grad_norm": 0.8146517872810364, + "learning_rate": 7.167528570537277e-05, + "loss": 5.2076, + "num_input_tokens_seen": 672792576, + "step": 5133 + }, + { + "epoch": 0.8782302019878166, + "grad_norm": 0.7291656732559204, + "learning_rate": 7.165434944504431e-05, + "loss": 5.1914, + "num_input_tokens_seen": 673185792, + "step": 5136 + }, + { + "epoch": 0.8787431869188843, + "grad_norm": 0.9471056461334229, + "learning_rate": 7.163343152036998e-05, + "loss": 5.267, + "num_input_tokens_seen": 673579008, + "step": 5139 + }, + { + "epoch": 0.8792561718499519, + "grad_norm": 0.6661033034324646, + "learning_rate": 7.161253190460194e-05, + "loss": 5.2058, + "num_input_tokens_seen": 673972224, + "step": 5142 + }, + { + "epoch": 0.8797691567810195, + "grad_norm": 0.7200859189033508, + "learning_rate": 7.159165057104689e-05, + "loss": 5.1633, + "num_input_tokens_seen": 674365440, + "step": 5145 + }, + { + "epoch": 0.8802821417120872, + "grad_norm": 0.8545052409172058, + "learning_rate": 7.157078749306606e-05, + "loss": 5.2405, + "num_input_tokens_seen": 674758656, + "step": 5148 + }, + { + "epoch": 0.8807951266431548, + "grad_norm": 0.785458505153656, + "learning_rate": 7.154994264407493e-05, + "loss": 5.2005, + "num_input_tokens_seen": 675151872, + "step": 5151 + }, + { + "epoch": 0.8813081115742225, + "grad_norm": 0.7209606170654297, + "learning_rate": 7.152911599754318e-05, + "loss": 5.2581, + "num_input_tokens_seen": 675545088, + "step": 5154 + }, + { + "epoch": 0.8818210965052902, + "grad_norm": 0.688242495059967, + "learning_rate": 7.15083075269945e-05, + "loss": 5.2225, + "num_input_tokens_seen": 675938304, + "step": 5157 + }, + { + "epoch": 0.8823340814363578, + "grad_norm": 0.7452808618545532, + "learning_rate": 7.148751720600645e-05, + "loss": 5.1834, + "num_input_tokens_seen": 676331520, + "step": 5160 + }, + { + "epoch": 0.8828470663674255, + "grad_norm": 0.7830954790115356, + "learning_rate": 7.146674500821039e-05, + "loss": 5.1773, + "num_input_tokens_seen": 676724736, + "step": 5163 + }, + { + "epoch": 0.8833600512984932, + "grad_norm": 0.7248668074607849, + "learning_rate": 7.144599090729122e-05, + "loss": 5.2314, + "num_input_tokens_seen": 677117952, + "step": 5166 + }, + { + "epoch": 0.8838730362295607, + "grad_norm": 0.8058164715766907, + "learning_rate": 7.142525487698731e-05, + "loss": 5.2789, + "num_input_tokens_seen": 677511168, + "step": 5169 + }, + { + "epoch": 0.8843860211606284, + "grad_norm": 0.70136958360672, + "learning_rate": 7.140453689109039e-05, + "loss": 5.272, + "num_input_tokens_seen": 677904384, + "step": 5172 + }, + { + "epoch": 0.884899006091696, + "grad_norm": 0.8600161671638489, + "learning_rate": 7.138383692344537e-05, + "loss": 5.2665, + "num_input_tokens_seen": 678297600, + "step": 5175 + }, + { + "epoch": 0.8854119910227637, + "grad_norm": 0.7736455202102661, + "learning_rate": 7.136315494795016e-05, + "loss": 5.2499, + "num_input_tokens_seen": 678690816, + "step": 5178 + }, + { + "epoch": 0.8859249759538314, + "grad_norm": 0.8731935024261475, + "learning_rate": 7.134249093855563e-05, + "loss": 5.2628, + "num_input_tokens_seen": 679084032, + "step": 5181 + }, + { + "epoch": 0.886437960884899, + "grad_norm": 0.7711248993873596, + "learning_rate": 7.132184486926537e-05, + "loss": 5.2459, + "num_input_tokens_seen": 679477248, + "step": 5184 + }, + { + "epoch": 0.8869509458159667, + "grad_norm": 0.7357232570648193, + "learning_rate": 7.130121671413564e-05, + "loss": 5.239, + "num_input_tokens_seen": 679870464, + "step": 5187 + }, + { + "epoch": 0.8874639307470343, + "grad_norm": 0.7501398324966431, + "learning_rate": 7.128060644727519e-05, + "loss": 5.2204, + "num_input_tokens_seen": 680263680, + "step": 5190 + }, + { + "epoch": 0.887976915678102, + "grad_norm": 0.7075851559638977, + "learning_rate": 7.12600140428451e-05, + "loss": 5.1888, + "num_input_tokens_seen": 680656896, + "step": 5193 + }, + { + "epoch": 0.8884899006091697, + "grad_norm": 0.7382627725601196, + "learning_rate": 7.123943947505872e-05, + "loss": 5.2237, + "num_input_tokens_seen": 681050112, + "step": 5196 + }, + { + "epoch": 0.8890028855402372, + "grad_norm": 0.6421706676483154, + "learning_rate": 7.121888271818144e-05, + "loss": 5.2264, + "num_input_tokens_seen": 681443328, + "step": 5199 + }, + { + "epoch": 0.8891738805172598, + "eval_accuracy": 0.18500407099820876, + "eval_loss": 5.699267387390137, + "eval_runtime": 109.3374, + "eval_samples_per_second": 2.744, + "eval_steps_per_second": 1.372, + "num_input_tokens_seen": 681574400, + "step": 5200 + }, + { + "epoch": 0.8895158704713049, + "grad_norm": 0.7272650599479675, + "learning_rate": 7.11983437465306e-05, + "loss": 5.298, + "num_input_tokens_seen": 681836544, + "step": 5202 + }, + { + "epoch": 0.8900288554023725, + "grad_norm": 0.6727803945541382, + "learning_rate": 7.117782253447543e-05, + "loss": 5.2554, + "num_input_tokens_seen": 682229760, + "step": 5205 + }, + { + "epoch": 0.8905418403334402, + "grad_norm": 0.713238000869751, + "learning_rate": 7.115731905643676e-05, + "loss": 5.2232, + "num_input_tokens_seen": 682622976, + "step": 5208 + }, + { + "epoch": 0.8910548252645079, + "grad_norm": 0.8077900409698486, + "learning_rate": 7.1136833286887e-05, + "loss": 5.188, + "num_input_tokens_seen": 683016192, + "step": 5211 + }, + { + "epoch": 0.8915678101955755, + "grad_norm": 0.694200873374939, + "learning_rate": 7.111636520034998e-05, + "loss": 5.2278, + "num_input_tokens_seen": 683409408, + "step": 5214 + }, + { + "epoch": 0.8920807951266432, + "grad_norm": 0.723655104637146, + "learning_rate": 7.109591477140081e-05, + "loss": 5.2641, + "num_input_tokens_seen": 683802624, + "step": 5217 + }, + { + "epoch": 0.8925937800577108, + "grad_norm": 0.7347647547721863, + "learning_rate": 7.107548197466574e-05, + "loss": 5.2124, + "num_input_tokens_seen": 684195840, + "step": 5220 + }, + { + "epoch": 0.8931067649887785, + "grad_norm": 0.7525767087936401, + "learning_rate": 7.105506678482202e-05, + "loss": 5.255, + "num_input_tokens_seen": 684589056, + "step": 5223 + }, + { + "epoch": 0.893619749919846, + "grad_norm": 0.7006353735923767, + "learning_rate": 7.103466917659785e-05, + "loss": 5.2275, + "num_input_tokens_seen": 684982272, + "step": 5226 + }, + { + "epoch": 0.8941327348509137, + "grad_norm": 0.7175182700157166, + "learning_rate": 7.101428912477212e-05, + "loss": 5.2545, + "num_input_tokens_seen": 685375488, + "step": 5229 + }, + { + "epoch": 0.8946457197819814, + "grad_norm": 0.692328691482544, + "learning_rate": 7.099392660417439e-05, + "loss": 5.1934, + "num_input_tokens_seen": 685768704, + "step": 5232 + }, + { + "epoch": 0.895158704713049, + "grad_norm": 0.6852241158485413, + "learning_rate": 7.097358158968464e-05, + "loss": 5.2326, + "num_input_tokens_seen": 686161920, + "step": 5235 + }, + { + "epoch": 0.8956716896441167, + "grad_norm": 0.7024674415588379, + "learning_rate": 7.095325405623328e-05, + "loss": 5.2226, + "num_input_tokens_seen": 686555136, + "step": 5238 + }, + { + "epoch": 0.8961846745751844, + "grad_norm": 0.6983912587165833, + "learning_rate": 7.093294397880095e-05, + "loss": 5.2614, + "num_input_tokens_seen": 686948352, + "step": 5241 + }, + { + "epoch": 0.896697659506252, + "grad_norm": 0.7533532381057739, + "learning_rate": 7.091265133241835e-05, + "loss": 5.2038, + "num_input_tokens_seen": 687341568, + "step": 5244 + }, + { + "epoch": 0.8972106444373197, + "grad_norm": 0.7239729166030884, + "learning_rate": 7.08923760921662e-05, + "loss": 5.2062, + "num_input_tokens_seen": 687734784, + "step": 5247 + }, + { + "epoch": 0.8977236293683873, + "grad_norm": 0.9106455445289612, + "learning_rate": 7.087211823317505e-05, + "loss": 5.1852, + "num_input_tokens_seen": 688128000, + "step": 5250 + }, + { + "epoch": 0.898236614299455, + "grad_norm": 0.7044593691825867, + "learning_rate": 7.085187773062514e-05, + "loss": 5.2111, + "num_input_tokens_seen": 688521216, + "step": 5253 + }, + { + "epoch": 0.8987495992305226, + "grad_norm": 0.7331400513648987, + "learning_rate": 7.083165455974633e-05, + "loss": 5.212, + "num_input_tokens_seen": 688914432, + "step": 5256 + }, + { + "epoch": 0.8992625841615902, + "grad_norm": 0.7778918743133545, + "learning_rate": 7.081144869581792e-05, + "loss": 5.1951, + "num_input_tokens_seen": 689307648, + "step": 5259 + }, + { + "epoch": 0.8997755690926579, + "grad_norm": 0.7159865498542786, + "learning_rate": 7.079126011416861e-05, + "loss": 5.1574, + "num_input_tokens_seen": 689700864, + "step": 5262 + }, + { + "epoch": 0.9002885540237255, + "grad_norm": 0.7375167012214661, + "learning_rate": 7.077108879017622e-05, + "loss": 5.2498, + "num_input_tokens_seen": 690094080, + "step": 5265 + }, + { + "epoch": 0.9008015389547932, + "grad_norm": 0.7126013040542603, + "learning_rate": 7.075093469926772e-05, + "loss": 5.2121, + "num_input_tokens_seen": 690487296, + "step": 5268 + }, + { + "epoch": 0.9013145238858609, + "grad_norm": 0.7177429795265198, + "learning_rate": 7.073079781691898e-05, + "loss": 5.2539, + "num_input_tokens_seen": 690880512, + "step": 5271 + }, + { + "epoch": 0.9018275088169285, + "grad_norm": 0.7073779106140137, + "learning_rate": 7.071067811865475e-05, + "loss": 5.2696, + "num_input_tokens_seen": 691273728, + "step": 5274 + }, + { + "epoch": 0.9023404937479962, + "grad_norm": 0.8069375157356262, + "learning_rate": 7.069057558004847e-05, + "loss": 5.1789, + "num_input_tokens_seen": 691666944, + "step": 5277 + }, + { + "epoch": 0.9028534786790638, + "grad_norm": 0.7410465478897095, + "learning_rate": 7.067049017672214e-05, + "loss": 5.2048, + "num_input_tokens_seen": 692060160, + "step": 5280 + }, + { + "epoch": 0.9033664636101314, + "grad_norm": 0.9009111523628235, + "learning_rate": 7.065042188434626e-05, + "loss": 5.2126, + "num_input_tokens_seen": 692453376, + "step": 5283 + }, + { + "epoch": 0.9038794485411991, + "grad_norm": 0.670901358127594, + "learning_rate": 7.063037067863967e-05, + "loss": 5.2405, + "num_input_tokens_seen": 692846592, + "step": 5286 + }, + { + "epoch": 0.9043924334722667, + "grad_norm": 0.666654646396637, + "learning_rate": 7.061033653536935e-05, + "loss": 5.2376, + "num_input_tokens_seen": 693239808, + "step": 5289 + }, + { + "epoch": 0.9049054184033344, + "grad_norm": 0.8325378894805908, + "learning_rate": 7.059031943035043e-05, + "loss": 5.175, + "num_input_tokens_seen": 693633024, + "step": 5292 + }, + { + "epoch": 0.905418403334402, + "grad_norm": 0.7429814338684082, + "learning_rate": 7.0570319339446e-05, + "loss": 5.2228, + "num_input_tokens_seen": 694026240, + "step": 5295 + }, + { + "epoch": 0.9059313882654697, + "grad_norm": 0.7891907095909119, + "learning_rate": 7.055033623856699e-05, + "loss": 5.2153, + "num_input_tokens_seen": 694419456, + "step": 5298 + }, + { + "epoch": 0.9064443731965374, + "grad_norm": 0.6809067726135254, + "learning_rate": 7.053037010367201e-05, + "loss": 5.1762, + "num_input_tokens_seen": 694812672, + "step": 5301 + }, + { + "epoch": 0.906957358127605, + "grad_norm": 0.721263587474823, + "learning_rate": 7.051042091076731e-05, + "loss": 5.2516, + "num_input_tokens_seen": 695205888, + "step": 5304 + }, + { + "epoch": 0.9074703430586727, + "grad_norm": 0.7249355912208557, + "learning_rate": 7.049048863590665e-05, + "loss": 5.211, + "num_input_tokens_seen": 695599104, + "step": 5307 + }, + { + "epoch": 0.9079833279897404, + "grad_norm": 0.7684010863304138, + "learning_rate": 7.047057325519109e-05, + "loss": 5.1956, + "num_input_tokens_seen": 695992320, + "step": 5310 + }, + { + "epoch": 0.9084963129208079, + "grad_norm": 0.8806822299957275, + "learning_rate": 7.045067474476893e-05, + "loss": 5.2286, + "num_input_tokens_seen": 696385536, + "step": 5313 + }, + { + "epoch": 0.9090092978518756, + "grad_norm": 0.8653072714805603, + "learning_rate": 7.043079308083562e-05, + "loss": 5.1777, + "num_input_tokens_seen": 696778752, + "step": 5316 + }, + { + "epoch": 0.9095222827829432, + "grad_norm": 0.8760643601417542, + "learning_rate": 7.041092823963362e-05, + "loss": 5.195, + "num_input_tokens_seen": 697171968, + "step": 5319 + }, + { + "epoch": 0.9100352677140109, + "grad_norm": 0.7858259081840515, + "learning_rate": 7.03910801974522e-05, + "loss": 5.2221, + "num_input_tokens_seen": 697565184, + "step": 5322 + }, + { + "epoch": 0.9105482526450785, + "grad_norm": 0.7364184856414795, + "learning_rate": 7.037124893062746e-05, + "loss": 5.1735, + "num_input_tokens_seen": 697958400, + "step": 5325 + }, + { + "epoch": 0.9110612375761462, + "grad_norm": 0.6950361132621765, + "learning_rate": 7.03514344155421e-05, + "loss": 5.2196, + "num_input_tokens_seen": 698351616, + "step": 5328 + }, + { + "epoch": 0.9115742225072139, + "grad_norm": 0.7385085821151733, + "learning_rate": 7.03316366286254e-05, + "loss": 5.2556, + "num_input_tokens_seen": 698744832, + "step": 5331 + }, + { + "epoch": 0.9120872074382815, + "grad_norm": 0.6558942198753357, + "learning_rate": 7.031185554635294e-05, + "loss": 5.2524, + "num_input_tokens_seen": 699138048, + "step": 5334 + }, + { + "epoch": 0.9126001923693492, + "grad_norm": 0.7035802602767944, + "learning_rate": 7.029209114524669e-05, + "loss": 5.2398, + "num_input_tokens_seen": 699531264, + "step": 5337 + }, + { + "epoch": 0.9131131773004167, + "grad_norm": 0.754648745059967, + "learning_rate": 7.027234340187475e-05, + "loss": 5.253, + "num_input_tokens_seen": 699924480, + "step": 5340 + }, + { + "epoch": 0.9136261622314844, + "grad_norm": 0.7020716667175293, + "learning_rate": 7.025261229285127e-05, + "loss": 5.2531, + "num_input_tokens_seen": 700317696, + "step": 5343 + }, + { + "epoch": 0.9141391471625521, + "grad_norm": 0.7005373239517212, + "learning_rate": 7.023289779483637e-05, + "loss": 5.1856, + "num_input_tokens_seen": 700710912, + "step": 5346 + }, + { + "epoch": 0.9146521320936197, + "grad_norm": 0.8192885518074036, + "learning_rate": 7.021319988453594e-05, + "loss": 5.2202, + "num_input_tokens_seen": 701104128, + "step": 5349 + }, + { + "epoch": 0.9151651170246874, + "grad_norm": 0.7134819626808167, + "learning_rate": 7.019351853870163e-05, + "loss": 5.1995, + "num_input_tokens_seen": 701497344, + "step": 5352 + }, + { + "epoch": 0.915678101955755, + "grad_norm": 0.7718667387962341, + "learning_rate": 7.017385373413064e-05, + "loss": 5.2546, + "num_input_tokens_seen": 701890560, + "step": 5355 + }, + { + "epoch": 0.9161910868868227, + "grad_norm": 0.7580474615097046, + "learning_rate": 7.015420544766564e-05, + "loss": 5.223, + "num_input_tokens_seen": 702283776, + "step": 5358 + }, + { + "epoch": 0.9167040718178904, + "grad_norm": 0.761403501033783, + "learning_rate": 7.013457365619473e-05, + "loss": 5.2063, + "num_input_tokens_seen": 702676992, + "step": 5361 + }, + { + "epoch": 0.917217056748958, + "grad_norm": 0.7182543873786926, + "learning_rate": 7.01149583366512e-05, + "loss": 5.2481, + "num_input_tokens_seen": 703070208, + "step": 5364 + }, + { + "epoch": 0.9177300416800257, + "grad_norm": 0.7789894938468933, + "learning_rate": 7.009535946601349e-05, + "loss": 5.2004, + "num_input_tokens_seen": 703463424, + "step": 5367 + }, + { + "epoch": 0.9182430266110932, + "grad_norm": 0.698883593082428, + "learning_rate": 7.007577702130504e-05, + "loss": 5.1961, + "num_input_tokens_seen": 703856640, + "step": 5370 + }, + { + "epoch": 0.9187560115421609, + "grad_norm": 0.7992156744003296, + "learning_rate": 7.005621097959424e-05, + "loss": 5.1863, + "num_input_tokens_seen": 704249856, + "step": 5373 + }, + { + "epoch": 0.9192689964732286, + "grad_norm": 0.7814815044403076, + "learning_rate": 7.003666131799421e-05, + "loss": 5.1962, + "num_input_tokens_seen": 704643072, + "step": 5376 + }, + { + "epoch": 0.9197819814042962, + "grad_norm": 0.7636701464653015, + "learning_rate": 7.001712801366284e-05, + "loss": 5.2015, + "num_input_tokens_seen": 705036288, + "step": 5379 + }, + { + "epoch": 0.9202949663353639, + "grad_norm": 0.7517854571342468, + "learning_rate": 6.999761104380251e-05, + "loss": 5.2346, + "num_input_tokens_seen": 705429504, + "step": 5382 + }, + { + "epoch": 0.9208079512664316, + "grad_norm": 0.7156343460083008, + "learning_rate": 6.99781103856601e-05, + "loss": 5.1884, + "num_input_tokens_seen": 705822720, + "step": 5385 + }, + { + "epoch": 0.9213209361974992, + "grad_norm": 0.7727795243263245, + "learning_rate": 6.995862601652685e-05, + "loss": 5.1847, + "num_input_tokens_seen": 706215936, + "step": 5388 + }, + { + "epoch": 0.9218339211285669, + "grad_norm": 0.790640115737915, + "learning_rate": 6.993915791373815e-05, + "loss": 5.2489, + "num_input_tokens_seen": 706609152, + "step": 5391 + }, + { + "epoch": 0.9223469060596345, + "grad_norm": 0.6869045495986938, + "learning_rate": 6.991970605467365e-05, + "loss": 5.2265, + "num_input_tokens_seen": 707002368, + "step": 5394 + }, + { + "epoch": 0.9228598909907022, + "grad_norm": 0.6860256791114807, + "learning_rate": 6.99002704167569e-05, + "loss": 5.1901, + "num_input_tokens_seen": 707395584, + "step": 5397 + }, + { + "epoch": 0.9233728759217698, + "grad_norm": 0.7133844494819641, + "learning_rate": 6.988085097745543e-05, + "loss": 5.2146, + "num_input_tokens_seen": 707788800, + "step": 5400 + }, + { + "epoch": 0.9238858608528374, + "grad_norm": 0.7109991908073425, + "learning_rate": 6.986144771428049e-05, + "loss": 5.244, + "num_input_tokens_seen": 708182016, + "step": 5403 + }, + { + "epoch": 0.9243988457839051, + "grad_norm": 0.6793102622032166, + "learning_rate": 6.984206060478708e-05, + "loss": 5.2256, + "num_input_tokens_seen": 708575232, + "step": 5406 + }, + { + "epoch": 0.9249118307149727, + "grad_norm": 0.6254820823669434, + "learning_rate": 6.982268962657377e-05, + "loss": 5.2082, + "num_input_tokens_seen": 708968448, + "step": 5409 + }, + { + "epoch": 0.9254248156460404, + "grad_norm": 0.6661543846130371, + "learning_rate": 6.980333475728256e-05, + "loss": 5.2922, + "num_input_tokens_seen": 709361664, + "step": 5412 + }, + { + "epoch": 0.9259378005771081, + "grad_norm": 0.674060583114624, + "learning_rate": 6.978399597459882e-05, + "loss": 5.2425, + "num_input_tokens_seen": 709754880, + "step": 5415 + }, + { + "epoch": 0.9264507855081757, + "grad_norm": 0.7253686189651489, + "learning_rate": 6.976467325625122e-05, + "loss": 5.1947, + "num_input_tokens_seen": 710148096, + "step": 5418 + }, + { + "epoch": 0.9269637704392434, + "grad_norm": 0.6542623043060303, + "learning_rate": 6.974536658001151e-05, + "loss": 5.1991, + "num_input_tokens_seen": 710541312, + "step": 5421 + }, + { + "epoch": 0.927476755370311, + "grad_norm": 0.7277594208717346, + "learning_rate": 6.972607592369453e-05, + "loss": 5.2157, + "num_input_tokens_seen": 710934528, + "step": 5424 + }, + { + "epoch": 0.9279897403013786, + "grad_norm": 0.6952804923057556, + "learning_rate": 6.970680126515798e-05, + "loss": 5.2329, + "num_input_tokens_seen": 711327744, + "step": 5427 + }, + { + "epoch": 0.9285027252324463, + "grad_norm": 0.7163040041923523, + "learning_rate": 6.968754258230246e-05, + "loss": 5.2398, + "num_input_tokens_seen": 711720960, + "step": 5430 + }, + { + "epoch": 0.9290157101635139, + "grad_norm": 0.6704942584037781, + "learning_rate": 6.966829985307124e-05, + "loss": 5.1918, + "num_input_tokens_seen": 712114176, + "step": 5433 + }, + { + "epoch": 0.9295286950945816, + "grad_norm": 0.6688148379325867, + "learning_rate": 6.964907305545022e-05, + "loss": 5.2048, + "num_input_tokens_seen": 712507392, + "step": 5436 + }, + { + "epoch": 0.9300416800256492, + "grad_norm": 0.7081178426742554, + "learning_rate": 6.962986216746778e-05, + "loss": 5.2602, + "num_input_tokens_seen": 712900608, + "step": 5439 + }, + { + "epoch": 0.9305546649567169, + "grad_norm": 0.7875306606292725, + "learning_rate": 6.961066716719472e-05, + "loss": 5.2051, + "num_input_tokens_seen": 713293824, + "step": 5442 + }, + { + "epoch": 0.9310676498877846, + "grad_norm": 0.7292640209197998, + "learning_rate": 6.959148803274413e-05, + "loss": 5.2177, + "num_input_tokens_seen": 713687040, + "step": 5445 + }, + { + "epoch": 0.9315806348188522, + "grad_norm": 0.7992739677429199, + "learning_rate": 6.957232474227127e-05, + "loss": 5.2218, + "num_input_tokens_seen": 714080256, + "step": 5448 + }, + { + "epoch": 0.9320936197499199, + "grad_norm": 0.6518861651420593, + "learning_rate": 6.955317727397353e-05, + "loss": 5.2443, + "num_input_tokens_seen": 714473472, + "step": 5451 + }, + { + "epoch": 0.9326066046809875, + "grad_norm": 0.6771551966667175, + "learning_rate": 6.953404560609022e-05, + "loss": 5.1987, + "num_input_tokens_seen": 714866688, + "step": 5454 + }, + { + "epoch": 0.9331195896120551, + "grad_norm": 0.7463414072990417, + "learning_rate": 6.951492971690257e-05, + "loss": 5.2047, + "num_input_tokens_seen": 715259904, + "step": 5457 + }, + { + "epoch": 0.9336325745431228, + "grad_norm": 0.7123643755912781, + "learning_rate": 6.949582958473356e-05, + "loss": 5.2367, + "num_input_tokens_seen": 715653120, + "step": 5460 + }, + { + "epoch": 0.9341455594741904, + "grad_norm": 0.8397427201271057, + "learning_rate": 6.947674518794787e-05, + "loss": 5.2235, + "num_input_tokens_seen": 716046336, + "step": 5463 + }, + { + "epoch": 0.9346585444052581, + "grad_norm": 0.731139600276947, + "learning_rate": 6.94576765049517e-05, + "loss": 5.2689, + "num_input_tokens_seen": 716439552, + "step": 5466 + }, + { + "epoch": 0.9351715293363257, + "grad_norm": 0.7536250352859497, + "learning_rate": 6.943862351419276e-05, + "loss": 5.1548, + "num_input_tokens_seen": 716832768, + "step": 5469 + }, + { + "epoch": 0.9356845142673934, + "grad_norm": 0.7753055095672607, + "learning_rate": 6.941958619416007e-05, + "loss": 5.2435, + "num_input_tokens_seen": 717225984, + "step": 5472 + }, + { + "epoch": 0.9361974991984611, + "grad_norm": 0.677101731300354, + "learning_rate": 6.940056452338398e-05, + "loss": 5.1863, + "num_input_tokens_seen": 717619200, + "step": 5475 + }, + { + "epoch": 0.9367104841295287, + "grad_norm": 0.730702817440033, + "learning_rate": 6.938155848043593e-05, + "loss": 5.2294, + "num_input_tokens_seen": 718012416, + "step": 5478 + }, + { + "epoch": 0.9372234690605964, + "grad_norm": 0.7470299601554871, + "learning_rate": 6.936256804392845e-05, + "loss": 5.1684, + "num_input_tokens_seen": 718405632, + "step": 5481 + }, + { + "epoch": 0.9377364539916639, + "grad_norm": 0.7415927052497864, + "learning_rate": 6.934359319251501e-05, + "loss": 5.213, + "num_input_tokens_seen": 718798848, + "step": 5484 + }, + { + "epoch": 0.9382494389227316, + "grad_norm": 0.8822851777076721, + "learning_rate": 6.932463390488997e-05, + "loss": 5.2153, + "num_input_tokens_seen": 719192064, + "step": 5487 + }, + { + "epoch": 0.9387624238537993, + "grad_norm": 0.8007544279098511, + "learning_rate": 6.93056901597884e-05, + "loss": 5.2219, + "num_input_tokens_seen": 719585280, + "step": 5490 + }, + { + "epoch": 0.9392754087848669, + "grad_norm": 0.7476817965507507, + "learning_rate": 6.928676193598603e-05, + "loss": 5.2602, + "num_input_tokens_seen": 719978496, + "step": 5493 + }, + { + "epoch": 0.9397883937159346, + "grad_norm": 0.8078888654708862, + "learning_rate": 6.926784921229917e-05, + "loss": 5.2227, + "num_input_tokens_seen": 720371712, + "step": 5496 + }, + { + "epoch": 0.9403013786470023, + "grad_norm": 0.75888991355896, + "learning_rate": 6.924895196758458e-05, + "loss": 5.2416, + "num_input_tokens_seen": 720764928, + "step": 5499 + }, + { + "epoch": 0.9408143635780699, + "grad_norm": 0.6974719762802124, + "learning_rate": 6.923007018073937e-05, + "loss": 5.191, + "num_input_tokens_seen": 721158144, + "step": 5502 + }, + { + "epoch": 0.9413273485091376, + "grad_norm": 0.7393808960914612, + "learning_rate": 6.921120383070087e-05, + "loss": 5.2472, + "num_input_tokens_seen": 721551360, + "step": 5505 + }, + { + "epoch": 0.9418403334402052, + "grad_norm": 0.7024008631706238, + "learning_rate": 6.919235289644663e-05, + "loss": 5.2467, + "num_input_tokens_seen": 721944576, + "step": 5508 + }, + { + "epoch": 0.9423533183712729, + "grad_norm": 0.7399491667747498, + "learning_rate": 6.91735173569942e-05, + "loss": 5.1734, + "num_input_tokens_seen": 722337792, + "step": 5511 + }, + { + "epoch": 0.9428663033023404, + "grad_norm": 0.6762517690658569, + "learning_rate": 6.915469719140114e-05, + "loss": 5.2006, + "num_input_tokens_seen": 722731008, + "step": 5514 + }, + { + "epoch": 0.9433792882334081, + "grad_norm": 0.7834249138832092, + "learning_rate": 6.913589237876484e-05, + "loss": 5.2128, + "num_input_tokens_seen": 723124224, + "step": 5517 + }, + { + "epoch": 0.9438922731644758, + "grad_norm": 0.7544867992401123, + "learning_rate": 6.91171028982225e-05, + "loss": 5.1778, + "num_input_tokens_seen": 723517440, + "step": 5520 + }, + { + "epoch": 0.9444052580955434, + "grad_norm": 0.7604652643203735, + "learning_rate": 6.909832872895093e-05, + "loss": 5.2395, + "num_input_tokens_seen": 723910656, + "step": 5523 + }, + { + "epoch": 0.9449182430266111, + "grad_norm": 0.7239798307418823, + "learning_rate": 6.907956985016653e-05, + "loss": 5.2005, + "num_input_tokens_seen": 724303872, + "step": 5526 + }, + { + "epoch": 0.9454312279576788, + "grad_norm": 0.6755896210670471, + "learning_rate": 6.906082624112522e-05, + "loss": 5.2482, + "num_input_tokens_seen": 724697088, + "step": 5529 + }, + { + "epoch": 0.9459442128887464, + "grad_norm": 0.6451256275177002, + "learning_rate": 6.904209788112224e-05, + "loss": 5.2247, + "num_input_tokens_seen": 725090304, + "step": 5532 + }, + { + "epoch": 0.9464571978198141, + "grad_norm": 0.6745946407318115, + "learning_rate": 6.902338474949212e-05, + "loss": 5.1867, + "num_input_tokens_seen": 725483520, + "step": 5535 + }, + { + "epoch": 0.9469701827508817, + "grad_norm": 0.7276203036308289, + "learning_rate": 6.900468682560861e-05, + "loss": 5.2158, + "num_input_tokens_seen": 725876736, + "step": 5538 + }, + { + "epoch": 0.9474831676819493, + "grad_norm": 0.7208330035209656, + "learning_rate": 6.898600408888455e-05, + "loss": 5.252, + "num_input_tokens_seen": 726269952, + "step": 5541 + }, + { + "epoch": 0.947996152613017, + "grad_norm": 0.7684881687164307, + "learning_rate": 6.896733651877174e-05, + "loss": 5.2424, + "num_input_tokens_seen": 726663168, + "step": 5544 + }, + { + "epoch": 0.9485091375440846, + "grad_norm": 0.6904166340827942, + "learning_rate": 6.894868409476089e-05, + "loss": 5.1711, + "num_input_tokens_seen": 727056384, + "step": 5547 + }, + { + "epoch": 0.9490221224751523, + "grad_norm": 0.6969854235649109, + "learning_rate": 6.893004679638155e-05, + "loss": 5.2098, + "num_input_tokens_seen": 727449600, + "step": 5550 + }, + { + "epoch": 0.9495351074062199, + "grad_norm": 0.6778176426887512, + "learning_rate": 6.891142460320194e-05, + "loss": 5.207, + "num_input_tokens_seen": 727842816, + "step": 5553 + }, + { + "epoch": 0.9500480923372876, + "grad_norm": 0.6988036632537842, + "learning_rate": 6.889281749482896e-05, + "loss": 5.1607, + "num_input_tokens_seen": 728236032, + "step": 5556 + }, + { + "epoch": 0.9505610772683553, + "grad_norm": 0.6899657845497131, + "learning_rate": 6.887422545090792e-05, + "loss": 5.2255, + "num_input_tokens_seen": 728629248, + "step": 5559 + }, + { + "epoch": 0.9510740621994229, + "grad_norm": 0.7000394463539124, + "learning_rate": 6.885564845112269e-05, + "loss": 5.234, + "num_input_tokens_seen": 729022464, + "step": 5562 + }, + { + "epoch": 0.9515870471304906, + "grad_norm": 0.7130152583122253, + "learning_rate": 6.88370864751954e-05, + "loss": 5.2547, + "num_input_tokens_seen": 729415680, + "step": 5565 + }, + { + "epoch": 0.9521000320615582, + "grad_norm": 0.7220883369445801, + "learning_rate": 6.881853950288646e-05, + "loss": 5.1583, + "num_input_tokens_seen": 729808896, + "step": 5568 + }, + { + "epoch": 0.9526130169926258, + "grad_norm": 0.700537919998169, + "learning_rate": 6.88000075139944e-05, + "loss": 5.2015, + "num_input_tokens_seen": 730202112, + "step": 5571 + }, + { + "epoch": 0.9531260019236935, + "grad_norm": 0.8122661113739014, + "learning_rate": 6.878149048835583e-05, + "loss": 5.1881, + "num_input_tokens_seen": 730595328, + "step": 5574 + }, + { + "epoch": 0.9536389868547611, + "grad_norm": 0.6667414307594299, + "learning_rate": 6.876298840584535e-05, + "loss": 5.1502, + "num_input_tokens_seen": 730988544, + "step": 5577 + }, + { + "epoch": 0.9541519717858288, + "grad_norm": 0.7198598980903625, + "learning_rate": 6.874450124637534e-05, + "loss": 5.2023, + "num_input_tokens_seen": 731381760, + "step": 5580 + }, + { + "epoch": 0.9546649567168964, + "grad_norm": 0.7227144837379456, + "learning_rate": 6.872602898989611e-05, + "loss": 5.1746, + "num_input_tokens_seen": 731774976, + "step": 5583 + }, + { + "epoch": 0.9551779416479641, + "grad_norm": 0.7212773561477661, + "learning_rate": 6.870757161639557e-05, + "loss": 5.2028, + "num_input_tokens_seen": 732168192, + "step": 5586 + }, + { + "epoch": 0.9556909265790318, + "grad_norm": 0.7933756113052368, + "learning_rate": 6.868912910589922e-05, + "loss": 5.1419, + "num_input_tokens_seen": 732561408, + "step": 5589 + }, + { + "epoch": 0.9562039115100994, + "grad_norm": 0.8172382712364197, + "learning_rate": 6.867070143847011e-05, + "loss": 5.1742, + "num_input_tokens_seen": 732954624, + "step": 5592 + }, + { + "epoch": 0.9567168964411671, + "grad_norm": 0.7590979337692261, + "learning_rate": 6.86522885942087e-05, + "loss": 5.2606, + "num_input_tokens_seen": 733347840, + "step": 5595 + }, + { + "epoch": 0.9572298813722346, + "grad_norm": 0.8111834526062012, + "learning_rate": 6.86338905532528e-05, + "loss": 5.2107, + "num_input_tokens_seen": 733741056, + "step": 5598 + }, + { + "epoch": 0.9575718713262797, + "eval_accuracy": 0.18844650708353688, + "eval_loss": 5.67447566986084, + "eval_runtime": 111.5849, + "eval_samples_per_second": 2.689, + "eval_steps_per_second": 1.344, + "num_input_tokens_seen": 734003200, + "step": 5600 + }, + { + "epoch": 0.9577428663033023, + "grad_norm": 0.7701963782310486, + "learning_rate": 6.861550729577741e-05, + "loss": 5.2339, + "num_input_tokens_seen": 734134272, + "step": 5601 + }, + { + "epoch": 0.95825585123437, + "grad_norm": 0.7521535754203796, + "learning_rate": 6.85971388019947e-05, + "loss": 5.1737, + "num_input_tokens_seen": 734527488, + "step": 5604 + }, + { + "epoch": 0.9587688361654376, + "grad_norm": 0.8248022794723511, + "learning_rate": 6.857878505215393e-05, + "loss": 5.202, + "num_input_tokens_seen": 734920704, + "step": 5607 + }, + { + "epoch": 0.9592818210965053, + "grad_norm": 0.6702877879142761, + "learning_rate": 6.856044602654132e-05, + "loss": 5.1872, + "num_input_tokens_seen": 735313920, + "step": 5610 + }, + { + "epoch": 0.9597948060275729, + "grad_norm": 0.6633716225624084, + "learning_rate": 6.854212170547997e-05, + "loss": 5.2188, + "num_input_tokens_seen": 735707136, + "step": 5613 + }, + { + "epoch": 0.9603077909586406, + "grad_norm": 0.7093427181243896, + "learning_rate": 6.852381206932974e-05, + "loss": 5.1756, + "num_input_tokens_seen": 736100352, + "step": 5616 + }, + { + "epoch": 0.9608207758897083, + "grad_norm": 0.7626616358757019, + "learning_rate": 6.850551709848722e-05, + "loss": 5.1181, + "num_input_tokens_seen": 736493568, + "step": 5619 + }, + { + "epoch": 0.9613337608207759, + "grad_norm": 0.7711624503135681, + "learning_rate": 6.848723677338564e-05, + "loss": 5.1677, + "num_input_tokens_seen": 736886784, + "step": 5622 + }, + { + "epoch": 0.9618467457518436, + "grad_norm": 0.7787635326385498, + "learning_rate": 6.846897107449475e-05, + "loss": 5.1462, + "num_input_tokens_seen": 737280000, + "step": 5625 + }, + { + "epoch": 0.9623597306829111, + "grad_norm": 0.7611491084098816, + "learning_rate": 6.845071998232071e-05, + "loss": 5.2339, + "num_input_tokens_seen": 737673216, + "step": 5628 + }, + { + "epoch": 0.9628727156139788, + "grad_norm": 0.7201474905014038, + "learning_rate": 6.843248347740607e-05, + "loss": 5.2319, + "num_input_tokens_seen": 738066432, + "step": 5631 + }, + { + "epoch": 0.9633857005450465, + "grad_norm": 0.6865774393081665, + "learning_rate": 6.841426154032964e-05, + "loss": 5.1631, + "num_input_tokens_seen": 738459648, + "step": 5634 + }, + { + "epoch": 0.9638986854761141, + "grad_norm": 0.7156822681427002, + "learning_rate": 6.839605415170637e-05, + "loss": 5.2378, + "num_input_tokens_seen": 738852864, + "step": 5637 + }, + { + "epoch": 0.9644116704071818, + "grad_norm": 0.7458356618881226, + "learning_rate": 6.837786129218738e-05, + "loss": 5.1746, + "num_input_tokens_seen": 739246080, + "step": 5640 + }, + { + "epoch": 0.9649246553382494, + "grad_norm": 0.7389182448387146, + "learning_rate": 6.835968294245973e-05, + "loss": 5.1859, + "num_input_tokens_seen": 739639296, + "step": 5643 + }, + { + "epoch": 0.9654376402693171, + "grad_norm": 0.7108219861984253, + "learning_rate": 6.834151908324644e-05, + "loss": 5.1834, + "num_input_tokens_seen": 740032512, + "step": 5646 + }, + { + "epoch": 0.9659506252003848, + "grad_norm": 0.8195409178733826, + "learning_rate": 6.832336969530635e-05, + "loss": 5.1927, + "num_input_tokens_seen": 740425728, + "step": 5649 + }, + { + "epoch": 0.9664636101314524, + "grad_norm": 0.7212623953819275, + "learning_rate": 6.830523475943408e-05, + "loss": 5.1814, + "num_input_tokens_seen": 740818944, + "step": 5652 + }, + { + "epoch": 0.96697659506252, + "grad_norm": 0.6641290187835693, + "learning_rate": 6.828711425645984e-05, + "loss": 5.16, + "num_input_tokens_seen": 741212160, + "step": 5655 + }, + { + "epoch": 0.9674895799935876, + "grad_norm": 0.7765944004058838, + "learning_rate": 6.826900816724949e-05, + "loss": 5.1787, + "num_input_tokens_seen": 741605376, + "step": 5658 + }, + { + "epoch": 0.9680025649246553, + "grad_norm": 0.6821857690811157, + "learning_rate": 6.825091647270437e-05, + "loss": 5.188, + "num_input_tokens_seen": 741998592, + "step": 5661 + }, + { + "epoch": 0.968515549855723, + "grad_norm": 0.6633355617523193, + "learning_rate": 6.823283915376123e-05, + "loss": 5.2485, + "num_input_tokens_seen": 742391808, + "step": 5664 + }, + { + "epoch": 0.9690285347867906, + "grad_norm": 0.6869776844978333, + "learning_rate": 6.821477619139209e-05, + "loss": 5.222, + "num_input_tokens_seen": 742785024, + "step": 5667 + }, + { + "epoch": 0.9695415197178583, + "grad_norm": 0.7537380456924438, + "learning_rate": 6.819672756660432e-05, + "loss": 5.2192, + "num_input_tokens_seen": 743178240, + "step": 5670 + }, + { + "epoch": 0.970054504648926, + "grad_norm": 0.775252640247345, + "learning_rate": 6.817869326044036e-05, + "loss": 5.15, + "num_input_tokens_seen": 743571456, + "step": 5673 + }, + { + "epoch": 0.9705674895799936, + "grad_norm": 0.7099266648292542, + "learning_rate": 6.816067325397775e-05, + "loss": 5.1486, + "num_input_tokens_seen": 743964672, + "step": 5676 + }, + { + "epoch": 0.9710804745110613, + "grad_norm": 0.7244137525558472, + "learning_rate": 6.814266752832903e-05, + "loss": 5.1545, + "num_input_tokens_seen": 744357888, + "step": 5679 + }, + { + "epoch": 0.9715934594421289, + "grad_norm": 0.6794865727424622, + "learning_rate": 6.812467606464162e-05, + "loss": 5.2089, + "num_input_tokens_seen": 744751104, + "step": 5682 + }, + { + "epoch": 0.9721064443731965, + "grad_norm": 0.7153213024139404, + "learning_rate": 6.81066988440978e-05, + "loss": 5.2345, + "num_input_tokens_seen": 745144320, + "step": 5685 + }, + { + "epoch": 0.9726194293042641, + "grad_norm": 0.7132147550582886, + "learning_rate": 6.808873584791457e-05, + "loss": 5.2006, + "num_input_tokens_seen": 745537536, + "step": 5688 + }, + { + "epoch": 0.9731324142353318, + "grad_norm": 0.6356132626533508, + "learning_rate": 6.807078705734362e-05, + "loss": 5.203, + "num_input_tokens_seen": 745930752, + "step": 5691 + }, + { + "epoch": 0.9736453991663995, + "grad_norm": 0.6981724500656128, + "learning_rate": 6.805285245367116e-05, + "loss": 5.2071, + "num_input_tokens_seen": 746323968, + "step": 5694 + }, + { + "epoch": 0.9741583840974671, + "grad_norm": 0.6996302604675293, + "learning_rate": 6.803493201821794e-05, + "loss": 5.2533, + "num_input_tokens_seen": 746717184, + "step": 5697 + }, + { + "epoch": 0.9746713690285348, + "grad_norm": 0.8056071996688843, + "learning_rate": 6.801702573233913e-05, + "loss": 5.1897, + "num_input_tokens_seen": 747110400, + "step": 5700 + }, + { + "epoch": 0.9751843539596025, + "grad_norm": 0.7028996348381042, + "learning_rate": 6.79991335774242e-05, + "loss": 5.1624, + "num_input_tokens_seen": 747503616, + "step": 5703 + }, + { + "epoch": 0.9756973388906701, + "grad_norm": 0.8844038248062134, + "learning_rate": 6.798125553489686e-05, + "loss": 5.1886, + "num_input_tokens_seen": 747896832, + "step": 5706 + }, + { + "epoch": 0.9762103238217378, + "grad_norm": 0.7125393152236938, + "learning_rate": 6.796339158621506e-05, + "loss": 5.1686, + "num_input_tokens_seen": 748290048, + "step": 5709 + }, + { + "epoch": 0.9767233087528054, + "grad_norm": 0.7706892490386963, + "learning_rate": 6.794554171287077e-05, + "loss": 5.1821, + "num_input_tokens_seen": 748683264, + "step": 5712 + }, + { + "epoch": 0.977236293683873, + "grad_norm": 0.7665553689002991, + "learning_rate": 6.792770589638998e-05, + "loss": 5.1875, + "num_input_tokens_seen": 749076480, + "step": 5715 + }, + { + "epoch": 0.9777492786149407, + "grad_norm": 0.822076141834259, + "learning_rate": 6.790988411833267e-05, + "loss": 5.1802, + "num_input_tokens_seen": 749469696, + "step": 5718 + }, + { + "epoch": 0.9782622635460083, + "grad_norm": 0.734258770942688, + "learning_rate": 6.789207636029258e-05, + "loss": 5.1485, + "num_input_tokens_seen": 749862912, + "step": 5721 + }, + { + "epoch": 0.978775248477076, + "grad_norm": 0.851309597492218, + "learning_rate": 6.787428260389725e-05, + "loss": 5.2038, + "num_input_tokens_seen": 750256128, + "step": 5724 + }, + { + "epoch": 0.9792882334081436, + "grad_norm": 0.8018383979797363, + "learning_rate": 6.785650283080797e-05, + "loss": 5.1956, + "num_input_tokens_seen": 750649344, + "step": 5727 + }, + { + "epoch": 0.9798012183392113, + "grad_norm": 0.8261561393737793, + "learning_rate": 6.78387370227195e-05, + "loss": 5.2167, + "num_input_tokens_seen": 751042560, + "step": 5730 + }, + { + "epoch": 0.980314203270279, + "grad_norm": 0.7677420377731323, + "learning_rate": 6.782098516136031e-05, + "loss": 5.1776, + "num_input_tokens_seen": 751435776, + "step": 5733 + }, + { + "epoch": 0.9808271882013466, + "grad_norm": 0.7804604172706604, + "learning_rate": 6.780324722849218e-05, + "loss": 5.2386, + "num_input_tokens_seen": 751828992, + "step": 5736 + }, + { + "epoch": 0.9813401731324143, + "grad_norm": 0.8051115870475769, + "learning_rate": 6.778552320591031e-05, + "loss": 5.1832, + "num_input_tokens_seen": 752222208, + "step": 5739 + }, + { + "epoch": 0.9818531580634818, + "grad_norm": 0.7745217680931091, + "learning_rate": 6.776781307544323e-05, + "loss": 5.1991, + "num_input_tokens_seen": 752615424, + "step": 5742 + }, + { + "epoch": 0.9823661429945495, + "grad_norm": 0.7418707609176636, + "learning_rate": 6.775011681895259e-05, + "loss": 5.1977, + "num_input_tokens_seen": 753008640, + "step": 5745 + }, + { + "epoch": 0.9828791279256172, + "grad_norm": 0.7361640930175781, + "learning_rate": 6.773243441833328e-05, + "loss": 5.1713, + "num_input_tokens_seen": 753401856, + "step": 5748 + }, + { + "epoch": 0.9833921128566848, + "grad_norm": 0.7707425355911255, + "learning_rate": 6.771476585551323e-05, + "loss": 5.1812, + "num_input_tokens_seen": 753795072, + "step": 5751 + }, + { + "epoch": 0.9839050977877525, + "grad_norm": 0.7603244781494141, + "learning_rate": 6.769711111245329e-05, + "loss": 5.2091, + "num_input_tokens_seen": 754188288, + "step": 5754 + }, + { + "epoch": 0.9844180827188201, + "grad_norm": 0.771886944770813, + "learning_rate": 6.767947017114727e-05, + "loss": 5.1886, + "num_input_tokens_seen": 754581504, + "step": 5757 + }, + { + "epoch": 0.9849310676498878, + "grad_norm": 0.8924115896224976, + "learning_rate": 6.766184301362177e-05, + "loss": 5.1916, + "num_input_tokens_seen": 754974720, + "step": 5760 + }, + { + "epoch": 0.9854440525809555, + "grad_norm": 0.7867364287376404, + "learning_rate": 6.764422962193624e-05, + "loss": 5.1989, + "num_input_tokens_seen": 755367936, + "step": 5763 + }, + { + "epoch": 0.9859570375120231, + "grad_norm": 0.809303343296051, + "learning_rate": 6.762662997818266e-05, + "loss": 5.2264, + "num_input_tokens_seen": 755761152, + "step": 5766 + }, + { + "epoch": 0.9864700224430908, + "grad_norm": 0.7048402428627014, + "learning_rate": 6.760904406448573e-05, + "loss": 5.2237, + "num_input_tokens_seen": 756154368, + "step": 5769 + }, + { + "epoch": 0.9869830073741583, + "grad_norm": 0.7238898277282715, + "learning_rate": 6.759147186300257e-05, + "loss": 5.2013, + "num_input_tokens_seen": 756547584, + "step": 5772 + }, + { + "epoch": 0.987495992305226, + "grad_norm": 0.7172737121582031, + "learning_rate": 6.757391335592282e-05, + "loss": 5.1839, + "num_input_tokens_seen": 756940800, + "step": 5775 + }, + { + "epoch": 0.9880089772362937, + "grad_norm": 0.7212975025177002, + "learning_rate": 6.755636852546848e-05, + "loss": 5.1696, + "num_input_tokens_seen": 757334016, + "step": 5778 + }, + { + "epoch": 0.9885219621673613, + "grad_norm": 0.754808247089386, + "learning_rate": 6.753883735389383e-05, + "loss": 5.2154, + "num_input_tokens_seen": 757727232, + "step": 5781 + }, + { + "epoch": 0.989034947098429, + "grad_norm": 0.782254159450531, + "learning_rate": 6.752131982348533e-05, + "loss": 5.1923, + "num_input_tokens_seen": 758120448, + "step": 5784 + }, + { + "epoch": 0.9895479320294966, + "grad_norm": 0.7205175757408142, + "learning_rate": 6.750381591656167e-05, + "loss": 5.2076, + "num_input_tokens_seen": 758513664, + "step": 5787 + }, + { + "epoch": 0.9900609169605643, + "grad_norm": 0.7384617328643799, + "learning_rate": 6.748632561547353e-05, + "loss": 5.1796, + "num_input_tokens_seen": 758906880, + "step": 5790 + }, + { + "epoch": 0.990573901891632, + "grad_norm": 0.7156171202659607, + "learning_rate": 6.746884890260363e-05, + "loss": 5.1877, + "num_input_tokens_seen": 759300096, + "step": 5793 + }, + { + "epoch": 0.9910868868226996, + "grad_norm": 0.8681517243385315, + "learning_rate": 6.745138576036662e-05, + "loss": 5.1948, + "num_input_tokens_seen": 759693312, + "step": 5796 + }, + { + "epoch": 0.9915998717537672, + "grad_norm": 0.7191133499145508, + "learning_rate": 6.743393617120892e-05, + "loss": 5.2031, + "num_input_tokens_seen": 760086528, + "step": 5799 + }, + { + "epoch": 0.9921128566848348, + "grad_norm": 0.8175661563873291, + "learning_rate": 6.741650011760882e-05, + "loss": 5.1437, + "num_input_tokens_seen": 760479744, + "step": 5802 + }, + { + "epoch": 0.9926258416159025, + "grad_norm": 0.7410142421722412, + "learning_rate": 6.739907758207622e-05, + "loss": 5.1831, + "num_input_tokens_seen": 760872960, + "step": 5805 + }, + { + "epoch": 0.9931388265469702, + "grad_norm": 0.8488388061523438, + "learning_rate": 6.73816685471527e-05, + "loss": 5.1424, + "num_input_tokens_seen": 761266176, + "step": 5808 + }, + { + "epoch": 0.9936518114780378, + "grad_norm": 0.7565668821334839, + "learning_rate": 6.736427299541137e-05, + "loss": 5.1853, + "num_input_tokens_seen": 761659392, + "step": 5811 + }, + { + "epoch": 0.9941647964091055, + "grad_norm": 0.8777074217796326, + "learning_rate": 6.734689090945682e-05, + "loss": 5.167, + "num_input_tokens_seen": 762052608, + "step": 5814 + }, + { + "epoch": 0.9946777813401732, + "grad_norm": 0.8720703125, + "learning_rate": 6.732952227192505e-05, + "loss": 5.1915, + "num_input_tokens_seen": 762445824, + "step": 5817 + }, + { + "epoch": 0.9951907662712408, + "grad_norm": 0.8000132441520691, + "learning_rate": 6.731216706548339e-05, + "loss": 5.1721, + "num_input_tokens_seen": 762839040, + "step": 5820 + }, + { + "epoch": 0.9957037512023085, + "grad_norm": 0.733505368232727, + "learning_rate": 6.729482527283039e-05, + "loss": 5.2092, + "num_input_tokens_seen": 763232256, + "step": 5823 + }, + { + "epoch": 0.9962167361333761, + "grad_norm": 0.7550384402275085, + "learning_rate": 6.727749687669586e-05, + "loss": 5.1846, + "num_input_tokens_seen": 763625472, + "step": 5826 + }, + { + "epoch": 0.9967297210644437, + "grad_norm": 0.8135012984275818, + "learning_rate": 6.726018185984064e-05, + "loss": 5.215, + "num_input_tokens_seen": 764018688, + "step": 5829 + }, + { + "epoch": 0.9972427059955113, + "grad_norm": 0.7134667634963989, + "learning_rate": 6.724288020505667e-05, + "loss": 5.1611, + "num_input_tokens_seen": 764411904, + "step": 5832 + }, + { + "epoch": 0.997755690926579, + "grad_norm": 0.7443891167640686, + "learning_rate": 6.722559189516687e-05, + "loss": 5.1741, + "num_input_tokens_seen": 764805120, + "step": 5835 + }, + { + "epoch": 0.9982686758576467, + "grad_norm": 0.6709545850753784, + "learning_rate": 6.720831691302501e-05, + "loss": 5.1677, + "num_input_tokens_seen": 765198336, + "step": 5838 + }, + { + "epoch": 0.9987816607887143, + "grad_norm": 0.7193747758865356, + "learning_rate": 6.71910552415157e-05, + "loss": 5.2325, + "num_input_tokens_seen": 765591552, + "step": 5841 + }, + { + "epoch": 0.999294645719782, + "grad_norm": 0.7146051526069641, + "learning_rate": 6.71738068635543e-05, + "loss": 5.1577, + "num_input_tokens_seen": 765984768, + "step": 5844 + }, + { + "epoch": 0.9998076306508497, + "grad_norm": 0.6998146772384644, + "learning_rate": 6.715657176208689e-05, + "loss": 5.2431, + "num_input_tokens_seen": 766377984, + "step": 5847 + }, + { + "epoch": 0.9999786256278722, + "num_input_tokens_seen": 766509056, + "step": 5848, + "total_flos": 4.708536848052388e+17, + "train_loss": 5.594264277028972, + "train_runtime": 134120.2101, + "train_samples_per_second": 2.791, + "train_steps_per_second": 0.044 + } + ], + "logging_steps": 3, + "max_steps": 5848, + "num_input_tokens_seen": 766509056, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 4.708536848052388e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}