{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998921074234783, "eval_steps": 400, "global_step": 6661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015011141081271255, "grad_norm": 0.8901742100715637, "learning_rate": 1.1976047904191619e-06, "loss": 2.5666, "num_input_tokens_seen": 5242880, "step": 10 }, { "epoch": 0.003002228216254251, "grad_norm": 0.8234608769416809, "learning_rate": 2.3952095808383237e-06, "loss": 2.5448, "num_input_tokens_seen": 10485760, "step": 20 }, { "epoch": 0.004503342324381376, "grad_norm": 0.6512044668197632, "learning_rate": 3.592814371257485e-06, "loss": 2.4883, "num_input_tokens_seen": 15728640, "step": 30 }, { "epoch": 0.006004456432508502, "grad_norm": 0.6285302639007568, "learning_rate": 4.7904191616766475e-06, "loss": 2.4675, "num_input_tokens_seen": 20971520, "step": 40 }, { "epoch": 0.007505570540635628, "grad_norm": 0.5729206204414368, "learning_rate": 5.9880239520958085e-06, "loss": 2.4332, "num_input_tokens_seen": 26214400, "step": 50 }, { "epoch": 0.009006684648762753, "grad_norm": 0.54844731092453, "learning_rate": 7.18562874251497e-06, "loss": 2.4374, "num_input_tokens_seen": 31457280, "step": 60 }, { "epoch": 0.01050779875688988, "grad_norm": 0.5901165008544922, "learning_rate": 8.383233532934131e-06, "loss": 2.4057, "num_input_tokens_seen": 36700160, "step": 70 }, { "epoch": 0.012008912865017004, "grad_norm": 0.6103231906890869, "learning_rate": 9.580838323353295e-06, "loss": 2.3765, "num_input_tokens_seen": 41943040, "step": 80 }, { "epoch": 0.013510026973144131, "grad_norm": 0.502941906452179, "learning_rate": 1.0778443113772457e-05, "loss": 2.3877, "num_input_tokens_seen": 47185920, "step": 90 }, { "epoch": 0.015011141081271256, "grad_norm": 0.6203625202178955, "learning_rate": 1.1976047904191617e-05, "loss": 2.3622, "num_input_tokens_seen": 52428800, "step": 100 }, { "epoch": 0.016512255189398382, "grad_norm": 0.658467173576355, "learning_rate": 1.3173652694610779e-05, "loss": 2.3567, "num_input_tokens_seen": 57671680, "step": 110 }, { "epoch": 0.018013369297525506, "grad_norm": 0.5568761229515076, "learning_rate": 1.437125748502994e-05, "loss": 2.3679, "num_input_tokens_seen": 62914560, "step": 120 }, { "epoch": 0.019514483405652632, "grad_norm": 0.6238291263580322, "learning_rate": 1.5568862275449103e-05, "loss": 2.3829, "num_input_tokens_seen": 68157440, "step": 130 }, { "epoch": 0.02101559751377976, "grad_norm": 0.6000885367393494, "learning_rate": 1.6766467065868263e-05, "loss": 2.3472, "num_input_tokens_seen": 73400320, "step": 140 }, { "epoch": 0.022516711621906885, "grad_norm": 0.6424698829650879, "learning_rate": 1.7964071856287426e-05, "loss": 2.3285, "num_input_tokens_seen": 78643200, "step": 150 }, { "epoch": 0.02401782573003401, "grad_norm": 0.6143007874488831, "learning_rate": 1.916167664670659e-05, "loss": 2.3151, "num_input_tokens_seen": 83886080, "step": 160 }, { "epoch": 0.025518939838161135, "grad_norm": 0.6206329464912415, "learning_rate": 2.035928143712575e-05, "loss": 2.3368, "num_input_tokens_seen": 89128960, "step": 170 }, { "epoch": 0.027020053946288262, "grad_norm": 0.6878198385238647, "learning_rate": 2.1556886227544914e-05, "loss": 2.3367, "num_input_tokens_seen": 94371840, "step": 180 }, { "epoch": 0.028521168054415385, "grad_norm": 0.8266871571540833, "learning_rate": 2.2754491017964074e-05, "loss": 2.3286, "num_input_tokens_seen": 99614720, "step": 190 }, { "epoch": 0.03002228216254251, "grad_norm": 0.5854138135910034, "learning_rate": 2.3952095808383234e-05, "loss": 2.303, "num_input_tokens_seen": 104857600, "step": 200 }, { "epoch": 0.031523396270669635, "grad_norm": 0.7110793590545654, "learning_rate": 2.5149700598802394e-05, "loss": 2.2985, "num_input_tokens_seen": 110100480, "step": 210 }, { "epoch": 0.033024510378796765, "grad_norm": 0.6561000943183899, "learning_rate": 2.6347305389221558e-05, "loss": 2.3154, "num_input_tokens_seen": 115343360, "step": 220 }, { "epoch": 0.03452562448692389, "grad_norm": 0.7813587188720703, "learning_rate": 2.754491017964072e-05, "loss": 2.3104, "num_input_tokens_seen": 120586240, "step": 230 }, { "epoch": 0.03602673859505101, "grad_norm": 0.7437533140182495, "learning_rate": 2.874251497005988e-05, "loss": 2.3098, "num_input_tokens_seen": 125829120, "step": 240 }, { "epoch": 0.03752785270317814, "grad_norm": 0.8770821690559387, "learning_rate": 2.9940119760479045e-05, "loss": 2.3061, "num_input_tokens_seen": 131072000, "step": 250 }, { "epoch": 0.039028966811305264, "grad_norm": 0.6238060593605042, "learning_rate": 3.1137724550898205e-05, "loss": 2.3052, "num_input_tokens_seen": 136314880, "step": 260 }, { "epoch": 0.040530080919432394, "grad_norm": 0.678520679473877, "learning_rate": 3.233532934131737e-05, "loss": 2.2934, "num_input_tokens_seen": 141557760, "step": 270 }, { "epoch": 0.04203119502755952, "grad_norm": 0.7275491952896118, "learning_rate": 3.3532934131736525e-05, "loss": 2.2938, "num_input_tokens_seen": 146800640, "step": 280 }, { "epoch": 0.04353230913568664, "grad_norm": 0.6745551824569702, "learning_rate": 3.473053892215569e-05, "loss": 2.3137, "num_input_tokens_seen": 152043520, "step": 290 }, { "epoch": 0.04503342324381377, "grad_norm": 0.8107547163963318, "learning_rate": 3.592814371257485e-05, "loss": 2.2833, "num_input_tokens_seen": 157286400, "step": 300 }, { "epoch": 0.046534537351940894, "grad_norm": 0.9518006443977356, "learning_rate": 3.7125748502994016e-05, "loss": 2.2883, "num_input_tokens_seen": 162529280, "step": 310 }, { "epoch": 0.04803565146006802, "grad_norm": 0.7946741580963135, "learning_rate": 3.832335329341318e-05, "loss": 2.292, "num_input_tokens_seen": 167772160, "step": 320 }, { "epoch": 0.04953676556819515, "grad_norm": 0.8819884657859802, "learning_rate": 3.9520958083832336e-05, "loss": 2.2874, "num_input_tokens_seen": 173015040, "step": 330 }, { "epoch": 0.05103787967632227, "grad_norm": 0.8569954633712769, "learning_rate": 3.9645487837613415e-05, "loss": 2.2701, "num_input_tokens_seen": 178257920, "step": 340 }, { "epoch": 0.05253899378444939, "grad_norm": 0.6152970194816589, "learning_rate": 3.907501942235145e-05, "loss": 2.2995, "num_input_tokens_seen": 183500800, "step": 350 }, { "epoch": 0.054040107892576524, "grad_norm": 0.6690587997436523, "learning_rate": 3.852848873813304e-05, "loss": 2.2663, "num_input_tokens_seen": 188743680, "step": 360 }, { "epoch": 0.05554122200070365, "grad_norm": 0.7796323895454407, "learning_rate": 3.800426718572961e-05, "loss": 2.2597, "num_input_tokens_seen": 193986560, "step": 370 }, { "epoch": 0.05704233610883077, "grad_norm": 0.9035688638687134, "learning_rate": 3.7500877182723136e-05, "loss": 2.2369, "num_input_tokens_seen": 199229440, "step": 380 }, { "epoch": 0.0585434502169579, "grad_norm": 0.7932724356651306, "learning_rate": 3.701697462322401e-05, "loss": 2.2514, "num_input_tokens_seen": 204472320, "step": 390 }, { "epoch": 0.06004456432508502, "grad_norm": 0.6233640909194946, "learning_rate": 3.6551333764994134e-05, "loss": 2.2572, "num_input_tokens_seen": 209715200, "step": 400 }, { "epoch": 0.06004456432508502, "eval_accuracy": 0.5491493691493692, "eval_loss": 2.2461655139923096, "eval_runtime": 95.7686, "eval_samples_per_second": 3.133, "eval_steps_per_second": 0.783, "num_input_tokens_seen": 209715200, "step": 400 }, { "epoch": 0.061545678433212146, "grad_norm": 0.5706452131271362, "learning_rate": 3.610283415670218e-05, "loss": 2.2674, "num_input_tokens_seen": 214958080, "step": 410 }, { "epoch": 0.06304679254133927, "grad_norm": 0.6556965112686157, "learning_rate": 3.567044928762396e-05, "loss": 2.2898, "num_input_tokens_seen": 220200960, "step": 420 }, { "epoch": 0.0645479066494664, "grad_norm": 0.7172439098358154, "learning_rate": 3.5253236697846886e-05, "loss": 2.2644, "num_input_tokens_seen": 225443840, "step": 430 }, { "epoch": 0.06604902075759353, "grad_norm": 0.6364426016807556, "learning_rate": 3.485032933195115e-05, "loss": 2.2551, "num_input_tokens_seen": 230686720, "step": 440 }, { "epoch": 0.06755013486572065, "grad_norm": 0.7587452530860901, "learning_rate": 3.446092795552023e-05, "loss": 2.2576, "num_input_tokens_seen": 235929600, "step": 450 }, { "epoch": 0.06905124897384778, "grad_norm": 0.6086819767951965, "learning_rate": 3.408429448345356e-05, "loss": 2.2768, "num_input_tokens_seen": 241172480, "step": 460 }, { "epoch": 0.0705523630819749, "grad_norm": 0.681690514087677, "learning_rate": 3.371974609328701e-05, "loss": 2.2744, "num_input_tokens_seen": 246415360, "step": 470 }, { "epoch": 0.07205347719010202, "grad_norm": 0.8943272233009338, "learning_rate": 3.336665001664587e-05, "loss": 2.2536, "num_input_tokens_seen": 251658240, "step": 480 }, { "epoch": 0.07355459129822915, "grad_norm": 1.1301355361938477, "learning_rate": 3.3024418918399747e-05, "loss": 2.2329, "num_input_tokens_seen": 256901120, "step": 490 }, { "epoch": 0.07505570540635628, "grad_norm": 0.7338290214538574, "learning_rate": 3.269250678672408e-05, "loss": 2.2592, "num_input_tokens_seen": 262144000, "step": 500 }, { "epoch": 0.07655681951448341, "grad_norm": 0.7819264531135559, "learning_rate": 3.23704052686231e-05, "loss": 2.2453, "num_input_tokens_seen": 267386880, "step": 510 }, { "epoch": 0.07805793362261053, "grad_norm": 0.6750678420066833, "learning_rate": 3.205764039495589e-05, "loss": 2.2514, "num_input_tokens_seen": 272629760, "step": 520 }, { "epoch": 0.07955904773073766, "grad_norm": 0.5943534970283508, "learning_rate": 3.175376964696401e-05, "loss": 2.2346, "num_input_tokens_seen": 277872640, "step": 530 }, { "epoch": 0.08106016183886479, "grad_norm": 0.7100771069526672, "learning_rate": 3.145837932299803e-05, "loss": 2.2316, "num_input_tokens_seen": 283115520, "step": 540 }, { "epoch": 0.0825612759469919, "grad_norm": 0.6773257851600647, "learning_rate": 3.117108216979904e-05, "loss": 2.2343, "num_input_tokens_seen": 288358400, "step": 550 }, { "epoch": 0.08406239005511904, "grad_norm": 0.8613666296005249, "learning_rate": 3.089151524748689e-05, "loss": 2.253, "num_input_tokens_seen": 293601280, "step": 560 }, { "epoch": 0.08556350416324617, "grad_norm": 0.5922672748565674, "learning_rate": 3.0619338001484006e-05, "loss": 2.2032, "num_input_tokens_seen": 298844160, "step": 570 }, { "epoch": 0.08706461827137328, "grad_norm": 0.8399538397789001, "learning_rate": 3.0354230518081458e-05, "loss": 2.2445, "num_input_tokens_seen": 304087040, "step": 580 }, { "epoch": 0.08856573237950041, "grad_norm": 0.8287783265113831, "learning_rate": 3.0095891943326868e-05, "loss": 2.2473, "num_input_tokens_seen": 309329920, "step": 590 }, { "epoch": 0.09006684648762754, "grad_norm": 0.6455395221710205, "learning_rate": 2.9844039047465857e-05, "loss": 2.2276, "num_input_tokens_seen": 314572800, "step": 600 }, { "epoch": 0.09156796059575466, "grad_norm": 0.6312438249588013, "learning_rate": 2.959840491936165e-05, "loss": 2.2383, "num_input_tokens_seen": 319815680, "step": 610 }, { "epoch": 0.09306907470388179, "grad_norm": 0.5632859468460083, "learning_rate": 2.9358737777209838e-05, "loss": 2.2342, "num_input_tokens_seen": 325058560, "step": 620 }, { "epoch": 0.09457018881200892, "grad_norm": 0.6137943863868713, "learning_rate": 2.9124799883500802e-05, "loss": 2.2058, "num_input_tokens_seen": 330301440, "step": 630 }, { "epoch": 0.09607130292013603, "grad_norm": 0.7079706788063049, "learning_rate": 2.8896366553599782e-05, "loss": 2.2235, "num_input_tokens_seen": 335544320, "step": 640 }, { "epoch": 0.09757241702826316, "grad_norm": 0.6246356964111328, "learning_rate": 2.8673225248545833e-05, "loss": 2.2363, "num_input_tokens_seen": 340787200, "step": 650 }, { "epoch": 0.0990735311363903, "grad_norm": 0.7458660006523132, "learning_rate": 2.8455174743743358e-05, "loss": 2.2199, "num_input_tokens_seen": 346030080, "step": 660 }, { "epoch": 0.10057464524451741, "grad_norm": 0.6742898225784302, "learning_rate": 2.8242024366155264e-05, "loss": 2.2293, "num_input_tokens_seen": 351272960, "step": 670 }, { "epoch": 0.10207575935264454, "grad_norm": 0.5833361148834229, "learning_rate": 2.8033593293425236e-05, "loss": 2.2346, "num_input_tokens_seen": 356515840, "step": 680 }, { "epoch": 0.10357687346077167, "grad_norm": 0.6706376075744629, "learning_rate": 2.7829709909073584e-05, "loss": 2.2353, "num_input_tokens_seen": 361758720, "step": 690 }, { "epoch": 0.10507798756889879, "grad_norm": 0.7505273222923279, "learning_rate": 2.763021120854076e-05, "loss": 2.2463, "num_input_tokens_seen": 367001600, "step": 700 }, { "epoch": 0.10657910167702592, "grad_norm": 0.6324784755706787, "learning_rate": 2.743494225140684e-05, "loss": 2.2208, "num_input_tokens_seen": 372244480, "step": 710 }, { "epoch": 0.10808021578515305, "grad_norm": 0.6054283380508423, "learning_rate": 2.7243755655603398e-05, "loss": 2.214, "num_input_tokens_seen": 377487360, "step": 720 }, { "epoch": 0.10958132989328016, "grad_norm": 0.6269551515579224, "learning_rate": 2.7056511129865727e-05, "loss": 2.2426, "num_input_tokens_seen": 382730240, "step": 730 }, { "epoch": 0.1110824440014073, "grad_norm": 0.8448464870452881, "learning_rate": 2.6873075041054796e-05, "loss": 2.2214, "num_input_tokens_seen": 387973120, "step": 740 }, { "epoch": 0.11258355810953442, "grad_norm": 0.7521204948425293, "learning_rate": 2.669332001331669e-05, "loss": 2.2033, "num_input_tokens_seen": 393216000, "step": 750 }, { "epoch": 0.11408467221766154, "grad_norm": 0.5531004071235657, "learning_rate": 2.6517124556347397e-05, "loss": 2.2177, "num_input_tokens_seen": 398458880, "step": 760 }, { "epoch": 0.11558578632578867, "grad_norm": 0.5678554773330688, "learning_rate": 2.6344372720297863e-05, "loss": 2.2527, "num_input_tokens_seen": 403701760, "step": 770 }, { "epoch": 0.1170869004339158, "grad_norm": 0.5761403441429138, "learning_rate": 2.6174953775092044e-05, "loss": 2.1891, "num_input_tokens_seen": 408944640, "step": 780 }, { "epoch": 0.11858801454204292, "grad_norm": 0.614530622959137, "learning_rate": 2.6008761912142832e-05, "loss": 2.2389, "num_input_tokens_seen": 414187520, "step": 790 }, { "epoch": 0.12008912865017005, "grad_norm": 0.8077418208122253, "learning_rate": 2.584569596664017e-05, "loss": 2.2173, "num_input_tokens_seen": 419430400, "step": 800 }, { "epoch": 0.12008912865017005, "eval_accuracy": 0.5564314204314205, "eval_loss": 2.1939218044281006, "eval_runtime": 94.2566, "eval_samples_per_second": 3.183, "eval_steps_per_second": 0.796, "num_input_tokens_seen": 419430400, "step": 800 }, { "epoch": 0.12159024275829718, "grad_norm": 0.5838737487792969, "learning_rate": 2.5685659158755362e-05, "loss": 2.2407, "num_input_tokens_seen": 424673280, "step": 810 }, { "epoch": 0.12309135686642429, "grad_norm": 0.5616093873977661, "learning_rate": 2.5528558852257423e-05, "loss": 2.2387, "num_input_tokens_seen": 429916160, "step": 820 }, { "epoch": 0.12459247097455142, "grad_norm": 0.5916436910629272, "learning_rate": 2.537430632917375e-05, "loss": 2.1901, "num_input_tokens_seen": 435159040, "step": 830 }, { "epoch": 0.12609358508267854, "grad_norm": 0.7512922286987305, "learning_rate": 2.5222816579249755e-05, "loss": 2.2354, "num_input_tokens_seen": 440401920, "step": 840 }, { "epoch": 0.12759469919080568, "grad_norm": 0.6344349384307861, "learning_rate": 2.507400810307242e-05, "loss": 2.18, "num_input_tokens_seen": 445644800, "step": 850 }, { "epoch": 0.1290958132989328, "grad_norm": 0.6193192005157471, "learning_rate": 2.4927802727821985e-05, "loss": 2.2427, "num_input_tokens_seen": 450887680, "step": 860 }, { "epoch": 0.13059692740705992, "grad_norm": 0.5937076807022095, "learning_rate": 2.4784125434705546e-05, "loss": 2.2096, "num_input_tokens_seen": 456130560, "step": 870 }, { "epoch": 0.13209804151518706, "grad_norm": 0.6233764886856079, "learning_rate": 2.4642904197207104e-05, "loss": 2.2124, "num_input_tokens_seen": 461373440, "step": 880 }, { "epoch": 0.13359915562331418, "grad_norm": 0.7363754510879517, "learning_rate": 2.4504069829361966e-05, "loss": 2.1925, "num_input_tokens_seen": 466616320, "step": 890 }, { "epoch": 0.1351002697314413, "grad_norm": 0.5889034867286682, "learning_rate": 2.4367555843329425e-05, "loss": 2.2139, "num_input_tokens_seen": 471859200, "step": 900 }, { "epoch": 0.13660138383956844, "grad_norm": 0.5365704894065857, "learning_rate": 2.4233298315597637e-05, "loss": 2.2096, "num_input_tokens_seen": 477102080, "step": 910 }, { "epoch": 0.13810249794769555, "grad_norm": 0.5648247599601746, "learning_rate": 2.4101235761209246e-05, "loss": 2.1803, "num_input_tokens_seen": 482344960, "step": 920 }, { "epoch": 0.13960361205582267, "grad_norm": 0.6017926931381226, "learning_rate": 2.3971309015445497e-05, "loss": 2.1787, "num_input_tokens_seen": 487587840, "step": 930 }, { "epoch": 0.1411047261639498, "grad_norm": 0.6408767700195312, "learning_rate": 2.384346112245184e-05, "loss": 2.1998, "num_input_tokens_seen": 492830720, "step": 940 }, { "epoch": 0.14260584027207693, "grad_norm": 0.635210394859314, "learning_rate": 2.3717637230328695e-05, "loss": 2.2327, "num_input_tokens_seen": 498073600, "step": 950 }, { "epoch": 0.14410695438020404, "grad_norm": 0.6507686972618103, "learning_rate": 2.3593784492248522e-05, "loss": 2.2162, "num_input_tokens_seen": 503316480, "step": 960 }, { "epoch": 0.1456080684883312, "grad_norm": 0.6332815885543823, "learning_rate": 2.3471851973194328e-05, "loss": 2.2287, "num_input_tokens_seen": 508559360, "step": 970 }, { "epoch": 0.1471091825964583, "grad_norm": 0.6600320339202881, "learning_rate": 2.3351790561945772e-05, "loss": 2.1837, "num_input_tokens_seen": 513802240, "step": 980 }, { "epoch": 0.14861029670458545, "grad_norm": 0.5689282417297363, "learning_rate": 2.3233552887967433e-05, "loss": 2.1926, "num_input_tokens_seen": 519045120, "step": 990 }, { "epoch": 0.15011141081271256, "grad_norm": 0.762301504611969, "learning_rate": 2.3117093242879823e-05, "loss": 2.2092, "num_input_tokens_seen": 524288000, "step": 1000 }, { "epoch": 0.15161252492083968, "grad_norm": 0.5970498919487, "learning_rate": 2.3002367506217465e-05, "loss": 2.2208, "num_input_tokens_seen": 529530880, "step": 1010 }, { "epoch": 0.15311363902896682, "grad_norm": 0.7378530502319336, "learning_rate": 2.288933307520014e-05, "loss": 2.193, "num_input_tokens_seen": 534773760, "step": 1020 }, { "epoch": 0.15461475313709394, "grad_norm": 0.6843045949935913, "learning_rate": 2.2777948798263435e-05, "loss": 2.201, "num_input_tokens_seen": 540016640, "step": 1030 }, { "epoch": 0.15611586724522106, "grad_norm": 0.6419305205345154, "learning_rate": 2.2668174912113104e-05, "loss": 2.2008, "num_input_tokens_seen": 545259520, "step": 1040 }, { "epoch": 0.1576169813533482, "grad_norm": 0.6570454239845276, "learning_rate": 2.2559972982084464e-05, "loss": 2.1878, "num_input_tokens_seen": 550502400, "step": 1050 }, { "epoch": 0.15911809546147532, "grad_norm": 0.6204289197921753, "learning_rate": 2.2453305845603817e-05, "loss": 2.1965, "num_input_tokens_seen": 555745280, "step": 1060 }, { "epoch": 0.16061920956960243, "grad_norm": 0.7957982420921326, "learning_rate": 2.234813755856288e-05, "loss": 2.1886, "num_input_tokens_seen": 560988160, "step": 1070 }, { "epoch": 0.16212032367772958, "grad_norm": 0.6827828288078308, "learning_rate": 2.2244433344430573e-05, "loss": 2.1646, "num_input_tokens_seen": 566231040, "step": 1080 }, { "epoch": 0.1636214377858567, "grad_norm": 0.8159109354019165, "learning_rate": 2.2142159545938563e-05, "loss": 2.1966, "num_input_tokens_seen": 571473920, "step": 1090 }, { "epoch": 0.1651225518939838, "grad_norm": 0.6716334223747253, "learning_rate": 2.2041283579187985e-05, "loss": 2.173, "num_input_tokens_seen": 576716800, "step": 1100 }, { "epoch": 0.16662366600211095, "grad_norm": 0.5386361479759216, "learning_rate": 2.1941773890035455e-05, "loss": 2.1958, "num_input_tokens_seen": 581959680, "step": 1110 }, { "epoch": 0.16812478011023807, "grad_norm": 0.5818248391151428, "learning_rate": 2.1843599912625603e-05, "loss": 2.2007, "num_input_tokens_seen": 587202560, "step": 1120 }, { "epoch": 0.1696258942183652, "grad_norm": 0.6868605613708496, "learning_rate": 2.17467320299465e-05, "loss": 2.2067, "num_input_tokens_seen": 592445440, "step": 1130 }, { "epoch": 0.17112700832649233, "grad_norm": 0.6540196537971497, "learning_rate": 2.1651141536292296e-05, "loss": 2.17, "num_input_tokens_seen": 597688320, "step": 1140 }, { "epoch": 0.17262812243461945, "grad_norm": 0.5799123644828796, "learning_rate": 2.1556800601525106e-05, "loss": 2.1932, "num_input_tokens_seen": 602931200, "step": 1150 }, { "epoch": 0.17412923654274656, "grad_norm": 0.7334414124488831, "learning_rate": 2.1463682237035052e-05, "loss": 2.1795, "num_input_tokens_seen": 608174080, "step": 1160 }, { "epoch": 0.1756303506508737, "grad_norm": 0.5947369337081909, "learning_rate": 2.137176026330393e-05, "loss": 2.1544, "num_input_tokens_seen": 613416960, "step": 1170 }, { "epoch": 0.17713146475900082, "grad_norm": 0.6859150528907776, "learning_rate": 2.128100927898401e-05, "loss": 2.1821, "num_input_tokens_seen": 618659840, "step": 1180 }, { "epoch": 0.17863257886712794, "grad_norm": 0.6155475378036499, "learning_rate": 2.1191404631408953e-05, "loss": 2.1643, "num_input_tokens_seen": 623902720, "step": 1190 }, { "epoch": 0.18013369297525508, "grad_norm": 0.5512512922286987, "learning_rate": 2.110292238845922e-05, "loss": 2.1992, "num_input_tokens_seen": 629145600, "step": 1200 }, { "epoch": 0.18013369297525508, "eval_accuracy": 0.5604013024013024, "eval_loss": 2.1688950061798096, "eval_runtime": 95.4401, "eval_samples_per_second": 3.143, "eval_steps_per_second": 0.786, "num_input_tokens_seen": 629145600, "step": 1200 }, { "epoch": 0.1816348070833822, "grad_norm": 0.6315280795097351, "learning_rate": 2.1015539311708933e-05, "loss": 2.1888, "num_input_tokens_seen": 634388480, "step": 1210 }, { "epoch": 0.18313592119150932, "grad_norm": 0.6135600209236145, "learning_rate": 2.092923283078589e-05, "loss": 2.2094, "num_input_tokens_seen": 639631360, "step": 1220 }, { "epoch": 0.18463703529963646, "grad_norm": 0.7772483229637146, "learning_rate": 2.084398101888042e-05, "loss": 2.1959, "num_input_tokens_seen": 644874240, "step": 1230 }, { "epoch": 0.18613814940776358, "grad_norm": 0.7412639856338501, "learning_rate": 2.0759762569342743e-05, "loss": 2.1783, "num_input_tokens_seen": 650117120, "step": 1240 }, { "epoch": 0.1876392635158907, "grad_norm": 0.8701518177986145, "learning_rate": 2.0676556773312137e-05, "loss": 2.1231, "num_input_tokens_seen": 655360000, "step": 1250 }, { "epoch": 0.18914037762401784, "grad_norm": 0.7169715762138367, "learning_rate": 2.059434349832459e-05, "loss": 2.1964, "num_input_tokens_seen": 660602880, "step": 1260 }, { "epoch": 0.19064149173214495, "grad_norm": 0.5910638570785522, "learning_rate": 2.0513103167848672e-05, "loss": 2.1915, "num_input_tokens_seen": 665845760, "step": 1270 }, { "epoch": 0.19214260584027207, "grad_norm": 0.5827318429946899, "learning_rate": 2.0432816741702553e-05, "loss": 2.2007, "num_input_tokens_seen": 671088640, "step": 1280 }, { "epoch": 0.1936437199483992, "grad_norm": 0.6388292908668518, "learning_rate": 2.0353465697307495e-05, "loss": 2.1615, "num_input_tokens_seen": 676331520, "step": 1290 }, { "epoch": 0.19514483405652633, "grad_norm": 0.6320261359214783, "learning_rate": 2.0275032011736088e-05, "loss": 2.1983, "num_input_tokens_seen": 681574400, "step": 1300 }, { "epoch": 0.19664594816465344, "grad_norm": 0.6461578011512756, "learning_rate": 2.019749814451557e-05, "loss": 2.177, "num_input_tokens_seen": 686817280, "step": 1310 }, { "epoch": 0.1981470622727806, "grad_norm": 0.7476032376289368, "learning_rate": 2.0120847021149106e-05, "loss": 2.1549, "num_input_tokens_seen": 692060160, "step": 1320 }, { "epoch": 0.1996481763809077, "grad_norm": 0.6539933681488037, "learning_rate": 2.004506201731977e-05, "loss": 2.1761, "num_input_tokens_seen": 697303040, "step": 1330 }, { "epoch": 0.20114929048903482, "grad_norm": 0.5645061135292053, "learning_rate": 1.9970126943744094e-05, "loss": 2.1721, "num_input_tokens_seen": 702545920, "step": 1340 }, { "epoch": 0.20265040459716196, "grad_norm": 0.6610363125801086, "learning_rate": 1.9896026031643902e-05, "loss": 2.1748, "num_input_tokens_seen": 707788800, "step": 1350 }, { "epoch": 0.20415151870528908, "grad_norm": 0.6691358089447021, "learning_rate": 1.9822743918806708e-05, "loss": 2.1685, "num_input_tokens_seen": 713031680, "step": 1360 }, { "epoch": 0.2056526328134162, "grad_norm": 0.6636555790901184, "learning_rate": 1.9750265636206772e-05, "loss": 2.1817, "num_input_tokens_seen": 718274560, "step": 1370 }, { "epoch": 0.20715374692154334, "grad_norm": 0.7346318960189819, "learning_rate": 1.9678576595160388e-05, "loss": 2.1833, "num_input_tokens_seen": 723517440, "step": 1380 }, { "epoch": 0.20865486102967046, "grad_norm": 0.5943858623504639, "learning_rate": 1.960766257499033e-05, "loss": 2.2494, "num_input_tokens_seen": 728760320, "step": 1390 }, { "epoch": 0.21015597513779757, "grad_norm": 0.7019902467727661, "learning_rate": 1.9537509711175725e-05, "loss": 2.2019, "num_input_tokens_seen": 734003200, "step": 1400 }, { "epoch": 0.21165708924592472, "grad_norm": 0.5532287955284119, "learning_rate": 1.946810448396509e-05, "loss": 2.1975, "num_input_tokens_seen": 739246080, "step": 1410 }, { "epoch": 0.21315820335405183, "grad_norm": 0.5484923720359802, "learning_rate": 1.939943370743111e-05, "loss": 2.1773, "num_input_tokens_seen": 744488960, "step": 1420 }, { "epoch": 0.21465931746217895, "grad_norm": 0.6313416957855225, "learning_rate": 1.9331484518947162e-05, "loss": 2.1919, "num_input_tokens_seen": 749731840, "step": 1430 }, { "epoch": 0.2161604315703061, "grad_norm": 0.5967773795127869, "learning_rate": 1.926424436906652e-05, "loss": 2.1957, "num_input_tokens_seen": 754974720, "step": 1440 }, { "epoch": 0.2176615456784332, "grad_norm": 0.5972282290458679, "learning_rate": 1.919770101178605e-05, "loss": 2.1647, "num_input_tokens_seen": 760217600, "step": 1450 }, { "epoch": 0.21916265978656033, "grad_norm": 0.5539584755897522, "learning_rate": 1.9131842495177355e-05, "loss": 2.1815, "num_input_tokens_seen": 765460480, "step": 1460 }, { "epoch": 0.22066377389468747, "grad_norm": 0.7532745003700256, "learning_rate": 1.9066657152369064e-05, "loss": 2.2143, "num_input_tokens_seen": 770703360, "step": 1470 }, { "epoch": 0.2221648880028146, "grad_norm": 0.537327229976654, "learning_rate": 1.9002133592864807e-05, "loss": 2.1541, "num_input_tokens_seen": 775946240, "step": 1480 }, { "epoch": 0.2236660021109417, "grad_norm": 0.7467483878135681, "learning_rate": 1.893826069418217e-05, "loss": 2.1638, "num_input_tokens_seen": 781189120, "step": 1490 }, { "epoch": 0.22516711621906885, "grad_norm": 0.5528019070625305, "learning_rate": 1.8875027593798816e-05, "loss": 2.1865, "num_input_tokens_seen": 786432000, "step": 1500 }, { "epoch": 0.22666823032719596, "grad_norm": 0.6948214173316956, "learning_rate": 1.8812423681392357e-05, "loss": 2.166, "num_input_tokens_seen": 791674880, "step": 1510 }, { "epoch": 0.22816934443532308, "grad_norm": 0.6632937788963318, "learning_rate": 1.8750438591361568e-05, "loss": 2.1864, "num_input_tokens_seen": 796917760, "step": 1520 }, { "epoch": 0.22967045854345022, "grad_norm": 0.5619977116584778, "learning_rate": 1.8689062195616828e-05, "loss": 2.192, "num_input_tokens_seen": 802160640, "step": 1530 }, { "epoch": 0.23117157265157734, "grad_norm": 0.6643936038017273, "learning_rate": 1.8628284596628512e-05, "loss": 2.1749, "num_input_tokens_seen": 807403520, "step": 1540 }, { "epoch": 0.23267268675970446, "grad_norm": 0.6980964541435242, "learning_rate": 1.8568096120722424e-05, "loss": 2.1662, "num_input_tokens_seen": 812646400, "step": 1550 }, { "epoch": 0.2341738008678316, "grad_norm": 0.8099634647369385, "learning_rate": 1.8508487311612005e-05, "loss": 2.1778, "num_input_tokens_seen": 817889280, "step": 1560 }, { "epoch": 0.23567491497595872, "grad_norm": 0.5385667085647583, "learning_rate": 1.844944892415748e-05, "loss": 2.1536, "num_input_tokens_seen": 823132160, "step": 1570 }, { "epoch": 0.23717602908408583, "grad_norm": 0.70493483543396, "learning_rate": 1.8390971918342592e-05, "loss": 2.192, "num_input_tokens_seen": 828375040, "step": 1580 }, { "epoch": 0.23867714319221298, "grad_norm": 0.5577861666679382, "learning_rate": 1.833304745346004e-05, "loss": 2.1941, "num_input_tokens_seen": 833617920, "step": 1590 }, { "epoch": 0.2401782573003401, "grad_norm": 0.5603963136672974, "learning_rate": 1.8275666882497067e-05, "loss": 2.1543, "num_input_tokens_seen": 838860800, "step": 1600 }, { "epoch": 0.2401782573003401, "eval_accuracy": 0.5632421652421652, "eval_loss": 2.1520919799804688, "eval_runtime": 94.9668, "eval_samples_per_second": 3.159, "eval_steps_per_second": 0.79, "num_input_tokens_seen": 838860800, "step": 1600 }, { "epoch": 0.2416793714084672, "grad_norm": 0.6568350791931152, "learning_rate": 1.821882174671319e-05, "loss": 2.1729, "num_input_tokens_seen": 844103680, "step": 1610 }, { "epoch": 0.24318048551659435, "grad_norm": 0.6658337712287903, "learning_rate": 1.8162503770402265e-05, "loss": 2.1954, "num_input_tokens_seen": 849346560, "step": 1620 }, { "epoch": 0.24468159962472147, "grad_norm": 0.6133268475532532, "learning_rate": 1.8106704855831622e-05, "loss": 2.1887, "num_input_tokens_seen": 854589440, "step": 1630 }, { "epoch": 0.24618271373284858, "grad_norm": 0.6844907402992249, "learning_rate": 1.805141707835109e-05, "loss": 2.1772, "num_input_tokens_seen": 859832320, "step": 1640 }, { "epoch": 0.24768382784097573, "grad_norm": 0.6765184998512268, "learning_rate": 1.799663268166542e-05, "loss": 2.1631, "num_input_tokens_seen": 865075200, "step": 1650 }, { "epoch": 0.24918494194910285, "grad_norm": 0.7173587083816528, "learning_rate": 1.7942344073263488e-05, "loss": 2.1347, "num_input_tokens_seen": 870318080, "step": 1660 }, { "epoch": 0.25068605605723, "grad_norm": 0.907135546207428, "learning_rate": 1.788854381999832e-05, "loss": 2.1809, "num_input_tokens_seen": 875560960, "step": 1670 }, { "epoch": 0.2521871701653571, "grad_norm": 0.6652698516845703, "learning_rate": 1.783522464381198e-05, "loss": 2.1558, "num_input_tokens_seen": 880803840, "step": 1680 }, { "epoch": 0.2536882842734842, "grad_norm": 0.5732602477073669, "learning_rate": 1.7782379417599864e-05, "loss": 2.1788, "num_input_tokens_seen": 886046720, "step": 1690 }, { "epoch": 0.25518939838161137, "grad_norm": 0.5569264888763428, "learning_rate": 1.7730001161208948e-05, "loss": 2.1375, "num_input_tokens_seen": 891289600, "step": 1700 }, { "epoch": 0.25669051248973845, "grad_norm": 0.6948941946029663, "learning_rate": 1.767808303756493e-05, "loss": 2.148, "num_input_tokens_seen": 896532480, "step": 1710 }, { "epoch": 0.2581916265978656, "grad_norm": 0.5902568101882935, "learning_rate": 1.7626618348923443e-05, "loss": 2.1651, "num_input_tokens_seen": 901775360, "step": 1720 }, { "epoch": 0.25969274070599274, "grad_norm": 0.6689503192901611, "learning_rate": 1.757560053324057e-05, "loss": 2.1761, "num_input_tokens_seen": 907018240, "step": 1730 }, { "epoch": 0.26119385481411983, "grad_norm": 0.6513310074806213, "learning_rate": 1.7525023160658283e-05, "loss": 2.195, "num_input_tokens_seen": 912261120, "step": 1740 }, { "epoch": 0.262694968922247, "grad_norm": 0.5774445533752441, "learning_rate": 1.7474879930100483e-05, "loss": 2.1681, "num_input_tokens_seen": 917504000, "step": 1750 }, { "epoch": 0.2641960830303741, "grad_norm": 0.583354115486145, "learning_rate": 1.7425164665975574e-05, "loss": 2.1619, "num_input_tokens_seen": 922746880, "step": 1760 }, { "epoch": 0.2656971971385012, "grad_norm": 0.6275432109832764, "learning_rate": 1.7375871314981654e-05, "loss": 2.1819, "num_input_tokens_seen": 927989760, "step": 1770 }, { "epoch": 0.26719831124662835, "grad_norm": 0.6457045674324036, "learning_rate": 1.732699394301053e-05, "loss": 2.135, "num_input_tokens_seen": 933232640, "step": 1780 }, { "epoch": 0.2686994253547555, "grad_norm": 0.6066982746124268, "learning_rate": 1.7278526732147035e-05, "loss": 2.1782, "num_input_tokens_seen": 938475520, "step": 1790 }, { "epoch": 0.2702005394628826, "grad_norm": 0.6707212328910828, "learning_rate": 1.7230463977760115e-05, "loss": 2.2126, "num_input_tokens_seen": 943718400, "step": 1800 }, { "epoch": 0.2717016535710097, "grad_norm": 0.6359196305274963, "learning_rate": 1.71828000856825e-05, "loss": 2.1631, "num_input_tokens_seen": 948961280, "step": 1810 }, { "epoch": 0.27320276767913687, "grad_norm": 0.5916318297386169, "learning_rate": 1.713552956947563e-05, "loss": 2.1535, "num_input_tokens_seen": 954204160, "step": 1820 }, { "epoch": 0.27470388178726396, "grad_norm": 0.7036842107772827, "learning_rate": 1.708864704777699e-05, "loss": 2.153, "num_input_tokens_seen": 959447040, "step": 1830 }, { "epoch": 0.2762049958953911, "grad_norm": 0.6982750296592712, "learning_rate": 1.704214724172678e-05, "loss": 2.1806, "num_input_tokens_seen": 964689920, "step": 1840 }, { "epoch": 0.27770611000351825, "grad_norm": 0.6548587083816528, "learning_rate": 1.699602497247121e-05, "loss": 2.1547, "num_input_tokens_seen": 969932800, "step": 1850 }, { "epoch": 0.27920722411164534, "grad_norm": 0.6462974548339844, "learning_rate": 1.6950275158739732e-05, "loss": 2.1697, "num_input_tokens_seen": 975175680, "step": 1860 }, { "epoch": 0.2807083382197725, "grad_norm": 0.5359413623809814, "learning_rate": 1.690489281449364e-05, "loss": 2.158, "num_input_tokens_seen": 980418560, "step": 1870 }, { "epoch": 0.2822094523278996, "grad_norm": 0.6209542155265808, "learning_rate": 1.6859873046643506e-05, "loss": 2.1345, "num_input_tokens_seen": 985661440, "step": 1880 }, { "epoch": 0.2837105664360267, "grad_norm": 0.602796733379364, "learning_rate": 1.681521105283317e-05, "loss": 2.1774, "num_input_tokens_seen": 990904320, "step": 1890 }, { "epoch": 0.28521168054415386, "grad_norm": 0.6523065567016602, "learning_rate": 1.6770902119287942e-05, "loss": 2.2011, "num_input_tokens_seen": 996147200, "step": 1900 }, { "epoch": 0.286712794652281, "grad_norm": 0.5961717963218689, "learning_rate": 1.672694161872488e-05, "loss": 2.1722, "num_input_tokens_seen": 1001390080, "step": 1910 }, { "epoch": 0.2882139087604081, "grad_norm": 0.5899094343185425, "learning_rate": 1.6683325008322934e-05, "loss": 2.1353, "num_input_tokens_seen": 1006632960, "step": 1920 }, { "epoch": 0.28971502286853523, "grad_norm": 0.590152382850647, "learning_rate": 1.6640047827751115e-05, "loss": 2.186, "num_input_tokens_seen": 1011875840, "step": 1930 }, { "epoch": 0.2912161369766624, "grad_norm": 0.6230789422988892, "learning_rate": 1.6597105697252553e-05, "loss": 2.1719, "num_input_tokens_seen": 1017118720, "step": 1940 }, { "epoch": 0.29271725108478946, "grad_norm": 0.6433463096618652, "learning_rate": 1.655449431578271e-05, "loss": 2.1711, "num_input_tokens_seen": 1022361600, "step": 1950 }, { "epoch": 0.2942183651929166, "grad_norm": 0.6224901080131531, "learning_rate": 1.6512209459199873e-05, "loss": 2.1473, "num_input_tokens_seen": 1027604480, "step": 1960 }, { "epoch": 0.29571947930104375, "grad_norm": 0.6015456914901733, "learning_rate": 1.6470246978506274e-05, "loss": 2.1788, "num_input_tokens_seen": 1032847360, "step": 1970 }, { "epoch": 0.2972205934091709, "grad_norm": 0.7380375266075134, "learning_rate": 1.6428602798138068e-05, "loss": 2.1566, "num_input_tokens_seen": 1038090240, "step": 1980 }, { "epoch": 0.298721707517298, "grad_norm": 0.5713395476341248, "learning_rate": 1.638727291430271e-05, "loss": 2.1596, "num_input_tokens_seen": 1043333120, "step": 1990 }, { "epoch": 0.30022282162542513, "grad_norm": 0.6106094717979431, "learning_rate": 1.634625339336204e-05, "loss": 2.1532, "num_input_tokens_seen": 1048576000, "step": 2000 }, { "epoch": 0.30022282162542513, "eval_accuracy": 0.565023199023199, "eval_loss": 2.1400792598724365, "eval_runtime": 95.5183, "eval_samples_per_second": 3.141, "eval_steps_per_second": 0.785, "num_input_tokens_seen": 1048576000, "step": 2000 }, { "epoch": 0.3017239357335523, "grad_norm": 0.6882668733596802, "learning_rate": 1.6305540370259715e-05, "loss": 2.1681, "num_input_tokens_seen": 1053818880, "step": 2010 }, { "epoch": 0.30322504984167936, "grad_norm": 0.5827280282974243, "learning_rate": 1.6265130046991463e-05, "loss": 2.1699, "num_input_tokens_seen": 1059061760, "step": 2020 }, { "epoch": 0.3047261639498065, "grad_norm": 0.5182669162750244, "learning_rate": 1.6225018691116927e-05, "loss": 2.1495, "num_input_tokens_seen": 1064304640, "step": 2030 }, { "epoch": 0.30622727805793365, "grad_norm": 0.8182618618011475, "learning_rate": 1.618520263431155e-05, "loss": 2.1837, "num_input_tokens_seen": 1069547520, "step": 2040 }, { "epoch": 0.30772839216606074, "grad_norm": 0.5096232891082764, "learning_rate": 1.6145678270957475e-05, "loss": 2.1489, "num_input_tokens_seen": 1074790400, "step": 2050 }, { "epoch": 0.3092295062741879, "grad_norm": 0.6253472566604614, "learning_rate": 1.6106442056772048e-05, "loss": 2.1699, "num_input_tokens_seen": 1080033280, "step": 2060 }, { "epoch": 0.310730620382315, "grad_norm": 0.5389710068702698, "learning_rate": 1.6067490507472832e-05, "loss": 2.1715, "num_input_tokens_seen": 1085276160, "step": 2070 }, { "epoch": 0.3122317344904421, "grad_norm": 0.6460773348808289, "learning_rate": 1.6028820197477945e-05, "loss": 2.1601, "num_input_tokens_seen": 1090519040, "step": 2080 }, { "epoch": 0.31373284859856926, "grad_norm": 0.7538013458251953, "learning_rate": 1.5990427758640674e-05, "loss": 2.1677, "num_input_tokens_seen": 1095761920, "step": 2090 }, { "epoch": 0.3152339627066964, "grad_norm": 0.6462552547454834, "learning_rate": 1.5952309879017224e-05, "loss": 2.1805, "num_input_tokens_seen": 1101004800, "step": 2100 }, { "epoch": 0.3167350768148235, "grad_norm": 0.6027917861938477, "learning_rate": 1.5914463301666658e-05, "loss": 2.1415, "num_input_tokens_seen": 1106247680, "step": 2110 }, { "epoch": 0.31823619092295063, "grad_norm": 0.6895632147789001, "learning_rate": 1.5876884823482006e-05, "loss": 2.1587, "num_input_tokens_seen": 1111490560, "step": 2120 }, { "epoch": 0.3197373050310778, "grad_norm": 0.6652107834815979, "learning_rate": 1.5839571294051578e-05, "loss": 2.1444, "num_input_tokens_seen": 1116733440, "step": 2130 }, { "epoch": 0.32123841913920487, "grad_norm": 0.7466081380844116, "learning_rate": 1.580251961454958e-05, "loss": 2.1251, "num_input_tokens_seen": 1121976320, "step": 2140 }, { "epoch": 0.322739533247332, "grad_norm": 0.5937127470970154, "learning_rate": 1.5765726736655174e-05, "loss": 2.1489, "num_input_tokens_seen": 1127219200, "step": 2150 }, { "epoch": 0.32424064735545916, "grad_norm": 0.6133046746253967, "learning_rate": 1.5729189661499015e-05, "loss": 2.1507, "num_input_tokens_seen": 1132462080, "step": 2160 }, { "epoch": 0.32574176146358624, "grad_norm": 0.7476202845573425, "learning_rate": 1.5692905438636622e-05, "loss": 2.1691, "num_input_tokens_seen": 1137704960, "step": 2170 }, { "epoch": 0.3272428755717134, "grad_norm": 0.7822461724281311, "learning_rate": 1.5656871165047605e-05, "loss": 2.1522, "num_input_tokens_seen": 1142947840, "step": 2180 }, { "epoch": 0.32874398967984053, "grad_norm": 0.6776607632637024, "learning_rate": 1.5621083984160086e-05, "loss": 2.142, "num_input_tokens_seen": 1148190720, "step": 2190 }, { "epoch": 0.3302451037879676, "grad_norm": 0.7508281469345093, "learning_rate": 1.558554108489952e-05, "loss": 2.1723, "num_input_tokens_seen": 1153433600, "step": 2200 }, { "epoch": 0.33174621789609476, "grad_norm": 0.7543405890464783, "learning_rate": 1.5550239700761246e-05, "loss": 2.1442, "num_input_tokens_seen": 1158676480, "step": 2210 }, { "epoch": 0.3332473320042219, "grad_norm": 0.9116769433021545, "learning_rate": 1.5515177108906e-05, "loss": 2.1916, "num_input_tokens_seen": 1163919360, "step": 2220 }, { "epoch": 0.334748446112349, "grad_norm": 0.7372170686721802, "learning_rate": 1.5480350629277787e-05, "loss": 2.1546, "num_input_tokens_seen": 1169162240, "step": 2230 }, { "epoch": 0.33624956022047614, "grad_norm": 0.5467498302459717, "learning_rate": 1.5445757623743444e-05, "loss": 2.1919, "num_input_tokens_seen": 1174405120, "step": 2240 }, { "epoch": 0.3377506743286033, "grad_norm": 0.5791755318641663, "learning_rate": 1.5411395495253218e-05, "loss": 2.1782, "num_input_tokens_seen": 1179648000, "step": 2250 }, { "epoch": 0.3392517884367304, "grad_norm": 0.7308974862098694, "learning_rate": 1.5377261687021863e-05, "loss": 2.1682, "num_input_tokens_seen": 1184890880, "step": 2260 }, { "epoch": 0.3407529025448575, "grad_norm": 0.6968861222267151, "learning_rate": 1.5343353681729532e-05, "loss": 2.1724, "num_input_tokens_seen": 1190133760, "step": 2270 }, { "epoch": 0.34225401665298466, "grad_norm": 0.6837549805641174, "learning_rate": 1.5309669000742003e-05, "loss": 2.1783, "num_input_tokens_seen": 1195376640, "step": 2280 }, { "epoch": 0.34375513076111175, "grad_norm": 0.6059392094612122, "learning_rate": 1.5276205203349658e-05, "loss": 2.1454, "num_input_tokens_seen": 1200619520, "step": 2290 }, { "epoch": 0.3452562448692389, "grad_norm": 0.5880239605903625, "learning_rate": 1.5242959886024648e-05, "loss": 2.1428, "num_input_tokens_seen": 1205862400, "step": 2300 }, { "epoch": 0.34675735897736604, "grad_norm": 0.6651424169540405, "learning_rate": 1.5209930681695804e-05, "loss": 2.1518, "num_input_tokens_seen": 1211105280, "step": 2310 }, { "epoch": 0.3482584730854931, "grad_norm": 0.5734732151031494, "learning_rate": 1.5177115259040729e-05, "loss": 2.1327, "num_input_tokens_seen": 1216348160, "step": 2320 }, { "epoch": 0.34975958719362027, "grad_norm": 0.6250523328781128, "learning_rate": 1.514451132179463e-05, "loss": 2.1459, "num_input_tokens_seen": 1221591040, "step": 2330 }, { "epoch": 0.3512607013017474, "grad_norm": 0.6869091391563416, "learning_rate": 1.5112116608075403e-05, "loss": 2.1352, "num_input_tokens_seen": 1226833920, "step": 2340 }, { "epoch": 0.3527618154098745, "grad_norm": 0.8655930161476135, "learning_rate": 1.5079928889724547e-05, "loss": 2.1581, "num_input_tokens_seen": 1232076800, "step": 2350 }, { "epoch": 0.35426292951800165, "grad_norm": 0.6587986946105957, "learning_rate": 1.5047945971663434e-05, "loss": 2.1783, "num_input_tokens_seen": 1237319680, "step": 2360 }, { "epoch": 0.3557640436261288, "grad_norm": 0.6270188689231873, "learning_rate": 1.501616569126455e-05, "loss": 2.1665, "num_input_tokens_seen": 1242562560, "step": 2370 }, { "epoch": 0.3572651577342559, "grad_norm": 0.6337898969650269, "learning_rate": 1.4984585917737282e-05, "loss": 2.177, "num_input_tokens_seen": 1247805440, "step": 2380 }, { "epoch": 0.358766271842383, "grad_norm": 0.6973663568496704, "learning_rate": 1.4953204551527831e-05, "loss": 2.1493, "num_input_tokens_seen": 1253048320, "step": 2390 }, { "epoch": 0.36026738595051017, "grad_norm": 0.5605215430259705, "learning_rate": 1.4922019523732929e-05, "loss": 2.1688, "num_input_tokens_seen": 1258291200, "step": 2400 }, { "epoch": 0.36026738595051017, "eval_accuracy": 0.5663190883190883, "eval_loss": 2.1306827068328857, "eval_runtime": 94.9906, "eval_samples_per_second": 3.158, "eval_steps_per_second": 0.79, "num_input_tokens_seen": 1258291200, "step": 2400 }, { "epoch": 0.36176850005863725, "grad_norm": 0.7090189456939697, "learning_rate": 1.4891028795526912e-05, "loss": 2.1446, "num_input_tokens_seen": 1263534080, "step": 2410 }, { "epoch": 0.3632696141667644, "grad_norm": 0.7129746079444885, "learning_rate": 1.4860230357601855e-05, "loss": 2.178, "num_input_tokens_seen": 1268776960, "step": 2420 }, { "epoch": 0.36477072827489154, "grad_norm": 0.5609286427497864, "learning_rate": 1.4829622229620384e-05, "loss": 2.1846, "num_input_tokens_seen": 1274019840, "step": 2430 }, { "epoch": 0.36627184238301863, "grad_norm": 0.6806447505950928, "learning_rate": 1.4799202459680824e-05, "loss": 2.156, "num_input_tokens_seen": 1279262720, "step": 2440 }, { "epoch": 0.3677729564911458, "grad_norm": 0.5787296295166016, "learning_rate": 1.4768969123794386e-05, "loss": 2.128, "num_input_tokens_seen": 1284505600, "step": 2450 }, { "epoch": 0.3692740705992729, "grad_norm": 0.7626485824584961, "learning_rate": 1.4738920325374027e-05, "loss": 2.1607, "num_input_tokens_seen": 1289748480, "step": 2460 }, { "epoch": 0.3707751847074, "grad_norm": 0.6390326619148254, "learning_rate": 1.4709054194734743e-05, "loss": 2.1821, "num_input_tokens_seen": 1294991360, "step": 2470 }, { "epoch": 0.37227629881552715, "grad_norm": 0.5435773730278015, "learning_rate": 1.4679368888604919e-05, "loss": 2.1762, "num_input_tokens_seen": 1300234240, "step": 2480 }, { "epoch": 0.3737774129236543, "grad_norm": 0.5910453200340271, "learning_rate": 1.464986258964849e-05, "loss": 2.1586, "num_input_tokens_seen": 1305477120, "step": 2490 }, { "epoch": 0.3752785270317814, "grad_norm": 0.5591862797737122, "learning_rate": 1.4620533505997654e-05, "loss": 2.1512, "num_input_tokens_seen": 1310720000, "step": 2500 }, { "epoch": 0.3767796411399085, "grad_norm": 0.6790704727172852, "learning_rate": 1.459137987079579e-05, "loss": 2.1696, "num_input_tokens_seen": 1315962880, "step": 2510 }, { "epoch": 0.37828075524803567, "grad_norm": 0.649533212184906, "learning_rate": 1.4562399941750401e-05, "loss": 2.1357, "num_input_tokens_seen": 1321205760, "step": 2520 }, { "epoch": 0.37978186935616276, "grad_norm": 0.5689984560012817, "learning_rate": 1.453359200069576e-05, "loss": 2.1972, "num_input_tokens_seen": 1326448640, "step": 2530 }, { "epoch": 0.3812829834642899, "grad_norm": 0.7337691187858582, "learning_rate": 1.4504954353165044e-05, "loss": 2.1198, "num_input_tokens_seen": 1331691520, "step": 2540 }, { "epoch": 0.38278409757241705, "grad_norm": 0.550014853477478, "learning_rate": 1.447648532797172e-05, "loss": 2.1267, "num_input_tokens_seen": 1336934400, "step": 2550 }, { "epoch": 0.38428521168054414, "grad_norm": 0.6532608270645142, "learning_rate": 1.4448183276799891e-05, "loss": 2.1735, "num_input_tokens_seen": 1342177280, "step": 2560 }, { "epoch": 0.3857863257886713, "grad_norm": 0.5801606774330139, "learning_rate": 1.4420046573803481e-05, "loss": 2.113, "num_input_tokens_seen": 1347420160, "step": 2570 }, { "epoch": 0.3872874398967984, "grad_norm": 0.6582128405570984, "learning_rate": 1.4392073615213913e-05, "loss": 2.1431, "num_input_tokens_seen": 1352663040, "step": 2580 }, { "epoch": 0.3887885540049255, "grad_norm": 0.6207413077354431, "learning_rate": 1.436426281895616e-05, "loss": 2.1683, "num_input_tokens_seen": 1357905920, "step": 2590 }, { "epoch": 0.39028966811305266, "grad_norm": 0.5679153800010681, "learning_rate": 1.4336612624272917e-05, "loss": 2.1194, "num_input_tokens_seen": 1363148800, "step": 2600 }, { "epoch": 0.3917907822211798, "grad_norm": 0.6869505643844604, "learning_rate": 1.4309121491356698e-05, "loss": 2.1033, "num_input_tokens_seen": 1368391680, "step": 2610 }, { "epoch": 0.3932918963293069, "grad_norm": 0.661869466304779, "learning_rate": 1.4281787900989672e-05, "loss": 2.161, "num_input_tokens_seen": 1373634560, "step": 2620 }, { "epoch": 0.39479301043743403, "grad_norm": 0.6266052722930908, "learning_rate": 1.4254610354191023e-05, "loss": 2.1838, "num_input_tokens_seen": 1378877440, "step": 2630 }, { "epoch": 0.3962941245455612, "grad_norm": 0.6248527765274048, "learning_rate": 1.4227587371871679e-05, "loss": 2.1255, "num_input_tokens_seen": 1384120320, "step": 2640 }, { "epoch": 0.39779523865368827, "grad_norm": 0.7318665385246277, "learning_rate": 1.4200717494496206e-05, "loss": 2.1487, "num_input_tokens_seen": 1389363200, "step": 2650 }, { "epoch": 0.3992963527618154, "grad_norm": 0.7174627780914307, "learning_rate": 1.4173999281751702e-05, "loss": 2.1375, "num_input_tokens_seen": 1394606080, "step": 2660 }, { "epoch": 0.40079746686994255, "grad_norm": 0.6472954154014587, "learning_rate": 1.4147431312223518e-05, "loss": 2.1546, "num_input_tokens_seen": 1399848960, "step": 2670 }, { "epoch": 0.40229858097806964, "grad_norm": 0.588234543800354, "learning_rate": 1.4121012183077632e-05, "loss": 2.1437, "num_input_tokens_seen": 1405091840, "step": 2680 }, { "epoch": 0.4037996950861968, "grad_norm": 0.5872112512588501, "learning_rate": 1.4094740509749542e-05, "loss": 2.1672, "num_input_tokens_seen": 1410334720, "step": 2690 }, { "epoch": 0.40530080919432393, "grad_norm": 0.6200748682022095, "learning_rate": 1.406861492563948e-05, "loss": 2.1434, "num_input_tokens_seen": 1415577600, "step": 2700 }, { "epoch": 0.406801923302451, "grad_norm": 0.7556484937667847, "learning_rate": 1.4042634081813838e-05, "loss": 2.1603, "num_input_tokens_seen": 1420820480, "step": 2710 }, { "epoch": 0.40830303741057816, "grad_norm": 0.6454352736473083, "learning_rate": 1.4016796646712618e-05, "loss": 2.1783, "num_input_tokens_seen": 1426063360, "step": 2720 }, { "epoch": 0.4098041515187053, "grad_norm": 0.5394317507743835, "learning_rate": 1.3991101305862803e-05, "loss": 2.1646, "num_input_tokens_seen": 1431306240, "step": 2730 }, { "epoch": 0.4113052656268324, "grad_norm": 0.6699331998825073, "learning_rate": 1.396554676159745e-05, "loss": 2.1597, "num_input_tokens_seen": 1436549120, "step": 2740 }, { "epoch": 0.41280637973495954, "grad_norm": 0.6491063237190247, "learning_rate": 1.3940131732780461e-05, "loss": 2.1236, "num_input_tokens_seen": 1441792000, "step": 2750 }, { "epoch": 0.4143074938430867, "grad_norm": 0.6439197063446045, "learning_rate": 1.3914854954536792e-05, "loss": 2.164, "num_input_tokens_seen": 1447034880, "step": 2760 }, { "epoch": 0.41580860795121377, "grad_norm": 0.7333915829658508, "learning_rate": 1.3889715177988056e-05, "loss": 2.1417, "num_input_tokens_seen": 1452277760, "step": 2770 }, { "epoch": 0.4173097220593409, "grad_norm": 0.5554391145706177, "learning_rate": 1.386471116999334e-05, "loss": 2.1821, "num_input_tokens_seen": 1457520640, "step": 2780 }, { "epoch": 0.41881083616746806, "grad_norm": 0.6607913374900818, "learning_rate": 1.3839841712895161e-05, "loss": 2.1341, "num_input_tokens_seen": 1462763520, "step": 2790 }, { "epoch": 0.42031195027559515, "grad_norm": 0.6506746411323547, "learning_rate": 1.381510560427038e-05, "loss": 2.1443, "num_input_tokens_seen": 1468006400, "step": 2800 }, { "epoch": 0.42031195027559515, "eval_accuracy": 0.5676076516076516, "eval_loss": 2.1226789951324463, "eval_runtime": 94.7272, "eval_samples_per_second": 3.167, "eval_steps_per_second": 0.792, "num_input_tokens_seen": 1468006400, "step": 2800 }, { "epoch": 0.4218130643837223, "grad_norm": 0.6999175548553467, "learning_rate": 1.3790501656686045e-05, "loss": 2.1222, "num_input_tokens_seen": 1473249280, "step": 2810 }, { "epoch": 0.42331417849184944, "grad_norm": 0.6637996435165405, "learning_rate": 1.3766028697459948e-05, "loss": 2.1752, "num_input_tokens_seen": 1478492160, "step": 2820 }, { "epoch": 0.4248152925999765, "grad_norm": 0.675205409526825, "learning_rate": 1.3741685568425882e-05, "loss": 2.1379, "num_input_tokens_seen": 1483735040, "step": 2830 }, { "epoch": 0.42631640670810367, "grad_norm": 0.7251035571098328, "learning_rate": 1.371747112570342e-05, "loss": 2.1499, "num_input_tokens_seen": 1488977920, "step": 2840 }, { "epoch": 0.4278175208162308, "grad_norm": 0.8489077687263489, "learning_rate": 1.3693384239472163e-05, "loss": 2.1291, "num_input_tokens_seen": 1494220800, "step": 2850 }, { "epoch": 0.4293186349243579, "grad_norm": 0.5617297887802124, "learning_rate": 1.3669423793750302e-05, "loss": 2.145, "num_input_tokens_seen": 1499463680, "step": 2860 }, { "epoch": 0.43081974903248504, "grad_norm": 0.5285384654998779, "learning_rate": 1.3645588686177466e-05, "loss": 2.1492, "num_input_tokens_seen": 1504706560, "step": 2870 }, { "epoch": 0.4323208631406122, "grad_norm": 0.6254436373710632, "learning_rate": 1.3621877827801699e-05, "loss": 2.1196, "num_input_tokens_seen": 1509949440, "step": 2880 }, { "epoch": 0.4338219772487393, "grad_norm": 0.5884484052658081, "learning_rate": 1.3598290142870485e-05, "loss": 2.1484, "num_input_tokens_seen": 1515192320, "step": 2890 }, { "epoch": 0.4353230913568664, "grad_norm": 0.7654128074645996, "learning_rate": 1.357482456862576e-05, "loss": 2.1568, "num_input_tokens_seen": 1520435200, "step": 2900 }, { "epoch": 0.43682420546499356, "grad_norm": 0.839947521686554, "learning_rate": 1.3551480055102794e-05, "loss": 2.1651, "num_input_tokens_seen": 1525678080, "step": 2910 }, { "epoch": 0.43832531957312065, "grad_norm": 0.828296959400177, "learning_rate": 1.3528255564932864e-05, "loss": 2.1342, "num_input_tokens_seen": 1530920960, "step": 2920 }, { "epoch": 0.4398264336812478, "grad_norm": 0.5922654271125793, "learning_rate": 1.3505150073149634e-05, "loss": 2.1632, "num_input_tokens_seen": 1536163840, "step": 2930 }, { "epoch": 0.44132754778937494, "grad_norm": 0.6538131833076477, "learning_rate": 1.3482162566999154e-05, "loss": 2.1633, "num_input_tokens_seen": 1541406720, "step": 2940 }, { "epoch": 0.44282866189750203, "grad_norm": 0.7411973476409912, "learning_rate": 1.3459292045753423e-05, "loss": 2.1349, "num_input_tokens_seen": 1546649600, "step": 2950 }, { "epoch": 0.4443297760056292, "grad_norm": 0.6776240468025208, "learning_rate": 1.3436537520527398e-05, "loss": 2.1368, "num_input_tokens_seen": 1551892480, "step": 2960 }, { "epoch": 0.4458308901137563, "grad_norm": 0.6425411105155945, "learning_rate": 1.3413898014099404e-05, "loss": 2.1358, "num_input_tokens_seen": 1557135360, "step": 2970 }, { "epoch": 0.4473320042218834, "grad_norm": 0.6736078858375549, "learning_rate": 1.3391372560734868e-05, "loss": 2.1154, "num_input_tokens_seen": 1562378240, "step": 2980 }, { "epoch": 0.44883311833001055, "grad_norm": 0.6204555630683899, "learning_rate": 1.3368960206013277e-05, "loss": 2.1744, "num_input_tokens_seen": 1567621120, "step": 2990 }, { "epoch": 0.4503342324381377, "grad_norm": 0.6431263089179993, "learning_rate": 1.3346660006658346e-05, "loss": 2.1463, "num_input_tokens_seen": 1572864000, "step": 3000 }, { "epoch": 0.4518353465462648, "grad_norm": 0.6702350378036499, "learning_rate": 1.332447103037125e-05, "loss": 2.1422, "num_input_tokens_seen": 1578106880, "step": 3010 }, { "epoch": 0.4533364606543919, "grad_norm": 0.5336691737174988, "learning_rate": 1.330239235566693e-05, "loss": 2.1515, "num_input_tokens_seen": 1583349760, "step": 3020 }, { "epoch": 0.45483757476251907, "grad_norm": 0.6110213398933411, "learning_rate": 1.3280423071713355e-05, "loss": 2.1564, "num_input_tokens_seen": 1588592640, "step": 3030 }, { "epoch": 0.45633868887064616, "grad_norm": 0.6357016563415527, "learning_rate": 1.3258562278173699e-05, "loss": 2.1591, "num_input_tokens_seen": 1593835520, "step": 3040 }, { "epoch": 0.4578398029787733, "grad_norm": 0.609986424446106, "learning_rate": 1.3236809085051366e-05, "loss": 2.1862, "num_input_tokens_seen": 1599078400, "step": 3050 }, { "epoch": 0.45934091708690045, "grad_norm": 0.8125211000442505, "learning_rate": 1.3215162612537805e-05, "loss": 2.121, "num_input_tokens_seen": 1604321280, "step": 3060 }, { "epoch": 0.46084203119502753, "grad_norm": 0.5937681198120117, "learning_rate": 1.3193621990863051e-05, "loss": 2.1548, "num_input_tokens_seen": 1609564160, "step": 3070 }, { "epoch": 0.4623431453031547, "grad_norm": 0.6638383865356445, "learning_rate": 1.3172186360148932e-05, "loss": 2.1251, "num_input_tokens_seen": 1614807040, "step": 3080 }, { "epoch": 0.4638442594112818, "grad_norm": 0.6757727861404419, "learning_rate": 1.3150854870264907e-05, "loss": 2.1388, "num_input_tokens_seen": 1620049920, "step": 3090 }, { "epoch": 0.4653453735194089, "grad_norm": 0.5711122155189514, "learning_rate": 1.3129626680686454e-05, "loss": 2.1498, "num_input_tokens_seen": 1625292800, "step": 3100 }, { "epoch": 0.46684648762753606, "grad_norm": 0.6421942114830017, "learning_rate": 1.3108500960355947e-05, "loss": 2.149, "num_input_tokens_seen": 1630535680, "step": 3110 }, { "epoch": 0.4683476017356632, "grad_norm": 0.7078666090965271, "learning_rate": 1.3087476887546022e-05, "loss": 2.148, "num_input_tokens_seen": 1635778560, "step": 3120 }, { "epoch": 0.4698487158437903, "grad_norm": 0.69465172290802, "learning_rate": 1.3066553649725324e-05, "loss": 2.1375, "num_input_tokens_seen": 1641021440, "step": 3130 }, { "epoch": 0.47134982995191743, "grad_norm": 0.6873762011528015, "learning_rate": 1.304573044342661e-05, "loss": 2.1458, "num_input_tokens_seen": 1646264320, "step": 3140 }, { "epoch": 0.4728509440600446, "grad_norm": 0.6284067034721375, "learning_rate": 1.302500647411715e-05, "loss": 2.1442, "num_input_tokens_seen": 1651507200, "step": 3150 }, { "epoch": 0.47435205816817166, "grad_norm": 0.8749895095825195, "learning_rate": 1.3004380956071416e-05, "loss": 2.1304, "num_input_tokens_seen": 1656750080, "step": 3160 }, { "epoch": 0.4758531722762988, "grad_norm": 0.6920912265777588, "learning_rate": 1.298385311224594e-05, "loss": 2.1086, "num_input_tokens_seen": 1661992960, "step": 3170 }, { "epoch": 0.47735428638442595, "grad_norm": 0.6951104402542114, "learning_rate": 1.296342217415636e-05, "loss": 2.1271, "num_input_tokens_seen": 1667235840, "step": 3180 }, { "epoch": 0.47885540049255304, "grad_norm": 0.784321665763855, "learning_rate": 1.2943087381756598e-05, "loss": 2.1074, "num_input_tokens_seen": 1672478720, "step": 3190 }, { "epoch": 0.4803565146006802, "grad_norm": 0.6487578749656677, "learning_rate": 1.2922847983320086e-05, "loss": 2.1105, "num_input_tokens_seen": 1677721600, "step": 3200 }, { "epoch": 0.4803565146006802, "eval_accuracy": 0.5689149369149369, "eval_loss": 2.115828514099121, "eval_runtime": 94.253, "eval_samples_per_second": 3.183, "eval_steps_per_second": 0.796, "num_input_tokens_seen": 1677721600, "step": 3200 }, { "epoch": 0.48185762870880733, "grad_norm": 0.7569764852523804, "learning_rate": 1.2902703235323062e-05, "loss": 2.1195, "num_input_tokens_seen": 1682964480, "step": 3210 }, { "epoch": 0.4833587428169344, "grad_norm": 0.5782780051231384, "learning_rate": 1.2882652402329833e-05, "loss": 2.1186, "num_input_tokens_seen": 1688207360, "step": 3220 }, { "epoch": 0.48485985692506156, "grad_norm": 0.6713874936103821, "learning_rate": 1.286269475688002e-05, "loss": 2.1446, "num_input_tokens_seen": 1693450240, "step": 3230 }, { "epoch": 0.4863609710331887, "grad_norm": 0.6497519016265869, "learning_rate": 1.2842829579377681e-05, "loss": 2.1245, "num_input_tokens_seen": 1698693120, "step": 3240 }, { "epoch": 0.4878620851413158, "grad_norm": 0.6722620129585266, "learning_rate": 1.2823056157982359e-05, "loss": 2.108, "num_input_tokens_seen": 1703936000, "step": 3250 }, { "epoch": 0.48936319924944294, "grad_norm": 0.6298756003379822, "learning_rate": 1.2803373788501927e-05, "loss": 2.1412, "num_input_tokens_seen": 1709178880, "step": 3260 }, { "epoch": 0.4908643133575701, "grad_norm": 0.6180531978607178, "learning_rate": 1.2783781774287272e-05, "loss": 2.16, "num_input_tokens_seen": 1714421760, "step": 3270 }, { "epoch": 0.49236542746569717, "grad_norm": 0.6927182674407959, "learning_rate": 1.2764279426128711e-05, "loss": 2.1604, "num_input_tokens_seen": 1719664640, "step": 3280 }, { "epoch": 0.4938665415738243, "grad_norm": 0.8382894396781921, "learning_rate": 1.2744866062154176e-05, "loss": 2.1509, "num_input_tokens_seen": 1724907520, "step": 3290 }, { "epoch": 0.49536765568195146, "grad_norm": 0.634811282157898, "learning_rate": 1.2725541007729061e-05, "loss": 2.1651, "num_input_tokens_seen": 1730150400, "step": 3300 }, { "epoch": 0.4968687697900786, "grad_norm": 0.5942947268486023, "learning_rate": 1.2706303595357772e-05, "loss": 2.158, "num_input_tokens_seen": 1735393280, "step": 3310 }, { "epoch": 0.4983698838982057, "grad_norm": 0.6313604712486267, "learning_rate": 1.2687153164586875e-05, "loss": 2.1394, "num_input_tokens_seen": 1740636160, "step": 3320 }, { "epoch": 0.49987099800633283, "grad_norm": 0.604307234287262, "learning_rate": 1.2668089061909872e-05, "loss": 2.1611, "num_input_tokens_seen": 1745879040, "step": 3330 }, { "epoch": 0.50137211211446, "grad_norm": 0.6554628610610962, "learning_rate": 1.2649110640673518e-05, "loss": 2.1072, "num_input_tokens_seen": 1751121920, "step": 3340 }, { "epoch": 0.5028732262225871, "grad_norm": 0.7124990820884705, "learning_rate": 1.2630217260985716e-05, "loss": 2.1346, "num_input_tokens_seen": 1756364800, "step": 3350 }, { "epoch": 0.5043743403307142, "grad_norm": 0.784233033657074, "learning_rate": 1.2611408289624877e-05, "loss": 2.1628, "num_input_tokens_seen": 1761607680, "step": 3360 }, { "epoch": 0.5058754544388413, "grad_norm": 0.8534424901008606, "learning_rate": 1.2592683099950795e-05, "loss": 2.1577, "num_input_tokens_seen": 1766850560, "step": 3370 }, { "epoch": 0.5073765685469684, "grad_norm": 0.7110067009925842, "learning_rate": 1.2574041071816953e-05, "loss": 2.1333, "num_input_tokens_seen": 1772093440, "step": 3380 }, { "epoch": 0.5088776826550956, "grad_norm": 0.7778078317642212, "learning_rate": 1.2555481591484266e-05, "loss": 2.1136, "num_input_tokens_seen": 1777336320, "step": 3390 }, { "epoch": 0.5103787967632227, "grad_norm": 0.7516434192657471, "learning_rate": 1.253700405153621e-05, "loss": 2.1456, "num_input_tokens_seen": 1782579200, "step": 3400 }, { "epoch": 0.5118799108713499, "grad_norm": 0.8273611664772034, "learning_rate": 1.2518607850795318e-05, "loss": 2.1022, "num_input_tokens_seen": 1787822080, "step": 3410 }, { "epoch": 0.5133810249794769, "grad_norm": 0.6412326097488403, "learning_rate": 1.2500292394241044e-05, "loss": 2.1175, "num_input_tokens_seen": 1793064960, "step": 3420 }, { "epoch": 0.514882139087604, "grad_norm": 0.6667389273643494, "learning_rate": 1.2482057092928916e-05, "loss": 2.1253, "num_input_tokens_seen": 1798307840, "step": 3430 }, { "epoch": 0.5163832531957312, "grad_norm": 0.8034870624542236, "learning_rate": 1.2463901363910993e-05, "loss": 2.1233, "num_input_tokens_seen": 1803550720, "step": 3440 }, { "epoch": 0.5178843673038583, "grad_norm": 0.646260678768158, "learning_rate": 1.2445824630157606e-05, "loss": 2.0914, "num_input_tokens_seen": 1808793600, "step": 3450 }, { "epoch": 0.5193854814119855, "grad_norm": 0.7674567699432373, "learning_rate": 1.2427826320480309e-05, "loss": 2.0866, "num_input_tokens_seen": 1814036480, "step": 3460 }, { "epoch": 0.5208865955201126, "grad_norm": 0.7650302648544312, "learning_rate": 1.2409905869456093e-05, "loss": 2.1287, "num_input_tokens_seen": 1819279360, "step": 3470 }, { "epoch": 0.5223877096282397, "grad_norm": 0.6644052267074585, "learning_rate": 1.2392062717352773e-05, "loss": 2.122, "num_input_tokens_seen": 1824522240, "step": 3480 }, { "epoch": 0.5238888237363668, "grad_norm": 0.8389644026756287, "learning_rate": 1.2374296310055552e-05, "loss": 2.1453, "num_input_tokens_seen": 1829765120, "step": 3490 }, { "epoch": 0.525389937844494, "grad_norm": 0.7249365448951721, "learning_rate": 1.2356606098994752e-05, "loss": 2.1418, "num_input_tokens_seen": 1835008000, "step": 3500 }, { "epoch": 0.5268910519526211, "grad_norm": 0.7120906710624695, "learning_rate": 1.2338991541074669e-05, "loss": 2.1318, "num_input_tokens_seen": 1840250880, "step": 3510 }, { "epoch": 0.5283921660607482, "grad_norm": 0.7992458343505859, "learning_rate": 1.2321452098603552e-05, "loss": 2.144, "num_input_tokens_seen": 1845493760, "step": 3520 }, { "epoch": 0.5298932801688754, "grad_norm": 0.8479806780815125, "learning_rate": 1.2303987239224644e-05, "loss": 2.1429, "num_input_tokens_seen": 1850736640, "step": 3530 }, { "epoch": 0.5313943942770024, "grad_norm": 0.670669436454773, "learning_rate": 1.2286596435848341e-05, "loss": 2.1395, "num_input_tokens_seen": 1855979520, "step": 3540 }, { "epoch": 0.5328955083851296, "grad_norm": 0.7732242345809937, "learning_rate": 1.2269279166585364e-05, "loss": 2.1563, "num_input_tokens_seen": 1861222400, "step": 3550 }, { "epoch": 0.5343966224932567, "grad_norm": 0.9772087931632996, "learning_rate": 1.2252034914680983e-05, "loss": 2.1375, "num_input_tokens_seen": 1866465280, "step": 3560 }, { "epoch": 0.5358977366013838, "grad_norm": 0.5742955207824707, "learning_rate": 1.2234863168450241e-05, "loss": 2.1077, "num_input_tokens_seen": 1871708160, "step": 3570 }, { "epoch": 0.537398850709511, "grad_norm": 0.7332693934440613, "learning_rate": 1.2217763421214203e-05, "loss": 2.1338, "num_input_tokens_seen": 1876951040, "step": 3580 }, { "epoch": 0.5388999648176381, "grad_norm": 0.6025305390357971, "learning_rate": 1.2200735171237142e-05, "loss": 2.1233, "num_input_tokens_seen": 1882193920, "step": 3590 }, { "epoch": 0.5404010789257652, "grad_norm": 0.6781802773475647, "learning_rate": 1.2183777921664712e-05, "loss": 2.1045, "num_input_tokens_seen": 1887436800, "step": 3600 }, { "epoch": 0.5404010789257652, "eval_accuracy": 0.56999592999593, "eval_loss": 2.108959674835205, "eval_runtime": 94.9286, "eval_samples_per_second": 3.16, "eval_steps_per_second": 0.79, "num_input_tokens_seen": 1887436800, "step": 3600 }, { "epoch": 0.5419021930338923, "grad_norm": 0.6946921944618225, "learning_rate": 1.2166891180463065e-05, "loss": 2.1645, "num_input_tokens_seen": 1892679680, "step": 3610 }, { "epoch": 0.5434033071420195, "grad_norm": 0.7196519374847412, "learning_rate": 1.2150074460358885e-05, "loss": 2.1177, "num_input_tokens_seen": 1897922560, "step": 3620 }, { "epoch": 0.5449044212501466, "grad_norm": 0.6712445616722107, "learning_rate": 1.2133327278780315e-05, "loss": 2.118, "num_input_tokens_seen": 1903165440, "step": 3630 }, { "epoch": 0.5464055353582737, "grad_norm": 0.6647999286651611, "learning_rate": 1.2116649157798819e-05, "loss": 2.1208, "num_input_tokens_seen": 1908408320, "step": 3640 }, { "epoch": 0.5479066494664009, "grad_norm": 0.663698136806488, "learning_rate": 1.2100039624071883e-05, "loss": 2.1387, "num_input_tokens_seen": 1913651200, "step": 3650 }, { "epoch": 0.5494077635745279, "grad_norm": 0.873149037361145, "learning_rate": 1.2083498208786586e-05, "loss": 2.1205, "num_input_tokens_seen": 1918894080, "step": 3660 }, { "epoch": 0.5509088776826551, "grad_norm": 0.6879292726516724, "learning_rate": 1.2067024447604041e-05, "loss": 2.1089, "num_input_tokens_seen": 1924136960, "step": 3670 }, { "epoch": 0.5524099917907822, "grad_norm": 0.7777056694030762, "learning_rate": 1.2050617880604623e-05, "loss": 2.1087, "num_input_tokens_seen": 1929379840, "step": 3680 }, { "epoch": 0.5539111058989094, "grad_norm": 0.7917245030403137, "learning_rate": 1.2034278052234061e-05, "loss": 2.094, "num_input_tokens_seen": 1934622720, "step": 3690 }, { "epoch": 0.5554122200070365, "grad_norm": 0.7892484664916992, "learning_rate": 1.2018004511250296e-05, "loss": 2.111, "num_input_tokens_seen": 1939865600, "step": 3700 }, { "epoch": 0.5569133341151636, "grad_norm": 0.8272449374198914, "learning_rate": 1.2001796810671147e-05, "loss": 2.1574, "num_input_tokens_seen": 1945108480, "step": 3710 }, { "epoch": 0.5584144482232907, "grad_norm": 0.6054387092590332, "learning_rate": 1.1985654507722748e-05, "loss": 2.1226, "num_input_tokens_seen": 1950351360, "step": 3720 }, { "epoch": 0.5599155623314178, "grad_norm": 0.5401285886764526, "learning_rate": 1.1969577163788735e-05, "loss": 2.1093, "num_input_tokens_seen": 1955594240, "step": 3730 }, { "epoch": 0.561416676439545, "grad_norm": 0.6505089998245239, "learning_rate": 1.1953564344360196e-05, "loss": 2.1177, "num_input_tokens_seen": 1960837120, "step": 3740 }, { "epoch": 0.5629177905476721, "grad_norm": 0.5896223783493042, "learning_rate": 1.1937615618986343e-05, "loss": 2.1246, "num_input_tokens_seen": 1966080000, "step": 3750 }, { "epoch": 0.5644189046557992, "grad_norm": 0.9334360361099243, "learning_rate": 1.192173056122592e-05, "loss": 2.0941, "num_input_tokens_seen": 1971322880, "step": 3760 }, { "epoch": 0.5659200187639264, "grad_norm": 0.8824881315231323, "learning_rate": 1.1905908748599307e-05, "loss": 2.135, "num_input_tokens_seen": 1976565760, "step": 3770 }, { "epoch": 0.5674211328720534, "grad_norm": 0.9339269399642944, "learning_rate": 1.1890149762541318e-05, "loss": 2.1136, "num_input_tokens_seen": 1981808640, "step": 3780 }, { "epoch": 0.5689222469801806, "grad_norm": 0.7937804460525513, "learning_rate": 1.1874453188354716e-05, "loss": 2.1191, "num_input_tokens_seen": 1987051520, "step": 3790 }, { "epoch": 0.5704233610883077, "grad_norm": 0.7529720067977905, "learning_rate": 1.1858818615164347e-05, "loss": 2.1283, "num_input_tokens_seen": 1992294400, "step": 3800 }, { "epoch": 0.5719244751964349, "grad_norm": 0.5535932779312134, "learning_rate": 1.1843245635871996e-05, "loss": 2.1311, "num_input_tokens_seen": 1997537280, "step": 3810 }, { "epoch": 0.573425589304562, "grad_norm": 0.6932185888290405, "learning_rate": 1.1827733847111846e-05, "loss": 2.1332, "num_input_tokens_seen": 2002780160, "step": 3820 }, { "epoch": 0.5749267034126891, "grad_norm": 0.7156121730804443, "learning_rate": 1.181228284920661e-05, "loss": 2.1264, "num_input_tokens_seen": 2008023040, "step": 3830 }, { "epoch": 0.5764278175208162, "grad_norm": 0.6193491220474243, "learning_rate": 1.1796892246124261e-05, "loss": 2.1376, "num_input_tokens_seen": 2013265920, "step": 3840 }, { "epoch": 0.5779289316289433, "grad_norm": 0.6163820624351501, "learning_rate": 1.1781561645435414e-05, "loss": 2.0823, "num_input_tokens_seen": 2018508800, "step": 3850 }, { "epoch": 0.5794300457370705, "grad_norm": 0.7165153622627258, "learning_rate": 1.1766290658271293e-05, "loss": 2.1194, "num_input_tokens_seen": 2023751680, "step": 3860 }, { "epoch": 0.5809311598451976, "grad_norm": 0.6811047196388245, "learning_rate": 1.1751078899282295e-05, "loss": 2.1564, "num_input_tokens_seen": 2028994560, "step": 3870 }, { "epoch": 0.5824322739533248, "grad_norm": 0.9540090560913086, "learning_rate": 1.1735925986597164e-05, "loss": 2.1241, "num_input_tokens_seen": 2034237440, "step": 3880 }, { "epoch": 0.5839333880614519, "grad_norm": 0.6102780103683472, "learning_rate": 1.1720831541782706e-05, "loss": 2.0992, "num_input_tokens_seen": 2039480320, "step": 3890 }, { "epoch": 0.5854345021695789, "grad_norm": 0.6293305158615112, "learning_rate": 1.170579518980411e-05, "loss": 2.1285, "num_input_tokens_seen": 2044723200, "step": 3900 }, { "epoch": 0.5869356162777061, "grad_norm": 0.7680484056472778, "learning_rate": 1.169081655898581e-05, "loss": 2.0774, "num_input_tokens_seen": 2049966080, "step": 3910 }, { "epoch": 0.5884367303858332, "grad_norm": 0.7843759059906006, "learning_rate": 1.1675895280972886e-05, "loss": 2.1271, "num_input_tokens_seen": 2055208960, "step": 3920 }, { "epoch": 0.5899378444939604, "grad_norm": 0.8438634276390076, "learning_rate": 1.1661030990693031e-05, "loss": 2.1214, "num_input_tokens_seen": 2060451840, "step": 3930 }, { "epoch": 0.5914389586020875, "grad_norm": 0.7281661033630371, "learning_rate": 1.1646223326319031e-05, "loss": 2.1074, "num_input_tokens_seen": 2065694720, "step": 3940 }, { "epoch": 0.5929400727102146, "grad_norm": 0.7339427471160889, "learning_rate": 1.163147192923176e-05, "loss": 2.1571, "num_input_tokens_seen": 2070937600, "step": 3950 }, { "epoch": 0.5944411868183418, "grad_norm": 0.583240807056427, "learning_rate": 1.1616776443983717e-05, "loss": 2.1111, "num_input_tokens_seen": 2076180480, "step": 3960 }, { "epoch": 0.5959423009264688, "grad_norm": 0.5651365518569946, "learning_rate": 1.1602136518263054e-05, "loss": 2.0996, "num_input_tokens_seen": 2081423360, "step": 3970 }, { "epoch": 0.597443415034596, "grad_norm": 0.6529115438461304, "learning_rate": 1.1587551802858083e-05, "loss": 2.1478, "num_input_tokens_seen": 2086666240, "step": 3980 }, { "epoch": 0.5989445291427231, "grad_norm": 0.8366259932518005, "learning_rate": 1.1573021951622309e-05, "loss": 2.1437, "num_input_tokens_seen": 2091909120, "step": 3990 }, { "epoch": 0.6004456432508503, "grad_norm": 0.5824588537216187, "learning_rate": 1.1558546621439912e-05, "loss": 2.1181, "num_input_tokens_seen": 2097152000, "step": 4000 }, { "epoch": 0.6004456432508503, "eval_accuracy": 0.5708310948310948, "eval_loss": 2.104544162750244, "eval_runtime": 94.4692, "eval_samples_per_second": 3.176, "eval_steps_per_second": 0.794, "num_input_tokens_seen": 2097152000, "step": 4000 }, { "epoch": 0.6019467573589774, "grad_norm": 0.6183150410652161, "learning_rate": 1.1544125472191724e-05, "loss": 2.1217, "num_input_tokens_seen": 2102394880, "step": 4010 }, { "epoch": 0.6034478714671045, "grad_norm": 0.6359286904335022, "learning_rate": 1.152975816672165e-05, "loss": 2.0985, "num_input_tokens_seen": 2107637760, "step": 4020 }, { "epoch": 0.6049489855752316, "grad_norm": 0.6601809859275818, "learning_rate": 1.1515444370803586e-05, "loss": 2.1115, "num_input_tokens_seen": 2112880640, "step": 4030 }, { "epoch": 0.6064500996833587, "grad_norm": 0.6836331486701965, "learning_rate": 1.1501183753108733e-05, "loss": 2.1012, "num_input_tokens_seen": 2118123520, "step": 4040 }, { "epoch": 0.6079512137914859, "grad_norm": 0.6830666661262512, "learning_rate": 1.148697598517341e-05, "loss": 2.1594, "num_input_tokens_seen": 2123366400, "step": 4050 }, { "epoch": 0.609452327899613, "grad_norm": 0.6801722049713135, "learning_rate": 1.147282074136726e-05, "loss": 2.1227, "num_input_tokens_seen": 2128609280, "step": 4060 }, { "epoch": 0.6109534420077402, "grad_norm": 0.764336347579956, "learning_rate": 1.1458717698861917e-05, "loss": 2.1268, "num_input_tokens_seen": 2133852160, "step": 4070 }, { "epoch": 0.6124545561158673, "grad_norm": 1.0238054990768433, "learning_rate": 1.144466653760007e-05, "loss": 2.1217, "num_input_tokens_seen": 2139095040, "step": 4080 }, { "epoch": 0.6139556702239943, "grad_norm": 0.6901516914367676, "learning_rate": 1.1430666940264972e-05, "loss": 2.1304, "num_input_tokens_seen": 2144337920, "step": 4090 }, { "epoch": 0.6154567843321215, "grad_norm": 0.6896687150001526, "learning_rate": 1.141671859225032e-05, "loss": 2.1239, "num_input_tokens_seen": 2149580800, "step": 4100 }, { "epoch": 0.6169578984402486, "grad_norm": 0.8825215101242065, "learning_rate": 1.1402821181630592e-05, "loss": 2.1061, "num_input_tokens_seen": 2154823680, "step": 4110 }, { "epoch": 0.6184590125483758, "grad_norm": 0.7571860551834106, "learning_rate": 1.1388974399131718e-05, "loss": 2.1222, "num_input_tokens_seen": 2160066560, "step": 4120 }, { "epoch": 0.6199601266565029, "grad_norm": 0.6392367482185364, "learning_rate": 1.1375177938102187e-05, "loss": 2.156, "num_input_tokens_seen": 2165309440, "step": 4130 }, { "epoch": 0.62146124076463, "grad_norm": 0.7527381777763367, "learning_rate": 1.136143149448452e-05, "loss": 2.1233, "num_input_tokens_seen": 2170552320, "step": 4140 }, { "epoch": 0.6229623548727571, "grad_norm": 0.7589773535728455, "learning_rate": 1.1347734766787132e-05, "loss": 2.1081, "num_input_tokens_seen": 2175795200, "step": 4150 }, { "epoch": 0.6244634689808842, "grad_norm": 0.8642693161964417, "learning_rate": 1.1334087456056552e-05, "loss": 2.1131, "num_input_tokens_seen": 2181038080, "step": 4160 }, { "epoch": 0.6259645830890114, "grad_norm": 0.7018624544143677, "learning_rate": 1.1320489265850016e-05, "loss": 2.1194, "num_input_tokens_seen": 2186280960, "step": 4170 }, { "epoch": 0.6274656971971385, "grad_norm": 0.632854163646698, "learning_rate": 1.1306939902208426e-05, "loss": 2.1336, "num_input_tokens_seen": 2191523840, "step": 4180 }, { "epoch": 0.6289668113052657, "grad_norm": 0.8238286972045898, "learning_rate": 1.1293439073629661e-05, "loss": 2.1341, "num_input_tokens_seen": 2196766720, "step": 4190 }, { "epoch": 0.6304679254133928, "grad_norm": 0.8351567387580872, "learning_rate": 1.1279986491042232e-05, "loss": 2.0898, "num_input_tokens_seen": 2202009600, "step": 4200 }, { "epoch": 0.6319690395215198, "grad_norm": 0.5938683748245239, "learning_rate": 1.1266581867779284e-05, "loss": 2.1127, "num_input_tokens_seen": 2207252480, "step": 4210 }, { "epoch": 0.633470153629647, "grad_norm": 0.7919456958770752, "learning_rate": 1.1253224919552947e-05, "loss": 2.1014, "num_input_tokens_seen": 2212495360, "step": 4220 }, { "epoch": 0.6349712677377741, "grad_norm": 0.6826079487800598, "learning_rate": 1.1239915364429005e-05, "loss": 2.121, "num_input_tokens_seen": 2217738240, "step": 4230 }, { "epoch": 0.6364723818459013, "grad_norm": 0.7500185966491699, "learning_rate": 1.1226652922801909e-05, "loss": 2.123, "num_input_tokens_seen": 2222981120, "step": 4240 }, { "epoch": 0.6379734959540284, "grad_norm": 0.6904823780059814, "learning_rate": 1.1213437317370097e-05, "loss": 2.1258, "num_input_tokens_seen": 2228224000, "step": 4250 }, { "epoch": 0.6394746100621556, "grad_norm": 0.6400703191757202, "learning_rate": 1.1200268273111648e-05, "loss": 2.1147, "num_input_tokens_seen": 2233466880, "step": 4260 }, { "epoch": 0.6409757241702826, "grad_norm": 0.805190920829773, "learning_rate": 1.1187145517260246e-05, "loss": 2.1117, "num_input_tokens_seen": 2238709760, "step": 4270 }, { "epoch": 0.6424768382784097, "grad_norm": 0.6888982653617859, "learning_rate": 1.117406877928144e-05, "loss": 2.1171, "num_input_tokens_seen": 2243952640, "step": 4280 }, { "epoch": 0.6439779523865369, "grad_norm": 0.7432284355163574, "learning_rate": 1.1161037790849228e-05, "loss": 2.1098, "num_input_tokens_seen": 2249195520, "step": 4290 }, { "epoch": 0.645479066494664, "grad_norm": 0.6752641201019287, "learning_rate": 1.114805228582293e-05, "loss": 2.1208, "num_input_tokens_seen": 2254438400, "step": 4300 }, { "epoch": 0.6469801806027912, "grad_norm": 0.8564686179161072, "learning_rate": 1.1135112000224364e-05, "loss": 2.1125, "num_input_tokens_seen": 2259681280, "step": 4310 }, { "epoch": 0.6484812947109183, "grad_norm": 0.9626083970069885, "learning_rate": 1.1122216672215286e-05, "loss": 2.1215, "num_input_tokens_seen": 2264924160, "step": 4320 }, { "epoch": 0.6499824088190453, "grad_norm": 0.5604343414306641, "learning_rate": 1.1109366042075172e-05, "loss": 2.1667, "num_input_tokens_seen": 2270167040, "step": 4330 }, { "epoch": 0.6514835229271725, "grad_norm": 0.5957362651824951, "learning_rate": 1.1096559852179205e-05, "loss": 2.0962, "num_input_tokens_seen": 2275409920, "step": 4340 }, { "epoch": 0.6529846370352996, "grad_norm": 0.6427695751190186, "learning_rate": 1.1083797846976627e-05, "loss": 2.1203, "num_input_tokens_seen": 2280652800, "step": 4350 }, { "epoch": 0.6544857511434268, "grad_norm": 0.6548131108283997, "learning_rate": 1.1071079772969281e-05, "loss": 2.1172, "num_input_tokens_seen": 2285895680, "step": 4360 }, { "epoch": 0.6559868652515539, "grad_norm": 0.6767981052398682, "learning_rate": 1.1058405378690493e-05, "loss": 2.1267, "num_input_tokens_seen": 2291138560, "step": 4370 }, { "epoch": 0.6574879793596811, "grad_norm": 0.7053226232528687, "learning_rate": 1.1045774414684167e-05, "loss": 2.1183, "num_input_tokens_seen": 2296381440, "step": 4380 }, { "epoch": 0.6589890934678081, "grad_norm": 0.7884171605110168, "learning_rate": 1.1033186633484174e-05, "loss": 2.1103, "num_input_tokens_seen": 2301624320, "step": 4390 }, { "epoch": 0.6604902075759352, "grad_norm": 0.6177472472190857, "learning_rate": 1.1020641789593993e-05, "loss": 2.127, "num_input_tokens_seen": 2306867200, "step": 4400 }, { "epoch": 0.6604902075759352, "eval_accuracy": 0.5715555555555556, "eval_loss": 2.099358081817627, "eval_runtime": 93.9705, "eval_samples_per_second": 3.192, "eval_steps_per_second": 0.798, "num_input_tokens_seen": 2306867200, "step": 4400 }, { "epoch": 0.6619913216840624, "grad_norm": 0.6775453090667725, "learning_rate": 1.1008139639466584e-05, "loss": 2.1329, "num_input_tokens_seen": 2312110080, "step": 4410 }, { "epoch": 0.6634924357921895, "grad_norm": 0.6154870986938477, "learning_rate": 1.0995679941484547e-05, "loss": 2.0883, "num_input_tokens_seen": 2317352960, "step": 4420 }, { "epoch": 0.6649935499003167, "grad_norm": 0.7121848464012146, "learning_rate": 1.0983262455940505e-05, "loss": 2.1364, "num_input_tokens_seen": 2322595840, "step": 4430 }, { "epoch": 0.6664946640084438, "grad_norm": 0.7415539622306824, "learning_rate": 1.0970886945017727e-05, "loss": 2.1281, "num_input_tokens_seen": 2327838720, "step": 4440 }, { "epoch": 0.6679957781165708, "grad_norm": 0.678644597530365, "learning_rate": 1.0958553172771004e-05, "loss": 2.0817, "num_input_tokens_seen": 2333081600, "step": 4450 }, { "epoch": 0.669496892224698, "grad_norm": 0.6753328442573547, "learning_rate": 1.0946260905107762e-05, "loss": 2.1169, "num_input_tokens_seen": 2338324480, "step": 4460 }, { "epoch": 0.6709980063328251, "grad_norm": 0.6162213087081909, "learning_rate": 1.0934009909769385e-05, "loss": 2.1551, "num_input_tokens_seen": 2343567360, "step": 4470 }, { "epoch": 0.6724991204409523, "grad_norm": 0.7211172580718994, "learning_rate": 1.0921799956312801e-05, "loss": 2.1153, "num_input_tokens_seen": 2348810240, "step": 4480 }, { "epoch": 0.6740002345490794, "grad_norm": 0.8724449872970581, "learning_rate": 1.090963081609226e-05, "loss": 2.1171, "num_input_tokens_seen": 2354053120, "step": 4490 }, { "epoch": 0.6755013486572066, "grad_norm": 0.8566068410873413, "learning_rate": 1.0897502262241359e-05, "loss": 2.1138, "num_input_tokens_seen": 2359296000, "step": 4500 }, { "epoch": 0.6770024627653336, "grad_norm": 0.6851250529289246, "learning_rate": 1.0885414069655284e-05, "loss": 2.1129, "num_input_tokens_seen": 2364538880, "step": 4510 }, { "epoch": 0.6785035768734607, "grad_norm": 0.6527068614959717, "learning_rate": 1.087336601497325e-05, "loss": 2.1354, "num_input_tokens_seen": 2369781760, "step": 4520 }, { "epoch": 0.6800046909815879, "grad_norm": 0.6223852038383484, "learning_rate": 1.086135787656117e-05, "loss": 2.1055, "num_input_tokens_seen": 2375024640, "step": 4530 }, { "epoch": 0.681505805089715, "grad_norm": 0.7803609371185303, "learning_rate": 1.0849389434494532e-05, "loss": 2.1206, "num_input_tokens_seen": 2380267520, "step": 4540 }, { "epoch": 0.6830069191978422, "grad_norm": 0.8386520743370056, "learning_rate": 1.0837460470541495e-05, "loss": 2.1396, "num_input_tokens_seen": 2385510400, "step": 4550 }, { "epoch": 0.6845080333059693, "grad_norm": 0.8144930601119995, "learning_rate": 1.0825570768146148e-05, "loss": 2.1149, "num_input_tokens_seen": 2390753280, "step": 4560 }, { "epoch": 0.6860091474140964, "grad_norm": 0.8149564266204834, "learning_rate": 1.081372011241202e-05, "loss": 2.1056, "num_input_tokens_seen": 2395996160, "step": 4570 }, { "epoch": 0.6875102615222235, "grad_norm": 0.5903097987174988, "learning_rate": 1.0801908290085765e-05, "loss": 2.123, "num_input_tokens_seen": 2401239040, "step": 4580 }, { "epoch": 0.6890113756303506, "grad_norm": 0.6024293899536133, "learning_rate": 1.0790135089541035e-05, "loss": 2.1504, "num_input_tokens_seen": 2406481920, "step": 4590 }, { "epoch": 0.6905124897384778, "grad_norm": 1.085522174835205, "learning_rate": 1.0778400300762553e-05, "loss": 2.1158, "num_input_tokens_seen": 2411724800, "step": 4600 }, { "epoch": 0.6920136038466049, "grad_norm": 0.6899899840354919, "learning_rate": 1.0766703715330396e-05, "loss": 2.0817, "num_input_tokens_seen": 2416967680, "step": 4610 }, { "epoch": 0.6935147179547321, "grad_norm": 0.732567310333252, "learning_rate": 1.075504512640443e-05, "loss": 2.1013, "num_input_tokens_seen": 2422210560, "step": 4620 }, { "epoch": 0.6950158320628591, "grad_norm": 0.7568468451499939, "learning_rate": 1.0743424328708955e-05, "loss": 2.1554, "num_input_tokens_seen": 2427453440, "step": 4630 }, { "epoch": 0.6965169461709863, "grad_norm": 0.7178658843040466, "learning_rate": 1.0731841118517526e-05, "loss": 2.0937, "num_input_tokens_seen": 2432696320, "step": 4640 }, { "epoch": 0.6980180602791134, "grad_norm": 0.8591102361679077, "learning_rate": 1.072029529363794e-05, "loss": 2.11, "num_input_tokens_seen": 2437939200, "step": 4650 }, { "epoch": 0.6995191743872405, "grad_norm": 0.776842474937439, "learning_rate": 1.0708786653397427e-05, "loss": 2.1335, "num_input_tokens_seen": 2443182080, "step": 4660 }, { "epoch": 0.7010202884953677, "grad_norm": 0.6280748248100281, "learning_rate": 1.0697314998628e-05, "loss": 2.0915, "num_input_tokens_seen": 2448424960, "step": 4670 }, { "epoch": 0.7025214026034948, "grad_norm": 0.6168721318244934, "learning_rate": 1.0685880131651965e-05, "loss": 2.0978, "num_input_tokens_seen": 2453667840, "step": 4680 }, { "epoch": 0.7040225167116219, "grad_norm": 0.7713338732719421, "learning_rate": 1.067448185626763e-05, "loss": 2.0992, "num_input_tokens_seen": 2458910720, "step": 4690 }, { "epoch": 0.705523630819749, "grad_norm": 0.9560806751251221, "learning_rate": 1.0663119977735152e-05, "loss": 2.1351, "num_input_tokens_seen": 2464153600, "step": 4700 }, { "epoch": 0.7070247449278761, "grad_norm": 0.7966359257698059, "learning_rate": 1.0651794302762573e-05, "loss": 2.1285, "num_input_tokens_seen": 2469396480, "step": 4710 }, { "epoch": 0.7085258590360033, "grad_norm": 0.7075307965278625, "learning_rate": 1.0640504639492005e-05, "loss": 2.1108, "num_input_tokens_seen": 2474639360, "step": 4720 }, { "epoch": 0.7100269731441304, "grad_norm": 0.7893349528312683, "learning_rate": 1.0629250797485977e-05, "loss": 2.1133, "num_input_tokens_seen": 2479882240, "step": 4730 }, { "epoch": 0.7115280872522576, "grad_norm": 0.6469416618347168, "learning_rate": 1.0618032587713944e-05, "loss": 2.1259, "num_input_tokens_seen": 2485125120, "step": 4740 }, { "epoch": 0.7130292013603846, "grad_norm": 0.6949597001075745, "learning_rate": 1.0606849822538959e-05, "loss": 2.1272, "num_input_tokens_seen": 2490368000, "step": 4750 }, { "epoch": 0.7145303154685118, "grad_norm": 0.6256146430969238, "learning_rate": 1.0595702315704477e-05, "loss": 2.1132, "num_input_tokens_seen": 2495610880, "step": 4760 }, { "epoch": 0.7160314295766389, "grad_norm": 0.644726037979126, "learning_rate": 1.0584589882321336e-05, "loss": 2.1304, "num_input_tokens_seen": 2500853760, "step": 4770 }, { "epoch": 0.717532543684766, "grad_norm": 0.9808601140975952, "learning_rate": 1.0573512338854876e-05, "loss": 2.1061, "num_input_tokens_seen": 2506096640, "step": 4780 }, { "epoch": 0.7190336577928932, "grad_norm": 0.829621434211731, "learning_rate": 1.0562469503112205e-05, "loss": 2.1288, "num_input_tokens_seen": 2511339520, "step": 4790 }, { "epoch": 0.7205347719010203, "grad_norm": 1.08700692653656, "learning_rate": 1.055146119422961e-05, "loss": 2.1265, "num_input_tokens_seen": 2516582400, "step": 4800 }, { "epoch": 0.7205347719010203, "eval_accuracy": 0.571921855921856, "eval_loss": 2.095761299133301, "eval_runtime": 95.3789, "eval_samples_per_second": 3.145, "eval_steps_per_second": 0.786, "num_input_tokens_seen": 2516582400, "step": 4800 }, { "epoch": 0.7220358860091474, "grad_norm": 0.7580691576004028, "learning_rate": 1.0540487232660126e-05, "loss": 2.118, "num_input_tokens_seen": 2521825280, "step": 4810 }, { "epoch": 0.7235370001172745, "grad_norm": 0.7149703502655029, "learning_rate": 1.0529547440161225e-05, "loss": 2.0822, "num_input_tokens_seen": 2527068160, "step": 4820 }, { "epoch": 0.7250381142254017, "grad_norm": 0.6624520421028137, "learning_rate": 1.0518641639782667e-05, "loss": 2.1325, "num_input_tokens_seen": 2532311040, "step": 4830 }, { "epoch": 0.7265392283335288, "grad_norm": 0.7207245826721191, "learning_rate": 1.0507769655854467e-05, "loss": 2.1231, "num_input_tokens_seen": 2537553920, "step": 4840 }, { "epoch": 0.7280403424416559, "grad_norm": 0.6605533361434937, "learning_rate": 1.0496931313975018e-05, "loss": 2.0944, "num_input_tokens_seen": 2542796800, "step": 4850 }, { "epoch": 0.7295414565497831, "grad_norm": 0.6779858469963074, "learning_rate": 1.0486126440999343e-05, "loss": 2.1355, "num_input_tokens_seen": 2548039680, "step": 4860 }, { "epoch": 0.7310425706579101, "grad_norm": 0.9669274091720581, "learning_rate": 1.047535486502747e-05, "loss": 2.1198, "num_input_tokens_seen": 2553282560, "step": 4870 }, { "epoch": 0.7325436847660373, "grad_norm": 0.6352229714393616, "learning_rate": 1.0464616415392945e-05, "loss": 2.0968, "num_input_tokens_seen": 2558525440, "step": 4880 }, { "epoch": 0.7340447988741644, "grad_norm": 0.9474151134490967, "learning_rate": 1.0453910922651489e-05, "loss": 2.1213, "num_input_tokens_seen": 2563768320, "step": 4890 }, { "epoch": 0.7355459129822915, "grad_norm": 0.6039556264877319, "learning_rate": 1.0443238218569753e-05, "loss": 2.1092, "num_input_tokens_seen": 2569011200, "step": 4900 }, { "epoch": 0.7370470270904187, "grad_norm": 0.6552874445915222, "learning_rate": 1.0432598136114214e-05, "loss": 2.1292, "num_input_tokens_seen": 2574254080, "step": 4910 }, { "epoch": 0.7385481411985458, "grad_norm": 0.6924867033958435, "learning_rate": 1.042199050944021e-05, "loss": 2.1396, "num_input_tokens_seen": 2579496960, "step": 4920 }, { "epoch": 0.7400492553066729, "grad_norm": 0.7396571636199951, "learning_rate": 1.0411415173881066e-05, "loss": 2.1028, "num_input_tokens_seen": 2584739840, "step": 4930 }, { "epoch": 0.7415503694148, "grad_norm": 0.737847626209259, "learning_rate": 1.0400871965937369e-05, "loss": 2.0786, "num_input_tokens_seen": 2589982720, "step": 4940 }, { "epoch": 0.7430514835229272, "grad_norm": 0.7622154355049133, "learning_rate": 1.0390360723266348e-05, "loss": 2.1172, "num_input_tokens_seen": 2595225600, "step": 4950 }, { "epoch": 0.7445525976310543, "grad_norm": 0.9400896430015564, "learning_rate": 1.0379881284671372e-05, "loss": 2.1024, "num_input_tokens_seen": 2600468480, "step": 4960 }, { "epoch": 0.7460537117391814, "grad_norm": 0.8766177296638489, "learning_rate": 1.0369433490091569e-05, "loss": 2.1389, "num_input_tokens_seen": 2605711360, "step": 4970 }, { "epoch": 0.7475548258473086, "grad_norm": 0.9279113411903381, "learning_rate": 1.0359017180591565e-05, "loss": 2.1417, "num_input_tokens_seen": 2610954240, "step": 4980 }, { "epoch": 0.7490559399554356, "grad_norm": 0.6392689347267151, "learning_rate": 1.0348632198351312e-05, "loss": 2.1454, "num_input_tokens_seen": 2616197120, "step": 4990 }, { "epoch": 0.7505570540635628, "grad_norm": 0.6666054129600525, "learning_rate": 1.0338278386656068e-05, "loss": 2.0914, "num_input_tokens_seen": 2621440000, "step": 5000 }, { "epoch": 0.7520581681716899, "grad_norm": 0.8447917103767395, "learning_rate": 1.0327955589886445e-05, "loss": 2.112, "num_input_tokens_seen": 2626682880, "step": 5010 }, { "epoch": 0.753559282279817, "grad_norm": 0.7507166862487793, "learning_rate": 1.0317663653508592e-05, "loss": 2.0921, "num_input_tokens_seen": 2631925760, "step": 5020 }, { "epoch": 0.7550603963879442, "grad_norm": 0.7831541299819946, "learning_rate": 1.0307402424064481e-05, "loss": 2.1418, "num_input_tokens_seen": 2637168640, "step": 5030 }, { "epoch": 0.7565615104960713, "grad_norm": 0.6328726410865784, "learning_rate": 1.0297171749162294e-05, "loss": 2.1253, "num_input_tokens_seen": 2642411520, "step": 5040 }, { "epoch": 0.7580626246041984, "grad_norm": 0.8324751257896423, "learning_rate": 1.0286971477466913e-05, "loss": 2.1222, "num_input_tokens_seen": 2647654400, "step": 5050 }, { "epoch": 0.7595637387123255, "grad_norm": 0.7374483942985535, "learning_rate": 1.0276801458690536e-05, "loss": 2.1117, "num_input_tokens_seen": 2652897280, "step": 5060 }, { "epoch": 0.7610648528204527, "grad_norm": 0.787074625492096, "learning_rate": 1.0266661543583343e-05, "loss": 2.0947, "num_input_tokens_seen": 2658140160, "step": 5070 }, { "epoch": 0.7625659669285798, "grad_norm": 0.6950966119766235, "learning_rate": 1.0256551583924336e-05, "loss": 2.1237, "num_input_tokens_seen": 2663383040, "step": 5080 }, { "epoch": 0.764067081036707, "grad_norm": 0.7359448671340942, "learning_rate": 1.024647143251221e-05, "loss": 2.1298, "num_input_tokens_seen": 2668625920, "step": 5090 }, { "epoch": 0.7655681951448341, "grad_norm": 0.7270290851593018, "learning_rate": 1.0236420943156362e-05, "loss": 2.1171, "num_input_tokens_seen": 2673868800, "step": 5100 }, { "epoch": 0.7670693092529611, "grad_norm": 0.9731076955795288, "learning_rate": 1.022639997066799e-05, "loss": 2.0728, "num_input_tokens_seen": 2679111680, "step": 5110 }, { "epoch": 0.7685704233610883, "grad_norm": 0.7202324867248535, "learning_rate": 1.0216408370851276e-05, "loss": 2.1083, "num_input_tokens_seen": 2684354560, "step": 5120 }, { "epoch": 0.7700715374692154, "grad_norm": 0.7496753334999084, "learning_rate": 1.020644600049467e-05, "loss": 2.105, "num_input_tokens_seen": 2689597440, "step": 5130 }, { "epoch": 0.7715726515773426, "grad_norm": 0.663766622543335, "learning_rate": 1.0196512717362284e-05, "loss": 2.1095, "num_input_tokens_seen": 2694840320, "step": 5140 }, { "epoch": 0.7730737656854697, "grad_norm": 0.697460949420929, "learning_rate": 1.0186608380185336e-05, "loss": 2.115, "num_input_tokens_seen": 2700083200, "step": 5150 }, { "epoch": 0.7745748797935968, "grad_norm": 0.7035109400749207, "learning_rate": 1.0176732848653748e-05, "loss": 2.1035, "num_input_tokens_seen": 2705326080, "step": 5160 }, { "epoch": 0.7760759939017239, "grad_norm": 0.7405036091804504, "learning_rate": 1.0166885983407759e-05, "loss": 2.1061, "num_input_tokens_seen": 2710568960, "step": 5170 }, { "epoch": 0.777577108009851, "grad_norm": 0.7046319842338562, "learning_rate": 1.0157067646029694e-05, "loss": 2.1098, "num_input_tokens_seen": 2715811840, "step": 5180 }, { "epoch": 0.7790782221179782, "grad_norm": 0.7800982594490051, "learning_rate": 1.0147277699035774e-05, "loss": 2.0871, "num_input_tokens_seen": 2721054720, "step": 5190 }, { "epoch": 0.7805793362261053, "grad_norm": 0.6889191269874573, "learning_rate": 1.0137516005868044e-05, "loss": 2.0951, "num_input_tokens_seen": 2726297600, "step": 5200 }, { "epoch": 0.7805793362261053, "eval_accuracy": 0.5728083028083029, "eval_loss": 2.090898036956787, "eval_runtime": 93.559, "eval_samples_per_second": 3.207, "eval_steps_per_second": 0.802, "num_input_tokens_seen": 2726297600, "step": 5200 }, { "epoch": 0.7820804503342325, "grad_norm": 0.8273455500602722, "learning_rate": 1.0127782430886362e-05, "loss": 2.0874, "num_input_tokens_seen": 2731540480, "step": 5210 }, { "epoch": 0.7835815644423596, "grad_norm": 0.7020549178123474, "learning_rate": 1.0118076839360487e-05, "loss": 2.1437, "num_input_tokens_seen": 2736783360, "step": 5220 }, { "epoch": 0.7850826785504866, "grad_norm": 0.6244192719459534, "learning_rate": 1.0108399097462245e-05, "loss": 2.114, "num_input_tokens_seen": 2742026240, "step": 5230 }, { "epoch": 0.7865837926586138, "grad_norm": 0.6874685883522034, "learning_rate": 1.0098749072257785e-05, "loss": 2.0951, "num_input_tokens_seen": 2747269120, "step": 5240 }, { "epoch": 0.7880849067667409, "grad_norm": 0.9486780762672424, "learning_rate": 1.0089126631699902e-05, "loss": 2.1273, "num_input_tokens_seen": 2752512000, "step": 5250 }, { "epoch": 0.7895860208748681, "grad_norm": 0.8076638579368591, "learning_rate": 1.0079531644620446e-05, "loss": 2.0947, "num_input_tokens_seen": 2757754880, "step": 5260 }, { "epoch": 0.7910871349829952, "grad_norm": 0.7314202785491943, "learning_rate": 1.0069963980722823e-05, "loss": 2.1266, "num_input_tokens_seen": 2762997760, "step": 5270 }, { "epoch": 0.7925882490911224, "grad_norm": 0.6520930528640747, "learning_rate": 1.0060423510574553e-05, "loss": 2.1178, "num_input_tokens_seen": 2768240640, "step": 5280 }, { "epoch": 0.7940893631992495, "grad_norm": 0.7945283651351929, "learning_rate": 1.0050910105599924e-05, "loss": 2.1345, "num_input_tokens_seen": 2773483520, "step": 5290 }, { "epoch": 0.7955904773073765, "grad_norm": 0.9129828810691833, "learning_rate": 1.0041423638072707e-05, "loss": 2.1111, "num_input_tokens_seen": 2778726400, "step": 5300 }, { "epoch": 0.7970915914155037, "grad_norm": 0.7378903031349182, "learning_rate": 1.0031963981108954e-05, "loss": 2.1075, "num_input_tokens_seen": 2783969280, "step": 5310 }, { "epoch": 0.7985927055236308, "grad_norm": 0.8121985197067261, "learning_rate": 1.0022531008659884e-05, "loss": 2.105, "num_input_tokens_seen": 2789212160, "step": 5320 }, { "epoch": 0.800093819631758, "grad_norm": 0.9001929759979248, "learning_rate": 1.0013124595504802e-05, "loss": 2.1389, "num_input_tokens_seen": 2794455040, "step": 5330 }, { "epoch": 0.8015949337398851, "grad_norm": 0.6680558323860168, "learning_rate": 1.0003744617244145e-05, "loss": 2.1057, "num_input_tokens_seen": 2799697920, "step": 5340 }, { "epoch": 0.8030960478480123, "grad_norm": 0.7327044606208801, "learning_rate": 9.994390950292557e-06, "loss": 2.1291, "num_input_tokens_seen": 2804940800, "step": 5350 }, { "epoch": 0.8045971619561393, "grad_norm": 0.8424970507621765, "learning_rate": 9.985063471872047e-06, "loss": 2.1063, "num_input_tokens_seen": 2810183680, "step": 5360 }, { "epoch": 0.8060982760642664, "grad_norm": 0.8721758127212524, "learning_rate": 9.975762060005233e-06, "loss": 2.1093, "num_input_tokens_seen": 2815426560, "step": 5370 }, { "epoch": 0.8075993901723936, "grad_norm": 0.8354282379150391, "learning_rate": 9.966486593508638e-06, "loss": 2.1093, "num_input_tokens_seen": 2820669440, "step": 5380 }, { "epoch": 0.8091005042805207, "grad_norm": 0.7554700970649719, "learning_rate": 9.957236951986044e-06, "loss": 2.1185, "num_input_tokens_seen": 2825912320, "step": 5390 }, { "epoch": 0.8106016183886479, "grad_norm": 0.6626455783843994, "learning_rate": 9.948013015821951e-06, "loss": 2.0802, "num_input_tokens_seen": 2831155200, "step": 5400 }, { "epoch": 0.812102732496775, "grad_norm": 0.7216958403587341, "learning_rate": 9.938814666175072e-06, "loss": 2.0879, "num_input_tokens_seen": 2836398080, "step": 5410 }, { "epoch": 0.813603846604902, "grad_norm": 0.7840401530265808, "learning_rate": 9.929641784971893e-06, "loss": 2.1033, "num_input_tokens_seen": 2841640960, "step": 5420 }, { "epoch": 0.8151049607130292, "grad_norm": 0.8764855265617371, "learning_rate": 9.920494254900313e-06, "loss": 2.0822, "num_input_tokens_seen": 2846883840, "step": 5430 }, { "epoch": 0.8166060748211563, "grad_norm": 0.8843982219696045, "learning_rate": 9.911371959403354e-06, "loss": 2.1132, "num_input_tokens_seen": 2852126720, "step": 5440 }, { "epoch": 0.8181071889292835, "grad_norm": 0.7158339023590088, "learning_rate": 9.902274782672901e-06, "loss": 2.1019, "num_input_tokens_seen": 2857369600, "step": 5450 }, { "epoch": 0.8196083030374106, "grad_norm": 0.6980221271514893, "learning_rate": 9.893202609643547e-06, "loss": 2.0935, "num_input_tokens_seen": 2862612480, "step": 5460 }, { "epoch": 0.8211094171455378, "grad_norm": 0.6455651521682739, "learning_rate": 9.88415532598647e-06, "loss": 2.1212, "num_input_tokens_seen": 2867855360, "step": 5470 }, { "epoch": 0.8226105312536648, "grad_norm": 0.6872067451477051, "learning_rate": 9.875132818103386e-06, "loss": 2.0931, "num_input_tokens_seen": 2873098240, "step": 5480 }, { "epoch": 0.8241116453617919, "grad_norm": 0.8125963807106018, "learning_rate": 9.86613497312055e-06, "loss": 2.1199, "num_input_tokens_seen": 2878341120, "step": 5490 }, { "epoch": 0.8256127594699191, "grad_norm": 0.840162992477417, "learning_rate": 9.857161678882842e-06, "loss": 2.131, "num_input_tokens_seen": 2883584000, "step": 5500 }, { "epoch": 0.8271138735780462, "grad_norm": 0.774443507194519, "learning_rate": 9.848212823947872e-06, "loss": 2.0972, "num_input_tokens_seen": 2888826880, "step": 5510 }, { "epoch": 0.8286149876861734, "grad_norm": 0.7709413766860962, "learning_rate": 9.839288297580194e-06, "loss": 2.1215, "num_input_tokens_seen": 2894069760, "step": 5520 }, { "epoch": 0.8301161017943005, "grad_norm": 0.738002598285675, "learning_rate": 9.830387989745525e-06, "loss": 2.0852, "num_input_tokens_seen": 2899312640, "step": 5530 }, { "epoch": 0.8316172159024275, "grad_norm": 0.803095817565918, "learning_rate": 9.821511791105069e-06, "loss": 2.1179, "num_input_tokens_seen": 2904555520, "step": 5540 }, { "epoch": 0.8331183300105547, "grad_norm": 0.808690071105957, "learning_rate": 9.812659593009853e-06, "loss": 2.1157, "num_input_tokens_seen": 2909798400, "step": 5550 }, { "epoch": 0.8346194441186818, "grad_norm": 0.7011695504188538, "learning_rate": 9.803831287495165e-06, "loss": 2.1104, "num_input_tokens_seen": 2915041280, "step": 5560 }, { "epoch": 0.836120558226809, "grad_norm": 0.8309350609779358, "learning_rate": 9.795026767275002e-06, "loss": 2.0879, "num_input_tokens_seen": 2920284160, "step": 5570 }, { "epoch": 0.8376216723349361, "grad_norm": 0.7645972371101379, "learning_rate": 9.78624592573661e-06, "loss": 2.083, "num_input_tokens_seen": 2925527040, "step": 5580 }, { "epoch": 0.8391227864430633, "grad_norm": 0.770995557308197, "learning_rate": 9.777488656935059e-06, "loss": 2.1311, "num_input_tokens_seen": 2930769920, "step": 5590 }, { "epoch": 0.8406239005511903, "grad_norm": 0.7518852949142456, "learning_rate": 9.768754855587863e-06, "loss": 2.0951, "num_input_tokens_seen": 2936012800, "step": 5600 }, { "epoch": 0.8406239005511903, "eval_accuracy": 0.5732901912901913, "eval_loss": 2.0876457691192627, "eval_runtime": 94.3441, "eval_samples_per_second": 3.18, "eval_steps_per_second": 0.795, "num_input_tokens_seen": 2936012800, "step": 5600 }, { "epoch": 0.8421250146593174, "grad_norm": 0.8328472375869751, "learning_rate": 9.760044417069675e-06, "loss": 2.1032, "num_input_tokens_seen": 2941255680, "step": 5610 }, { "epoch": 0.8436261287674446, "grad_norm": 0.803054690361023, "learning_rate": 9.75135723740702e-06, "loss": 2.0893, "num_input_tokens_seen": 2946498560, "step": 5620 }, { "epoch": 0.8451272428755717, "grad_norm": 1.0215765237808228, "learning_rate": 9.742693213273084e-06, "loss": 2.091, "num_input_tokens_seen": 2951741440, "step": 5630 }, { "epoch": 0.8466283569836989, "grad_norm": 0.669141948223114, "learning_rate": 9.734052241982545e-06, "loss": 2.1025, "num_input_tokens_seen": 2956984320, "step": 5640 }, { "epoch": 0.848129471091826, "grad_norm": 0.7541179656982422, "learning_rate": 9.725434221486473e-06, "loss": 2.1037, "num_input_tokens_seen": 2962227200, "step": 5650 }, { "epoch": 0.849630585199953, "grad_norm": 0.8103038668632507, "learning_rate": 9.716839050367259e-06, "loss": 2.1022, "num_input_tokens_seen": 2967470080, "step": 5660 }, { "epoch": 0.8511316993080802, "grad_norm": 0.7163017988204956, "learning_rate": 9.7082666278336e-06, "loss": 2.1019, "num_input_tokens_seen": 2972712960, "step": 5670 }, { "epoch": 0.8526328134162073, "grad_norm": 0.7300000786781311, "learning_rate": 9.699716853715554e-06, "loss": 2.118, "num_input_tokens_seen": 2977955840, "step": 5680 }, { "epoch": 0.8541339275243345, "grad_norm": 0.7246140241622925, "learning_rate": 9.691189628459591e-06, "loss": 2.1528, "num_input_tokens_seen": 2983198720, "step": 5690 }, { "epoch": 0.8556350416324616, "grad_norm": 0.8242561221122742, "learning_rate": 9.682684853123759e-06, "loss": 2.1482, "num_input_tokens_seen": 2988441600, "step": 5700 }, { "epoch": 0.8571361557405888, "grad_norm": 0.7430049777030945, "learning_rate": 9.674202429372843e-06, "loss": 2.0871, "num_input_tokens_seen": 2993684480, "step": 5710 }, { "epoch": 0.8586372698487158, "grad_norm": 0.7430338263511658, "learning_rate": 9.665742259473581e-06, "loss": 2.0706, "num_input_tokens_seen": 2998927360, "step": 5720 }, { "epoch": 0.860138383956843, "grad_norm": 0.7350336313247681, "learning_rate": 9.657304246289963e-06, "loss": 2.1035, "num_input_tokens_seen": 3004170240, "step": 5730 }, { "epoch": 0.8616394980649701, "grad_norm": 0.5950428247451782, "learning_rate": 9.64888829327852e-06, "loss": 2.1055, "num_input_tokens_seen": 3009413120, "step": 5740 }, { "epoch": 0.8631406121730972, "grad_norm": 0.7408620715141296, "learning_rate": 9.640494304483698e-06, "loss": 2.1078, "num_input_tokens_seen": 3014656000, "step": 5750 }, { "epoch": 0.8646417262812244, "grad_norm": 0.8998221158981323, "learning_rate": 9.63212218453326e-06, "loss": 2.1249, "num_input_tokens_seen": 3019898880, "step": 5760 }, { "epoch": 0.8661428403893515, "grad_norm": 0.8028159141540527, "learning_rate": 9.623771838633735e-06, "loss": 2.0808, "num_input_tokens_seen": 3025141760, "step": 5770 }, { "epoch": 0.8676439544974786, "grad_norm": 0.7718355655670166, "learning_rate": 9.615443172565908e-06, "loss": 2.1335, "num_input_tokens_seen": 3030384640, "step": 5780 }, { "epoch": 0.8691450686056057, "grad_norm": 0.6967669129371643, "learning_rate": 9.607136092680355e-06, "loss": 2.1056, "num_input_tokens_seen": 3035627520, "step": 5790 }, { "epoch": 0.8706461827137328, "grad_norm": 0.8609460592269897, "learning_rate": 9.598850505893025e-06, "loss": 2.1219, "num_input_tokens_seen": 3040870400, "step": 5800 }, { "epoch": 0.87214729682186, "grad_norm": 0.7492549419403076, "learning_rate": 9.590586319680857e-06, "loss": 2.0746, "num_input_tokens_seen": 3046113280, "step": 5810 }, { "epoch": 0.8736484109299871, "grad_norm": 0.7335068583488464, "learning_rate": 9.582343442077434e-06, "loss": 2.1379, "num_input_tokens_seen": 3051356160, "step": 5820 }, { "epoch": 0.8751495250381143, "grad_norm": 0.6927693486213684, "learning_rate": 9.574121781668698e-06, "loss": 2.1052, "num_input_tokens_seen": 3056599040, "step": 5830 }, { "epoch": 0.8766506391462413, "grad_norm": 0.7068751454353333, "learning_rate": 9.565921247588678e-06, "loss": 2.0933, "num_input_tokens_seen": 3061841920, "step": 5840 }, { "epoch": 0.8781517532543685, "grad_norm": 0.7502044439315796, "learning_rate": 9.557741749515278e-06, "loss": 2.0825, "num_input_tokens_seen": 3067084800, "step": 5850 }, { "epoch": 0.8796528673624956, "grad_norm": 0.6573644876480103, "learning_rate": 9.549583197666103e-06, "loss": 2.1222, "num_input_tokens_seen": 3072327680, "step": 5860 }, { "epoch": 0.8811539814706227, "grad_norm": 0.8025186657905579, "learning_rate": 9.541445502794315e-06, "loss": 2.0886, "num_input_tokens_seen": 3077570560, "step": 5870 }, { "epoch": 0.8826550955787499, "grad_norm": 0.8089122176170349, "learning_rate": 9.533328576184532e-06, "loss": 2.1055, "num_input_tokens_seen": 3082813440, "step": 5880 }, { "epoch": 0.884156209686877, "grad_norm": 0.8042004704475403, "learning_rate": 9.525232329648768e-06, "loss": 2.1178, "num_input_tokens_seen": 3088056320, "step": 5890 }, { "epoch": 0.8856573237950041, "grad_norm": 0.8820722699165344, "learning_rate": 9.517156675522405e-06, "loss": 2.0989, "num_input_tokens_seen": 3093299200, "step": 5900 }, { "epoch": 0.8871584379031312, "grad_norm": 0.731745719909668, "learning_rate": 9.509101526660216e-06, "loss": 2.1112, "num_input_tokens_seen": 3098542080, "step": 5910 }, { "epoch": 0.8886595520112583, "grad_norm": 0.8687134981155396, "learning_rate": 9.501066796432403e-06, "loss": 2.0962, "num_input_tokens_seen": 3103784960, "step": 5920 }, { "epoch": 0.8901606661193855, "grad_norm": 0.7933871150016785, "learning_rate": 9.493052398720693e-06, "loss": 2.1376, "num_input_tokens_seen": 3109027840, "step": 5930 }, { "epoch": 0.8916617802275126, "grad_norm": 0.8090782165527344, "learning_rate": 9.485058247914453e-06, "loss": 2.1105, "num_input_tokens_seen": 3114270720, "step": 5940 }, { "epoch": 0.8931628943356398, "grad_norm": 0.7016152739524841, "learning_rate": 9.477084258906861e-06, "loss": 2.0717, "num_input_tokens_seen": 3119513600, "step": 5950 }, { "epoch": 0.8946640084437668, "grad_norm": 0.7488442659378052, "learning_rate": 9.469130347091085e-06, "loss": 2.1083, "num_input_tokens_seen": 3124756480, "step": 5960 }, { "epoch": 0.896165122551894, "grad_norm": 0.8162315487861633, "learning_rate": 9.461196428356533e-06, "loss": 2.0822, "num_input_tokens_seen": 3129999360, "step": 5970 }, { "epoch": 0.8976662366600211, "grad_norm": 0.6994791030883789, "learning_rate": 9.453282419085091e-06, "loss": 2.1665, "num_input_tokens_seen": 3135242240, "step": 5980 }, { "epoch": 0.8991673507681482, "grad_norm": 0.7622292041778564, "learning_rate": 9.445388236147448e-06, "loss": 2.0986, "num_input_tokens_seen": 3140485120, "step": 5990 }, { "epoch": 0.9006684648762754, "grad_norm": 0.7731397747993469, "learning_rate": 9.437513796899408e-06, "loss": 2.1335, "num_input_tokens_seen": 3145728000, "step": 6000 }, { "epoch": 0.9006684648762754, "eval_accuracy": 0.5739422059422059, "eval_loss": 2.0838024616241455, "eval_runtime": 93.3731, "eval_samples_per_second": 3.213, "eval_steps_per_second": 0.803, "num_input_tokens_seen": 3145728000, "step": 6000 }, { "epoch": 0.9021695789844025, "grad_norm": 0.7499359250068665, "learning_rate": 9.429659019178268e-06, "loss": 2.1091, "num_input_tokens_seen": 3150970880, "step": 6010 }, { "epoch": 0.9036706930925296, "grad_norm": 1.0717604160308838, "learning_rate": 9.421823821299214e-06, "loss": 2.0924, "num_input_tokens_seen": 3156213760, "step": 6020 }, { "epoch": 0.9051718072006567, "grad_norm": 0.8998821973800659, "learning_rate": 9.414008122051756e-06, "loss": 2.108, "num_input_tokens_seen": 3161456640, "step": 6030 }, { "epoch": 0.9066729213087839, "grad_norm": 0.9980584383010864, "learning_rate": 9.406211840696178e-06, "loss": 2.0913, "num_input_tokens_seen": 3166699520, "step": 6040 }, { "epoch": 0.908174035416911, "grad_norm": 0.8425881862640381, "learning_rate": 9.398434896960062e-06, "loss": 2.1174, "num_input_tokens_seen": 3171942400, "step": 6050 }, { "epoch": 0.9096751495250381, "grad_norm": 0.7694622874259949, "learning_rate": 9.390677211034795e-06, "loss": 2.1214, "num_input_tokens_seen": 3177185280, "step": 6060 }, { "epoch": 0.9111762636331653, "grad_norm": 0.8509136438369751, "learning_rate": 9.382938703572126e-06, "loss": 2.1062, "num_input_tokens_seen": 3182428160, "step": 6070 }, { "epoch": 0.9126773777412923, "grad_norm": 0.688194990158081, "learning_rate": 9.375219295680784e-06, "loss": 2.0959, "num_input_tokens_seen": 3187671040, "step": 6080 }, { "epoch": 0.9141784918494195, "grad_norm": 0.9447828531265259, "learning_rate": 9.367518908923069e-06, "loss": 2.1215, "num_input_tokens_seen": 3192913920, "step": 6090 }, { "epoch": 0.9156796059575466, "grad_norm": 0.7632228136062622, "learning_rate": 9.35983746531152e-06, "loss": 2.0893, "num_input_tokens_seen": 3198156800, "step": 6100 }, { "epoch": 0.9171807200656737, "grad_norm": 0.6607562899589539, "learning_rate": 9.352174887305604e-06, "loss": 2.1476, "num_input_tokens_seen": 3203399680, "step": 6110 }, { "epoch": 0.9186818341738009, "grad_norm": 0.7273072004318237, "learning_rate": 9.344531097808414e-06, "loss": 2.0986, "num_input_tokens_seen": 3208642560, "step": 6120 }, { "epoch": 0.920182948281928, "grad_norm": 0.7447370290756226, "learning_rate": 9.336906020163414e-06, "loss": 2.1146, "num_input_tokens_seen": 3213885440, "step": 6130 }, { "epoch": 0.9216840623900551, "grad_norm": 0.6748001575469971, "learning_rate": 9.329299578151221e-06, "loss": 2.0587, "num_input_tokens_seen": 3219128320, "step": 6140 }, { "epoch": 0.9231851764981822, "grad_norm": 0.874092698097229, "learning_rate": 9.321711695986389e-06, "loss": 2.1088, "num_input_tokens_seen": 3224371200, "step": 6150 }, { "epoch": 0.9246862906063094, "grad_norm": 0.7109676003456116, "learning_rate": 9.314142298314256e-06, "loss": 2.1308, "num_input_tokens_seen": 3229614080, "step": 6160 }, { "epoch": 0.9261874047144365, "grad_norm": 0.9448009133338928, "learning_rate": 9.306591310207784e-06, "loss": 2.1008, "num_input_tokens_seen": 3234856960, "step": 6170 }, { "epoch": 0.9276885188225636, "grad_norm": 1.0548337697982788, "learning_rate": 9.29905865716445e-06, "loss": 2.1113, "num_input_tokens_seen": 3240099840, "step": 6180 }, { "epoch": 0.9291896329306908, "grad_norm": 0.8548759818077087, "learning_rate": 9.291544265103168e-06, "loss": 2.0949, "num_input_tokens_seen": 3245342720, "step": 6190 }, { "epoch": 0.9306907470388178, "grad_norm": 0.7196878790855408, "learning_rate": 9.284048060361212e-06, "loss": 2.1348, "num_input_tokens_seen": 3250585600, "step": 6200 }, { "epoch": 0.932191861146945, "grad_norm": 0.970755934715271, "learning_rate": 9.276569969691194e-06, "loss": 2.1198, "num_input_tokens_seen": 3255828480, "step": 6210 }, { "epoch": 0.9336929752550721, "grad_norm": 0.7153322100639343, "learning_rate": 9.26910992025806e-06, "loss": 2.1145, "num_input_tokens_seen": 3261071360, "step": 6220 }, { "epoch": 0.9351940893631993, "grad_norm": 0.9428135752677917, "learning_rate": 9.2616678396361e-06, "loss": 2.0888, "num_input_tokens_seen": 3266314240, "step": 6230 }, { "epoch": 0.9366952034713264, "grad_norm": 0.8652244806289673, "learning_rate": 9.254243655806003e-06, "loss": 2.0739, "num_input_tokens_seen": 3271557120, "step": 6240 }, { "epoch": 0.9381963175794535, "grad_norm": 1.0319374799728394, "learning_rate": 9.24683729715193e-06, "loss": 2.127, "num_input_tokens_seen": 3276800000, "step": 6250 }, { "epoch": 0.9396974316875806, "grad_norm": 1.1255468130111694, "learning_rate": 9.239448692458609e-06, "loss": 2.1033, "num_input_tokens_seen": 3282042880, "step": 6260 }, { "epoch": 0.9411985457957077, "grad_norm": 0.7438716888427734, "learning_rate": 9.232077770908458e-06, "loss": 2.1018, "num_input_tokens_seen": 3287285760, "step": 6270 }, { "epoch": 0.9426996599038349, "grad_norm": 0.7200952768325806, "learning_rate": 9.22472446207874e-06, "loss": 2.1389, "num_input_tokens_seen": 3292528640, "step": 6280 }, { "epoch": 0.944200774011962, "grad_norm": 0.6740290522575378, "learning_rate": 9.21738869593873e-06, "loss": 2.0873, "num_input_tokens_seen": 3297771520, "step": 6290 }, { "epoch": 0.9457018881200892, "grad_norm": 0.7916491031646729, "learning_rate": 9.210070402846921e-06, "loss": 2.0943, "num_input_tokens_seen": 3303014400, "step": 6300 }, { "epoch": 0.9472030022282163, "grad_norm": 0.9598779678344727, "learning_rate": 9.202769513548237e-06, "loss": 2.0996, "num_input_tokens_seen": 3308257280, "step": 6310 }, { "epoch": 0.9487041163363433, "grad_norm": 0.8254228234291077, "learning_rate": 9.195485959171296e-06, "loss": 2.0863, "num_input_tokens_seen": 3313500160, "step": 6320 }, { "epoch": 0.9502052304444705, "grad_norm": 0.6629865169525146, "learning_rate": 9.188219671225665e-06, "loss": 2.0822, "num_input_tokens_seen": 3318743040, "step": 6330 }, { "epoch": 0.9517063445525976, "grad_norm": 0.7633962035179138, "learning_rate": 9.180970581599163e-06, "loss": 2.1215, "num_input_tokens_seen": 3323985920, "step": 6340 }, { "epoch": 0.9532074586607248, "grad_norm": 0.7902926206588745, "learning_rate": 9.17373862255518e-06, "loss": 2.1073, "num_input_tokens_seen": 3329228800, "step": 6350 }, { "epoch": 0.9547085727688519, "grad_norm": 0.7041093111038208, "learning_rate": 9.16652372673002e-06, "loss": 2.1416, "num_input_tokens_seen": 3334471680, "step": 6360 }, { "epoch": 0.956209686876979, "grad_norm": 0.7147159576416016, "learning_rate": 9.159325827130255e-06, "loss": 2.0744, "num_input_tokens_seen": 3339714560, "step": 6370 }, { "epoch": 0.9577108009851061, "grad_norm": 0.7527651786804199, "learning_rate": 9.152144857130128e-06, "loss": 2.1002, "num_input_tokens_seen": 3344957440, "step": 6380 }, { "epoch": 0.9592119150932332, "grad_norm": 0.7740973830223083, "learning_rate": 9.144980750468947e-06, "loss": 2.0972, "num_input_tokens_seen": 3350200320, "step": 6390 }, { "epoch": 0.9607130292013604, "grad_norm": 0.8114108443260193, "learning_rate": 9.137833441248534e-06, "loss": 2.0731, "num_input_tokens_seen": 3355443200, "step": 6400 }, { "epoch": 0.9607130292013604, "eval_accuracy": 0.5744249084249085, "eval_loss": 2.080249071121216, "eval_runtime": 93.7101, "eval_samples_per_second": 3.201, "eval_steps_per_second": 0.8, "num_input_tokens_seen": 3355443200, "step": 6400 }, { "epoch": 0.9622141433094875, "grad_norm": 0.7784215211868286, "learning_rate": 9.130702863930661e-06, "loss": 2.0991, "num_input_tokens_seen": 3360686080, "step": 6410 }, { "epoch": 0.9637152574176147, "grad_norm": 0.883834958076477, "learning_rate": 9.123588953334543e-06, "loss": 2.127, "num_input_tokens_seen": 3365928960, "step": 6420 }, { "epoch": 0.9652163715257418, "grad_norm": 0.6606909036636353, "learning_rate": 9.116491644634323e-06, "loss": 2.0898, "num_input_tokens_seen": 3371171840, "step": 6430 }, { "epoch": 0.9667174856338688, "grad_norm": 0.8438331484794617, "learning_rate": 9.109410873356595e-06, "loss": 2.097, "num_input_tokens_seen": 3376414720, "step": 6440 }, { "epoch": 0.968218599741996, "grad_norm": 0.7696127891540527, "learning_rate": 9.102346575377945e-06, "loss": 2.0882, "num_input_tokens_seen": 3381657600, "step": 6450 }, { "epoch": 0.9697197138501231, "grad_norm": 0.7825368642807007, "learning_rate": 9.095298686922513e-06, "loss": 2.099, "num_input_tokens_seen": 3386900480, "step": 6460 }, { "epoch": 0.9712208279582503, "grad_norm": 1.1647248268127441, "learning_rate": 9.088267144559575e-06, "loss": 2.0652, "num_input_tokens_seen": 3392143360, "step": 6470 }, { "epoch": 0.9727219420663774, "grad_norm": 0.8124563694000244, "learning_rate": 9.081251885201133e-06, "loss": 2.0914, "num_input_tokens_seen": 3397386240, "step": 6480 }, { "epoch": 0.9742230561745046, "grad_norm": 0.8535528779029846, "learning_rate": 9.074252846099567e-06, "loss": 2.0706, "num_input_tokens_seen": 3402629120, "step": 6490 }, { "epoch": 0.9757241702826316, "grad_norm": 0.8512585759162903, "learning_rate": 9.067269964845241e-06, "loss": 2.1627, "num_input_tokens_seen": 3407872000, "step": 6500 }, { "epoch": 0.9772252843907587, "grad_norm": 0.7466162443161011, "learning_rate": 9.060303179364195e-06, "loss": 2.0776, "num_input_tokens_seen": 3413114880, "step": 6510 }, { "epoch": 0.9787263984988859, "grad_norm": 0.8283933997154236, "learning_rate": 9.053352427915811e-06, "loss": 2.0799, "num_input_tokens_seen": 3418357760, "step": 6520 }, { "epoch": 0.980227512607013, "grad_norm": 0.6737175583839417, "learning_rate": 9.04641764909052e-06, "loss": 2.0664, "num_input_tokens_seen": 3423600640, "step": 6530 }, { "epoch": 0.9817286267151402, "grad_norm": 0.6306325197219849, "learning_rate": 9.039498781807522e-06, "loss": 2.0973, "num_input_tokens_seen": 3428843520, "step": 6540 }, { "epoch": 0.9832297408232673, "grad_norm": 0.7176161408424377, "learning_rate": 9.032595765312539e-06, "loss": 2.0812, "num_input_tokens_seen": 3434086400, "step": 6550 }, { "epoch": 0.9847308549313943, "grad_norm": 0.7962211966514587, "learning_rate": 9.025708539175545e-06, "loss": 2.084, "num_input_tokens_seen": 3439329280, "step": 6560 }, { "epoch": 0.9862319690395215, "grad_norm": 0.9422264099121094, "learning_rate": 9.018837043288575e-06, "loss": 2.1091, "num_input_tokens_seen": 3444572160, "step": 6570 }, { "epoch": 0.9877330831476486, "grad_norm": 0.7303385734558105, "learning_rate": 9.011981217863507e-06, "loss": 2.113, "num_input_tokens_seen": 3449815040, "step": 6580 }, { "epoch": 0.9892341972557758, "grad_norm": 0.8498770594596863, "learning_rate": 9.005141003429877e-06, "loss": 2.1057, "num_input_tokens_seen": 3455057920, "step": 6590 }, { "epoch": 0.9907353113639029, "grad_norm": 0.945923924446106, "learning_rate": 8.99831634083271e-06, "loss": 2.0949, "num_input_tokens_seen": 3460300800, "step": 6600 }, { "epoch": 0.9922364254720301, "grad_norm": 0.8327839374542236, "learning_rate": 8.991507171230386e-06, "loss": 2.1105, "num_input_tokens_seen": 3465543680, "step": 6610 }, { "epoch": 0.9937375395801572, "grad_norm": 0.8914210200309753, "learning_rate": 8.98471343609249e-06, "loss": 2.1146, "num_input_tokens_seen": 3470786560, "step": 6620 }, { "epoch": 0.9952386536882842, "grad_norm": 0.8601353168487549, "learning_rate": 8.977935077197712e-06, "loss": 2.1059, "num_input_tokens_seen": 3476029440, "step": 6630 }, { "epoch": 0.9967397677964114, "grad_norm": 0.8136289715766907, "learning_rate": 8.971172036631744e-06, "loss": 2.0737, "num_input_tokens_seen": 3481272320, "step": 6640 }, { "epoch": 0.9982408819045385, "grad_norm": 0.980107843875885, "learning_rate": 8.964424256785211e-06, "loss": 2.0816, "num_input_tokens_seen": 3486515200, "step": 6650 }, { "epoch": 0.9997419960126657, "grad_norm": 0.812223494052887, "learning_rate": 8.9576916803516e-06, "loss": 2.0981, "num_input_tokens_seen": 3491758080, "step": 6660 }, { "epoch": 0.9998921074234783, "num_input_tokens_seen": 3492282368, "step": 6661, "total_flos": 4.5723602603621745e+18, "train_loss": 2.154333936901462, "train_runtime": 99522.0612, "train_samples_per_second": 8.568, "train_steps_per_second": 0.067 } ], "logging_steps": 10, "max_steps": 6661, "num_input_tokens_seen": 3492282368, "num_train_epochs": 1, "save_steps": 200, "total_flos": 4.5723602603621745e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }