{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.288412017167382, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019074868860276585, "grad_norm": 5.96875, "learning_rate": 1.9987282207808727e-05, "loss": 1.8153, "mean_token_accuracy": 0.5708044067025184, "step": 5 }, { "epoch": 0.003814973772055317, "grad_norm": 5.40625, "learning_rate": 1.997456441561745e-05, "loss": 1.5088, "mean_token_accuracy": 0.6002451926469803, "step": 10 }, { "epoch": 0.005722460658082976, "grad_norm": 5.5625, "learning_rate": 1.9961846623426175e-05, "loss": 1.5696, "mean_token_accuracy": 0.5999565117061139, "step": 15 }, { "epoch": 0.007629947544110634, "grad_norm": 6.0, "learning_rate": 1.9949128831234897e-05, "loss": 1.4962, "mean_token_accuracy": 0.6145946934819222, "step": 20 }, { "epoch": 0.009537434430138292, "grad_norm": 5.0625, "learning_rate": 1.9936411039043622e-05, "loss": 1.4145, "mean_token_accuracy": 0.6282299846410752, "step": 25 }, { "epoch": 0.011444921316165951, "grad_norm": 5.75, "learning_rate": 1.9923693246852348e-05, "loss": 1.4615, "mean_token_accuracy": 0.6196133770048619, "step": 30 }, { "epoch": 0.01335240820219361, "grad_norm": 4.84375, "learning_rate": 1.9910975454661073e-05, "loss": 1.4118, "mean_token_accuracy": 0.6272433631122112, "step": 35 }, { "epoch": 0.015259895088221268, "grad_norm": 5.0625, "learning_rate": 1.98982576624698e-05, "loss": 1.3874, "mean_token_accuracy": 0.6295698702335357, "step": 40 }, { "epoch": 0.017167381974248927, "grad_norm": 5.46875, "learning_rate": 1.988553987027852e-05, "loss": 1.4997, "mean_token_accuracy": 0.6131108298897743, "step": 45 }, { "epoch": 0.019074868860276584, "grad_norm": 4.59375, "learning_rate": 1.9872822078087246e-05, "loss": 1.3386, "mean_token_accuracy": 0.6355413243174552, "step": 50 }, { "epoch": 0.020982355746304245, "grad_norm": 4.46875, "learning_rate": 1.986010428589597e-05, "loss": 1.3453, "mean_token_accuracy": 0.6322750248014927, "step": 55 }, { "epoch": 0.022889842632331903, "grad_norm": 5.0625, "learning_rate": 1.9847386493704694e-05, "loss": 1.4213, "mean_token_accuracy": 0.6266717866063118, "step": 60 }, { "epoch": 0.02479732951835956, "grad_norm": 5.5625, "learning_rate": 1.983466870151342e-05, "loss": 1.3913, "mean_token_accuracy": 0.6290012784302235, "step": 65 }, { "epoch": 0.02670481640438722, "grad_norm": 5.0625, "learning_rate": 1.982195090932214e-05, "loss": 1.3871, "mean_token_accuracy": 0.627622963488102, "step": 70 }, { "epoch": 0.02861230329041488, "grad_norm": 4.84375, "learning_rate": 1.9809233117130867e-05, "loss": 1.3232, "mean_token_accuracy": 0.6432097807526589, "step": 75 }, { "epoch": 0.030519790176442536, "grad_norm": 5.96875, "learning_rate": 1.9796515324939593e-05, "loss": 1.3287, "mean_token_accuracy": 0.6405125185847282, "step": 80 }, { "epoch": 0.03242727706247019, "grad_norm": 4.78125, "learning_rate": 1.9783797532748318e-05, "loss": 1.4044, "mean_token_accuracy": 0.6292843967676163, "step": 85 }, { "epoch": 0.034334763948497854, "grad_norm": 4.6875, "learning_rate": 1.977107974055704e-05, "loss": 1.3368, "mean_token_accuracy": 0.6363929770886898, "step": 90 }, { "epoch": 0.036242250834525515, "grad_norm": 4.6875, "learning_rate": 1.9758361948365766e-05, "loss": 1.3156, "mean_token_accuracy": 0.6389718689024448, "step": 95 }, { "epoch": 0.03814973772055317, "grad_norm": 4.84375, "learning_rate": 1.974564415617449e-05, "loss": 1.3213, "mean_token_accuracy": 0.6437985837459564, "step": 100 }, { "epoch": 0.04005722460658083, "grad_norm": 5.34375, "learning_rate": 1.9732926363983213e-05, "loss": 1.3403, "mean_token_accuracy": 0.6304401338100434, "step": 105 }, { "epoch": 0.04196471149260849, "grad_norm": 5.21875, "learning_rate": 1.972020857179194e-05, "loss": 1.4318, "mean_token_accuracy": 0.6314706668257714, "step": 110 }, { "epoch": 0.043872198378636144, "grad_norm": 5.625, "learning_rate": 1.970749077960066e-05, "loss": 1.3123, "mean_token_accuracy": 0.6482092589139938, "step": 115 }, { "epoch": 0.045779685264663805, "grad_norm": 5.03125, "learning_rate": 1.9694772987409387e-05, "loss": 1.2722, "mean_token_accuracy": 0.6475286811590195, "step": 120 }, { "epoch": 0.047687172150691466, "grad_norm": 4.78125, "learning_rate": 1.9682055195218112e-05, "loss": 1.3184, "mean_token_accuracy": 0.636443517357111, "step": 125 }, { "epoch": 0.04959465903671912, "grad_norm": 4.75, "learning_rate": 1.9669337403026834e-05, "loss": 1.3604, "mean_token_accuracy": 0.6383991658687591, "step": 130 }, { "epoch": 0.05150214592274678, "grad_norm": 6.3125, "learning_rate": 1.965661961083556e-05, "loss": 1.3624, "mean_token_accuracy": 0.6359823271632195, "step": 135 }, { "epoch": 0.05340963280877444, "grad_norm": 4.9375, "learning_rate": 1.9643901818644285e-05, "loss": 1.3622, "mean_token_accuracy": 0.6360629022121429, "step": 140 }, { "epoch": 0.055317119694802096, "grad_norm": 5.25, "learning_rate": 1.963118402645301e-05, "loss": 1.3007, "mean_token_accuracy": 0.6486773908138275, "step": 145 }, { "epoch": 0.05722460658082976, "grad_norm": 4.625, "learning_rate": 1.9618466234261733e-05, "loss": 1.259, "mean_token_accuracy": 0.6483664289116859, "step": 150 }, { "epoch": 0.05913209346685742, "grad_norm": 4.4375, "learning_rate": 1.9605748442070458e-05, "loss": 1.2701, "mean_token_accuracy": 0.6497010916471482, "step": 155 }, { "epoch": 0.06103958035288507, "grad_norm": 5.0625, "learning_rate": 1.9593030649879184e-05, "loss": 1.2925, "mean_token_accuracy": 0.645444954931736, "step": 160 }, { "epoch": 0.06294706723891273, "grad_norm": 4.46875, "learning_rate": 1.9580312857687906e-05, "loss": 1.3194, "mean_token_accuracy": 0.6443428501486779, "step": 165 }, { "epoch": 0.06485455412494039, "grad_norm": 5.15625, "learning_rate": 1.956759506549663e-05, "loss": 1.2786, "mean_token_accuracy": 0.6503037214279175, "step": 170 }, { "epoch": 0.06676204101096805, "grad_norm": 5.65625, "learning_rate": 1.9554877273305353e-05, "loss": 1.3482, "mean_token_accuracy": 0.6470891699194908, "step": 175 }, { "epoch": 0.06866952789699571, "grad_norm": 4.21875, "learning_rate": 1.954215948111408e-05, "loss": 1.3022, "mean_token_accuracy": 0.6400970220565796, "step": 180 }, { "epoch": 0.07057701478302336, "grad_norm": 5.03125, "learning_rate": 1.9529441688922804e-05, "loss": 1.2821, "mean_token_accuracy": 0.6461471430957317, "step": 185 }, { "epoch": 0.07248450166905103, "grad_norm": 5.0, "learning_rate": 1.951672389673153e-05, "loss": 1.2824, "mean_token_accuracy": 0.649085208773613, "step": 190 }, { "epoch": 0.07439198855507868, "grad_norm": 4.5, "learning_rate": 1.9504006104540255e-05, "loss": 1.2508, "mean_token_accuracy": 0.6536262959241868, "step": 195 }, { "epoch": 0.07629947544110634, "grad_norm": 4.9375, "learning_rate": 1.9491288312348978e-05, "loss": 1.3044, "mean_token_accuracy": 0.6513546489179134, "step": 200 }, { "epoch": 0.078206962327134, "grad_norm": 5.125, "learning_rate": 1.9478570520157703e-05, "loss": 1.2265, "mean_token_accuracy": 0.6645126178860664, "step": 205 }, { "epoch": 0.08011444921316166, "grad_norm": 5.875, "learning_rate": 1.9465852727966425e-05, "loss": 1.3156, "mean_token_accuracy": 0.6494258716702461, "step": 210 }, { "epoch": 0.08202193609918931, "grad_norm": 5.0, "learning_rate": 1.945313493577515e-05, "loss": 1.1655, "mean_token_accuracy": 0.6783517330884934, "step": 215 }, { "epoch": 0.08392942298521698, "grad_norm": 5.4375, "learning_rate": 1.9440417143583876e-05, "loss": 1.2306, "mean_token_accuracy": 0.6659620314836502, "step": 220 }, { "epoch": 0.08583690987124463, "grad_norm": 4.40625, "learning_rate": 1.9427699351392598e-05, "loss": 1.3798, "mean_token_accuracy": 0.6320724219083786, "step": 225 }, { "epoch": 0.08774439675727229, "grad_norm": 4.75, "learning_rate": 1.9414981559201324e-05, "loss": 1.2437, "mean_token_accuracy": 0.6454930439591408, "step": 230 }, { "epoch": 0.08965188364329996, "grad_norm": 5.90625, "learning_rate": 1.940226376701005e-05, "loss": 1.2523, "mean_token_accuracy": 0.6506562553346157, "step": 235 }, { "epoch": 0.09155937052932761, "grad_norm": 4.75, "learning_rate": 1.938954597481877e-05, "loss": 1.2268, "mean_token_accuracy": 0.6620514318346977, "step": 240 }, { "epoch": 0.09346685741535526, "grad_norm": 4.53125, "learning_rate": 1.9376828182627497e-05, "loss": 1.2215, "mean_token_accuracy": 0.6610720351338386, "step": 245 }, { "epoch": 0.09537434430138293, "grad_norm": 5.15625, "learning_rate": 1.9364110390436222e-05, "loss": 1.2494, "mean_token_accuracy": 0.653127409517765, "step": 250 }, { "epoch": 0.09728183118741059, "grad_norm": 5.1875, "learning_rate": 1.9351392598244948e-05, "loss": 1.3126, "mean_token_accuracy": 0.6540979892015457, "step": 255 }, { "epoch": 0.09918931807343824, "grad_norm": 5.15625, "learning_rate": 1.933867480605367e-05, "loss": 1.2498, "mean_token_accuracy": 0.6507880866527558, "step": 260 }, { "epoch": 0.10109680495946591, "grad_norm": 5.8125, "learning_rate": 1.9325957013862396e-05, "loss": 1.3294, "mean_token_accuracy": 0.6403125211596489, "step": 265 }, { "epoch": 0.10300429184549356, "grad_norm": 6.09375, "learning_rate": 1.9313239221671118e-05, "loss": 1.3099, "mean_token_accuracy": 0.6520273745059967, "step": 270 }, { "epoch": 0.10491177873152122, "grad_norm": 5.40625, "learning_rate": 1.9300521429479843e-05, "loss": 1.2857, "mean_token_accuracy": 0.6506179749965668, "step": 275 }, { "epoch": 0.10681926561754888, "grad_norm": 5.71875, "learning_rate": 1.928780363728857e-05, "loss": 1.3496, "mean_token_accuracy": 0.6408744707703591, "step": 280 }, { "epoch": 0.10872675250357654, "grad_norm": 5.0625, "learning_rate": 1.927508584509729e-05, "loss": 1.2012, "mean_token_accuracy": 0.6639982044696808, "step": 285 }, { "epoch": 0.11063423938960419, "grad_norm": 5.5, "learning_rate": 1.9262368052906016e-05, "loss": 1.2449, "mean_token_accuracy": 0.6594835132360458, "step": 290 }, { "epoch": 0.11254172627563186, "grad_norm": 4.125, "learning_rate": 1.9249650260714742e-05, "loss": 1.1314, "mean_token_accuracy": 0.6807294517755509, "step": 295 }, { "epoch": 0.11444921316165951, "grad_norm": 5.25, "learning_rate": 1.9236932468523467e-05, "loss": 1.2528, "mean_token_accuracy": 0.6543513402342797, "step": 300 }, { "epoch": 0.11635670004768717, "grad_norm": 4.5, "learning_rate": 1.9224214676332193e-05, "loss": 1.201, "mean_token_accuracy": 0.6674724757671356, "step": 305 }, { "epoch": 0.11826418693371483, "grad_norm": 5.09375, "learning_rate": 1.9211496884140915e-05, "loss": 1.2232, "mean_token_accuracy": 0.6661024749279022, "step": 310 }, { "epoch": 0.12017167381974249, "grad_norm": 4.65625, "learning_rate": 1.919877909194964e-05, "loss": 1.2015, "mean_token_accuracy": 0.6703806266188621, "step": 315 }, { "epoch": 0.12207916070577014, "grad_norm": 4.90625, "learning_rate": 1.9186061299758362e-05, "loss": 1.265, "mean_token_accuracy": 0.6571035169064998, "step": 320 }, { "epoch": 0.12398664759179781, "grad_norm": 4.5625, "learning_rate": 1.9173343507567088e-05, "loss": 1.1817, "mean_token_accuracy": 0.6720203042030335, "step": 325 }, { "epoch": 0.12589413447782546, "grad_norm": 5.40625, "learning_rate": 1.916062571537581e-05, "loss": 1.3305, "mean_token_accuracy": 0.6445886738598346, "step": 330 }, { "epoch": 0.12780162136385312, "grad_norm": 5.34375, "learning_rate": 1.9147907923184536e-05, "loss": 1.2302, "mean_token_accuracy": 0.6632393077015877, "step": 335 }, { "epoch": 0.12970910824988077, "grad_norm": 4.40625, "learning_rate": 1.913519013099326e-05, "loss": 1.2463, "mean_token_accuracy": 0.6618235319852829, "step": 340 }, { "epoch": 0.13161659513590845, "grad_norm": 4.84375, "learning_rate": 1.9122472338801987e-05, "loss": 1.1995, "mean_token_accuracy": 0.6734424993395806, "step": 345 }, { "epoch": 0.1335240820219361, "grad_norm": 6.09375, "learning_rate": 1.9109754546610712e-05, "loss": 1.1695, "mean_token_accuracy": 0.6744470730423927, "step": 350 }, { "epoch": 0.13543156890796376, "grad_norm": 4.4375, "learning_rate": 1.9097036754419434e-05, "loss": 1.235, "mean_token_accuracy": 0.6613016352057457, "step": 355 }, { "epoch": 0.13733905579399142, "grad_norm": 5.15625, "learning_rate": 1.908431896222816e-05, "loss": 1.1908, "mean_token_accuracy": 0.6631178431212902, "step": 360 }, { "epoch": 0.13924654268001907, "grad_norm": 6.03125, "learning_rate": 1.9071601170036885e-05, "loss": 1.2538, "mean_token_accuracy": 0.6583275809884072, "step": 365 }, { "epoch": 0.14115402956604672, "grad_norm": 4.875, "learning_rate": 1.9058883377845607e-05, "loss": 1.1481, "mean_token_accuracy": 0.6743377096951008, "step": 370 }, { "epoch": 0.1430615164520744, "grad_norm": 4.65625, "learning_rate": 1.9046165585654333e-05, "loss": 1.1758, "mean_token_accuracy": 0.6755423441529274, "step": 375 }, { "epoch": 0.14496900333810206, "grad_norm": 4.21875, "learning_rate": 1.9033447793463055e-05, "loss": 1.1526, "mean_token_accuracy": 0.6709615409374237, "step": 380 }, { "epoch": 0.1468764902241297, "grad_norm": 5.34375, "learning_rate": 1.902073000127178e-05, "loss": 1.1614, "mean_token_accuracy": 0.6803930580615998, "step": 385 }, { "epoch": 0.14878397711015737, "grad_norm": 5.15625, "learning_rate": 1.9008012209080503e-05, "loss": 1.1651, "mean_token_accuracy": 0.6757953256368637, "step": 390 }, { "epoch": 0.15069146399618502, "grad_norm": 5.65625, "learning_rate": 1.8995294416889228e-05, "loss": 1.2219, "mean_token_accuracy": 0.6725533396005631, "step": 395 }, { "epoch": 0.15259895088221268, "grad_norm": 4.90625, "learning_rate": 1.8982576624697954e-05, "loss": 1.2598, "mean_token_accuracy": 0.6562705941498279, "step": 400 }, { "epoch": 0.15450643776824036, "grad_norm": 4.59375, "learning_rate": 1.896985883250668e-05, "loss": 1.1586, "mean_token_accuracy": 0.6731373474001885, "step": 405 }, { "epoch": 0.156413924654268, "grad_norm": 5.65625, "learning_rate": 1.8957141040315405e-05, "loss": 1.2544, "mean_token_accuracy": 0.6664155155420304, "step": 410 }, { "epoch": 0.15832141154029566, "grad_norm": 5.09375, "learning_rate": 1.8944423248124127e-05, "loss": 1.2045, "mean_token_accuracy": 0.6721395581960679, "step": 415 }, { "epoch": 0.16022889842632332, "grad_norm": 4.65625, "learning_rate": 1.8931705455932852e-05, "loss": 1.1623, "mean_token_accuracy": 0.6825975701212883, "step": 420 }, { "epoch": 0.16213638531235097, "grad_norm": 5.25, "learning_rate": 1.8918987663741578e-05, "loss": 1.2345, "mean_token_accuracy": 0.655138723552227, "step": 425 }, { "epoch": 0.16404387219837863, "grad_norm": 4.46875, "learning_rate": 1.89062698715503e-05, "loss": 1.1583, "mean_token_accuracy": 0.6771410465240478, "step": 430 }, { "epoch": 0.1659513590844063, "grad_norm": 4.34375, "learning_rate": 1.8893552079359025e-05, "loss": 1.2226, "mean_token_accuracy": 0.6566869288682937, "step": 435 }, { "epoch": 0.16785884597043396, "grad_norm": 4.8125, "learning_rate": 1.8880834287167747e-05, "loss": 1.2531, "mean_token_accuracy": 0.6537335075438022, "step": 440 }, { "epoch": 0.16976633285646162, "grad_norm": 5.1875, "learning_rate": 1.8868116494976473e-05, "loss": 1.1736, "mean_token_accuracy": 0.6741863384842872, "step": 445 }, { "epoch": 0.17167381974248927, "grad_norm": 5.0, "learning_rate": 1.88553987027852e-05, "loss": 1.1912, "mean_token_accuracy": 0.6718283355236053, "step": 450 }, { "epoch": 0.17358130662851692, "grad_norm": 5.03125, "learning_rate": 1.8842680910593924e-05, "loss": 1.2016, "mean_token_accuracy": 0.670920492708683, "step": 455 }, { "epoch": 0.17548879351454458, "grad_norm": 5.375, "learning_rate": 1.882996311840265e-05, "loss": 1.1818, "mean_token_accuracy": 0.6705762408673763, "step": 460 }, { "epoch": 0.17739628040057226, "grad_norm": 5.9375, "learning_rate": 1.881724532621137e-05, "loss": 1.3159, "mean_token_accuracy": 0.6546284504234791, "step": 465 }, { "epoch": 0.1793037672865999, "grad_norm": 4.8125, "learning_rate": 1.8804527534020097e-05, "loss": 1.2004, "mean_token_accuracy": 0.6597321718931198, "step": 470 }, { "epoch": 0.18121125417262757, "grad_norm": 6.9375, "learning_rate": 1.879180974182882e-05, "loss": 1.1672, "mean_token_accuracy": 0.6835288152098655, "step": 475 }, { "epoch": 0.18311874105865522, "grad_norm": 5.0625, "learning_rate": 1.8779091949637545e-05, "loss": 1.2113, "mean_token_accuracy": 0.6654989182949066, "step": 480 }, { "epoch": 0.18502622794468288, "grad_norm": 5.09375, "learning_rate": 1.876637415744627e-05, "loss": 1.076, "mean_token_accuracy": 0.6919501051306725, "step": 485 }, { "epoch": 0.18693371483071053, "grad_norm": 4.96875, "learning_rate": 1.8753656365254992e-05, "loss": 1.1946, "mean_token_accuracy": 0.671660166978836, "step": 490 }, { "epoch": 0.1888412017167382, "grad_norm": 5.875, "learning_rate": 1.8740938573063718e-05, "loss": 1.2334, "mean_token_accuracy": 0.6634502306580543, "step": 495 }, { "epoch": 0.19074868860276586, "grad_norm": 5.34375, "learning_rate": 1.872822078087244e-05, "loss": 1.1412, "mean_token_accuracy": 0.6843819186091423, "step": 500 }, { "epoch": 0.19265617548879352, "grad_norm": 4.875, "learning_rate": 1.8715502988681165e-05, "loss": 1.1017, "mean_token_accuracy": 0.6859075799584389, "step": 505 }, { "epoch": 0.19456366237482117, "grad_norm": 5.0, "learning_rate": 1.870278519648989e-05, "loss": 1.099, "mean_token_accuracy": 0.6794175133109093, "step": 510 }, { "epoch": 0.19647114926084883, "grad_norm": 5.0, "learning_rate": 1.8690067404298616e-05, "loss": 1.162, "mean_token_accuracy": 0.6769187614321709, "step": 515 }, { "epoch": 0.19837863614687648, "grad_norm": 4.21875, "learning_rate": 1.8677349612107342e-05, "loss": 1.1241, "mean_token_accuracy": 0.6816845044493676, "step": 520 }, { "epoch": 0.20028612303290416, "grad_norm": 7.09375, "learning_rate": 1.8664631819916064e-05, "loss": 1.1331, "mean_token_accuracy": 0.6768893599510193, "step": 525 }, { "epoch": 0.20219360991893182, "grad_norm": 4.25, "learning_rate": 1.865191402772479e-05, "loss": 1.2657, "mean_token_accuracy": 0.6479664385318756, "step": 530 }, { "epoch": 0.20410109680495947, "grad_norm": 4.5, "learning_rate": 1.863919623553351e-05, "loss": 1.2343, "mean_token_accuracy": 0.6734806634485722, "step": 535 }, { "epoch": 0.20600858369098712, "grad_norm": 12.3125, "learning_rate": 1.8626478443342237e-05, "loss": 1.1263, "mean_token_accuracy": 0.6808793410658837, "step": 540 }, { "epoch": 0.20791607057701478, "grad_norm": 5.9375, "learning_rate": 1.8613760651150963e-05, "loss": 1.0617, "mean_token_accuracy": 0.6977316424250603, "step": 545 }, { "epoch": 0.20982355746304243, "grad_norm": 5.96875, "learning_rate": 1.8601042858959685e-05, "loss": 1.1229, "mean_token_accuracy": 0.6831489652395248, "step": 550 }, { "epoch": 0.2117310443490701, "grad_norm": 4.84375, "learning_rate": 1.858832506676841e-05, "loss": 1.1216, "mean_token_accuracy": 0.6838791735470295, "step": 555 }, { "epoch": 0.21363853123509777, "grad_norm": 4.78125, "learning_rate": 1.8575607274577136e-05, "loss": 1.2059, "mean_token_accuracy": 0.6669023260474205, "step": 560 }, { "epoch": 0.21554601812112542, "grad_norm": 4.875, "learning_rate": 1.856288948238586e-05, "loss": 1.1515, "mean_token_accuracy": 0.6748893111944199, "step": 565 }, { "epoch": 0.21745350500715308, "grad_norm": 4.21875, "learning_rate": 1.8550171690194583e-05, "loss": 1.063, "mean_token_accuracy": 0.7043292924761773, "step": 570 }, { "epoch": 0.21936099189318073, "grad_norm": 5.34375, "learning_rate": 1.853745389800331e-05, "loss": 1.1071, "mean_token_accuracy": 0.6816813468933105, "step": 575 }, { "epoch": 0.22126847877920838, "grad_norm": 7.25, "learning_rate": 1.8524736105812034e-05, "loss": 1.1493, "mean_token_accuracy": 0.6842595711350441, "step": 580 }, { "epoch": 0.22317596566523606, "grad_norm": 6.3125, "learning_rate": 1.8512018313620756e-05, "loss": 1.2036, "mean_token_accuracy": 0.6606533020734787, "step": 585 }, { "epoch": 0.22508345255126372, "grad_norm": 5.1875, "learning_rate": 1.8499300521429482e-05, "loss": 1.0329, "mean_token_accuracy": 0.7044132232666016, "step": 590 }, { "epoch": 0.22699093943729137, "grad_norm": 4.90625, "learning_rate": 1.8486582729238204e-05, "loss": 1.183, "mean_token_accuracy": 0.6742020189762116, "step": 595 }, { "epoch": 0.22889842632331903, "grad_norm": 5.375, "learning_rate": 1.847386493704693e-05, "loss": 1.1212, "mean_token_accuracy": 0.6900173485279083, "step": 600 }, { "epoch": 0.23080591320934668, "grad_norm": 5.5, "learning_rate": 1.8461147144855655e-05, "loss": 1.1654, "mean_token_accuracy": 0.6756381630897522, "step": 605 }, { "epoch": 0.23271340009537433, "grad_norm": 4.8125, "learning_rate": 1.8448429352664377e-05, "loss": 1.1592, "mean_token_accuracy": 0.6697646602988243, "step": 610 }, { "epoch": 0.23462088698140202, "grad_norm": 4.84375, "learning_rate": 1.8435711560473103e-05, "loss": 1.0309, "mean_token_accuracy": 0.6977804109454155, "step": 615 }, { "epoch": 0.23652837386742967, "grad_norm": 5.78125, "learning_rate": 1.8422993768281828e-05, "loss": 1.1705, "mean_token_accuracy": 0.6789061531424523, "step": 620 }, { "epoch": 0.23843586075345732, "grad_norm": 4.84375, "learning_rate": 1.8410275976090554e-05, "loss": 1.1715, "mean_token_accuracy": 0.6736443802714348, "step": 625 }, { "epoch": 0.24034334763948498, "grad_norm": 4.6875, "learning_rate": 1.8397558183899276e-05, "loss": 1.1535, "mean_token_accuracy": 0.6837850168347359, "step": 630 }, { "epoch": 0.24225083452551263, "grad_norm": 5.625, "learning_rate": 1.8384840391708e-05, "loss": 1.1403, "mean_token_accuracy": 0.6860886000096797, "step": 635 }, { "epoch": 0.24415832141154029, "grad_norm": 5.78125, "learning_rate": 1.8372122599516727e-05, "loss": 1.1777, "mean_token_accuracy": 0.6730990558862686, "step": 640 }, { "epoch": 0.24606580829756797, "grad_norm": 4.96875, "learning_rate": 1.835940480732545e-05, "loss": 1.2174, "mean_token_accuracy": 0.6607669338583946, "step": 645 }, { "epoch": 0.24797329518359562, "grad_norm": 5.03125, "learning_rate": 1.8346687015134174e-05, "loss": 1.1306, "mean_token_accuracy": 0.6789732642471791, "step": 650 }, { "epoch": 0.24988078206962328, "grad_norm": 4.5, "learning_rate": 1.8333969222942896e-05, "loss": 1.0958, "mean_token_accuracy": 0.6940956100821495, "step": 655 }, { "epoch": 0.25178826895565093, "grad_norm": 4.53125, "learning_rate": 1.8321251430751622e-05, "loss": 1.1623, "mean_token_accuracy": 0.6806924149394036, "step": 660 }, { "epoch": 0.2536957558416786, "grad_norm": 5.59375, "learning_rate": 1.8308533638560347e-05, "loss": 1.1438, "mean_token_accuracy": 0.6787467435002327, "step": 665 }, { "epoch": 0.25560324272770624, "grad_norm": 4.65625, "learning_rate": 1.8295815846369073e-05, "loss": 1.2902, "mean_token_accuracy": 0.655436672270298, "step": 670 }, { "epoch": 0.2575107296137339, "grad_norm": 4.46875, "learning_rate": 1.82830980541778e-05, "loss": 1.1499, "mean_token_accuracy": 0.6697306737303734, "step": 675 }, { "epoch": 0.25941821649976154, "grad_norm": 5.78125, "learning_rate": 1.827038026198652e-05, "loss": 1.1432, "mean_token_accuracy": 0.6788436755537987, "step": 680 }, { "epoch": 0.2613257033857892, "grad_norm": 6.40625, "learning_rate": 1.8257662469795246e-05, "loss": 1.1457, "mean_token_accuracy": 0.6794642567634582, "step": 685 }, { "epoch": 0.2632331902718169, "grad_norm": 4.5625, "learning_rate": 1.8244944677603968e-05, "loss": 1.1842, "mean_token_accuracy": 0.6739339649677276, "step": 690 }, { "epoch": 0.26514067715784456, "grad_norm": 5.375, "learning_rate": 1.8232226885412694e-05, "loss": 1.1979, "mean_token_accuracy": 0.671060286462307, "step": 695 }, { "epoch": 0.2670481640438722, "grad_norm": 5.9375, "learning_rate": 1.821950909322142e-05, "loss": 1.1926, "mean_token_accuracy": 0.68006531894207, "step": 700 }, { "epoch": 0.26895565092989987, "grad_norm": 4.5, "learning_rate": 1.820679130103014e-05, "loss": 1.1425, "mean_token_accuracy": 0.686011116206646, "step": 705 }, { "epoch": 0.2708631378159275, "grad_norm": 4.125, "learning_rate": 1.8194073508838867e-05, "loss": 1.1241, "mean_token_accuracy": 0.6813527546823025, "step": 710 }, { "epoch": 0.2727706247019552, "grad_norm": 4.59375, "learning_rate": 1.8181355716647592e-05, "loss": 1.0763, "mean_token_accuracy": 0.6978838533163071, "step": 715 }, { "epoch": 0.27467811158798283, "grad_norm": 4.71875, "learning_rate": 1.8168637924456314e-05, "loss": 1.1214, "mean_token_accuracy": 0.6879947543144226, "step": 720 }, { "epoch": 0.2765855984740105, "grad_norm": 4.65625, "learning_rate": 1.815592013226504e-05, "loss": 1.145, "mean_token_accuracy": 0.6740229934453964, "step": 725 }, { "epoch": 0.27849308536003814, "grad_norm": 5.0, "learning_rate": 1.8143202340073765e-05, "loss": 1.0915, "mean_token_accuracy": 0.6966636836528778, "step": 730 }, { "epoch": 0.2804005722460658, "grad_norm": 5.46875, "learning_rate": 1.813048454788249e-05, "loss": 1.1534, "mean_token_accuracy": 0.6881409972906113, "step": 735 }, { "epoch": 0.28230805913209345, "grad_norm": 5.21875, "learning_rate": 1.8117766755691213e-05, "loss": 1.1306, "mean_token_accuracy": 0.6856201700866222, "step": 740 }, { "epoch": 0.2842155460181211, "grad_norm": 4.3125, "learning_rate": 1.810504896349994e-05, "loss": 1.1514, "mean_token_accuracy": 0.6833734557032585, "step": 745 }, { "epoch": 0.2861230329041488, "grad_norm": 5.1875, "learning_rate": 1.809233117130866e-05, "loss": 1.1379, "mean_token_accuracy": 0.6824650421738625, "step": 750 }, { "epoch": 0.28803051979017646, "grad_norm": 4.8125, "learning_rate": 1.8079613379117386e-05, "loss": 1.1685, "mean_token_accuracy": 0.6773202955722809, "step": 755 }, { "epoch": 0.2899380066762041, "grad_norm": 5.09375, "learning_rate": 1.806689558692611e-05, "loss": 1.1683, "mean_token_accuracy": 0.6770651459693908, "step": 760 }, { "epoch": 0.2918454935622318, "grad_norm": 4.875, "learning_rate": 1.8054177794734834e-05, "loss": 1.0853, "mean_token_accuracy": 0.6914259925484657, "step": 765 }, { "epoch": 0.2937529804482594, "grad_norm": 5.0, "learning_rate": 1.804146000254356e-05, "loss": 1.1336, "mean_token_accuracy": 0.6820079162716866, "step": 770 }, { "epoch": 0.2956604673342871, "grad_norm": 4.09375, "learning_rate": 1.8028742210352285e-05, "loss": 1.1233, "mean_token_accuracy": 0.6821878552436829, "step": 775 }, { "epoch": 0.29756795422031473, "grad_norm": 4.71875, "learning_rate": 1.801602441816101e-05, "loss": 1.1204, "mean_token_accuracy": 0.692288200557232, "step": 780 }, { "epoch": 0.2994754411063424, "grad_norm": 4.9375, "learning_rate": 1.8003306625969736e-05, "loss": 1.2294, "mean_token_accuracy": 0.673324004560709, "step": 785 }, { "epoch": 0.30138292799237004, "grad_norm": 4.9375, "learning_rate": 1.7990588833778458e-05, "loss": 1.2059, "mean_token_accuracy": 0.6697646111249924, "step": 790 }, { "epoch": 0.3032904148783977, "grad_norm": 4.5625, "learning_rate": 1.7977871041587183e-05, "loss": 1.1745, "mean_token_accuracy": 0.6823094062507152, "step": 795 }, { "epoch": 0.30519790176442535, "grad_norm": 4.625, "learning_rate": 1.7965153249395905e-05, "loss": 1.1519, "mean_token_accuracy": 0.6778446674346924, "step": 800 }, { "epoch": 0.307105388650453, "grad_norm": 4.53125, "learning_rate": 1.795243545720463e-05, "loss": 1.0918, "mean_token_accuracy": 0.6978467896580696, "step": 805 }, { "epoch": 0.3090128755364807, "grad_norm": 3.921875, "learning_rate": 1.7939717665013353e-05, "loss": 1.1631, "mean_token_accuracy": 0.6788024313747882, "step": 810 }, { "epoch": 0.31092036242250837, "grad_norm": 4.78125, "learning_rate": 1.792699987282208e-05, "loss": 1.1259, "mean_token_accuracy": 0.6834620237350464, "step": 815 }, { "epoch": 0.312827849308536, "grad_norm": 5.21875, "learning_rate": 1.7914282080630804e-05, "loss": 1.0979, "mean_token_accuracy": 0.6804828964173794, "step": 820 }, { "epoch": 0.3147353361945637, "grad_norm": 4.875, "learning_rate": 1.790156428843953e-05, "loss": 1.204, "mean_token_accuracy": 0.6713850289583206, "step": 825 }, { "epoch": 0.31664282308059133, "grad_norm": 4.6875, "learning_rate": 1.788884649624825e-05, "loss": 1.14, "mean_token_accuracy": 0.6814699381589889, "step": 830 }, { "epoch": 0.318550309966619, "grad_norm": 4.3125, "learning_rate": 1.7876128704056977e-05, "loss": 1.1383, "mean_token_accuracy": 0.672491405904293, "step": 835 }, { "epoch": 0.32045779685264664, "grad_norm": 4.875, "learning_rate": 1.7863410911865703e-05, "loss": 1.1993, "mean_token_accuracy": 0.662742418050766, "step": 840 }, { "epoch": 0.3223652837386743, "grad_norm": 4.53125, "learning_rate": 1.7850693119674428e-05, "loss": 1.0142, "mean_token_accuracy": 0.7142936125397682, "step": 845 }, { "epoch": 0.32427277062470194, "grad_norm": 4.6875, "learning_rate": 1.783797532748315e-05, "loss": 1.1083, "mean_token_accuracy": 0.6907312035560608, "step": 850 }, { "epoch": 0.3261802575107296, "grad_norm": 4.40625, "learning_rate": 1.7825257535291876e-05, "loss": 1.0248, "mean_token_accuracy": 0.7127422258257866, "step": 855 }, { "epoch": 0.32808774439675725, "grad_norm": 4.375, "learning_rate": 1.7812539743100598e-05, "loss": 1.1234, "mean_token_accuracy": 0.6854491457343102, "step": 860 }, { "epoch": 0.3299952312827849, "grad_norm": 5.0625, "learning_rate": 1.7799821950909323e-05, "loss": 1.1691, "mean_token_accuracy": 0.6794403240084648, "step": 865 }, { "epoch": 0.3319027181688126, "grad_norm": 5.5625, "learning_rate": 1.7787104158718045e-05, "loss": 1.1286, "mean_token_accuracy": 0.6781167238950729, "step": 870 }, { "epoch": 0.33381020505484027, "grad_norm": 4.8125, "learning_rate": 1.777438636652677e-05, "loss": 1.1314, "mean_token_accuracy": 0.6972567990422249, "step": 875 }, { "epoch": 0.3357176919408679, "grad_norm": 5.34375, "learning_rate": 1.7761668574335496e-05, "loss": 1.0309, "mean_token_accuracy": 0.7109489843249321, "step": 880 }, { "epoch": 0.3376251788268956, "grad_norm": 5.53125, "learning_rate": 1.7748950782144222e-05, "loss": 1.122, "mean_token_accuracy": 0.6877056941390037, "step": 885 }, { "epoch": 0.33953266571292323, "grad_norm": 4.53125, "learning_rate": 1.7736232989952947e-05, "loss": 1.0609, "mean_token_accuracy": 0.7033521652221679, "step": 890 }, { "epoch": 0.3414401525989509, "grad_norm": 4.125, "learning_rate": 1.772351519776167e-05, "loss": 1.0418, "mean_token_accuracy": 0.7021679773926734, "step": 895 }, { "epoch": 0.34334763948497854, "grad_norm": 4.4375, "learning_rate": 1.7710797405570395e-05, "loss": 1.1233, "mean_token_accuracy": 0.696443286538124, "step": 900 }, { "epoch": 0.3452551263710062, "grad_norm": 4.53125, "learning_rate": 1.7698079613379117e-05, "loss": 1.043, "mean_token_accuracy": 0.7000239789485931, "step": 905 }, { "epoch": 0.34716261325703385, "grad_norm": 4.75, "learning_rate": 1.7685361821187843e-05, "loss": 1.0404, "mean_token_accuracy": 0.7076486960053444, "step": 910 }, { "epoch": 0.3490701001430615, "grad_norm": 4.78125, "learning_rate": 1.7672644028996568e-05, "loss": 1.2165, "mean_token_accuracy": 0.6694424465298653, "step": 915 }, { "epoch": 0.35097758702908916, "grad_norm": 5.15625, "learning_rate": 1.765992623680529e-05, "loss": 1.1061, "mean_token_accuracy": 0.689212466776371, "step": 920 }, { "epoch": 0.3528850739151168, "grad_norm": 4.9375, "learning_rate": 1.7647208444614016e-05, "loss": 1.1573, "mean_token_accuracy": 0.6761819615960121, "step": 925 }, { "epoch": 0.3547925608011445, "grad_norm": 4.96875, "learning_rate": 1.763449065242274e-05, "loss": 1.0511, "mean_token_accuracy": 0.7017502933740616, "step": 930 }, { "epoch": 0.3567000476871722, "grad_norm": 5.0625, "learning_rate": 1.7621772860231467e-05, "loss": 1.176, "mean_token_accuracy": 0.6732675984501839, "step": 935 }, { "epoch": 0.3586075345731998, "grad_norm": 4.1875, "learning_rate": 1.760905506804019e-05, "loss": 1.0912, "mean_token_accuracy": 0.6839179575443268, "step": 940 }, { "epoch": 0.3605150214592275, "grad_norm": 5.53125, "learning_rate": 1.7596337275848914e-05, "loss": 1.1048, "mean_token_accuracy": 0.6849195197224617, "step": 945 }, { "epoch": 0.36242250834525513, "grad_norm": 4.5625, "learning_rate": 1.758361948365764e-05, "loss": 1.0162, "mean_token_accuracy": 0.6860754758119583, "step": 950 }, { "epoch": 0.3643299952312828, "grad_norm": 4.59375, "learning_rate": 1.7570901691466362e-05, "loss": 1.0933, "mean_token_accuracy": 0.6905333071947097, "step": 955 }, { "epoch": 0.36623748211731044, "grad_norm": 4.9375, "learning_rate": 1.7558183899275088e-05, "loss": 1.0952, "mean_token_accuracy": 0.6910875916481019, "step": 960 }, { "epoch": 0.3681449690033381, "grad_norm": 5.25, "learning_rate": 1.754546610708381e-05, "loss": 1.2019, "mean_token_accuracy": 0.6728987120091915, "step": 965 }, { "epoch": 0.37005245588936575, "grad_norm": 3.890625, "learning_rate": 1.7532748314892535e-05, "loss": 1.0999, "mean_token_accuracy": 0.6894315019249916, "step": 970 }, { "epoch": 0.3719599427753934, "grad_norm": 4.28125, "learning_rate": 1.752003052270126e-05, "loss": 1.0943, "mean_token_accuracy": 0.6812393218278885, "step": 975 }, { "epoch": 0.37386742966142106, "grad_norm": 4.5, "learning_rate": 1.7507312730509983e-05, "loss": 1.1383, "mean_token_accuracy": 0.6867758512496949, "step": 980 }, { "epoch": 0.3757749165474487, "grad_norm": 4.46875, "learning_rate": 1.7494594938318708e-05, "loss": 1.0795, "mean_token_accuracy": 0.6977563664317131, "step": 985 }, { "epoch": 0.3776824034334764, "grad_norm": 5.46875, "learning_rate": 1.7481877146127434e-05, "loss": 1.0531, "mean_token_accuracy": 0.7000179141759872, "step": 990 }, { "epoch": 0.3795898903195041, "grad_norm": 5.09375, "learning_rate": 1.746915935393616e-05, "loss": 1.1515, "mean_token_accuracy": 0.679880291223526, "step": 995 }, { "epoch": 0.38149737720553173, "grad_norm": 4.96875, "learning_rate": 1.7456441561744885e-05, "loss": 1.139, "mean_token_accuracy": 0.6736231818795204, "step": 1000 }, { "epoch": 0.3834048640915594, "grad_norm": 4.5625, "learning_rate": 1.7443723769553607e-05, "loss": 1.161, "mean_token_accuracy": 0.6775983899831772, "step": 1005 }, { "epoch": 0.38531235097758704, "grad_norm": 5.65625, "learning_rate": 1.7431005977362332e-05, "loss": 1.1337, "mean_token_accuracy": 0.6791507929563523, "step": 1010 }, { "epoch": 0.3872198378636147, "grad_norm": 4.71875, "learning_rate": 1.7418288185171054e-05, "loss": 1.0665, "mean_token_accuracy": 0.6990373253822326, "step": 1015 }, { "epoch": 0.38912732474964234, "grad_norm": 5.34375, "learning_rate": 1.740557039297978e-05, "loss": 0.9875, "mean_token_accuracy": 0.7201899453997612, "step": 1020 }, { "epoch": 0.39103481163567, "grad_norm": 5.15625, "learning_rate": 1.7392852600788502e-05, "loss": 1.1103, "mean_token_accuracy": 0.6919103190302849, "step": 1025 }, { "epoch": 0.39294229852169765, "grad_norm": 4.90625, "learning_rate": 1.7380134808597228e-05, "loss": 1.1621, "mean_token_accuracy": 0.6763632833957672, "step": 1030 }, { "epoch": 0.3948497854077253, "grad_norm": 5.375, "learning_rate": 1.7367417016405953e-05, "loss": 1.0891, "mean_token_accuracy": 0.6873229309916496, "step": 1035 }, { "epoch": 0.39675727229375296, "grad_norm": 4.9375, "learning_rate": 1.735469922421468e-05, "loss": 1.0736, "mean_token_accuracy": 0.6943288549780846, "step": 1040 }, { "epoch": 0.3986647591797806, "grad_norm": 4.84375, "learning_rate": 1.7341981432023404e-05, "loss": 1.1739, "mean_token_accuracy": 0.6772709146142006, "step": 1045 }, { "epoch": 0.4005722460658083, "grad_norm": 4.9375, "learning_rate": 1.7329263639832126e-05, "loss": 1.132, "mean_token_accuracy": 0.6861761540174485, "step": 1050 }, { "epoch": 0.402479732951836, "grad_norm": 5.0, "learning_rate": 1.731654584764085e-05, "loss": 1.0849, "mean_token_accuracy": 0.6876397714018821, "step": 1055 }, { "epoch": 0.40438721983786363, "grad_norm": 5.28125, "learning_rate": 1.7303828055449577e-05, "loss": 1.1119, "mean_token_accuracy": 0.6828581809997558, "step": 1060 }, { "epoch": 0.4062947067238913, "grad_norm": 4.78125, "learning_rate": 1.72911102632583e-05, "loss": 1.1542, "mean_token_accuracy": 0.6845688432455063, "step": 1065 }, { "epoch": 0.40820219360991894, "grad_norm": 4.78125, "learning_rate": 1.7278392471067025e-05, "loss": 1.245, "mean_token_accuracy": 0.6623896270990371, "step": 1070 }, { "epoch": 0.4101096804959466, "grad_norm": 4.40625, "learning_rate": 1.7265674678875747e-05, "loss": 0.9855, "mean_token_accuracy": 0.7169230833649636, "step": 1075 }, { "epoch": 0.41201716738197425, "grad_norm": 5.28125, "learning_rate": 1.7252956886684472e-05, "loss": 1.0003, "mean_token_accuracy": 0.7067764922976494, "step": 1080 }, { "epoch": 0.4139246542680019, "grad_norm": 4.71875, "learning_rate": 1.7240239094493198e-05, "loss": 1.0566, "mean_token_accuracy": 0.7076253116130828, "step": 1085 }, { "epoch": 0.41583214115402956, "grad_norm": 4.8125, "learning_rate": 1.722752130230192e-05, "loss": 1.0692, "mean_token_accuracy": 0.6991090714931488, "step": 1090 }, { "epoch": 0.4177396280400572, "grad_norm": 4.84375, "learning_rate": 1.7214803510110646e-05, "loss": 1.1702, "mean_token_accuracy": 0.6843620404601097, "step": 1095 }, { "epoch": 0.41964711492608486, "grad_norm": 5.34375, "learning_rate": 1.720208571791937e-05, "loss": 1.1131, "mean_token_accuracy": 0.6780217066407204, "step": 1100 }, { "epoch": 0.4215546018121125, "grad_norm": 5.03125, "learning_rate": 1.7189367925728097e-05, "loss": 1.1733, "mean_token_accuracy": 0.6738237209618092, "step": 1105 }, { "epoch": 0.4234620886981402, "grad_norm": 4.15625, "learning_rate": 1.717665013353682e-05, "loss": 1.0971, "mean_token_accuracy": 0.6967151150107384, "step": 1110 }, { "epoch": 0.4253695755841679, "grad_norm": 6.125, "learning_rate": 1.7163932341345544e-05, "loss": 1.0593, "mean_token_accuracy": 0.6975896939635277, "step": 1115 }, { "epoch": 0.42727706247019553, "grad_norm": 4.875, "learning_rate": 1.715121454915427e-05, "loss": 1.045, "mean_token_accuracy": 0.6995080903172493, "step": 1120 }, { "epoch": 0.4291845493562232, "grad_norm": 4.6875, "learning_rate": 1.7138496756962992e-05, "loss": 1.1514, "mean_token_accuracy": 0.6870008051395416, "step": 1125 }, { "epoch": 0.43109203624225084, "grad_norm": 4.9375, "learning_rate": 1.7125778964771717e-05, "loss": 1.147, "mean_token_accuracy": 0.6756279736757278, "step": 1130 }, { "epoch": 0.4329995231282785, "grad_norm": 3.953125, "learning_rate": 1.711306117258044e-05, "loss": 1.0626, "mean_token_accuracy": 0.6975025564432145, "step": 1135 }, { "epoch": 0.43490701001430615, "grad_norm": 4.78125, "learning_rate": 1.7100343380389165e-05, "loss": 1.0843, "mean_token_accuracy": 0.6906091332435608, "step": 1140 }, { "epoch": 0.4368144969003338, "grad_norm": 4.84375, "learning_rate": 1.708762558819789e-05, "loss": 1.1134, "mean_token_accuracy": 0.6875185921788216, "step": 1145 }, { "epoch": 0.43872198378636146, "grad_norm": 5.28125, "learning_rate": 1.7074907796006616e-05, "loss": 1.1802, "mean_token_accuracy": 0.6715085253119468, "step": 1150 }, { "epoch": 0.4406294706723891, "grad_norm": 5.1875, "learning_rate": 1.706219000381534e-05, "loss": 1.0848, "mean_token_accuracy": 0.6938497066497803, "step": 1155 }, { "epoch": 0.44253695755841677, "grad_norm": 4.34375, "learning_rate": 1.7049472211624063e-05, "loss": 0.99, "mean_token_accuracy": 0.7224840387701988, "step": 1160 }, { "epoch": 0.4444444444444444, "grad_norm": 5.0625, "learning_rate": 1.703675441943279e-05, "loss": 1.0878, "mean_token_accuracy": 0.6980762526392936, "step": 1165 }, { "epoch": 0.44635193133047213, "grad_norm": 4.84375, "learning_rate": 1.702403662724151e-05, "loss": 1.0144, "mean_token_accuracy": 0.7007738411426544, "step": 1170 }, { "epoch": 0.4482594182164998, "grad_norm": 4.71875, "learning_rate": 1.7011318835050237e-05, "loss": 1.0875, "mean_token_accuracy": 0.6976178154349327, "step": 1175 }, { "epoch": 0.45016690510252744, "grad_norm": 4.25, "learning_rate": 1.6998601042858962e-05, "loss": 1.1086, "mean_token_accuracy": 0.6936508253216743, "step": 1180 }, { "epoch": 0.4520743919885551, "grad_norm": 4.625, "learning_rate": 1.6985883250667684e-05, "loss": 1.0357, "mean_token_accuracy": 0.7059789746999741, "step": 1185 }, { "epoch": 0.45398187887458274, "grad_norm": 5.84375, "learning_rate": 1.697316545847641e-05, "loss": 1.1516, "mean_token_accuracy": 0.6797626093029976, "step": 1190 }, { "epoch": 0.4558893657606104, "grad_norm": 4.59375, "learning_rate": 1.6960447666285135e-05, "loss": 1.0598, "mean_token_accuracy": 0.6971547856926918, "step": 1195 }, { "epoch": 0.45779685264663805, "grad_norm": 4.34375, "learning_rate": 1.6947729874093857e-05, "loss": 0.9843, "mean_token_accuracy": 0.7089567899703979, "step": 1200 }, { "epoch": 0.4597043395326657, "grad_norm": 5.6875, "learning_rate": 1.6935012081902583e-05, "loss": 1.1058, "mean_token_accuracy": 0.6865063227713109, "step": 1205 }, { "epoch": 0.46161182641869336, "grad_norm": 4.65625, "learning_rate": 1.6922294289711308e-05, "loss": 1.1277, "mean_token_accuracy": 0.6827705770730972, "step": 1210 }, { "epoch": 0.463519313304721, "grad_norm": 4.875, "learning_rate": 1.6909576497520034e-05, "loss": 1.1664, "mean_token_accuracy": 0.6819883540272713, "step": 1215 }, { "epoch": 0.46542680019074867, "grad_norm": 5.09375, "learning_rate": 1.6896858705328756e-05, "loss": 1.0562, "mean_token_accuracy": 0.6979849010705947, "step": 1220 }, { "epoch": 0.4673342870767763, "grad_norm": 4.875, "learning_rate": 1.688414091313748e-05, "loss": 1.1879, "mean_token_accuracy": 0.6789781466126442, "step": 1225 }, { "epoch": 0.46924177396280403, "grad_norm": 4.6875, "learning_rate": 1.6871423120946204e-05, "loss": 1.1305, "mean_token_accuracy": 0.6878492012619972, "step": 1230 }, { "epoch": 0.4711492608488317, "grad_norm": 4.90625, "learning_rate": 1.685870532875493e-05, "loss": 1.1696, "mean_token_accuracy": 0.6815823867917061, "step": 1235 }, { "epoch": 0.47305674773485934, "grad_norm": 4.4375, "learning_rate": 1.6845987536563655e-05, "loss": 0.9887, "mean_token_accuracy": 0.7134695425629616, "step": 1240 }, { "epoch": 0.474964234620887, "grad_norm": 4.625, "learning_rate": 1.6833269744372377e-05, "loss": 1.0254, "mean_token_accuracy": 0.7122278586030006, "step": 1245 }, { "epoch": 0.47687172150691465, "grad_norm": 5.25, "learning_rate": 1.6820551952181102e-05, "loss": 1.123, "mean_token_accuracy": 0.6886913120746613, "step": 1250 }, { "epoch": 0.4787792083929423, "grad_norm": 4.5625, "learning_rate": 1.6807834159989828e-05, "loss": 1.1054, "mean_token_accuracy": 0.6899633683264256, "step": 1255 }, { "epoch": 0.48068669527896996, "grad_norm": 4.5, "learning_rate": 1.6795116367798553e-05, "loss": 0.9764, "mean_token_accuracy": 0.7191405698657036, "step": 1260 }, { "epoch": 0.4825941821649976, "grad_norm": 4.9375, "learning_rate": 1.6782398575607275e-05, "loss": 1.0562, "mean_token_accuracy": 0.7035323694348335, "step": 1265 }, { "epoch": 0.48450166905102526, "grad_norm": 4.25, "learning_rate": 1.6769680783416e-05, "loss": 1.0389, "mean_token_accuracy": 0.708684840798378, "step": 1270 }, { "epoch": 0.4864091559370529, "grad_norm": 4.375, "learning_rate": 1.6756962991224726e-05, "loss": 1.0037, "mean_token_accuracy": 0.7036843597888947, "step": 1275 }, { "epoch": 0.48831664282308057, "grad_norm": 4.46875, "learning_rate": 1.674424519903345e-05, "loss": 0.9991, "mean_token_accuracy": 0.708541002869606, "step": 1280 }, { "epoch": 0.4902241297091082, "grad_norm": 4.53125, "learning_rate": 1.6731527406842174e-05, "loss": 1.0307, "mean_token_accuracy": 0.7065932080149651, "step": 1285 }, { "epoch": 0.49213161659513593, "grad_norm": 5.3125, "learning_rate": 1.6718809614650896e-05, "loss": 1.1893, "mean_token_accuracy": 0.6674706935882568, "step": 1290 }, { "epoch": 0.4940391034811636, "grad_norm": 4.75, "learning_rate": 1.670609182245962e-05, "loss": 1.0691, "mean_token_accuracy": 0.6954927012324333, "step": 1295 }, { "epoch": 0.49594659036719124, "grad_norm": 5.0, "learning_rate": 1.6693374030268347e-05, "loss": 1.1, "mean_token_accuracy": 0.687789686024189, "step": 1300 }, { "epoch": 0.4978540772532189, "grad_norm": 5.4375, "learning_rate": 1.6680656238077072e-05, "loss": 0.995, "mean_token_accuracy": 0.7104071035981179, "step": 1305 }, { "epoch": 0.49976156413924655, "grad_norm": 8.9375, "learning_rate": 1.6667938445885795e-05, "loss": 1.1605, "mean_token_accuracy": 0.6653927579522133, "step": 1310 }, { "epoch": 0.5016690510252741, "grad_norm": 5.28125, "learning_rate": 1.665522065369452e-05, "loss": 1.0855, "mean_token_accuracy": 0.6951776430010795, "step": 1315 }, { "epoch": 0.5035765379113019, "grad_norm": 5.40625, "learning_rate": 1.6642502861503246e-05, "loss": 1.0628, "mean_token_accuracy": 0.6940785989165306, "step": 1320 }, { "epoch": 0.5054840247973296, "grad_norm": 5.34375, "learning_rate": 1.6629785069311968e-05, "loss": 0.988, "mean_token_accuracy": 0.7169103771448135, "step": 1325 }, { "epoch": 0.5073915116833572, "grad_norm": 6.0, "learning_rate": 1.6617067277120693e-05, "loss": 1.1389, "mean_token_accuracy": 0.6772550821304322, "step": 1330 }, { "epoch": 0.5092989985693849, "grad_norm": 5.21875, "learning_rate": 1.660434948492942e-05, "loss": 1.0482, "mean_token_accuracy": 0.7066885620355606, "step": 1335 }, { "epoch": 0.5112064854554125, "grad_norm": 4.46875, "learning_rate": 1.659163169273814e-05, "loss": 1.0581, "mean_token_accuracy": 0.6999363213777542, "step": 1340 }, { "epoch": 0.5131139723414402, "grad_norm": 4.625, "learning_rate": 1.6578913900546866e-05, "loss": 1.0167, "mean_token_accuracy": 0.7108445912599564, "step": 1345 }, { "epoch": 0.5150214592274678, "grad_norm": 4.34375, "learning_rate": 1.656619610835559e-05, "loss": 1.0204, "mean_token_accuracy": 0.7141309767961502, "step": 1350 }, { "epoch": 0.5169289461134955, "grad_norm": 5.0, "learning_rate": 1.6553478316164314e-05, "loss": 1.0762, "mean_token_accuracy": 0.681866991519928, "step": 1355 }, { "epoch": 0.5188364329995231, "grad_norm": 4.75, "learning_rate": 1.654076052397304e-05, "loss": 1.11, "mean_token_accuracy": 0.6910865843296051, "step": 1360 }, { "epoch": 0.5207439198855508, "grad_norm": 4.9375, "learning_rate": 1.6528042731781765e-05, "loss": 1.1043, "mean_token_accuracy": 0.6882475554943085, "step": 1365 }, { "epoch": 0.5226514067715784, "grad_norm": 4.25, "learning_rate": 1.651532493959049e-05, "loss": 1.0554, "mean_token_accuracy": 0.7098303481936454, "step": 1370 }, { "epoch": 0.5245588936576061, "grad_norm": 4.875, "learning_rate": 1.6502607147399213e-05, "loss": 1.0832, "mean_token_accuracy": 0.7008169665932655, "step": 1375 }, { "epoch": 0.5264663805436338, "grad_norm": 4.75, "learning_rate": 1.6489889355207938e-05, "loss": 1.0496, "mean_token_accuracy": 0.7001980841159821, "step": 1380 }, { "epoch": 0.5283738674296614, "grad_norm": 4.96875, "learning_rate": 1.647717156301666e-05, "loss": 1.058, "mean_token_accuracy": 0.6955149456858635, "step": 1385 }, { "epoch": 0.5302813543156891, "grad_norm": 4.5, "learning_rate": 1.6464453770825386e-05, "loss": 1.0637, "mean_token_accuracy": 0.6973527297377586, "step": 1390 }, { "epoch": 0.5321888412017167, "grad_norm": 4.65625, "learning_rate": 1.645173597863411e-05, "loss": 1.0224, "mean_token_accuracy": 0.6985811904072762, "step": 1395 }, { "epoch": 0.5340963280877444, "grad_norm": 5.90625, "learning_rate": 1.6439018186442833e-05, "loss": 0.9889, "mean_token_accuracy": 0.7180039718747139, "step": 1400 }, { "epoch": 0.536003814973772, "grad_norm": 5.65625, "learning_rate": 1.642630039425156e-05, "loss": 1.1035, "mean_token_accuracy": 0.6911762669682503, "step": 1405 }, { "epoch": 0.5379113018597997, "grad_norm": 5.3125, "learning_rate": 1.6413582602060284e-05, "loss": 1.121, "mean_token_accuracy": 0.6914720147848129, "step": 1410 }, { "epoch": 0.5398187887458273, "grad_norm": 4.6875, "learning_rate": 1.640086480986901e-05, "loss": 1.0923, "mean_token_accuracy": 0.7057502642273903, "step": 1415 }, { "epoch": 0.541726275631855, "grad_norm": 4.03125, "learning_rate": 1.6388147017677732e-05, "loss": 1.0645, "mean_token_accuracy": 0.6903545215725899, "step": 1420 }, { "epoch": 0.5436337625178826, "grad_norm": 4.46875, "learning_rate": 1.6375429225486457e-05, "loss": 1.067, "mean_token_accuracy": 0.6956262946128845, "step": 1425 }, { "epoch": 0.5455412494039104, "grad_norm": 5.40625, "learning_rate": 1.6362711433295183e-05, "loss": 1.051, "mean_token_accuracy": 0.7086583986878395, "step": 1430 }, { "epoch": 0.547448736289938, "grad_norm": 5.15625, "learning_rate": 1.6349993641103905e-05, "loss": 1.083, "mean_token_accuracy": 0.6992680087685585, "step": 1435 }, { "epoch": 0.5493562231759657, "grad_norm": 4.875, "learning_rate": 1.633727584891263e-05, "loss": 1.048, "mean_token_accuracy": 0.7062048301100731, "step": 1440 }, { "epoch": 0.5512637100619934, "grad_norm": 4.375, "learning_rate": 1.6324558056721353e-05, "loss": 1.0279, "mean_token_accuracy": 0.6988534897565841, "step": 1445 }, { "epoch": 0.553171196948021, "grad_norm": 5.59375, "learning_rate": 1.6311840264530078e-05, "loss": 1.1266, "mean_token_accuracy": 0.6839306525886059, "step": 1450 }, { "epoch": 0.5550786838340487, "grad_norm": 5.25, "learning_rate": 1.6299122472338804e-05, "loss": 1.0217, "mean_token_accuracy": 0.711203609406948, "step": 1455 }, { "epoch": 0.5569861707200763, "grad_norm": 4.59375, "learning_rate": 1.6286404680147526e-05, "loss": 1.0203, "mean_token_accuracy": 0.7061103895306587, "step": 1460 }, { "epoch": 0.558893657606104, "grad_norm": 4.5, "learning_rate": 1.627368688795625e-05, "loss": 1.1219, "mean_token_accuracy": 0.6869349181652069, "step": 1465 }, { "epoch": 0.5608011444921316, "grad_norm": 5.78125, "learning_rate": 1.6260969095764977e-05, "loss": 1.0224, "mean_token_accuracy": 0.7102344155311584, "step": 1470 }, { "epoch": 0.5627086313781593, "grad_norm": 4.71875, "learning_rate": 1.6248251303573702e-05, "loss": 1.0301, "mean_token_accuracy": 0.7014199420809746, "step": 1475 }, { "epoch": 0.5646161182641869, "grad_norm": 4.84375, "learning_rate": 1.6235533511382428e-05, "loss": 1.0919, "mean_token_accuracy": 0.6958119504153728, "step": 1480 }, { "epoch": 0.5665236051502146, "grad_norm": 3.796875, "learning_rate": 1.622281571919115e-05, "loss": 1.0172, "mean_token_accuracy": 0.7171666666865348, "step": 1485 }, { "epoch": 0.5684310920362422, "grad_norm": 4.9375, "learning_rate": 1.6210097926999875e-05, "loss": 1.0293, "mean_token_accuracy": 0.7110596433281898, "step": 1490 }, { "epoch": 0.5703385789222699, "grad_norm": 5.0, "learning_rate": 1.6197380134808597e-05, "loss": 1.0437, "mean_token_accuracy": 0.7055989898741245, "step": 1495 }, { "epoch": 0.5722460658082976, "grad_norm": 4.5625, "learning_rate": 1.6184662342617323e-05, "loss": 1.1109, "mean_token_accuracy": 0.6983527675271034, "step": 1500 }, { "epoch": 0.5741535526943252, "grad_norm": 5.21875, "learning_rate": 1.6171944550426045e-05, "loss": 1.0585, "mean_token_accuracy": 0.7039476573467255, "step": 1505 }, { "epoch": 0.5760610395803529, "grad_norm": 4.71875, "learning_rate": 1.615922675823477e-05, "loss": 1.0795, "mean_token_accuracy": 0.6893104076385498, "step": 1510 }, { "epoch": 0.5779685264663805, "grad_norm": 4.71875, "learning_rate": 1.6146508966043496e-05, "loss": 0.9975, "mean_token_accuracy": 0.7137760400772095, "step": 1515 }, { "epoch": 0.5798760133524082, "grad_norm": 4.5625, "learning_rate": 1.613379117385222e-05, "loss": 1.0283, "mean_token_accuracy": 0.7071486204862595, "step": 1520 }, { "epoch": 0.5817835002384358, "grad_norm": 4.125, "learning_rate": 1.6121073381660947e-05, "loss": 0.9634, "mean_token_accuracy": 0.7267276033759117, "step": 1525 }, { "epoch": 0.5836909871244635, "grad_norm": 5.28125, "learning_rate": 1.610835558946967e-05, "loss": 1.141, "mean_token_accuracy": 0.6760995179414749, "step": 1530 }, { "epoch": 0.5855984740104911, "grad_norm": 4.3125, "learning_rate": 1.6095637797278395e-05, "loss": 1.0584, "mean_token_accuracy": 0.7001825541257858, "step": 1535 }, { "epoch": 0.5875059608965189, "grad_norm": 4.71875, "learning_rate": 1.608292000508712e-05, "loss": 0.9554, "mean_token_accuracy": 0.7191137507557869, "step": 1540 }, { "epoch": 0.5894134477825465, "grad_norm": 4.59375, "learning_rate": 1.6070202212895842e-05, "loss": 1.0338, "mean_token_accuracy": 0.6991492182016372, "step": 1545 }, { "epoch": 0.5913209346685742, "grad_norm": 4.375, "learning_rate": 1.6057484420704568e-05, "loss": 1.1116, "mean_token_accuracy": 0.6837974414229393, "step": 1550 }, { "epoch": 0.5932284215546018, "grad_norm": 5.0, "learning_rate": 1.604476662851329e-05, "loss": 1.1166, "mean_token_accuracy": 0.6914651602506637, "step": 1555 }, { "epoch": 0.5951359084406295, "grad_norm": 4.53125, "learning_rate": 1.6032048836322015e-05, "loss": 1.092, "mean_token_accuracy": 0.6894845418632031, "step": 1560 }, { "epoch": 0.5970433953266572, "grad_norm": 3.96875, "learning_rate": 1.601933104413074e-05, "loss": 1.0382, "mean_token_accuracy": 0.7044162392616272, "step": 1565 }, { "epoch": 0.5989508822126848, "grad_norm": 5.28125, "learning_rate": 1.6006613251939463e-05, "loss": 1.1431, "mean_token_accuracy": 0.6869931921362877, "step": 1570 }, { "epoch": 0.6008583690987125, "grad_norm": 5.03125, "learning_rate": 1.599389545974819e-05, "loss": 0.9842, "mean_token_accuracy": 0.7128385215997696, "step": 1575 }, { "epoch": 0.6027658559847401, "grad_norm": 4.09375, "learning_rate": 1.5981177667556914e-05, "loss": 0.9261, "mean_token_accuracy": 0.7247092142701149, "step": 1580 }, { "epoch": 0.6046733428707678, "grad_norm": 6.1875, "learning_rate": 1.596845987536564e-05, "loss": 1.1425, "mean_token_accuracy": 0.690905112028122, "step": 1585 }, { "epoch": 0.6065808297567954, "grad_norm": 4.15625, "learning_rate": 1.595574208317436e-05, "loss": 1.0104, "mean_token_accuracy": 0.7160186618566513, "step": 1590 }, { "epoch": 0.6084883166428231, "grad_norm": 4.25, "learning_rate": 1.5943024290983087e-05, "loss": 1.0177, "mean_token_accuracy": 0.7115990072488785, "step": 1595 }, { "epoch": 0.6103958035288507, "grad_norm": 4.125, "learning_rate": 1.5930306498791813e-05, "loss": 0.9649, "mean_token_accuracy": 0.7167477622628212, "step": 1600 }, { "epoch": 0.6123032904148784, "grad_norm": 4.78125, "learning_rate": 1.5917588706600535e-05, "loss": 1.0788, "mean_token_accuracy": 0.6938311874866485, "step": 1605 }, { "epoch": 0.614210777300906, "grad_norm": 4.8125, "learning_rate": 1.590487091440926e-05, "loss": 1.0125, "mean_token_accuracy": 0.7016476511955261, "step": 1610 }, { "epoch": 0.6161182641869337, "grad_norm": 4.875, "learning_rate": 1.5892153122217982e-05, "loss": 1.0339, "mean_token_accuracy": 0.7032722011208534, "step": 1615 }, { "epoch": 0.6180257510729614, "grad_norm": 5.0625, "learning_rate": 1.5879435330026708e-05, "loss": 1.0328, "mean_token_accuracy": 0.7095901571214199, "step": 1620 }, { "epoch": 0.619933237958989, "grad_norm": 5.25, "learning_rate": 1.5866717537835433e-05, "loss": 1.0373, "mean_token_accuracy": 0.7035743817687035, "step": 1625 }, { "epoch": 0.6218407248450167, "grad_norm": 4.53125, "learning_rate": 1.585399974564416e-05, "loss": 0.9541, "mean_token_accuracy": 0.7259443908929825, "step": 1630 }, { "epoch": 0.6237482117310443, "grad_norm": 5.40625, "learning_rate": 1.5841281953452884e-05, "loss": 1.163, "mean_token_accuracy": 0.6807891383767128, "step": 1635 }, { "epoch": 0.625655698617072, "grad_norm": 5.46875, "learning_rate": 1.5828564161261606e-05, "loss": 0.9723, "mean_token_accuracy": 0.7241554304957389, "step": 1640 }, { "epoch": 0.6275631855030996, "grad_norm": 4.5625, "learning_rate": 1.5815846369070332e-05, "loss": 1.0736, "mean_token_accuracy": 0.7066580310463906, "step": 1645 }, { "epoch": 0.6294706723891274, "grad_norm": 5.125, "learning_rate": 1.5803128576879054e-05, "loss": 1.0553, "mean_token_accuracy": 0.7062164053320885, "step": 1650 }, { "epoch": 0.631378159275155, "grad_norm": 4.1875, "learning_rate": 1.579041078468778e-05, "loss": 1.0238, "mean_token_accuracy": 0.7111857526004315, "step": 1655 }, { "epoch": 0.6332856461611827, "grad_norm": 4.59375, "learning_rate": 1.5777692992496505e-05, "loss": 0.9547, "mean_token_accuracy": 0.7140131160616875, "step": 1660 }, { "epoch": 0.6351931330472103, "grad_norm": 4.75, "learning_rate": 1.5764975200305227e-05, "loss": 1.0719, "mean_token_accuracy": 0.6966261744499207, "step": 1665 }, { "epoch": 0.637100619933238, "grad_norm": 4.5625, "learning_rate": 1.5752257408113953e-05, "loss": 1.0385, "mean_token_accuracy": 0.7105199143290519, "step": 1670 }, { "epoch": 0.6390081068192656, "grad_norm": 5.40625, "learning_rate": 1.5739539615922678e-05, "loss": 1.0362, "mean_token_accuracy": 0.7015564523637294, "step": 1675 }, { "epoch": 0.6409155937052933, "grad_norm": 5.3125, "learning_rate": 1.57268218237314e-05, "loss": 0.9366, "mean_token_accuracy": 0.7337832853198052, "step": 1680 }, { "epoch": 0.642823080591321, "grad_norm": 5.4375, "learning_rate": 1.5714104031540126e-05, "loss": 1.1166, "mean_token_accuracy": 0.6753425896167755, "step": 1685 }, { "epoch": 0.6447305674773486, "grad_norm": 4.5625, "learning_rate": 1.570138623934885e-05, "loss": 1.0364, "mean_token_accuracy": 0.7042675077915191, "step": 1690 }, { "epoch": 0.6466380543633763, "grad_norm": 5.1875, "learning_rate": 1.5688668447157577e-05, "loss": 0.9948, "mean_token_accuracy": 0.7185570999979973, "step": 1695 }, { "epoch": 0.6485455412494039, "grad_norm": 5.25, "learning_rate": 1.56759506549663e-05, "loss": 1.0878, "mean_token_accuracy": 0.6928838163614273, "step": 1700 }, { "epoch": 0.6504530281354316, "grad_norm": 5.90625, "learning_rate": 1.5663232862775024e-05, "loss": 1.0671, "mean_token_accuracy": 0.7043869346380234, "step": 1705 }, { "epoch": 0.6523605150214592, "grad_norm": 4.5, "learning_rate": 1.5650515070583746e-05, "loss": 1.0721, "mean_token_accuracy": 0.7013254553079605, "step": 1710 }, { "epoch": 0.6542680019074869, "grad_norm": 4.90625, "learning_rate": 1.5637797278392472e-05, "loss": 1.0065, "mean_token_accuracy": 0.7042224168777466, "step": 1715 }, { "epoch": 0.6561754887935145, "grad_norm": 4.15625, "learning_rate": 1.5625079486201197e-05, "loss": 1.0739, "mean_token_accuracy": 0.6945244207978248, "step": 1720 }, { "epoch": 0.6580829756795422, "grad_norm": 5.5625, "learning_rate": 1.561236169400992e-05, "loss": 1.0859, "mean_token_accuracy": 0.6950926452875137, "step": 1725 }, { "epoch": 0.6599904625655698, "grad_norm": 4.8125, "learning_rate": 1.5599643901818645e-05, "loss": 0.9103, "mean_token_accuracy": 0.7312288954854012, "step": 1730 }, { "epoch": 0.6618979494515975, "grad_norm": 4.375, "learning_rate": 1.558692610962737e-05, "loss": 0.9913, "mean_token_accuracy": 0.7258316233754158, "step": 1735 }, { "epoch": 0.6638054363376252, "grad_norm": 5.84375, "learning_rate": 1.5574208317436096e-05, "loss": 1.1365, "mean_token_accuracy": 0.6847386986017228, "step": 1740 }, { "epoch": 0.6657129232236528, "grad_norm": 4.34375, "learning_rate": 1.5561490525244818e-05, "loss": 0.9208, "mean_token_accuracy": 0.7318013399839401, "step": 1745 }, { "epoch": 0.6676204101096805, "grad_norm": 4.59375, "learning_rate": 1.5548772733053544e-05, "loss": 1.0381, "mean_token_accuracy": 0.7117675840854645, "step": 1750 }, { "epoch": 0.6695278969957081, "grad_norm": 4.0, "learning_rate": 1.553605494086227e-05, "loss": 0.9807, "mean_token_accuracy": 0.7145498290657997, "step": 1755 }, { "epoch": 0.6714353838817358, "grad_norm": 4.4375, "learning_rate": 1.552333714867099e-05, "loss": 0.9746, "mean_token_accuracy": 0.7150619328022003, "step": 1760 }, { "epoch": 0.6733428707677634, "grad_norm": 5.0625, "learning_rate": 1.5510619356479717e-05, "loss": 1.0595, "mean_token_accuracy": 0.7089547023177147, "step": 1765 }, { "epoch": 0.6752503576537912, "grad_norm": 4.59375, "learning_rate": 1.549790156428844e-05, "loss": 1.0631, "mean_token_accuracy": 0.7015356197953224, "step": 1770 }, { "epoch": 0.6771578445398188, "grad_norm": 6.28125, "learning_rate": 1.5485183772097164e-05, "loss": 1.0188, "mean_token_accuracy": 0.7043671816587448, "step": 1775 }, { "epoch": 0.6790653314258465, "grad_norm": 4.34375, "learning_rate": 1.547246597990589e-05, "loss": 1.017, "mean_token_accuracy": 0.707620695233345, "step": 1780 }, { "epoch": 0.6809728183118741, "grad_norm": 5.25, "learning_rate": 1.5459748187714615e-05, "loss": 1.0261, "mean_token_accuracy": 0.7107647344470024, "step": 1785 }, { "epoch": 0.6828803051979018, "grad_norm": 5.1875, "learning_rate": 1.5447030395523338e-05, "loss": 0.976, "mean_token_accuracy": 0.7236540615558624, "step": 1790 }, { "epoch": 0.6847877920839294, "grad_norm": 4.40625, "learning_rate": 1.5434312603332063e-05, "loss": 1.0489, "mean_token_accuracy": 0.7052806288003921, "step": 1795 }, { "epoch": 0.6866952789699571, "grad_norm": 4.53125, "learning_rate": 1.542159481114079e-05, "loss": 1.0648, "mean_token_accuracy": 0.6937704533338547, "step": 1800 }, { "epoch": 0.6886027658559848, "grad_norm": 3.921875, "learning_rate": 1.540887701894951e-05, "loss": 1.0472, "mean_token_accuracy": 0.7000033929944038, "step": 1805 }, { "epoch": 0.6905102527420124, "grad_norm": 4.0625, "learning_rate": 1.5396159226758236e-05, "loss": 0.9398, "mean_token_accuracy": 0.7206048682332039, "step": 1810 }, { "epoch": 0.6924177396280401, "grad_norm": 4.71875, "learning_rate": 1.538344143456696e-05, "loss": 1.078, "mean_token_accuracy": 0.6949393272399902, "step": 1815 }, { "epoch": 0.6943252265140677, "grad_norm": 4.78125, "learning_rate": 1.5370723642375684e-05, "loss": 0.9936, "mean_token_accuracy": 0.716967236995697, "step": 1820 }, { "epoch": 0.6962327134000954, "grad_norm": 5.46875, "learning_rate": 1.535800585018441e-05, "loss": 1.0597, "mean_token_accuracy": 0.7033153355121613, "step": 1825 }, { "epoch": 0.698140200286123, "grad_norm": 5.59375, "learning_rate": 1.534528805799313e-05, "loss": 1.0689, "mean_token_accuracy": 0.6991181001067162, "step": 1830 }, { "epoch": 0.7000476871721507, "grad_norm": 5.59375, "learning_rate": 1.5332570265801857e-05, "loss": 1.0807, "mean_token_accuracy": 0.6999934658408165, "step": 1835 }, { "epoch": 0.7019551740581783, "grad_norm": 5.0625, "learning_rate": 1.5319852473610582e-05, "loss": 0.9682, "mean_token_accuracy": 0.723272667825222, "step": 1840 }, { "epoch": 0.703862660944206, "grad_norm": 4.15625, "learning_rate": 1.5307134681419308e-05, "loss": 1.031, "mean_token_accuracy": 0.6966592162847519, "step": 1845 }, { "epoch": 0.7057701478302336, "grad_norm": 5.15625, "learning_rate": 1.5294416889228033e-05, "loss": 1.0431, "mean_token_accuracy": 0.714342576265335, "step": 1850 }, { "epoch": 0.7076776347162613, "grad_norm": 5.78125, "learning_rate": 1.5281699097036755e-05, "loss": 1.0259, "mean_token_accuracy": 0.7083830907940865, "step": 1855 }, { "epoch": 0.709585121602289, "grad_norm": 4.5625, "learning_rate": 1.526898130484548e-05, "loss": 0.9847, "mean_token_accuracy": 0.7163553655147552, "step": 1860 }, { "epoch": 0.7114926084883166, "grad_norm": 6.28125, "learning_rate": 1.5256263512654203e-05, "loss": 0.9951, "mean_token_accuracy": 0.7172233253717423, "step": 1865 }, { "epoch": 0.7134000953743443, "grad_norm": 5.25, "learning_rate": 1.5243545720462929e-05, "loss": 1.0708, "mean_token_accuracy": 0.6974482744932174, "step": 1870 }, { "epoch": 0.7153075822603719, "grad_norm": 4.625, "learning_rate": 1.5230827928271654e-05, "loss": 1.0573, "mean_token_accuracy": 0.7091765016317367, "step": 1875 }, { "epoch": 0.7172150691463997, "grad_norm": 4.96875, "learning_rate": 1.5218110136080378e-05, "loss": 0.969, "mean_token_accuracy": 0.7144028797745705, "step": 1880 }, { "epoch": 0.7191225560324273, "grad_norm": 4.71875, "learning_rate": 1.5205392343889103e-05, "loss": 1.0841, "mean_token_accuracy": 0.696842522919178, "step": 1885 }, { "epoch": 0.721030042918455, "grad_norm": 4.21875, "learning_rate": 1.5192674551697825e-05, "loss": 0.9193, "mean_token_accuracy": 0.7247643247246742, "step": 1890 }, { "epoch": 0.7229375298044826, "grad_norm": 5.3125, "learning_rate": 1.5179956759506551e-05, "loss": 0.9693, "mean_token_accuracy": 0.7202573090791702, "step": 1895 }, { "epoch": 0.7248450166905103, "grad_norm": 4.59375, "learning_rate": 1.5167238967315275e-05, "loss": 0.9176, "mean_token_accuracy": 0.7282809093594551, "step": 1900 }, { "epoch": 0.7267525035765379, "grad_norm": 4.53125, "learning_rate": 1.5154521175124e-05, "loss": 0.9122, "mean_token_accuracy": 0.7338912770152092, "step": 1905 }, { "epoch": 0.7286599904625656, "grad_norm": 4.96875, "learning_rate": 1.5141803382932724e-05, "loss": 1.0756, "mean_token_accuracy": 0.6949716225266457, "step": 1910 }, { "epoch": 0.7305674773485932, "grad_norm": 4.96875, "learning_rate": 1.5129085590741448e-05, "loss": 1.0789, "mean_token_accuracy": 0.6967040143907071, "step": 1915 }, { "epoch": 0.7324749642346209, "grad_norm": 4.71875, "learning_rate": 1.5116367798550173e-05, "loss": 0.9752, "mean_token_accuracy": 0.7204620942473412, "step": 1920 }, { "epoch": 0.7343824511206486, "grad_norm": 3.8125, "learning_rate": 1.5103650006358897e-05, "loss": 1.0254, "mean_token_accuracy": 0.7050608977675438, "step": 1925 }, { "epoch": 0.7362899380066762, "grad_norm": 4.5625, "learning_rate": 1.5090932214167621e-05, "loss": 0.9845, "mean_token_accuracy": 0.718043963611126, "step": 1930 }, { "epoch": 0.7381974248927039, "grad_norm": 4.75, "learning_rate": 1.5078214421976347e-05, "loss": 0.9798, "mean_token_accuracy": 0.7155537083745003, "step": 1935 }, { "epoch": 0.7401049117787315, "grad_norm": 4.125, "learning_rate": 1.506549662978507e-05, "loss": 0.9766, "mean_token_accuracy": 0.7140335828065872, "step": 1940 }, { "epoch": 0.7420123986647592, "grad_norm": 4.125, "learning_rate": 1.5052778837593796e-05, "loss": 1.048, "mean_token_accuracy": 0.6943504139780998, "step": 1945 }, { "epoch": 0.7439198855507868, "grad_norm": 4.46875, "learning_rate": 1.5040061045402518e-05, "loss": 0.9615, "mean_token_accuracy": 0.7220544546842576, "step": 1950 }, { "epoch": 0.7458273724368145, "grad_norm": 5.0, "learning_rate": 1.5027343253211243e-05, "loss": 0.9978, "mean_token_accuracy": 0.7115991845726967, "step": 1955 }, { "epoch": 0.7477348593228421, "grad_norm": 4.25, "learning_rate": 1.5014625461019967e-05, "loss": 0.9384, "mean_token_accuracy": 0.7263565197587013, "step": 1960 }, { "epoch": 0.7496423462088698, "grad_norm": 4.90625, "learning_rate": 1.5001907668828693e-05, "loss": 1.0709, "mean_token_accuracy": 0.6875549122691155, "step": 1965 }, { "epoch": 0.7515498330948974, "grad_norm": 5.4375, "learning_rate": 1.4989189876637418e-05, "loss": 1.0374, "mean_token_accuracy": 0.7037866428494454, "step": 1970 }, { "epoch": 0.7534573199809251, "grad_norm": 4.4375, "learning_rate": 1.497647208444614e-05, "loss": 0.9627, "mean_token_accuracy": 0.7117268234491348, "step": 1975 }, { "epoch": 0.7553648068669528, "grad_norm": 4.34375, "learning_rate": 1.4963754292254866e-05, "loss": 0.9921, "mean_token_accuracy": 0.7058862507343292, "step": 1980 }, { "epoch": 0.7572722937529804, "grad_norm": 5.09375, "learning_rate": 1.495103650006359e-05, "loss": 0.961, "mean_token_accuracy": 0.7215966627001762, "step": 1985 }, { "epoch": 0.7591797806390082, "grad_norm": 4.375, "learning_rate": 1.4938318707872315e-05, "loss": 0.8637, "mean_token_accuracy": 0.7477660223841667, "step": 1990 }, { "epoch": 0.7610872675250357, "grad_norm": 4.71875, "learning_rate": 1.492560091568104e-05, "loss": 0.9912, "mean_token_accuracy": 0.7125525683164596, "step": 1995 }, { "epoch": 0.7629947544110635, "grad_norm": 4.09375, "learning_rate": 1.4912883123489763e-05, "loss": 0.9836, "mean_token_accuracy": 0.7163909748196602, "step": 2000 }, { "epoch": 0.7649022412970911, "grad_norm": 4.96875, "learning_rate": 1.4900165331298488e-05, "loss": 0.9218, "mean_token_accuracy": 0.718401075899601, "step": 2005 }, { "epoch": 0.7668097281831188, "grad_norm": 5.5625, "learning_rate": 1.4887447539107212e-05, "loss": 1.07, "mean_token_accuracy": 0.7017799213528633, "step": 2010 }, { "epoch": 0.7687172150691464, "grad_norm": 4.96875, "learning_rate": 1.4874729746915938e-05, "loss": 1.0801, "mean_token_accuracy": 0.689344696700573, "step": 2015 }, { "epoch": 0.7706247019551741, "grad_norm": 4.90625, "learning_rate": 1.486201195472466e-05, "loss": 1.0465, "mean_token_accuracy": 0.7119330614805222, "step": 2020 }, { "epoch": 0.7725321888412017, "grad_norm": 4.5625, "learning_rate": 1.4849294162533385e-05, "loss": 0.9299, "mean_token_accuracy": 0.7306962683796883, "step": 2025 }, { "epoch": 0.7744396757272294, "grad_norm": 5.40625, "learning_rate": 1.483657637034211e-05, "loss": 1.0421, "mean_token_accuracy": 0.7032359898090362, "step": 2030 }, { "epoch": 0.776347162613257, "grad_norm": 4.59375, "learning_rate": 1.4823858578150834e-05, "loss": 1.004, "mean_token_accuracy": 0.7154980972409248, "step": 2035 }, { "epoch": 0.7782546494992847, "grad_norm": 4.46875, "learning_rate": 1.4811140785959558e-05, "loss": 0.9309, "mean_token_accuracy": 0.7144239723682404, "step": 2040 }, { "epoch": 0.7801621363853124, "grad_norm": 4.5625, "learning_rate": 1.4798422993768282e-05, "loss": 0.9303, "mean_token_accuracy": 0.7244376420974732, "step": 2045 }, { "epoch": 0.78206962327134, "grad_norm": 4.90625, "learning_rate": 1.4785705201577008e-05, "loss": 1.0593, "mean_token_accuracy": 0.7060457020998001, "step": 2050 }, { "epoch": 0.7839771101573677, "grad_norm": 4.6875, "learning_rate": 1.4772987409385733e-05, "loss": 0.9526, "mean_token_accuracy": 0.724238371104002, "step": 2055 }, { "epoch": 0.7858845970433953, "grad_norm": 6.90625, "learning_rate": 1.4760269617194455e-05, "loss": 1.0683, "mean_token_accuracy": 0.6990483224391937, "step": 2060 }, { "epoch": 0.787792083929423, "grad_norm": 5.0625, "learning_rate": 1.474755182500318e-05, "loss": 0.9964, "mean_token_accuracy": 0.7195842653512955, "step": 2065 }, { "epoch": 0.7896995708154506, "grad_norm": 4.5625, "learning_rate": 1.4734834032811905e-05, "loss": 0.936, "mean_token_accuracy": 0.7289351716637611, "step": 2070 }, { "epoch": 0.7916070577014783, "grad_norm": 6.125, "learning_rate": 1.472211624062063e-05, "loss": 1.0424, "mean_token_accuracy": 0.7045046918094158, "step": 2075 }, { "epoch": 0.7935145445875059, "grad_norm": 4.65625, "learning_rate": 1.4709398448429352e-05, "loss": 0.9917, "mean_token_accuracy": 0.7202805817127228, "step": 2080 }, { "epoch": 0.7954220314735336, "grad_norm": 4.3125, "learning_rate": 1.4696680656238078e-05, "loss": 0.9864, "mean_token_accuracy": 0.7165962055325508, "step": 2085 }, { "epoch": 0.7973295183595612, "grad_norm": 5.09375, "learning_rate": 1.4683962864046803e-05, "loss": 1.003, "mean_token_accuracy": 0.7066600769758224, "step": 2090 }, { "epoch": 0.7992370052455889, "grad_norm": 5.0, "learning_rate": 1.4671245071855527e-05, "loss": 0.9686, "mean_token_accuracy": 0.7235840618610382, "step": 2095 }, { "epoch": 0.8011444921316166, "grad_norm": 4.59375, "learning_rate": 1.4658527279664252e-05, "loss": 1.0147, "mean_token_accuracy": 0.7132967829704284, "step": 2100 }, { "epoch": 0.8030519790176442, "grad_norm": 5.09375, "learning_rate": 1.4645809487472975e-05, "loss": 1.0229, "mean_token_accuracy": 0.7088969826698304, "step": 2105 }, { "epoch": 0.804959465903672, "grad_norm": 4.65625, "learning_rate": 1.46330916952817e-05, "loss": 1.0721, "mean_token_accuracy": 0.6959314554929733, "step": 2110 }, { "epoch": 0.8068669527896996, "grad_norm": 5.375, "learning_rate": 1.4620373903090426e-05, "loss": 1.0519, "mean_token_accuracy": 0.7039619326591492, "step": 2115 }, { "epoch": 0.8087744396757273, "grad_norm": 5.0625, "learning_rate": 1.460765611089915e-05, "loss": 0.9795, "mean_token_accuracy": 0.7203118950128555, "step": 2120 }, { "epoch": 0.8106819265617549, "grad_norm": 5.59375, "learning_rate": 1.4594938318707875e-05, "loss": 0.9688, "mean_token_accuracy": 0.721127749979496, "step": 2125 }, { "epoch": 0.8125894134477826, "grad_norm": 4.375, "learning_rate": 1.4582220526516597e-05, "loss": 1.0835, "mean_token_accuracy": 0.6876327477395534, "step": 2130 }, { "epoch": 0.8144969003338102, "grad_norm": 4.53125, "learning_rate": 1.4569502734325322e-05, "loss": 1.0776, "mean_token_accuracy": 0.6945757657289505, "step": 2135 }, { "epoch": 0.8164043872198379, "grad_norm": 4.46875, "learning_rate": 1.4556784942134046e-05, "loss": 1.0365, "mean_token_accuracy": 0.7062824577093124, "step": 2140 }, { "epoch": 0.8183118741058655, "grad_norm": 4.6875, "learning_rate": 1.4544067149942772e-05, "loss": 1.0154, "mean_token_accuracy": 0.7210542999207974, "step": 2145 }, { "epoch": 0.8202193609918932, "grad_norm": 5.9375, "learning_rate": 1.4531349357751496e-05, "loss": 1.1141, "mean_token_accuracy": 0.6904696643352508, "step": 2150 }, { "epoch": 0.8221268478779208, "grad_norm": 4.28125, "learning_rate": 1.451863156556022e-05, "loss": 0.9764, "mean_token_accuracy": 0.7087605282664299, "step": 2155 }, { "epoch": 0.8240343347639485, "grad_norm": 4.25, "learning_rate": 1.4505913773368945e-05, "loss": 0.9799, "mean_token_accuracy": 0.704703937470913, "step": 2160 }, { "epoch": 0.8259418216499762, "grad_norm": 5.5, "learning_rate": 1.4493195981177669e-05, "loss": 1.001, "mean_token_accuracy": 0.7080218985676765, "step": 2165 }, { "epoch": 0.8278493085360038, "grad_norm": 4.96875, "learning_rate": 1.4480478188986392e-05, "loss": 1.0226, "mean_token_accuracy": 0.7065225437283515, "step": 2170 }, { "epoch": 0.8297567954220315, "grad_norm": 4.4375, "learning_rate": 1.4467760396795118e-05, "loss": 0.9912, "mean_token_accuracy": 0.7132961705327034, "step": 2175 }, { "epoch": 0.8316642823080591, "grad_norm": 4.90625, "learning_rate": 1.4455042604603842e-05, "loss": 0.9855, "mean_token_accuracy": 0.7152161702513695, "step": 2180 }, { "epoch": 0.8335717691940868, "grad_norm": 4.59375, "learning_rate": 1.4442324812412567e-05, "loss": 1.0422, "mean_token_accuracy": 0.7072960436344147, "step": 2185 }, { "epoch": 0.8354792560801144, "grad_norm": 3.96875, "learning_rate": 1.442960702022129e-05, "loss": 0.971, "mean_token_accuracy": 0.7153645426034927, "step": 2190 }, { "epoch": 0.8373867429661421, "grad_norm": 4.78125, "learning_rate": 1.4416889228030015e-05, "loss": 0.993, "mean_token_accuracy": 0.7126340731978417, "step": 2195 }, { "epoch": 0.8392942298521697, "grad_norm": 4.90625, "learning_rate": 1.4404171435838739e-05, "loss": 0.954, "mean_token_accuracy": 0.727930200099945, "step": 2200 }, { "epoch": 0.8412017167381974, "grad_norm": 4.75, "learning_rate": 1.4391453643647464e-05, "loss": 0.9531, "mean_token_accuracy": 0.7239082336425782, "step": 2205 }, { "epoch": 0.843109203624225, "grad_norm": 4.9375, "learning_rate": 1.437873585145619e-05, "loss": 1.0409, "mean_token_accuracy": 0.7055112421512604, "step": 2210 }, { "epoch": 0.8450166905102527, "grad_norm": 5.125, "learning_rate": 1.4366018059264912e-05, "loss": 0.9722, "mean_token_accuracy": 0.7141941577196121, "step": 2215 }, { "epoch": 0.8469241773962805, "grad_norm": 5.5625, "learning_rate": 1.4353300267073637e-05, "loss": 1.0716, "mean_token_accuracy": 0.6948425002396107, "step": 2220 }, { "epoch": 0.848831664282308, "grad_norm": 5.53125, "learning_rate": 1.4340582474882361e-05, "loss": 0.985, "mean_token_accuracy": 0.7037705272436142, "step": 2225 }, { "epoch": 0.8507391511683358, "grad_norm": 4.53125, "learning_rate": 1.4327864682691087e-05, "loss": 1.0183, "mean_token_accuracy": 0.7084992684423923, "step": 2230 }, { "epoch": 0.8526466380543634, "grad_norm": 4.3125, "learning_rate": 1.4315146890499812e-05, "loss": 1.0079, "mean_token_accuracy": 0.7166560679674149, "step": 2235 }, { "epoch": 0.8545541249403911, "grad_norm": 4.5625, "learning_rate": 1.4302429098308534e-05, "loss": 0.9975, "mean_token_accuracy": 0.7056717827916146, "step": 2240 }, { "epoch": 0.8564616118264187, "grad_norm": 4.34375, "learning_rate": 1.428971130611726e-05, "loss": 1.0258, "mean_token_accuracy": 0.7125526055693626, "step": 2245 }, { "epoch": 0.8583690987124464, "grad_norm": 4.71875, "learning_rate": 1.4276993513925984e-05, "loss": 1.0043, "mean_token_accuracy": 0.7112932525575161, "step": 2250 }, { "epoch": 0.860276585598474, "grad_norm": 4.78125, "learning_rate": 1.4264275721734709e-05, "loss": 0.9908, "mean_token_accuracy": 0.7215219050645828, "step": 2255 }, { "epoch": 0.8621840724845017, "grad_norm": 4.96875, "learning_rate": 1.4251557929543431e-05, "loss": 0.9868, "mean_token_accuracy": 0.711732842028141, "step": 2260 }, { "epoch": 0.8640915593705293, "grad_norm": 5.53125, "learning_rate": 1.4238840137352157e-05, "loss": 1.0097, "mean_token_accuracy": 0.6953270882368088, "step": 2265 }, { "epoch": 0.865999046256557, "grad_norm": 5.15625, "learning_rate": 1.4226122345160882e-05, "loss": 0.9891, "mean_token_accuracy": 0.7083195835351944, "step": 2270 }, { "epoch": 0.8679065331425846, "grad_norm": 5.3125, "learning_rate": 1.4213404552969606e-05, "loss": 0.9258, "mean_token_accuracy": 0.7308154091238975, "step": 2275 }, { "epoch": 0.8698140200286123, "grad_norm": 4.71875, "learning_rate": 1.420068676077833e-05, "loss": 1.0778, "mean_token_accuracy": 0.7014591008424759, "step": 2280 }, { "epoch": 0.87172150691464, "grad_norm": 4.84375, "learning_rate": 1.4187968968587054e-05, "loss": 0.983, "mean_token_accuracy": 0.7186863839626312, "step": 2285 }, { "epoch": 0.8736289938006676, "grad_norm": 4.5, "learning_rate": 1.4175251176395779e-05, "loss": 0.9944, "mean_token_accuracy": 0.7090497985482216, "step": 2290 }, { "epoch": 0.8755364806866953, "grad_norm": 5.21875, "learning_rate": 1.4162533384204505e-05, "loss": 1.0525, "mean_token_accuracy": 0.6964134000241756, "step": 2295 }, { "epoch": 0.8774439675727229, "grad_norm": 5.40625, "learning_rate": 1.4149815592013227e-05, "loss": 0.9733, "mean_token_accuracy": 0.7252121046185493, "step": 2300 }, { "epoch": 0.8793514544587506, "grad_norm": 4.96875, "learning_rate": 1.4137097799821952e-05, "loss": 0.959, "mean_token_accuracy": 0.7168088331818581, "step": 2305 }, { "epoch": 0.8812589413447782, "grad_norm": 6.875, "learning_rate": 1.4124380007630676e-05, "loss": 1.049, "mean_token_accuracy": 0.7068444952368736, "step": 2310 }, { "epoch": 0.8831664282308059, "grad_norm": 5.5, "learning_rate": 1.4111662215439401e-05, "loss": 0.9387, "mean_token_accuracy": 0.7282044783234596, "step": 2315 }, { "epoch": 0.8850739151168335, "grad_norm": 5.21875, "learning_rate": 1.4098944423248124e-05, "loss": 1.0492, "mean_token_accuracy": 0.6986595824360847, "step": 2320 }, { "epoch": 0.8869814020028612, "grad_norm": 4.75, "learning_rate": 1.4086226631056849e-05, "loss": 1.0166, "mean_token_accuracy": 0.70717094540596, "step": 2325 }, { "epoch": 0.8888888888888888, "grad_norm": 5.3125, "learning_rate": 1.4073508838865575e-05, "loss": 1.0353, "mean_token_accuracy": 0.7046495825052261, "step": 2330 }, { "epoch": 0.8907963757749165, "grad_norm": 4.78125, "learning_rate": 1.4060791046674298e-05, "loss": 1.004, "mean_token_accuracy": 0.7123620569705963, "step": 2335 }, { "epoch": 0.8927038626609443, "grad_norm": 5.0625, "learning_rate": 1.4048073254483024e-05, "loss": 0.9766, "mean_token_accuracy": 0.7214268952608108, "step": 2340 }, { "epoch": 0.8946113495469719, "grad_norm": 4.84375, "learning_rate": 1.4035355462291746e-05, "loss": 1.0042, "mean_token_accuracy": 0.7193110853433609, "step": 2345 }, { "epoch": 0.8965188364329996, "grad_norm": 4.875, "learning_rate": 1.4022637670100472e-05, "loss": 0.9944, "mean_token_accuracy": 0.7094147495925427, "step": 2350 }, { "epoch": 0.8984263233190272, "grad_norm": 5.0, "learning_rate": 1.4009919877909197e-05, "loss": 0.9404, "mean_token_accuracy": 0.7051554054021836, "step": 2355 }, { "epoch": 0.9003338102050549, "grad_norm": 4.09375, "learning_rate": 1.399720208571792e-05, "loss": 0.9273, "mean_token_accuracy": 0.7224482171237468, "step": 2360 }, { "epoch": 0.9022412970910825, "grad_norm": 5.0625, "learning_rate": 1.3984484293526646e-05, "loss": 0.9338, "mean_token_accuracy": 0.7283406421542168, "step": 2365 }, { "epoch": 0.9041487839771102, "grad_norm": 4.78125, "learning_rate": 1.3971766501335368e-05, "loss": 0.963, "mean_token_accuracy": 0.7229938983917237, "step": 2370 }, { "epoch": 0.9060562708631378, "grad_norm": 4.6875, "learning_rate": 1.3959048709144094e-05, "loss": 0.9619, "mean_token_accuracy": 0.7297323048114777, "step": 2375 }, { "epoch": 0.9079637577491655, "grad_norm": 5.15625, "learning_rate": 1.3946330916952818e-05, "loss": 0.9825, "mean_token_accuracy": 0.7321062237024307, "step": 2380 }, { "epoch": 0.9098712446351931, "grad_norm": 4.0625, "learning_rate": 1.3933613124761543e-05, "loss": 0.9912, "mean_token_accuracy": 0.7135132804512978, "step": 2385 }, { "epoch": 0.9117787315212208, "grad_norm": 4.0, "learning_rate": 1.3920895332570267e-05, "loss": 0.9794, "mean_token_accuracy": 0.7198825925588608, "step": 2390 }, { "epoch": 0.9136862184072484, "grad_norm": 5.5, "learning_rate": 1.3908177540378991e-05, "loss": 1.0248, "mean_token_accuracy": 0.7084431156516076, "step": 2395 }, { "epoch": 0.9155937052932761, "grad_norm": 4.53125, "learning_rate": 1.3895459748187716e-05, "loss": 0.9445, "mean_token_accuracy": 0.7225293383002281, "step": 2400 }, { "epoch": 0.9175011921793038, "grad_norm": 4.40625, "learning_rate": 1.388274195599644e-05, "loss": 0.9686, "mean_token_accuracy": 0.7221581146121026, "step": 2405 }, { "epoch": 0.9194086790653314, "grad_norm": 4.46875, "learning_rate": 1.3870024163805164e-05, "loss": 0.8985, "mean_token_accuracy": 0.73746228069067, "step": 2410 }, { "epoch": 0.9213161659513591, "grad_norm": 5.84375, "learning_rate": 1.385730637161389e-05, "loss": 1.0963, "mean_token_accuracy": 0.6906205818057061, "step": 2415 }, { "epoch": 0.9232236528373867, "grad_norm": 6.15625, "learning_rate": 1.3844588579422613e-05, "loss": 1.0587, "mean_token_accuracy": 0.6952868282794953, "step": 2420 }, { "epoch": 0.9251311397234144, "grad_norm": 4.8125, "learning_rate": 1.3831870787231339e-05, "loss": 1.0485, "mean_token_accuracy": 0.7097712486982346, "step": 2425 }, { "epoch": 0.927038626609442, "grad_norm": 4.53125, "learning_rate": 1.3819152995040061e-05, "loss": 0.9926, "mean_token_accuracy": 0.7256608709692955, "step": 2430 }, { "epoch": 0.9289461134954697, "grad_norm": 4.75, "learning_rate": 1.3806435202848786e-05, "loss": 0.9483, "mean_token_accuracy": 0.7213496834039688, "step": 2435 }, { "epoch": 0.9308536003814973, "grad_norm": 5.875, "learning_rate": 1.379371741065751e-05, "loss": 0.9997, "mean_token_accuracy": 0.71124257594347, "step": 2440 }, { "epoch": 0.932761087267525, "grad_norm": 4.6875, "learning_rate": 1.3780999618466236e-05, "loss": 0.9698, "mean_token_accuracy": 0.7182813182473182, "step": 2445 }, { "epoch": 0.9346685741535526, "grad_norm": 6.0, "learning_rate": 1.3768281826274961e-05, "loss": 1.0038, "mean_token_accuracy": 0.7101509183645248, "step": 2450 }, { "epoch": 0.9365760610395804, "grad_norm": 5.84375, "learning_rate": 1.3755564034083683e-05, "loss": 1.0164, "mean_token_accuracy": 0.7154742404818535, "step": 2455 }, { "epoch": 0.9384835479256081, "grad_norm": 4.96875, "learning_rate": 1.3742846241892409e-05, "loss": 0.9209, "mean_token_accuracy": 0.7325319960713387, "step": 2460 }, { "epoch": 0.9403910348116357, "grad_norm": 4.5625, "learning_rate": 1.3730128449701133e-05, "loss": 0.9261, "mean_token_accuracy": 0.7324376836419105, "step": 2465 }, { "epoch": 0.9422985216976634, "grad_norm": 6.625, "learning_rate": 1.3717410657509858e-05, "loss": 1.0196, "mean_token_accuracy": 0.6997216045856476, "step": 2470 }, { "epoch": 0.944206008583691, "grad_norm": 4.875, "learning_rate": 1.3704692865318584e-05, "loss": 0.9358, "mean_token_accuracy": 0.7200413174927235, "step": 2475 }, { "epoch": 0.9461134954697187, "grad_norm": 5.09375, "learning_rate": 1.3691975073127306e-05, "loss": 0.9271, "mean_token_accuracy": 0.7231632620096207, "step": 2480 }, { "epoch": 0.9480209823557463, "grad_norm": 4.53125, "learning_rate": 1.3679257280936031e-05, "loss": 0.9233, "mean_token_accuracy": 0.7239905461668968, "step": 2485 }, { "epoch": 0.949928469241774, "grad_norm": 4.59375, "learning_rate": 1.3666539488744755e-05, "loss": 0.869, "mean_token_accuracy": 0.7495769336819649, "step": 2490 }, { "epoch": 0.9518359561278016, "grad_norm": 5.46875, "learning_rate": 1.365382169655348e-05, "loss": 1.0349, "mean_token_accuracy": 0.7054173357784748, "step": 2495 }, { "epoch": 0.9537434430138293, "grad_norm": 6.21875, "learning_rate": 1.3641103904362203e-05, "loss": 0.9375, "mean_token_accuracy": 0.7280614987015724, "step": 2500 }, { "epoch": 0.9556509298998569, "grad_norm": 7.09375, "learning_rate": 1.3628386112170928e-05, "loss": 0.9994, "mean_token_accuracy": 0.7150517120957375, "step": 2505 }, { "epoch": 0.9575584167858846, "grad_norm": 5.59375, "learning_rate": 1.3615668319979654e-05, "loss": 0.9513, "mean_token_accuracy": 0.732499985396862, "step": 2510 }, { "epoch": 0.9594659036719122, "grad_norm": 5.0, "learning_rate": 1.3602950527788377e-05, "loss": 0.9393, "mean_token_accuracy": 0.7373679198324681, "step": 2515 }, { "epoch": 0.9613733905579399, "grad_norm": 5.53125, "learning_rate": 1.3590232735597101e-05, "loss": 0.9434, "mean_token_accuracy": 0.7251186773180962, "step": 2520 }, { "epoch": 0.9632808774439676, "grad_norm": 4.875, "learning_rate": 1.3577514943405825e-05, "loss": 1.0031, "mean_token_accuracy": 0.7138618201017379, "step": 2525 }, { "epoch": 0.9651883643299952, "grad_norm": 5.0625, "learning_rate": 1.356479715121455e-05, "loss": 0.9804, "mean_token_accuracy": 0.7263573125004769, "step": 2530 }, { "epoch": 0.9670958512160229, "grad_norm": 5.3125, "learning_rate": 1.3552079359023276e-05, "loss": 0.9581, "mean_token_accuracy": 0.7182695418596268, "step": 2535 }, { "epoch": 0.9690033381020505, "grad_norm": 4.96875, "learning_rate": 1.3539361566831998e-05, "loss": 1.0186, "mean_token_accuracy": 0.7014960646629333, "step": 2540 }, { "epoch": 0.9709108249880782, "grad_norm": 4.71875, "learning_rate": 1.3526643774640724e-05, "loss": 0.9072, "mean_token_accuracy": 0.7355842962861061, "step": 2545 }, { "epoch": 0.9728183118741058, "grad_norm": 4.78125, "learning_rate": 1.3513925982449447e-05, "loss": 0.9532, "mean_token_accuracy": 0.7224856913089752, "step": 2550 }, { "epoch": 0.9747257987601335, "grad_norm": 4.34375, "learning_rate": 1.3501208190258173e-05, "loss": 1.0263, "mean_token_accuracy": 0.7150218620896339, "step": 2555 }, { "epoch": 0.9766332856461611, "grad_norm": 4.0625, "learning_rate": 1.3488490398066895e-05, "loss": 0.979, "mean_token_accuracy": 0.7201207339763641, "step": 2560 }, { "epoch": 0.9785407725321889, "grad_norm": 5.40625, "learning_rate": 1.347577260587562e-05, "loss": 1.0678, "mean_token_accuracy": 0.7005651786923408, "step": 2565 }, { "epoch": 0.9804482594182165, "grad_norm": 4.21875, "learning_rate": 1.3463054813684346e-05, "loss": 0.9428, "mean_token_accuracy": 0.719219633936882, "step": 2570 }, { "epoch": 0.9823557463042442, "grad_norm": 5.125, "learning_rate": 1.345033702149307e-05, "loss": 0.9287, "mean_token_accuracy": 0.7292002603411675, "step": 2575 }, { "epoch": 0.9842632331902719, "grad_norm": 4.9375, "learning_rate": 1.3437619229301795e-05, "loss": 0.8934, "mean_token_accuracy": 0.7359087854623795, "step": 2580 }, { "epoch": 0.9861707200762995, "grad_norm": 4.75, "learning_rate": 1.3424901437110517e-05, "loss": 0.9668, "mean_token_accuracy": 0.7158159494400025, "step": 2585 }, { "epoch": 0.9880782069623272, "grad_norm": 4.8125, "learning_rate": 1.3412183644919243e-05, "loss": 0.9261, "mean_token_accuracy": 0.7298913650214672, "step": 2590 }, { "epoch": 0.9899856938483548, "grad_norm": 4.0625, "learning_rate": 1.3399465852727968e-05, "loss": 0.9487, "mean_token_accuracy": 0.7226217985153198, "step": 2595 }, { "epoch": 0.9918931807343825, "grad_norm": 3.984375, "learning_rate": 1.3386748060536692e-05, "loss": 0.9619, "mean_token_accuracy": 0.7170055121183395, "step": 2600 }, { "epoch": 0.9938006676204101, "grad_norm": 4.84375, "learning_rate": 1.3374030268345418e-05, "loss": 1.1289, "mean_token_accuracy": 0.6791918635368347, "step": 2605 }, { "epoch": 0.9957081545064378, "grad_norm": 4.46875, "learning_rate": 1.336131247615414e-05, "loss": 0.9903, "mean_token_accuracy": 0.7179454803466797, "step": 2610 }, { "epoch": 0.9976156413924654, "grad_norm": 4.65625, "learning_rate": 1.3348594683962865e-05, "loss": 0.9445, "mean_token_accuracy": 0.7281292766332627, "step": 2615 }, { "epoch": 0.9995231282784931, "grad_norm": 6.28125, "learning_rate": 1.333587689177159e-05, "loss": 0.9597, "mean_token_accuracy": 0.7316592544317245, "step": 2620 }, { "epoch": 1.0011444921316166, "grad_norm": 5.25, "learning_rate": 1.3323159099580315e-05, "loss": 0.8169, "mean_token_accuracy": 0.7703618687741897, "step": 2625 }, { "epoch": 1.0030519790176442, "grad_norm": 4.53125, "learning_rate": 1.3310441307389039e-05, "loss": 0.8116, "mean_token_accuracy": 0.7598389968276024, "step": 2630 }, { "epoch": 1.004959465903672, "grad_norm": 5.90625, "learning_rate": 1.3297723515197762e-05, "loss": 0.6731, "mean_token_accuracy": 0.8001046404242516, "step": 2635 }, { "epoch": 1.0068669527896996, "grad_norm": 4.375, "learning_rate": 1.3285005723006488e-05, "loss": 0.7272, "mean_token_accuracy": 0.7857492476701736, "step": 2640 }, { "epoch": 1.0087744396757272, "grad_norm": 4.21875, "learning_rate": 1.3272287930815212e-05, "loss": 0.6981, "mean_token_accuracy": 0.7862312287092209, "step": 2645 }, { "epoch": 1.0106819265617548, "grad_norm": 5.34375, "learning_rate": 1.3259570138623935e-05, "loss": 0.7172, "mean_token_accuracy": 0.7825499027967453, "step": 2650 }, { "epoch": 1.0125894134477826, "grad_norm": 4.6875, "learning_rate": 1.324685234643266e-05, "loss": 0.7289, "mean_token_accuracy": 0.777584858238697, "step": 2655 }, { "epoch": 1.0144969003338102, "grad_norm": 5.21875, "learning_rate": 1.3234134554241385e-05, "loss": 0.7725, "mean_token_accuracy": 0.774682505428791, "step": 2660 }, { "epoch": 1.0164043872198378, "grad_norm": 5.3125, "learning_rate": 1.322141676205011e-05, "loss": 0.714, "mean_token_accuracy": 0.784676456451416, "step": 2665 }, { "epoch": 1.0183118741058654, "grad_norm": 4.65625, "learning_rate": 1.3208698969858832e-05, "loss": 0.701, "mean_token_accuracy": 0.7884877413511276, "step": 2670 }, { "epoch": 1.0202193609918933, "grad_norm": 5.90625, "learning_rate": 1.3195981177667558e-05, "loss": 0.8432, "mean_token_accuracy": 0.7497532099485398, "step": 2675 }, { "epoch": 1.0221268478779209, "grad_norm": 5.1875, "learning_rate": 1.3183263385476282e-05, "loss": 0.7722, "mean_token_accuracy": 0.7718176633119583, "step": 2680 }, { "epoch": 1.0240343347639485, "grad_norm": 4.875, "learning_rate": 1.3170545593285007e-05, "loss": 0.6933, "mean_token_accuracy": 0.7811264783143997, "step": 2685 }, { "epoch": 1.025941821649976, "grad_norm": 5.625, "learning_rate": 1.3157827801093733e-05, "loss": 0.7905, "mean_token_accuracy": 0.7633818849921227, "step": 2690 }, { "epoch": 1.0278493085360039, "grad_norm": 4.78125, "learning_rate": 1.3145110008902455e-05, "loss": 0.6934, "mean_token_accuracy": 0.7909208744764328, "step": 2695 }, { "epoch": 1.0297567954220315, "grad_norm": 6.09375, "learning_rate": 1.313239221671118e-05, "loss": 0.7197, "mean_token_accuracy": 0.7824130252003669, "step": 2700 }, { "epoch": 1.031664282308059, "grad_norm": 5.28125, "learning_rate": 1.3119674424519904e-05, "loss": 0.795, "mean_token_accuracy": 0.7646695077419281, "step": 2705 }, { "epoch": 1.0335717691940869, "grad_norm": 5.59375, "learning_rate": 1.310695663232863e-05, "loss": 0.7501, "mean_token_accuracy": 0.7809911444783211, "step": 2710 }, { "epoch": 1.0354792560801145, "grad_norm": 5.1875, "learning_rate": 1.3094238840137352e-05, "loss": 0.707, "mean_token_accuracy": 0.7913937494158745, "step": 2715 }, { "epoch": 1.037386742966142, "grad_norm": 5.21875, "learning_rate": 1.3081521047946077e-05, "loss": 0.63, "mean_token_accuracy": 0.8109789371490479, "step": 2720 }, { "epoch": 1.0392942298521697, "grad_norm": 7.15625, "learning_rate": 1.3068803255754803e-05, "loss": 0.7498, "mean_token_accuracy": 0.7778886809945107, "step": 2725 }, { "epoch": 1.0412017167381975, "grad_norm": 4.65625, "learning_rate": 1.3056085463563526e-05, "loss": 0.7035, "mean_token_accuracy": 0.7928727805614472, "step": 2730 }, { "epoch": 1.043109203624225, "grad_norm": 5.09375, "learning_rate": 1.3043367671372252e-05, "loss": 0.7275, "mean_token_accuracy": 0.7794766634702682, "step": 2735 }, { "epoch": 1.0450166905102527, "grad_norm": 4.4375, "learning_rate": 1.3030649879180974e-05, "loss": 0.7762, "mean_token_accuracy": 0.770900085568428, "step": 2740 }, { "epoch": 1.0469241773962803, "grad_norm": 4.8125, "learning_rate": 1.30179320869897e-05, "loss": 0.685, "mean_token_accuracy": 0.7873307526111603, "step": 2745 }, { "epoch": 1.0488316642823081, "grad_norm": 5.3125, "learning_rate": 1.3005214294798425e-05, "loss": 0.7436, "mean_token_accuracy": 0.7779841095209121, "step": 2750 }, { "epoch": 1.0507391511683357, "grad_norm": 4.75, "learning_rate": 1.2992496502607149e-05, "loss": 0.7905, "mean_token_accuracy": 0.7763501286506653, "step": 2755 }, { "epoch": 1.0526466380543633, "grad_norm": 5.03125, "learning_rate": 1.2979778710415873e-05, "loss": 0.7612, "mean_token_accuracy": 0.7685739085078239, "step": 2760 }, { "epoch": 1.0545541249403911, "grad_norm": 4.65625, "learning_rate": 1.2967060918224597e-05, "loss": 0.6927, "mean_token_accuracy": 0.7887519240379334, "step": 2765 }, { "epoch": 1.0564616118264187, "grad_norm": 5.5625, "learning_rate": 1.2954343126033322e-05, "loss": 0.7351, "mean_token_accuracy": 0.7762645557522774, "step": 2770 }, { "epoch": 1.0583690987124463, "grad_norm": 5.59375, "learning_rate": 1.2941625333842046e-05, "loss": 0.7275, "mean_token_accuracy": 0.7789256662130356, "step": 2775 }, { "epoch": 1.060276585598474, "grad_norm": 5.5625, "learning_rate": 1.292890754165077e-05, "loss": 0.6901, "mean_token_accuracy": 0.7860920026898384, "step": 2780 }, { "epoch": 1.0621840724845018, "grad_norm": 5.34375, "learning_rate": 1.2916189749459495e-05, "loss": 0.6762, "mean_token_accuracy": 0.7937915295362472, "step": 2785 }, { "epoch": 1.0640915593705293, "grad_norm": 5.46875, "learning_rate": 1.2903471957268219e-05, "loss": 0.7657, "mean_token_accuracy": 0.7764147505164146, "step": 2790 }, { "epoch": 1.065999046256557, "grad_norm": 5.21875, "learning_rate": 1.2890754165076944e-05, "loss": 0.6942, "mean_token_accuracy": 0.7820986464619637, "step": 2795 }, { "epoch": 1.0679065331425845, "grad_norm": 5.5625, "learning_rate": 1.2878036372885667e-05, "loss": 0.7147, "mean_token_accuracy": 0.7832359343767166, "step": 2800 }, { "epoch": 1.0698140200286124, "grad_norm": 5.09375, "learning_rate": 1.2865318580694392e-05, "loss": 0.7236, "mean_token_accuracy": 0.7760476425290108, "step": 2805 }, { "epoch": 1.07172150691464, "grad_norm": 4.0625, "learning_rate": 1.2852600788503118e-05, "loss": 0.6761, "mean_token_accuracy": 0.7974345803260803, "step": 2810 }, { "epoch": 1.0736289938006676, "grad_norm": 5.3125, "learning_rate": 1.2839882996311841e-05, "loss": 0.6539, "mean_token_accuracy": 0.7917591854929924, "step": 2815 }, { "epoch": 1.0755364806866954, "grad_norm": 5.34375, "learning_rate": 1.2827165204120567e-05, "loss": 0.7374, "mean_token_accuracy": 0.7774428620934486, "step": 2820 }, { "epoch": 1.077443967572723, "grad_norm": 4.78125, "learning_rate": 1.2814447411929289e-05, "loss": 0.723, "mean_token_accuracy": 0.7769853323698044, "step": 2825 }, { "epoch": 1.0793514544587506, "grad_norm": 5.84375, "learning_rate": 1.2801729619738014e-05, "loss": 0.7696, "mean_token_accuracy": 0.766608040034771, "step": 2830 }, { "epoch": 1.0812589413447782, "grad_norm": 5.46875, "learning_rate": 1.2789011827546738e-05, "loss": 0.7817, "mean_token_accuracy": 0.769975657761097, "step": 2835 }, { "epoch": 1.083166428230806, "grad_norm": 5.1875, "learning_rate": 1.2776294035355464e-05, "loss": 0.7443, "mean_token_accuracy": 0.7756567880511284, "step": 2840 }, { "epoch": 1.0850739151168336, "grad_norm": 5.96875, "learning_rate": 1.276357624316419e-05, "loss": 0.7636, "mean_token_accuracy": 0.7678534090518951, "step": 2845 }, { "epoch": 1.0869814020028612, "grad_norm": 5.65625, "learning_rate": 1.2750858450972911e-05, "loss": 0.7688, "mean_token_accuracy": 0.7699078634381294, "step": 2850 }, { "epoch": 1.0888888888888888, "grad_norm": 5.09375, "learning_rate": 1.2738140658781637e-05, "loss": 0.6738, "mean_token_accuracy": 0.7892726883292198, "step": 2855 }, { "epoch": 1.0907963757749166, "grad_norm": 5.8125, "learning_rate": 1.272542286659036e-05, "loss": 0.6766, "mean_token_accuracy": 0.7931236639618874, "step": 2860 }, { "epoch": 1.0927038626609442, "grad_norm": 5.21875, "learning_rate": 1.2712705074399086e-05, "loss": 0.6785, "mean_token_accuracy": 0.7956051483750344, "step": 2865 }, { "epoch": 1.0946113495469718, "grad_norm": 5.40625, "learning_rate": 1.269998728220781e-05, "loss": 0.7536, "mean_token_accuracy": 0.7718766152858734, "step": 2870 }, { "epoch": 1.0965188364329994, "grad_norm": 4.96875, "learning_rate": 1.2687269490016534e-05, "loss": 0.7712, "mean_token_accuracy": 0.7525050655007363, "step": 2875 }, { "epoch": 1.0984263233190272, "grad_norm": 5.28125, "learning_rate": 1.267455169782526e-05, "loss": 0.801, "mean_token_accuracy": 0.7586658120155334, "step": 2880 }, { "epoch": 1.1003338102050548, "grad_norm": 5.1875, "learning_rate": 1.2661833905633983e-05, "loss": 0.7505, "mean_token_accuracy": 0.7702452227473259, "step": 2885 }, { "epoch": 1.1022412970910824, "grad_norm": 5.59375, "learning_rate": 1.2649116113442707e-05, "loss": 0.7451, "mean_token_accuracy": 0.7831007912755013, "step": 2890 }, { "epoch": 1.1041487839771102, "grad_norm": 5.25, "learning_rate": 1.263639832125143e-05, "loss": 0.7195, "mean_token_accuracy": 0.7821832373738289, "step": 2895 }, { "epoch": 1.1060562708631378, "grad_norm": 5.3125, "learning_rate": 1.2623680529060156e-05, "loss": 0.7329, "mean_token_accuracy": 0.7784585759043694, "step": 2900 }, { "epoch": 1.1079637577491654, "grad_norm": 4.96875, "learning_rate": 1.2610962736868882e-05, "loss": 0.7102, "mean_token_accuracy": 0.7830314084887504, "step": 2905 }, { "epoch": 1.109871244635193, "grad_norm": 5.5625, "learning_rate": 1.2598244944677604e-05, "loss": 0.7185, "mean_token_accuracy": 0.7852053195238113, "step": 2910 }, { "epoch": 1.1117787315212209, "grad_norm": 6.1875, "learning_rate": 1.258552715248633e-05, "loss": 0.7364, "mean_token_accuracy": 0.7896745055913925, "step": 2915 }, { "epoch": 1.1136862184072485, "grad_norm": 5.125, "learning_rate": 1.2572809360295053e-05, "loss": 0.7305, "mean_token_accuracy": 0.7816007971763611, "step": 2920 }, { "epoch": 1.115593705293276, "grad_norm": 6.03125, "learning_rate": 1.2560091568103779e-05, "loss": 0.6798, "mean_token_accuracy": 0.7947070822119713, "step": 2925 }, { "epoch": 1.1175011921793039, "grad_norm": 4.875, "learning_rate": 1.2547373775912504e-05, "loss": 0.7053, "mean_token_accuracy": 0.7792468458414078, "step": 2930 }, { "epoch": 1.1194086790653315, "grad_norm": 5.25, "learning_rate": 1.2534655983721226e-05, "loss": 0.7326, "mean_token_accuracy": 0.7784487336874009, "step": 2935 }, { "epoch": 1.121316165951359, "grad_norm": 5.625, "learning_rate": 1.2521938191529952e-05, "loss": 0.7323, "mean_token_accuracy": 0.7857000678777695, "step": 2940 }, { "epoch": 1.1232236528373867, "grad_norm": 5.875, "learning_rate": 1.2509220399338676e-05, "loss": 0.7108, "mean_token_accuracy": 0.774834556877613, "step": 2945 }, { "epoch": 1.1251311397234145, "grad_norm": 5.15625, "learning_rate": 1.2496502607147401e-05, "loss": 0.763, "mean_token_accuracy": 0.7830505579710006, "step": 2950 }, { "epoch": 1.127038626609442, "grad_norm": 4.6875, "learning_rate": 1.2483784814956123e-05, "loss": 0.7907, "mean_token_accuracy": 0.7585530668497086, "step": 2955 }, { "epoch": 1.1289461134954697, "grad_norm": 6.03125, "learning_rate": 1.2471067022764849e-05, "loss": 0.6787, "mean_token_accuracy": 0.8010306641459465, "step": 2960 }, { "epoch": 1.1308536003814973, "grad_norm": 5.71875, "learning_rate": 1.2458349230573574e-05, "loss": 0.7014, "mean_token_accuracy": 0.7869263395667077, "step": 2965 }, { "epoch": 1.1327610872675251, "grad_norm": 5.96875, "learning_rate": 1.2445631438382298e-05, "loss": 0.6731, "mean_token_accuracy": 0.7962010264396667, "step": 2970 }, { "epoch": 1.1346685741535527, "grad_norm": 5.40625, "learning_rate": 1.2432913646191023e-05, "loss": 0.6931, "mean_token_accuracy": 0.7810707330703736, "step": 2975 }, { "epoch": 1.1365760610395803, "grad_norm": 5.28125, "learning_rate": 1.2420195853999746e-05, "loss": 0.6993, "mean_token_accuracy": 0.7831882372498512, "step": 2980 }, { "epoch": 1.138483547925608, "grad_norm": 4.46875, "learning_rate": 1.2407478061808471e-05, "loss": 0.7372, "mean_token_accuracy": 0.7778501972556114, "step": 2985 }, { "epoch": 1.1403910348116357, "grad_norm": 6.15625, "learning_rate": 1.2394760269617197e-05, "loss": 0.7833, "mean_token_accuracy": 0.7658291473984719, "step": 2990 }, { "epoch": 1.1422985216976633, "grad_norm": 5.34375, "learning_rate": 1.238204247742592e-05, "loss": 0.7703, "mean_token_accuracy": 0.7754868105053901, "step": 2995 }, { "epoch": 1.144206008583691, "grad_norm": 4.59375, "learning_rate": 1.2369324685234644e-05, "loss": 0.7596, "mean_token_accuracy": 0.7784150972962379, "step": 3000 }, { "epoch": 1.1461134954697187, "grad_norm": 5.1875, "learning_rate": 1.2356606893043368e-05, "loss": 0.699, "mean_token_accuracy": 0.8043418154120445, "step": 3005 }, { "epoch": 1.1480209823557463, "grad_norm": 5.53125, "learning_rate": 1.2343889100852093e-05, "loss": 0.7489, "mean_token_accuracy": 0.7779771253466606, "step": 3010 }, { "epoch": 1.149928469241774, "grad_norm": 4.8125, "learning_rate": 1.2331171308660817e-05, "loss": 0.7085, "mean_token_accuracy": 0.7849825263023377, "step": 3015 }, { "epoch": 1.1518359561278015, "grad_norm": 5.0625, "learning_rate": 1.2318453516469541e-05, "loss": 0.7101, "mean_token_accuracy": 0.7911660373210907, "step": 3020 }, { "epoch": 1.1537434430138294, "grad_norm": 4.21875, "learning_rate": 1.2305735724278267e-05, "loss": 0.6447, "mean_token_accuracy": 0.8056324139237404, "step": 3025 }, { "epoch": 1.155650929899857, "grad_norm": 5.8125, "learning_rate": 1.229301793208699e-05, "loss": 0.7109, "mean_token_accuracy": 0.7876276299357414, "step": 3030 }, { "epoch": 1.1575584167858846, "grad_norm": 5.34375, "learning_rate": 1.2280300139895716e-05, "loss": 0.8102, "mean_token_accuracy": 0.7673216596245765, "step": 3035 }, { "epoch": 1.1594659036719124, "grad_norm": 5.375, "learning_rate": 1.2267582347704438e-05, "loss": 0.7131, "mean_token_accuracy": 0.7869996011257172, "step": 3040 }, { "epoch": 1.16137339055794, "grad_norm": 5.53125, "learning_rate": 1.2254864555513164e-05, "loss": 0.7036, "mean_token_accuracy": 0.7789271324872971, "step": 3045 }, { "epoch": 1.1632808774439676, "grad_norm": 5.28125, "learning_rate": 1.2242146763321889e-05, "loss": 0.6824, "mean_token_accuracy": 0.7872760623693467, "step": 3050 }, { "epoch": 1.1651883643299952, "grad_norm": 4.46875, "learning_rate": 1.2229428971130613e-05, "loss": 0.7335, "mean_token_accuracy": 0.7788919404149055, "step": 3055 }, { "epoch": 1.1670958512160228, "grad_norm": 5.28125, "learning_rate": 1.2216711178939338e-05, "loss": 0.7988, "mean_token_accuracy": 0.7685728743672371, "step": 3060 }, { "epoch": 1.1690033381020506, "grad_norm": 5.125, "learning_rate": 1.220399338674806e-05, "loss": 0.7423, "mean_token_accuracy": 0.7701018214225769, "step": 3065 }, { "epoch": 1.1709108249880782, "grad_norm": 6.46875, "learning_rate": 1.2191275594556786e-05, "loss": 0.7445, "mean_token_accuracy": 0.7846373036503792, "step": 3070 }, { "epoch": 1.1728183118741058, "grad_norm": 5.6875, "learning_rate": 1.217855780236551e-05, "loss": 0.6958, "mean_token_accuracy": 0.7816796407103539, "step": 3075 }, { "epoch": 1.1747257987601336, "grad_norm": 4.96875, "learning_rate": 1.2165840010174235e-05, "loss": 0.6823, "mean_token_accuracy": 0.7943092510104179, "step": 3080 }, { "epoch": 1.1766332856461612, "grad_norm": 5.28125, "learning_rate": 1.215312221798296e-05, "loss": 0.6901, "mean_token_accuracy": 0.7862473502755165, "step": 3085 }, { "epoch": 1.1785407725321888, "grad_norm": 4.625, "learning_rate": 1.2140404425791683e-05, "loss": 0.6357, "mean_token_accuracy": 0.8068152844905854, "step": 3090 }, { "epoch": 1.1804482594182164, "grad_norm": 4.53125, "learning_rate": 1.2127686633600408e-05, "loss": 0.6244, "mean_token_accuracy": 0.8068492501974106, "step": 3095 }, { "epoch": 1.1823557463042442, "grad_norm": 4.875, "learning_rate": 1.2114968841409132e-05, "loss": 0.7346, "mean_token_accuracy": 0.7738267377018928, "step": 3100 }, { "epoch": 1.1842632331902718, "grad_norm": 5.5, "learning_rate": 1.2102251049217858e-05, "loss": 0.6948, "mean_token_accuracy": 0.7892757222056389, "step": 3105 }, { "epoch": 1.1861707200762994, "grad_norm": 5.4375, "learning_rate": 1.2089533257026581e-05, "loss": 0.676, "mean_token_accuracy": 0.7894925311207771, "step": 3110 }, { "epoch": 1.1880782069623272, "grad_norm": 4.65625, "learning_rate": 1.2076815464835305e-05, "loss": 0.712, "mean_token_accuracy": 0.7807778924703598, "step": 3115 }, { "epoch": 1.1899856938483548, "grad_norm": 4.4375, "learning_rate": 1.206409767264403e-05, "loss": 0.6698, "mean_token_accuracy": 0.7984389364719391, "step": 3120 }, { "epoch": 1.1918931807343824, "grad_norm": 5.0625, "learning_rate": 1.2051379880452755e-05, "loss": 0.6725, "mean_token_accuracy": 0.800166568160057, "step": 3125 }, { "epoch": 1.19380066762041, "grad_norm": 5.875, "learning_rate": 1.2038662088261478e-05, "loss": 0.7385, "mean_token_accuracy": 0.7819548204541207, "step": 3130 }, { "epoch": 1.1957081545064379, "grad_norm": 5.5625, "learning_rate": 1.2025944296070202e-05, "loss": 0.6656, "mean_token_accuracy": 0.7992453247308731, "step": 3135 }, { "epoch": 1.1976156413924655, "grad_norm": 6.59375, "learning_rate": 1.2013226503878928e-05, "loss": 0.6904, "mean_token_accuracy": 0.7905550047755241, "step": 3140 }, { "epoch": 1.199523128278493, "grad_norm": 5.125, "learning_rate": 1.2000508711687653e-05, "loss": 0.7657, "mean_token_accuracy": 0.7733726158738137, "step": 3145 }, { "epoch": 1.2014306151645207, "grad_norm": 5.1875, "learning_rate": 1.1987790919496375e-05, "loss": 0.6699, "mean_token_accuracy": 0.7883818462491036, "step": 3150 }, { "epoch": 1.2033381020505485, "grad_norm": 5.0625, "learning_rate": 1.19750731273051e-05, "loss": 0.7322, "mean_token_accuracy": 0.7788220182061195, "step": 3155 }, { "epoch": 1.205245588936576, "grad_norm": 5.5, "learning_rate": 1.1962355335113825e-05, "loss": 0.6893, "mean_token_accuracy": 0.775249108672142, "step": 3160 }, { "epoch": 1.2071530758226037, "grad_norm": 7.0625, "learning_rate": 1.194963754292255e-05, "loss": 0.7337, "mean_token_accuracy": 0.784166394174099, "step": 3165 }, { "epoch": 1.2090605627086313, "grad_norm": 5.5625, "learning_rate": 1.1936919750731276e-05, "loss": 0.7038, "mean_token_accuracy": 0.7948532626032829, "step": 3170 }, { "epoch": 1.210968049594659, "grad_norm": 8.8125, "learning_rate": 1.1924201958539998e-05, "loss": 0.753, "mean_token_accuracy": 0.7781305342912674, "step": 3175 }, { "epoch": 1.2128755364806867, "grad_norm": 5.15625, "learning_rate": 1.1911484166348723e-05, "loss": 0.6694, "mean_token_accuracy": 0.800914041697979, "step": 3180 }, { "epoch": 1.2147830233667143, "grad_norm": 5.625, "learning_rate": 1.1898766374157447e-05, "loss": 0.6664, "mean_token_accuracy": 0.7857102379202843, "step": 3185 }, { "epoch": 1.216690510252742, "grad_norm": 5.3125, "learning_rate": 1.1886048581966173e-05, "loss": 0.6609, "mean_token_accuracy": 0.7949806705117226, "step": 3190 }, { "epoch": 1.2185979971387697, "grad_norm": 5.1875, "learning_rate": 1.1873330789774895e-05, "loss": 0.7093, "mean_token_accuracy": 0.7808532416820526, "step": 3195 }, { "epoch": 1.2205054840247973, "grad_norm": 5.15625, "learning_rate": 1.186061299758362e-05, "loss": 0.7516, "mean_token_accuracy": 0.7698055237531662, "step": 3200 }, { "epoch": 1.222412970910825, "grad_norm": 4.6875, "learning_rate": 1.1847895205392346e-05, "loss": 0.6743, "mean_token_accuracy": 0.8001621171832085, "step": 3205 }, { "epoch": 1.2243204577968527, "grad_norm": 4.78125, "learning_rate": 1.183517741320107e-05, "loss": 0.7656, "mean_token_accuracy": 0.7759733751416207, "step": 3210 }, { "epoch": 1.2262279446828803, "grad_norm": 5.875, "learning_rate": 1.1822459621009795e-05, "loss": 0.736, "mean_token_accuracy": 0.7686377540230751, "step": 3215 }, { "epoch": 1.228135431568908, "grad_norm": 5.4375, "learning_rate": 1.1809741828818517e-05, "loss": 0.7115, "mean_token_accuracy": 0.7793835371732711, "step": 3220 }, { "epoch": 1.2300429184549357, "grad_norm": 6.0625, "learning_rate": 1.1797024036627243e-05, "loss": 0.7457, "mean_token_accuracy": 0.7806543171405792, "step": 3225 }, { "epoch": 1.2319504053409633, "grad_norm": 4.75, "learning_rate": 1.1784306244435968e-05, "loss": 0.7483, "mean_token_accuracy": 0.7799697473645211, "step": 3230 }, { "epoch": 1.233857892226991, "grad_norm": 6.15625, "learning_rate": 1.1771588452244692e-05, "loss": 0.7715, "mean_token_accuracy": 0.7469990506768227, "step": 3235 }, { "epoch": 1.2357653791130185, "grad_norm": 4.53125, "learning_rate": 1.1758870660053416e-05, "loss": 0.6083, "mean_token_accuracy": 0.8102578550577164, "step": 3240 }, { "epoch": 1.2376728659990461, "grad_norm": 5.34375, "learning_rate": 1.174615286786214e-05, "loss": 0.752, "mean_token_accuracy": 0.7763517677783967, "step": 3245 }, { "epoch": 1.239580352885074, "grad_norm": 4.8125, "learning_rate": 1.1733435075670865e-05, "loss": 0.739, "mean_token_accuracy": 0.7704811751842499, "step": 3250 }, { "epoch": 1.2414878397711016, "grad_norm": 5.34375, "learning_rate": 1.1720717283479589e-05, "loss": 0.693, "mean_token_accuracy": 0.7942864954471588, "step": 3255 }, { "epoch": 1.2433953266571292, "grad_norm": 5.09375, "learning_rate": 1.1707999491288313e-05, "loss": 0.7277, "mean_token_accuracy": 0.7802015751600265, "step": 3260 }, { "epoch": 1.245302813543157, "grad_norm": 5.71875, "learning_rate": 1.1695281699097038e-05, "loss": 0.8018, "mean_token_accuracy": 0.7585472777485848, "step": 3265 }, { "epoch": 1.2472103004291846, "grad_norm": 6.6875, "learning_rate": 1.1682563906905762e-05, "loss": 0.7667, "mean_token_accuracy": 0.773290790617466, "step": 3270 }, { "epoch": 1.2491177873152122, "grad_norm": 5.4375, "learning_rate": 1.1669846114714487e-05, "loss": 0.7078, "mean_token_accuracy": 0.7810175165534019, "step": 3275 }, { "epoch": 1.2510252742012398, "grad_norm": 6.375, "learning_rate": 1.165712832252321e-05, "loss": 0.7716, "mean_token_accuracy": 0.7618390426039696, "step": 3280 }, { "epoch": 1.2529327610872676, "grad_norm": 5.59375, "learning_rate": 1.1644410530331935e-05, "loss": 0.6536, "mean_token_accuracy": 0.794977605342865, "step": 3285 }, { "epoch": 1.2548402479732952, "grad_norm": 5.15625, "learning_rate": 1.163169273814066e-05, "loss": 0.6787, "mean_token_accuracy": 0.7936605170369149, "step": 3290 }, { "epoch": 1.2567477348593228, "grad_norm": 5.71875, "learning_rate": 1.1618974945949384e-05, "loss": 0.7416, "mean_token_accuracy": 0.7848603546619415, "step": 3295 }, { "epoch": 1.2586552217453506, "grad_norm": 5.28125, "learning_rate": 1.160625715375811e-05, "loss": 0.7217, "mean_token_accuracy": 0.7866264447569847, "step": 3300 }, { "epoch": 1.2605627086313782, "grad_norm": 5.5, "learning_rate": 1.1593539361566832e-05, "loss": 0.6698, "mean_token_accuracy": 0.8022088572382927, "step": 3305 }, { "epoch": 1.2624701955174058, "grad_norm": 5.15625, "learning_rate": 1.1580821569375557e-05, "loss": 0.7169, "mean_token_accuracy": 0.7868107482790947, "step": 3310 }, { "epoch": 1.2643776824034334, "grad_norm": 5.0, "learning_rate": 1.1568103777184281e-05, "loss": 0.6532, "mean_token_accuracy": 0.8017155960202217, "step": 3315 }, { "epoch": 1.266285169289461, "grad_norm": 4.78125, "learning_rate": 1.1555385984993007e-05, "loss": 0.7316, "mean_token_accuracy": 0.7813733011484146, "step": 3320 }, { "epoch": 1.2681926561754888, "grad_norm": 4.9375, "learning_rate": 1.1542668192801732e-05, "loss": 0.7266, "mean_token_accuracy": 0.7774209037423134, "step": 3325 }, { "epoch": 1.2701001430615164, "grad_norm": 5.53125, "learning_rate": 1.1529950400610454e-05, "loss": 0.706, "mean_token_accuracy": 0.7916549324989319, "step": 3330 }, { "epoch": 1.2720076299475442, "grad_norm": 5.28125, "learning_rate": 1.151723260841918e-05, "loss": 0.7483, "mean_token_accuracy": 0.7811658516526222, "step": 3335 }, { "epoch": 1.2739151168335718, "grad_norm": 6.1875, "learning_rate": 1.1504514816227904e-05, "loss": 0.7015, "mean_token_accuracy": 0.7907307639718055, "step": 3340 }, { "epoch": 1.2758226037195994, "grad_norm": 4.46875, "learning_rate": 1.1491797024036629e-05, "loss": 0.7421, "mean_token_accuracy": 0.7704378560185432, "step": 3345 }, { "epoch": 1.277730090605627, "grad_norm": 5.8125, "learning_rate": 1.1479079231845353e-05, "loss": 0.6951, "mean_token_accuracy": 0.7920941516757012, "step": 3350 }, { "epoch": 1.2796375774916546, "grad_norm": 5.5, "learning_rate": 1.1466361439654077e-05, "loss": 0.7488, "mean_token_accuracy": 0.7743821710348129, "step": 3355 }, { "epoch": 1.2815450643776825, "grad_norm": 5.53125, "learning_rate": 1.1453643647462802e-05, "loss": 0.7239, "mean_token_accuracy": 0.7782898634672165, "step": 3360 }, { "epoch": 1.28345255126371, "grad_norm": 5.15625, "learning_rate": 1.1440925855271526e-05, "loss": 0.7275, "mean_token_accuracy": 0.7705945268273353, "step": 3365 }, { "epoch": 1.2853600381497376, "grad_norm": 4.84375, "learning_rate": 1.142820806308025e-05, "loss": 0.7028, "mean_token_accuracy": 0.7802629828453064, "step": 3370 }, { "epoch": 1.2872675250357655, "grad_norm": 6.21875, "learning_rate": 1.1415490270888974e-05, "loss": 0.7664, "mean_token_accuracy": 0.7695603922009469, "step": 3375 }, { "epoch": 1.289175011921793, "grad_norm": 5.90625, "learning_rate": 1.14027724786977e-05, "loss": 0.728, "mean_token_accuracy": 0.7842082038521767, "step": 3380 }, { "epoch": 1.2910824988078207, "grad_norm": 4.6875, "learning_rate": 1.1390054686506425e-05, "loss": 0.6729, "mean_token_accuracy": 0.7945865884423255, "step": 3385 }, { "epoch": 1.2929899856938483, "grad_norm": 5.21875, "learning_rate": 1.1377336894315147e-05, "loss": 0.6643, "mean_token_accuracy": 0.7997446030378341, "step": 3390 }, { "epoch": 1.294897472579876, "grad_norm": 4.40625, "learning_rate": 1.1364619102123872e-05, "loss": 0.7113, "mean_token_accuracy": 0.7903872340917587, "step": 3395 }, { "epoch": 1.2968049594659037, "grad_norm": 5.09375, "learning_rate": 1.1351901309932596e-05, "loss": 0.6903, "mean_token_accuracy": 0.7984808310866356, "step": 3400 }, { "epoch": 1.2987124463519313, "grad_norm": 5.5625, "learning_rate": 1.1339183517741322e-05, "loss": 0.7383, "mean_token_accuracy": 0.7875708505511284, "step": 3405 }, { "epoch": 1.300619933237959, "grad_norm": 6.21875, "learning_rate": 1.1326465725550047e-05, "loss": 0.6604, "mean_token_accuracy": 0.7980678513646126, "step": 3410 }, { "epoch": 1.3025274201239867, "grad_norm": 6.875, "learning_rate": 1.131374793335877e-05, "loss": 0.7434, "mean_token_accuracy": 0.7777546659111977, "step": 3415 }, { "epoch": 1.3044349070100143, "grad_norm": 4.9375, "learning_rate": 1.1301030141167495e-05, "loss": 0.6542, "mean_token_accuracy": 0.7975090846419335, "step": 3420 }, { "epoch": 1.306342393896042, "grad_norm": 4.71875, "learning_rate": 1.1288312348976218e-05, "loss": 0.7533, "mean_token_accuracy": 0.7681490883231163, "step": 3425 }, { "epoch": 1.3082498807820695, "grad_norm": 5.65625, "learning_rate": 1.1275594556784944e-05, "loss": 0.7697, "mean_token_accuracy": 0.7731455415487289, "step": 3430 }, { "epoch": 1.3101573676680973, "grad_norm": 6.375, "learning_rate": 1.1262876764593666e-05, "loss": 0.6563, "mean_token_accuracy": 0.8001643344759941, "step": 3435 }, { "epoch": 1.312064854554125, "grad_norm": 5.90625, "learning_rate": 1.1250158972402392e-05, "loss": 0.6893, "mean_token_accuracy": 0.787324196100235, "step": 3440 }, { "epoch": 1.3139723414401527, "grad_norm": 5.40625, "learning_rate": 1.1237441180211117e-05, "loss": 0.6895, "mean_token_accuracy": 0.7794161334633827, "step": 3445 }, { "epoch": 1.3158798283261803, "grad_norm": 6.03125, "learning_rate": 1.1224723388019841e-05, "loss": 0.6712, "mean_token_accuracy": 0.8040449827909469, "step": 3450 }, { "epoch": 1.317787315212208, "grad_norm": 6.28125, "learning_rate": 1.1212005595828566e-05, "loss": 0.7596, "mean_token_accuracy": 0.7752531677484512, "step": 3455 }, { "epoch": 1.3196948020982355, "grad_norm": 6.71875, "learning_rate": 1.1199287803637289e-05, "loss": 0.7522, "mean_token_accuracy": 0.7756654977798462, "step": 3460 }, { "epoch": 1.3216022889842631, "grad_norm": 5.3125, "learning_rate": 1.1186570011446014e-05, "loss": 0.6728, "mean_token_accuracy": 0.7911383360624313, "step": 3465 }, { "epoch": 1.323509775870291, "grad_norm": 5.5625, "learning_rate": 1.117385221925474e-05, "loss": 0.6693, "mean_token_accuracy": 0.7900627195835114, "step": 3470 }, { "epoch": 1.3254172627563185, "grad_norm": 5.3125, "learning_rate": 1.1161134427063463e-05, "loss": 0.8004, "mean_token_accuracy": 0.7619607031345368, "step": 3475 }, { "epoch": 1.3273247496423461, "grad_norm": 5.40625, "learning_rate": 1.1148416634872187e-05, "loss": 0.6342, "mean_token_accuracy": 0.8038983285427094, "step": 3480 }, { "epoch": 1.329232236528374, "grad_norm": 4.84375, "learning_rate": 1.1135698842680911e-05, "loss": 0.6636, "mean_token_accuracy": 0.7987726837396621, "step": 3485 }, { "epoch": 1.3311397234144016, "grad_norm": 5.375, "learning_rate": 1.1122981050489636e-05, "loss": 0.7975, "mean_token_accuracy": 0.7736665666103363, "step": 3490 }, { "epoch": 1.3330472103004292, "grad_norm": 4.375, "learning_rate": 1.1110263258298359e-05, "loss": 0.671, "mean_token_accuracy": 0.7992818504571915, "step": 3495 }, { "epoch": 1.3349546971864568, "grad_norm": 6.28125, "learning_rate": 1.1097545466107084e-05, "loss": 0.7656, "mean_token_accuracy": 0.767841525375843, "step": 3500 }, { "epoch": 1.3368621840724846, "grad_norm": 6.0, "learning_rate": 1.108482767391581e-05, "loss": 0.7973, "mean_token_accuracy": 0.764432281255722, "step": 3505 }, { "epoch": 1.3387696709585122, "grad_norm": 6.0, "learning_rate": 1.1072109881724533e-05, "loss": 0.753, "mean_token_accuracy": 0.7767688825726509, "step": 3510 }, { "epoch": 1.3406771578445398, "grad_norm": 4.5625, "learning_rate": 1.1059392089533259e-05, "loss": 0.7506, "mean_token_accuracy": 0.7745794802904129, "step": 3515 }, { "epoch": 1.3425846447305676, "grad_norm": 5.1875, "learning_rate": 1.1046674297341981e-05, "loss": 0.6841, "mean_token_accuracy": 0.7984428569674492, "step": 3520 }, { "epoch": 1.3444921316165952, "grad_norm": 6.53125, "learning_rate": 1.1033956505150706e-05, "loss": 0.7079, "mean_token_accuracy": 0.7906140476465225, "step": 3525 }, { "epoch": 1.3463996185026228, "grad_norm": 6.125, "learning_rate": 1.102123871295943e-05, "loss": 0.6787, "mean_token_accuracy": 0.7917294785380363, "step": 3530 }, { "epoch": 1.3483071053886504, "grad_norm": 5.40625, "learning_rate": 1.1008520920768156e-05, "loss": 0.7256, "mean_token_accuracy": 0.7792046666145325, "step": 3535 }, { "epoch": 1.350214592274678, "grad_norm": 6.15625, "learning_rate": 1.0995803128576881e-05, "loss": 0.6745, "mean_token_accuracy": 0.794202433526516, "step": 3540 }, { "epoch": 1.3521220791607058, "grad_norm": 5.875, "learning_rate": 1.0983085336385603e-05, "loss": 0.73, "mean_token_accuracy": 0.7751868382096291, "step": 3545 }, { "epoch": 1.3540295660467334, "grad_norm": 4.9375, "learning_rate": 1.0970367544194329e-05, "loss": 0.7783, "mean_token_accuracy": 0.7688572570681572, "step": 3550 }, { "epoch": 1.355937052932761, "grad_norm": 5.125, "learning_rate": 1.0957649752003053e-05, "loss": 0.7878, "mean_token_accuracy": 0.7709324531257152, "step": 3555 }, { "epoch": 1.3578445398187888, "grad_norm": 4.8125, "learning_rate": 1.0944931959811778e-05, "loss": 0.6636, "mean_token_accuracy": 0.799449859559536, "step": 3560 }, { "epoch": 1.3597520267048164, "grad_norm": 5.21875, "learning_rate": 1.0932214167620504e-05, "loss": 0.7111, "mean_token_accuracy": 0.7865828841924667, "step": 3565 }, { "epoch": 1.361659513590844, "grad_norm": 6.6875, "learning_rate": 1.0919496375429226e-05, "loss": 0.7151, "mean_token_accuracy": 0.7823675647377968, "step": 3570 }, { "epoch": 1.3635670004768716, "grad_norm": 4.65625, "learning_rate": 1.0906778583237951e-05, "loss": 0.6872, "mean_token_accuracy": 0.790128941833973, "step": 3575 }, { "epoch": 1.3654744873628994, "grad_norm": 4.75, "learning_rate": 1.0894060791046675e-05, "loss": 0.6401, "mean_token_accuracy": 0.8002132594585418, "step": 3580 }, { "epoch": 1.367381974248927, "grad_norm": 7.5, "learning_rate": 1.08813429988554e-05, "loss": 0.7567, "mean_token_accuracy": 0.77416540235281, "step": 3585 }, { "epoch": 1.3692894611349546, "grad_norm": 5.78125, "learning_rate": 1.0868625206664123e-05, "loss": 0.6764, "mean_token_accuracy": 0.787289160490036, "step": 3590 }, { "epoch": 1.3711969480209825, "grad_norm": 5.1875, "learning_rate": 1.0855907414472848e-05, "loss": 0.7577, "mean_token_accuracy": 0.7855016678571701, "step": 3595 }, { "epoch": 1.37310443490701, "grad_norm": 4.96875, "learning_rate": 1.0843189622281574e-05, "loss": 0.7051, "mean_token_accuracy": 0.7881559386849404, "step": 3600 }, { "epoch": 1.3750119217930377, "grad_norm": 5.28125, "learning_rate": 1.0830471830090298e-05, "loss": 0.7592, "mean_token_accuracy": 0.7797208964824677, "step": 3605 }, { "epoch": 1.3769194086790653, "grad_norm": 4.9375, "learning_rate": 1.0817754037899021e-05, "loss": 0.6396, "mean_token_accuracy": 0.8072021082043648, "step": 3610 }, { "epoch": 1.3788268955650929, "grad_norm": 5.75, "learning_rate": 1.0805036245707745e-05, "loss": 0.7734, "mean_token_accuracy": 0.7690646216273308, "step": 3615 }, { "epoch": 1.3807343824511207, "grad_norm": 5.15625, "learning_rate": 1.079231845351647e-05, "loss": 0.6875, "mean_token_accuracy": 0.7913995578885078, "step": 3620 }, { "epoch": 1.3826418693371483, "grad_norm": 6.03125, "learning_rate": 1.0779600661325196e-05, "loss": 0.6543, "mean_token_accuracy": 0.8033325642347335, "step": 3625 }, { "epoch": 1.384549356223176, "grad_norm": 5.25, "learning_rate": 1.0766882869133918e-05, "loss": 0.7016, "mean_token_accuracy": 0.7871606081724167, "step": 3630 }, { "epoch": 1.3864568431092037, "grad_norm": 5.03125, "learning_rate": 1.0754165076942644e-05, "loss": 0.7866, "mean_token_accuracy": 0.7680317148566246, "step": 3635 }, { "epoch": 1.3883643299952313, "grad_norm": 5.25, "learning_rate": 1.0741447284751368e-05, "loss": 0.6679, "mean_token_accuracy": 0.7980614572763443, "step": 3640 }, { "epoch": 1.390271816881259, "grad_norm": 4.125, "learning_rate": 1.0728729492560093e-05, "loss": 0.7072, "mean_token_accuracy": 0.7797451242804527, "step": 3645 }, { "epoch": 1.3921793037672865, "grad_norm": 6.40625, "learning_rate": 1.0716011700368815e-05, "loss": 0.7171, "mean_token_accuracy": 0.7874438062310218, "step": 3650 }, { "epoch": 1.3940867906533143, "grad_norm": 5.09375, "learning_rate": 1.070329390817754e-05, "loss": 0.7401, "mean_token_accuracy": 0.7841003894805908, "step": 3655 }, { "epoch": 1.395994277539342, "grad_norm": 5.625, "learning_rate": 1.0690576115986266e-05, "loss": 0.6398, "mean_token_accuracy": 0.7971999242901802, "step": 3660 }, { "epoch": 1.3979017644253695, "grad_norm": 5.96875, "learning_rate": 1.067785832379499e-05, "loss": 0.6566, "mean_token_accuracy": 0.8056228026747704, "step": 3665 }, { "epoch": 1.3998092513113973, "grad_norm": 5.78125, "learning_rate": 1.0665140531603715e-05, "loss": 0.6996, "mean_token_accuracy": 0.7968237593770027, "step": 3670 }, { "epoch": 1.401716738197425, "grad_norm": 5.90625, "learning_rate": 1.0652422739412438e-05, "loss": 0.7803, "mean_token_accuracy": 0.7777243912220001, "step": 3675 }, { "epoch": 1.4036242250834525, "grad_norm": 7.125, "learning_rate": 1.0639704947221163e-05, "loss": 0.691, "mean_token_accuracy": 0.7860228613018989, "step": 3680 }, { "epoch": 1.4055317119694801, "grad_norm": 4.6875, "learning_rate": 1.0626987155029889e-05, "loss": 0.6445, "mean_token_accuracy": 0.8084975272417069, "step": 3685 }, { "epoch": 1.407439198855508, "grad_norm": 5.03125, "learning_rate": 1.0614269362838612e-05, "loss": 0.6662, "mean_token_accuracy": 0.7929752454161644, "step": 3690 }, { "epoch": 1.4093466857415355, "grad_norm": 5.90625, "learning_rate": 1.0601551570647338e-05, "loss": 0.7406, "mean_token_accuracy": 0.7814078807830811, "step": 3695 }, { "epoch": 1.4112541726275631, "grad_norm": 6.09375, "learning_rate": 1.058883377845606e-05, "loss": 0.7051, "mean_token_accuracy": 0.7858014822006225, "step": 3700 }, { "epoch": 1.413161659513591, "grad_norm": 4.34375, "learning_rate": 1.0576115986264785e-05, "loss": 0.7126, "mean_token_accuracy": 0.7788995161652565, "step": 3705 }, { "epoch": 1.4150691463996186, "grad_norm": 4.90625, "learning_rate": 1.056339819407351e-05, "loss": 0.7203, "mean_token_accuracy": 0.790349793434143, "step": 3710 }, { "epoch": 1.4169766332856462, "grad_norm": 5.25, "learning_rate": 1.0550680401882235e-05, "loss": 0.7395, "mean_token_accuracy": 0.7782655239105225, "step": 3715 }, { "epoch": 1.4188841201716738, "grad_norm": 4.8125, "learning_rate": 1.0537962609690959e-05, "loss": 0.6044, "mean_token_accuracy": 0.8102027118206024, "step": 3720 }, { "epoch": 1.4207916070577014, "grad_norm": 5.8125, "learning_rate": 1.0525244817499682e-05, "loss": 0.7614, "mean_token_accuracy": 0.7794598892331124, "step": 3725 }, { "epoch": 1.4226990939437292, "grad_norm": 4.78125, "learning_rate": 1.0512527025308408e-05, "loss": 0.694, "mean_token_accuracy": 0.785606586933136, "step": 3730 }, { "epoch": 1.4246065808297568, "grad_norm": 5.9375, "learning_rate": 1.049980923311713e-05, "loss": 0.7178, "mean_token_accuracy": 0.7886698007583618, "step": 3735 }, { "epoch": 1.4265140677157844, "grad_norm": 6.65625, "learning_rate": 1.0487091440925856e-05, "loss": 0.8059, "mean_token_accuracy": 0.7648954957723617, "step": 3740 }, { "epoch": 1.4284215546018122, "grad_norm": 5.0, "learning_rate": 1.0474373648734581e-05, "loss": 0.6423, "mean_token_accuracy": 0.8022066548466682, "step": 3745 }, { "epoch": 1.4303290414878398, "grad_norm": 5.3125, "learning_rate": 1.0461655856543305e-05, "loss": 0.6728, "mean_token_accuracy": 0.7947381287813187, "step": 3750 }, { "epoch": 1.4322365283738674, "grad_norm": 5.875, "learning_rate": 1.044893806435203e-05, "loss": 0.7399, "mean_token_accuracy": 0.7824591249227524, "step": 3755 }, { "epoch": 1.434144015259895, "grad_norm": 5.5625, "learning_rate": 1.0436220272160752e-05, "loss": 0.7446, "mean_token_accuracy": 0.7775179803371429, "step": 3760 }, { "epoch": 1.4360515021459228, "grad_norm": 6.5, "learning_rate": 1.0423502479969478e-05, "loss": 0.763, "mean_token_accuracy": 0.7766611501574516, "step": 3765 }, { "epoch": 1.4379589890319504, "grad_norm": 6.6875, "learning_rate": 1.0410784687778202e-05, "loss": 0.7096, "mean_token_accuracy": 0.7917313784360885, "step": 3770 }, { "epoch": 1.439866475917978, "grad_norm": 5.09375, "learning_rate": 1.0398066895586927e-05, "loss": 0.6619, "mean_token_accuracy": 0.7976467430591583, "step": 3775 }, { "epoch": 1.4417739628040058, "grad_norm": 6.59375, "learning_rate": 1.0385349103395653e-05, "loss": 0.6963, "mean_token_accuracy": 0.7884339362382888, "step": 3780 }, { "epoch": 1.4436814496900334, "grad_norm": 4.59375, "learning_rate": 1.0372631311204375e-05, "loss": 0.7326, "mean_token_accuracy": 0.7857313707470894, "step": 3785 }, { "epoch": 1.445588936576061, "grad_norm": 5.28125, "learning_rate": 1.03599135190131e-05, "loss": 0.6981, "mean_token_accuracy": 0.7860577270388603, "step": 3790 }, { "epoch": 1.4474964234620886, "grad_norm": 5.75, "learning_rate": 1.0347195726821824e-05, "loss": 0.7086, "mean_token_accuracy": 0.7905237257480622, "step": 3795 }, { "epoch": 1.4494039103481162, "grad_norm": 6.375, "learning_rate": 1.033447793463055e-05, "loss": 0.7141, "mean_token_accuracy": 0.7852094903588295, "step": 3800 }, { "epoch": 1.451311397234144, "grad_norm": 5.375, "learning_rate": 1.0321760142439275e-05, "loss": 0.722, "mean_token_accuracy": 0.7839903935790062, "step": 3805 }, { "epoch": 1.4532188841201716, "grad_norm": 5.0625, "learning_rate": 1.0309042350247997e-05, "loss": 0.6321, "mean_token_accuracy": 0.8083162158727646, "step": 3810 }, { "epoch": 1.4551263710061995, "grad_norm": 5.875, "learning_rate": 1.0296324558056723e-05, "loss": 0.7022, "mean_token_accuracy": 0.7952652171254158, "step": 3815 }, { "epoch": 1.457033857892227, "grad_norm": 5.28125, "learning_rate": 1.0283606765865447e-05, "loss": 0.6933, "mean_token_accuracy": 0.7907264307141304, "step": 3820 }, { "epoch": 1.4589413447782547, "grad_norm": 6.8125, "learning_rate": 1.0270888973674172e-05, "loss": 0.7134, "mean_token_accuracy": 0.7855746790766716, "step": 3825 }, { "epoch": 1.4608488316642823, "grad_norm": 5.6875, "learning_rate": 1.0258171181482894e-05, "loss": 0.7339, "mean_token_accuracy": 0.7784282520413399, "step": 3830 }, { "epoch": 1.4627563185503099, "grad_norm": 6.0625, "learning_rate": 1.024545338929162e-05, "loss": 0.6768, "mean_token_accuracy": 0.794599574804306, "step": 3835 }, { "epoch": 1.4646638054363377, "grad_norm": 5.875, "learning_rate": 1.0232735597100345e-05, "loss": 0.7027, "mean_token_accuracy": 0.7912124559283257, "step": 3840 }, { "epoch": 1.4665712923223653, "grad_norm": 6.1875, "learning_rate": 1.0220017804909069e-05, "loss": 0.6917, "mean_token_accuracy": 0.7966108277440072, "step": 3845 }, { "epoch": 1.4684787792083929, "grad_norm": 5.21875, "learning_rate": 1.0207300012717793e-05, "loss": 0.7194, "mean_token_accuracy": 0.7810886323451995, "step": 3850 }, { "epoch": 1.4703862660944207, "grad_norm": 5.4375, "learning_rate": 1.0194582220526517e-05, "loss": 0.7036, "mean_token_accuracy": 0.7848643451929093, "step": 3855 }, { "epoch": 1.4722937529804483, "grad_norm": 5.0625, "learning_rate": 1.0181864428335242e-05, "loss": 0.6481, "mean_token_accuracy": 0.7968376025557518, "step": 3860 }, { "epoch": 1.4742012398664759, "grad_norm": 4.59375, "learning_rate": 1.0169146636143968e-05, "loss": 0.881, "mean_token_accuracy": 0.772949455678463, "step": 3865 }, { "epoch": 1.4761087267525035, "grad_norm": 4.875, "learning_rate": 1.015642884395269e-05, "loss": 0.7054, "mean_token_accuracy": 0.7943061590194702, "step": 3870 }, { "epoch": 1.4780162136385313, "grad_norm": 5.53125, "learning_rate": 1.0143711051761415e-05, "loss": 0.6809, "mean_token_accuracy": 0.7910932347178459, "step": 3875 }, { "epoch": 1.479923700524559, "grad_norm": 4.875, "learning_rate": 1.0130993259570139e-05, "loss": 0.7266, "mean_token_accuracy": 0.7764055386185647, "step": 3880 }, { "epoch": 1.4818311874105865, "grad_norm": 4.53125, "learning_rate": 1.0118275467378865e-05, "loss": 0.6531, "mean_token_accuracy": 0.8028417259454728, "step": 3885 }, { "epoch": 1.4837386742966143, "grad_norm": 5.8125, "learning_rate": 1.0105557675187587e-05, "loss": 0.7847, "mean_token_accuracy": 0.7660407572984695, "step": 3890 }, { "epoch": 1.485646161182642, "grad_norm": 5.1875, "learning_rate": 1.0092839882996312e-05, "loss": 0.6556, "mean_token_accuracy": 0.7957696139812469, "step": 3895 }, { "epoch": 1.4875536480686695, "grad_norm": 5.28125, "learning_rate": 1.0080122090805038e-05, "loss": 0.7379, "mean_token_accuracy": 0.7747633382678032, "step": 3900 }, { "epoch": 1.4894611349546971, "grad_norm": 5.125, "learning_rate": 1.0067404298613761e-05, "loss": 0.6545, "mean_token_accuracy": 0.8000090837478637, "step": 3905 }, { "epoch": 1.4913686218407247, "grad_norm": 5.3125, "learning_rate": 1.0054686506422487e-05, "loss": 0.6919, "mean_token_accuracy": 0.7933750420808792, "step": 3910 }, { "epoch": 1.4932761087267525, "grad_norm": 5.15625, "learning_rate": 1.0041968714231209e-05, "loss": 0.8278, "mean_token_accuracy": 0.754620935022831, "step": 3915 }, { "epoch": 1.4951835956127801, "grad_norm": 4.5, "learning_rate": 1.0029250922039935e-05, "loss": 0.6829, "mean_token_accuracy": 0.7846381962299347, "step": 3920 }, { "epoch": 1.497091082498808, "grad_norm": 5.875, "learning_rate": 1.001653312984866e-05, "loss": 0.729, "mean_token_accuracy": 0.7845353007316589, "step": 3925 }, { "epoch": 1.4989985693848356, "grad_norm": 5.46875, "learning_rate": 1.0003815337657384e-05, "loss": 0.7197, "mean_token_accuracy": 0.7817666932940484, "step": 3930 }, { "epoch": 1.5009060562708632, "grad_norm": 6.59375, "learning_rate": 9.991097545466108e-06, "loss": 0.7118, "mean_token_accuracy": 0.7892649322748184, "step": 3935 }, { "epoch": 1.5028135431568908, "grad_norm": 5.0625, "learning_rate": 9.978379753274833e-06, "loss": 0.6926, "mean_token_accuracy": 0.7978235989809036, "step": 3940 }, { "epoch": 1.5047210300429184, "grad_norm": 5.84375, "learning_rate": 9.965661961083557e-06, "loss": 0.7213, "mean_token_accuracy": 0.781156699359417, "step": 3945 }, { "epoch": 1.506628516928946, "grad_norm": 4.84375, "learning_rate": 9.95294416889228e-06, "loss": 0.7274, "mean_token_accuracy": 0.7924999624490738, "step": 3950 }, { "epoch": 1.5085360038149738, "grad_norm": 4.75, "learning_rate": 9.940226376701006e-06, "loss": 0.7102, "mean_token_accuracy": 0.7911828309297562, "step": 3955 }, { "epoch": 1.5104434907010016, "grad_norm": 7.21875, "learning_rate": 9.92750858450973e-06, "loss": 0.7476, "mean_token_accuracy": 0.7776550844311714, "step": 3960 }, { "epoch": 1.5123509775870292, "grad_norm": 5.75, "learning_rate": 9.914790792318454e-06, "loss": 0.6937, "mean_token_accuracy": 0.7937393441796303, "step": 3965 }, { "epoch": 1.5142584644730568, "grad_norm": 5.75, "learning_rate": 9.90207300012718e-06, "loss": 0.6873, "mean_token_accuracy": 0.7911945432424545, "step": 3970 }, { "epoch": 1.5161659513590844, "grad_norm": 5.15625, "learning_rate": 9.889355207935903e-06, "loss": 0.6954, "mean_token_accuracy": 0.7914523154497146, "step": 3975 }, { "epoch": 1.518073438245112, "grad_norm": 5.71875, "learning_rate": 9.876637415744627e-06, "loss": 0.7153, "mean_token_accuracy": 0.7951628744602204, "step": 3980 }, { "epoch": 1.5199809251311396, "grad_norm": 5.90625, "learning_rate": 9.863919623553352e-06, "loss": 0.7331, "mean_token_accuracy": 0.7814614549279213, "step": 3985 }, { "epoch": 1.5218884120171674, "grad_norm": 6.53125, "learning_rate": 9.851201831362076e-06, "loss": 0.6445, "mean_token_accuracy": 0.7996955350041389, "step": 3990 }, { "epoch": 1.523795898903195, "grad_norm": 5.0625, "learning_rate": 9.8384840391708e-06, "loss": 0.698, "mean_token_accuracy": 0.7938384771347046, "step": 3995 }, { "epoch": 1.5257033857892228, "grad_norm": 4.65625, "learning_rate": 9.825766246979526e-06, "loss": 0.7347, "mean_token_accuracy": 0.7821006596088409, "step": 4000 }, { "epoch": 1.5276108726752504, "grad_norm": 4.71875, "learning_rate": 9.81304845478825e-06, "loss": 0.6667, "mean_token_accuracy": 0.798861163854599, "step": 4005 }, { "epoch": 1.529518359561278, "grad_norm": 5.1875, "learning_rate": 9.800330662596975e-06, "loss": 0.6878, "mean_token_accuracy": 0.7857447057962418, "step": 4010 }, { "epoch": 1.5314258464473056, "grad_norm": 7.0, "learning_rate": 9.787612870405699e-06, "loss": 0.6711, "mean_token_accuracy": 0.7897988051176071, "step": 4015 }, { "epoch": 1.5333333333333332, "grad_norm": 5.25, "learning_rate": 9.774895078214423e-06, "loss": 0.682, "mean_token_accuracy": 0.7865668818354606, "step": 4020 }, { "epoch": 1.535240820219361, "grad_norm": 5.75, "learning_rate": 9.762177286023146e-06, "loss": 0.6479, "mean_token_accuracy": 0.7969919174909592, "step": 4025 }, { "epoch": 1.5371483071053886, "grad_norm": 4.78125, "learning_rate": 9.749459493831872e-06, "loss": 0.6849, "mean_token_accuracy": 0.7866938337683678, "step": 4030 }, { "epoch": 1.5390557939914165, "grad_norm": 6.03125, "learning_rate": 9.736741701640596e-06, "loss": 0.7675, "mean_token_accuracy": 0.7690645560622216, "step": 4035 }, { "epoch": 1.540963280877444, "grad_norm": 5.03125, "learning_rate": 9.724023909449321e-06, "loss": 0.7621, "mean_token_accuracy": 0.7758720055222511, "step": 4040 }, { "epoch": 1.5428707677634717, "grad_norm": 5.5, "learning_rate": 9.711306117258045e-06, "loss": 0.7452, "mean_token_accuracy": 0.783280897140503, "step": 4045 }, { "epoch": 1.5447782546494992, "grad_norm": 5.78125, "learning_rate": 9.698588325066769e-06, "loss": 0.6976, "mean_token_accuracy": 0.7859001785516739, "step": 4050 }, { "epoch": 1.5466857415355268, "grad_norm": 5.34375, "learning_rate": 9.685870532875493e-06, "loss": 0.7175, "mean_token_accuracy": 0.7929377570748329, "step": 4055 }, { "epoch": 1.5485932284215544, "grad_norm": 4.96875, "learning_rate": 9.673152740684218e-06, "loss": 0.6568, "mean_token_accuracy": 0.8060544520616532, "step": 4060 }, { "epoch": 1.5505007153075823, "grad_norm": 6.15625, "learning_rate": 9.660434948492944e-06, "loss": 0.738, "mean_token_accuracy": 0.7754119589924813, "step": 4065 }, { "epoch": 1.5524082021936099, "grad_norm": 5.84375, "learning_rate": 9.647717156301667e-06, "loss": 0.6538, "mean_token_accuracy": 0.7938358634710312, "step": 4070 }, { "epoch": 1.5543156890796377, "grad_norm": 5.9375, "learning_rate": 9.634999364110391e-06, "loss": 0.6854, "mean_token_accuracy": 0.7839797839522362, "step": 4075 }, { "epoch": 1.5562231759656653, "grad_norm": 5.125, "learning_rate": 9.622281571919115e-06, "loss": 0.6888, "mean_token_accuracy": 0.7812078207731247, "step": 4080 }, { "epoch": 1.5581306628516929, "grad_norm": 4.65625, "learning_rate": 9.609563779727839e-06, "loss": 0.7103, "mean_token_accuracy": 0.7829771265387535, "step": 4085 }, { "epoch": 1.5600381497377205, "grad_norm": 6.25, "learning_rate": 9.596845987536564e-06, "loss": 0.6588, "mean_token_accuracy": 0.798148213326931, "step": 4090 }, { "epoch": 1.561945636623748, "grad_norm": 5.375, "learning_rate": 9.58412819534529e-06, "loss": 0.6851, "mean_token_accuracy": 0.7942114874720574, "step": 4095 }, { "epoch": 1.563853123509776, "grad_norm": 6.21875, "learning_rate": 9.571410403154014e-06, "loss": 0.7434, "mean_token_accuracy": 0.7808380603790284, "step": 4100 }, { "epoch": 1.5657606103958035, "grad_norm": 4.90625, "learning_rate": 9.558692610962737e-06, "loss": 0.715, "mean_token_accuracy": 0.785970288515091, "step": 4105 }, { "epoch": 1.5676680972818313, "grad_norm": 5.21875, "learning_rate": 9.545974818771461e-06, "loss": 0.6222, "mean_token_accuracy": 0.8164134413003922, "step": 4110 }, { "epoch": 1.569575584167859, "grad_norm": 4.875, "learning_rate": 9.533257026580187e-06, "loss": 0.7092, "mean_token_accuracy": 0.7927107289433479, "step": 4115 }, { "epoch": 1.5714830710538865, "grad_norm": 5.90625, "learning_rate": 9.520539234388912e-06, "loss": 0.7766, "mean_token_accuracy": 0.7675907790660859, "step": 4120 }, { "epoch": 1.5733905579399141, "grad_norm": 6.125, "learning_rate": 9.507821442197636e-06, "loss": 0.6413, "mean_token_accuracy": 0.7978465214371682, "step": 4125 }, { "epoch": 1.5752980448259417, "grad_norm": 6.3125, "learning_rate": 9.49510365000636e-06, "loss": 0.7322, "mean_token_accuracy": 0.7795029923319816, "step": 4130 }, { "epoch": 1.5772055317119695, "grad_norm": 5.78125, "learning_rate": 9.482385857815084e-06, "loss": 0.721, "mean_token_accuracy": 0.7786308526992798, "step": 4135 }, { "epoch": 1.5791130185979971, "grad_norm": 4.96875, "learning_rate": 9.469668065623809e-06, "loss": 0.6567, "mean_token_accuracy": 0.803253422677517, "step": 4140 }, { "epoch": 1.581020505484025, "grad_norm": 6.71875, "learning_rate": 9.456950273432533e-06, "loss": 0.7418, "mean_token_accuracy": 0.7790598273277283, "step": 4145 }, { "epoch": 1.5829279923700526, "grad_norm": 6.28125, "learning_rate": 9.444232481241258e-06, "loss": 0.6787, "mean_token_accuracy": 0.7986498147249221, "step": 4150 }, { "epoch": 1.5848354792560801, "grad_norm": 5.21875, "learning_rate": 9.431514689049982e-06, "loss": 0.684, "mean_token_accuracy": 0.7936962231993675, "step": 4155 }, { "epoch": 1.5867429661421077, "grad_norm": 5.21875, "learning_rate": 9.418796896858706e-06, "loss": 0.6595, "mean_token_accuracy": 0.8023280754685402, "step": 4160 }, { "epoch": 1.5886504530281353, "grad_norm": 4.4375, "learning_rate": 9.40607910466743e-06, "loss": 0.7351, "mean_token_accuracy": 0.7782548010349274, "step": 4165 }, { "epoch": 1.590557939914163, "grad_norm": 5.875, "learning_rate": 9.393361312476155e-06, "loss": 0.6161, "mean_token_accuracy": 0.7823518723249435, "step": 4170 }, { "epoch": 1.5924654268001908, "grad_norm": 5.40625, "learning_rate": 9.380643520284879e-06, "loss": 0.7431, "mean_token_accuracy": 0.7842708334326745, "step": 4175 }, { "epoch": 1.5943729136862184, "grad_norm": 6.53125, "learning_rate": 9.367925728093605e-06, "loss": 0.6682, "mean_token_accuracy": 0.794320285320282, "step": 4180 }, { "epoch": 1.5962804005722462, "grad_norm": 5.0625, "learning_rate": 9.355207935902328e-06, "loss": 0.6784, "mean_token_accuracy": 0.7968481913208961, "step": 4185 }, { "epoch": 1.5981878874582738, "grad_norm": 5.5, "learning_rate": 9.342490143711052e-06, "loss": 0.6813, "mean_token_accuracy": 0.7989706471562386, "step": 4190 }, { "epoch": 1.6000953743443014, "grad_norm": 6.21875, "learning_rate": 9.329772351519778e-06, "loss": 0.6784, "mean_token_accuracy": 0.7907978564500808, "step": 4195 }, { "epoch": 1.602002861230329, "grad_norm": 7.09375, "learning_rate": 9.317054559328502e-06, "loss": 0.7648, "mean_token_accuracy": 0.7642725974321365, "step": 4200 }, { "epoch": 1.6039103481163566, "grad_norm": 5.84375, "learning_rate": 9.304336767137225e-06, "loss": 0.7262, "mean_token_accuracy": 0.7915614515542984, "step": 4205 }, { "epoch": 1.6058178350023844, "grad_norm": 5.15625, "learning_rate": 9.29161897494595e-06, "loss": 0.6314, "mean_token_accuracy": 0.8063288360834122, "step": 4210 }, { "epoch": 1.607725321888412, "grad_norm": 4.8125, "learning_rate": 9.278901182754675e-06, "loss": 0.6453, "mean_token_accuracy": 0.8019056558609009, "step": 4215 }, { "epoch": 1.6096328087744398, "grad_norm": 5.71875, "learning_rate": 9.266183390563398e-06, "loss": 0.7563, "mean_token_accuracy": 0.7762271568179131, "step": 4220 }, { "epoch": 1.6115402956604674, "grad_norm": 5.0, "learning_rate": 9.253465598372124e-06, "loss": 0.6171, "mean_token_accuracy": 0.8098215907812119, "step": 4225 }, { "epoch": 1.613447782546495, "grad_norm": 6.5, "learning_rate": 9.240747806180848e-06, "loss": 0.7252, "mean_token_accuracy": 0.7729369938373566, "step": 4230 }, { "epoch": 1.6153552694325226, "grad_norm": 5.75, "learning_rate": 9.228030013989572e-06, "loss": 0.6851, "mean_token_accuracy": 0.7906094118952751, "step": 4235 }, { "epoch": 1.6172627563185502, "grad_norm": 5.0, "learning_rate": 9.215312221798297e-06, "loss": 0.6927, "mean_token_accuracy": 0.7865256935358047, "step": 4240 }, { "epoch": 1.6191702432045778, "grad_norm": 4.96875, "learning_rate": 9.202594429607021e-06, "loss": 0.6688, "mean_token_accuracy": 0.7988918453454972, "step": 4245 }, { "epoch": 1.6210777300906056, "grad_norm": 5.625, "learning_rate": 9.189876637415746e-06, "loss": 0.6652, "mean_token_accuracy": 0.7949412643909455, "step": 4250 }, { "epoch": 1.6229852169766334, "grad_norm": 5.1875, "learning_rate": 9.17715884522447e-06, "loss": 0.7182, "mean_token_accuracy": 0.7824036061763764, "step": 4255 }, { "epoch": 1.624892703862661, "grad_norm": 4.8125, "learning_rate": 9.164441053033194e-06, "loss": 0.6984, "mean_token_accuracy": 0.7907288119196891, "step": 4260 }, { "epoch": 1.6268001907486886, "grad_norm": 5.5, "learning_rate": 9.151723260841918e-06, "loss": 0.6665, "mean_token_accuracy": 0.7920968532562256, "step": 4265 }, { "epoch": 1.6287076776347162, "grad_norm": 6.125, "learning_rate": 9.139005468650643e-06, "loss": 0.6316, "mean_token_accuracy": 0.8020718723535538, "step": 4270 }, { "epoch": 1.6306151645207438, "grad_norm": 5.3125, "learning_rate": 9.126287676459367e-06, "loss": 0.6571, "mean_token_accuracy": 0.8011513471603393, "step": 4275 }, { "epoch": 1.6325226514067714, "grad_norm": 6.03125, "learning_rate": 9.113569884268093e-06, "loss": 0.7276, "mean_token_accuracy": 0.7811956480145454, "step": 4280 }, { "epoch": 1.6344301382927993, "grad_norm": 6.28125, "learning_rate": 9.100852092076816e-06, "loss": 0.7329, "mean_token_accuracy": 0.775389178097248, "step": 4285 }, { "epoch": 1.6363376251788269, "grad_norm": 4.71875, "learning_rate": 9.08813429988554e-06, "loss": 0.6592, "mean_token_accuracy": 0.7980861410498619, "step": 4290 }, { "epoch": 1.6382451120648547, "grad_norm": 4.71875, "learning_rate": 9.075416507694264e-06, "loss": 0.6433, "mean_token_accuracy": 0.8049220308661461, "step": 4295 }, { "epoch": 1.6401525989508823, "grad_norm": 5.65625, "learning_rate": 9.06269871550299e-06, "loss": 0.6726, "mean_token_accuracy": 0.7945365980267525, "step": 4300 }, { "epoch": 1.6420600858369099, "grad_norm": 5.75, "learning_rate": 9.049980923311715e-06, "loss": 0.7318, "mean_token_accuracy": 0.7869562119245529, "step": 4305 }, { "epoch": 1.6439675727229375, "grad_norm": 6.5625, "learning_rate": 9.037263131120439e-06, "loss": 0.7299, "mean_token_accuracy": 0.7817317515611648, "step": 4310 }, { "epoch": 1.645875059608965, "grad_norm": 4.59375, "learning_rate": 9.024545338929163e-06, "loss": 0.7227, "mean_token_accuracy": 0.7855249509215355, "step": 4315 }, { "epoch": 1.647782546494993, "grad_norm": 5.40625, "learning_rate": 9.011827546737886e-06, "loss": 0.6917, "mean_token_accuracy": 0.7888121485710144, "step": 4320 }, { "epoch": 1.6496900333810205, "grad_norm": 4.90625, "learning_rate": 8.99910975454661e-06, "loss": 0.6699, "mean_token_accuracy": 0.7999153479933738, "step": 4325 }, { "epoch": 1.6515975202670483, "grad_norm": 5.0, "learning_rate": 8.986391962355336e-06, "loss": 0.7176, "mean_token_accuracy": 0.7791346043348313, "step": 4330 }, { "epoch": 1.653505007153076, "grad_norm": 6.28125, "learning_rate": 8.973674170164061e-06, "loss": 0.759, "mean_token_accuracy": 0.7725300207734108, "step": 4335 }, { "epoch": 1.6554124940391035, "grad_norm": 4.53125, "learning_rate": 8.960956377972785e-06, "loss": 0.6885, "mean_token_accuracy": 0.786538128554821, "step": 4340 }, { "epoch": 1.657319980925131, "grad_norm": 5.0625, "learning_rate": 8.948238585781509e-06, "loss": 0.639, "mean_token_accuracy": 0.8017129465937615, "step": 4345 }, { "epoch": 1.6592274678111587, "grad_norm": 4.84375, "learning_rate": 8.935520793590233e-06, "loss": 0.8396, "mean_token_accuracy": 0.759829866886139, "step": 4350 }, { "epoch": 1.6611349546971863, "grad_norm": 6.6875, "learning_rate": 8.922803001398958e-06, "loss": 0.6782, "mean_token_accuracy": 0.7945697858929635, "step": 4355 }, { "epoch": 1.6630424415832141, "grad_norm": 5.25, "learning_rate": 8.910085209207684e-06, "loss": 0.6431, "mean_token_accuracy": 0.8029188916087151, "step": 4360 }, { "epoch": 1.6649499284692417, "grad_norm": 5.0625, "learning_rate": 8.897367417016407e-06, "loss": 0.7161, "mean_token_accuracy": 0.781055423617363, "step": 4365 }, { "epoch": 1.6668574153552695, "grad_norm": 5.21875, "learning_rate": 8.884649624825131e-06, "loss": 0.6356, "mean_token_accuracy": 0.8030644968152046, "step": 4370 }, { "epoch": 1.6687649022412971, "grad_norm": 5.625, "learning_rate": 8.871931832633855e-06, "loss": 0.699, "mean_token_accuracy": 0.7894296318292617, "step": 4375 }, { "epoch": 1.6706723891273247, "grad_norm": 5.625, "learning_rate": 8.859214040442579e-06, "loss": 0.6767, "mean_token_accuracy": 0.7901316702365875, "step": 4380 }, { "epoch": 1.6725798760133523, "grad_norm": 4.9375, "learning_rate": 8.846496248251304e-06, "loss": 0.7094, "mean_token_accuracy": 0.7896265298128128, "step": 4385 }, { "epoch": 1.67448736289938, "grad_norm": 5.46875, "learning_rate": 8.833778456060028e-06, "loss": 0.7371, "mean_token_accuracy": 0.7754695668816567, "step": 4390 }, { "epoch": 1.6763948497854078, "grad_norm": 5.46875, "learning_rate": 8.821060663868754e-06, "loss": 0.7375, "mean_token_accuracy": 0.7802638679742813, "step": 4395 }, { "epoch": 1.6783023366714354, "grad_norm": 5.46875, "learning_rate": 8.808342871677477e-06, "loss": 0.6407, "mean_token_accuracy": 0.8061942294239998, "step": 4400 }, { "epoch": 1.6802098235574632, "grad_norm": 5.8125, "learning_rate": 8.795625079486201e-06, "loss": 0.7094, "mean_token_accuracy": 0.7912459701299668, "step": 4405 }, { "epoch": 1.6821173104434908, "grad_norm": 5.96875, "learning_rate": 8.782907287294927e-06, "loss": 0.6832, "mean_token_accuracy": 0.7963031128048896, "step": 4410 }, { "epoch": 1.6840247973295184, "grad_norm": 4.875, "learning_rate": 8.77018949510365e-06, "loss": 0.6857, "mean_token_accuracy": 0.8052731156349182, "step": 4415 }, { "epoch": 1.685932284215546, "grad_norm": 5.21875, "learning_rate": 8.757471702912374e-06, "loss": 0.7246, "mean_token_accuracy": 0.7847167909145355, "step": 4420 }, { "epoch": 1.6878397711015736, "grad_norm": 5.25, "learning_rate": 8.7447539107211e-06, "loss": 0.7078, "mean_token_accuracy": 0.7909077003598213, "step": 4425 }, { "epoch": 1.6897472579876012, "grad_norm": 5.09375, "learning_rate": 8.732036118529824e-06, "loss": 0.7117, "mean_token_accuracy": 0.7859697327017784, "step": 4430 }, { "epoch": 1.691654744873629, "grad_norm": 5.09375, "learning_rate": 8.719318326338548e-06, "loss": 0.7079, "mean_token_accuracy": 0.7907924354076385, "step": 4435 }, { "epoch": 1.6935622317596568, "grad_norm": 5.34375, "learning_rate": 8.706600534147273e-06, "loss": 0.7293, "mean_token_accuracy": 0.7734788835048676, "step": 4440 }, { "epoch": 1.6954697186456844, "grad_norm": 5.15625, "learning_rate": 8.693882741955997e-06, "loss": 0.6904, "mean_token_accuracy": 0.7842113867402076, "step": 4445 }, { "epoch": 1.697377205531712, "grad_norm": 4.65625, "learning_rate": 8.68116494976472e-06, "loss": 0.6981, "mean_token_accuracy": 0.7911527335643769, "step": 4450 }, { "epoch": 1.6992846924177396, "grad_norm": 5.625, "learning_rate": 8.668447157573446e-06, "loss": 0.7824, "mean_token_accuracy": 0.7754535049200058, "step": 4455 }, { "epoch": 1.7011921793037672, "grad_norm": 6.90625, "learning_rate": 8.65572936538217e-06, "loss": 0.7218, "mean_token_accuracy": 0.784953074157238, "step": 4460 }, { "epoch": 1.7030996661897948, "grad_norm": 5.8125, "learning_rate": 8.643011573190895e-06, "loss": 0.7838, "mean_token_accuracy": 0.777295109629631, "step": 4465 }, { "epoch": 1.7050071530758226, "grad_norm": 5.0625, "learning_rate": 8.63029378099962e-06, "loss": 0.6843, "mean_token_accuracy": 0.7970955744385719, "step": 4470 }, { "epoch": 1.7069146399618502, "grad_norm": 5.65625, "learning_rate": 8.617575988808343e-06, "loss": 0.6861, "mean_token_accuracy": 0.7880251660943032, "step": 4475 }, { "epoch": 1.708822126847878, "grad_norm": 5.1875, "learning_rate": 8.604858196617067e-06, "loss": 0.6557, "mean_token_accuracy": 0.8084867134690285, "step": 4480 }, { "epoch": 1.7107296137339056, "grad_norm": 6.1875, "learning_rate": 8.592140404425792e-06, "loss": 0.6653, "mean_token_accuracy": 0.7949703514575959, "step": 4485 }, { "epoch": 1.7126371006199332, "grad_norm": 5.5, "learning_rate": 8.579422612234518e-06, "loss": 0.6825, "mean_token_accuracy": 0.7934735119342804, "step": 4490 }, { "epoch": 1.7145445875059608, "grad_norm": 5.625, "learning_rate": 8.566704820043242e-06, "loss": 0.6551, "mean_token_accuracy": 0.8140067383646965, "step": 4495 }, { "epoch": 1.7164520743919884, "grad_norm": 6.0625, "learning_rate": 8.553987027851965e-06, "loss": 0.7623, "mean_token_accuracy": 0.7625874444842339, "step": 4500 }, { "epoch": 1.7183595612780163, "grad_norm": 4.75, "learning_rate": 8.54126923566069e-06, "loss": 0.6702, "mean_token_accuracy": 0.79883813560009, "step": 4505 }, { "epoch": 1.7202670481640439, "grad_norm": 5.6875, "learning_rate": 8.528551443469413e-06, "loss": 0.6767, "mean_token_accuracy": 0.8004112139344215, "step": 4510 }, { "epoch": 1.7221745350500717, "grad_norm": 5.125, "learning_rate": 8.515833651278139e-06, "loss": 0.6816, "mean_token_accuracy": 0.789125694334507, "step": 4515 }, { "epoch": 1.7240820219360993, "grad_norm": 4.8125, "learning_rate": 8.503115859086864e-06, "loss": 0.6976, "mean_token_accuracy": 0.7907531931996346, "step": 4520 }, { "epoch": 1.7259895088221269, "grad_norm": 5.0625, "learning_rate": 8.490398066895588e-06, "loss": 0.7044, "mean_token_accuracy": 0.7968610107898713, "step": 4525 }, { "epoch": 1.7278969957081545, "grad_norm": 5.5625, "learning_rate": 8.477680274704312e-06, "loss": 0.7121, "mean_token_accuracy": 0.7842711061239243, "step": 4530 }, { "epoch": 1.729804482594182, "grad_norm": 6.09375, "learning_rate": 8.464962482513035e-06, "loss": 0.672, "mean_token_accuracy": 0.7987869158387184, "step": 4535 }, { "epoch": 1.7317119694802097, "grad_norm": 6.21875, "learning_rate": 8.452244690321761e-06, "loss": 0.7427, "mean_token_accuracy": 0.7810085907578468, "step": 4540 }, { "epoch": 1.7336194563662375, "grad_norm": 6.375, "learning_rate": 8.439526898130486e-06, "loss": 0.6892, "mean_token_accuracy": 0.7918313056230545, "step": 4545 }, { "epoch": 1.735526943252265, "grad_norm": 5.3125, "learning_rate": 8.42680910593921e-06, "loss": 0.7126, "mean_token_accuracy": 0.7846375346183777, "step": 4550 }, { "epoch": 1.737434430138293, "grad_norm": 4.875, "learning_rate": 8.414091313747934e-06, "loss": 0.6309, "mean_token_accuracy": 0.8014385819435119, "step": 4555 }, { "epoch": 1.7393419170243205, "grad_norm": 6.0, "learning_rate": 8.401373521556658e-06, "loss": 0.7012, "mean_token_accuracy": 0.7848466068506241, "step": 4560 }, { "epoch": 1.741249403910348, "grad_norm": 6.0625, "learning_rate": 8.388655729365382e-06, "loss": 0.6944, "mean_token_accuracy": 0.7874592751264572, "step": 4565 }, { "epoch": 1.7431568907963757, "grad_norm": 5.5, "learning_rate": 8.375937937174107e-06, "loss": 0.7152, "mean_token_accuracy": 0.7913419619202614, "step": 4570 }, { "epoch": 1.7450643776824033, "grad_norm": 5.15625, "learning_rate": 8.363220144982833e-06, "loss": 0.6847, "mean_token_accuracy": 0.7942621618509292, "step": 4575 }, { "epoch": 1.7469718645684311, "grad_norm": 4.75, "learning_rate": 8.350502352791557e-06, "loss": 0.6101, "mean_token_accuracy": 0.8068650275468826, "step": 4580 }, { "epoch": 1.7488793514544587, "grad_norm": 5.53125, "learning_rate": 8.33778456060028e-06, "loss": 0.7196, "mean_token_accuracy": 0.7823515102267266, "step": 4585 }, { "epoch": 1.7507868383404865, "grad_norm": 5.90625, "learning_rate": 8.325066768409004e-06, "loss": 0.6743, "mean_token_accuracy": 0.7903394609689712, "step": 4590 }, { "epoch": 1.7526943252265141, "grad_norm": 5.875, "learning_rate": 8.31234897621773e-06, "loss": 0.6551, "mean_token_accuracy": 0.7948520883917809, "step": 4595 }, { "epoch": 1.7546018121125417, "grad_norm": 6.09375, "learning_rate": 8.299631184026453e-06, "loss": 0.7097, "mean_token_accuracy": 0.7938745290040969, "step": 4600 }, { "epoch": 1.7565092989985693, "grad_norm": 5.65625, "learning_rate": 8.286913391835179e-06, "loss": 0.6904, "mean_token_accuracy": 0.7947997763752938, "step": 4605 }, { "epoch": 1.758416785884597, "grad_norm": 6.375, "learning_rate": 8.274195599643903e-06, "loss": 0.7108, "mean_token_accuracy": 0.7907149016857147, "step": 4610 }, { "epoch": 1.7603242727706248, "grad_norm": 6.0, "learning_rate": 8.261477807452627e-06, "loss": 0.7304, "mean_token_accuracy": 0.777514460682869, "step": 4615 }, { "epoch": 1.7622317596566524, "grad_norm": 6.09375, "learning_rate": 8.24876001526135e-06, "loss": 0.6733, "mean_token_accuracy": 0.7971729338169098, "step": 4620 }, { "epoch": 1.7641392465426802, "grad_norm": 5.1875, "learning_rate": 8.236042223070076e-06, "loss": 0.7006, "mean_token_accuracy": 0.7935417667031288, "step": 4625 }, { "epoch": 1.7660467334287078, "grad_norm": 6.125, "learning_rate": 8.2233244308788e-06, "loss": 0.6794, "mean_token_accuracy": 0.8022807404398918, "step": 4630 }, { "epoch": 1.7679542203147354, "grad_norm": 5.125, "learning_rate": 8.210606638687525e-06, "loss": 0.7911, "mean_token_accuracy": 0.7670699298381806, "step": 4635 }, { "epoch": 1.769861707200763, "grad_norm": 5.46875, "learning_rate": 8.197888846496249e-06, "loss": 0.689, "mean_token_accuracy": 0.78763447701931, "step": 4640 }, { "epoch": 1.7717691940867906, "grad_norm": 6.28125, "learning_rate": 8.185171054304973e-06, "loss": 0.6695, "mean_token_accuracy": 0.8034919664263726, "step": 4645 }, { "epoch": 1.7736766809728182, "grad_norm": 5.0625, "learning_rate": 8.172453262113698e-06, "loss": 0.6928, "mean_token_accuracy": 0.7899195536971092, "step": 4650 }, { "epoch": 1.775584167858846, "grad_norm": 5.59375, "learning_rate": 8.159735469922422e-06, "loss": 0.7525, "mean_token_accuracy": 0.7785638749599457, "step": 4655 }, { "epoch": 1.7774916547448736, "grad_norm": 6.96875, "learning_rate": 8.147017677731146e-06, "loss": 0.5964, "mean_token_accuracy": 0.8081588611006737, "step": 4660 }, { "epoch": 1.7793991416309014, "grad_norm": 5.625, "learning_rate": 8.134299885539871e-06, "loss": 0.7501, "mean_token_accuracy": 0.7743466660380364, "step": 4665 }, { "epoch": 1.781306628516929, "grad_norm": 6.4375, "learning_rate": 8.121582093348595e-06, "loss": 0.75, "mean_token_accuracy": 0.7758030876517296, "step": 4670 }, { "epoch": 1.7832141154029566, "grad_norm": 5.125, "learning_rate": 8.108864301157319e-06, "loss": 0.6643, "mean_token_accuracy": 0.7975737527012825, "step": 4675 }, { "epoch": 1.7851216022889842, "grad_norm": 6.03125, "learning_rate": 8.096146508966044e-06, "loss": 0.7249, "mean_token_accuracy": 0.7859432637691498, "step": 4680 }, { "epoch": 1.7870290891750118, "grad_norm": 6.8125, "learning_rate": 8.083428716774768e-06, "loss": 0.7242, "mean_token_accuracy": 0.7819555312395096, "step": 4685 }, { "epoch": 1.7889365760610396, "grad_norm": 5.28125, "learning_rate": 8.070710924583492e-06, "loss": 0.6617, "mean_token_accuracy": 0.8005429416894912, "step": 4690 }, { "epoch": 1.7908440629470672, "grad_norm": 5.96875, "learning_rate": 8.057993132392218e-06, "loss": 0.6907, "mean_token_accuracy": 0.79127366989851, "step": 4695 }, { "epoch": 1.792751549833095, "grad_norm": 4.84375, "learning_rate": 8.045275340200941e-06, "loss": 0.6406, "mean_token_accuracy": 0.8125458776950836, "step": 4700 }, { "epoch": 1.7946590367191226, "grad_norm": 5.5, "learning_rate": 8.032557548009667e-06, "loss": 0.699, "mean_token_accuracy": 0.7910123407840729, "step": 4705 }, { "epoch": 1.7965665236051502, "grad_norm": 6.8125, "learning_rate": 8.01983975581839e-06, "loss": 0.7152, "mean_token_accuracy": 0.7795601680874824, "step": 4710 }, { "epoch": 1.7984740104911778, "grad_norm": 6.5, "learning_rate": 8.007121963627115e-06, "loss": 0.7939, "mean_token_accuracy": 0.7633196681737899, "step": 4715 }, { "epoch": 1.8003814973772054, "grad_norm": 4.65625, "learning_rate": 7.994404171435838e-06, "loss": 0.697, "mean_token_accuracy": 0.7856346786022186, "step": 4720 }, { "epoch": 1.802288984263233, "grad_norm": 5.3125, "learning_rate": 7.981686379244564e-06, "loss": 0.6621, "mean_token_accuracy": 0.7977853432297707, "step": 4725 }, { "epoch": 1.8041964711492608, "grad_norm": 5.46875, "learning_rate": 7.968968587053288e-06, "loss": 0.69, "mean_token_accuracy": 0.7897114619612694, "step": 4730 }, { "epoch": 1.8061039580352887, "grad_norm": 5.8125, "learning_rate": 7.956250794862013e-06, "loss": 0.6012, "mean_token_accuracy": 0.8110170468688012, "step": 4735 }, { "epoch": 1.8080114449213163, "grad_norm": 6.46875, "learning_rate": 7.943533002670737e-06, "loss": 0.8006, "mean_token_accuracy": 0.7650435447692872, "step": 4740 }, { "epoch": 1.8099189318073439, "grad_norm": 5.25, "learning_rate": 7.93081521047946e-06, "loss": 0.676, "mean_token_accuracy": 0.7938611567020416, "step": 4745 }, { "epoch": 1.8118264186933715, "grad_norm": 5.6875, "learning_rate": 7.918097418288185e-06, "loss": 0.7047, "mean_token_accuracy": 0.7773008272051811, "step": 4750 }, { "epoch": 1.813733905579399, "grad_norm": 5.34375, "learning_rate": 7.90537962609691e-06, "loss": 0.7271, "mean_token_accuracy": 0.7902742803096772, "step": 4755 }, { "epoch": 1.8156413924654267, "grad_norm": 6.5625, "learning_rate": 7.892661833905636e-06, "loss": 0.7035, "mean_token_accuracy": 0.7915276631712913, "step": 4760 }, { "epoch": 1.8175488793514545, "grad_norm": 5.34375, "learning_rate": 7.87994404171436e-06, "loss": 0.6543, "mean_token_accuracy": 0.7968891650438309, "step": 4765 }, { "epoch": 1.819456366237482, "grad_norm": 4.71875, "learning_rate": 7.867226249523083e-06, "loss": 0.6581, "mean_token_accuracy": 0.7915876343846321, "step": 4770 }, { "epoch": 1.82136385312351, "grad_norm": 6.125, "learning_rate": 7.854508457331807e-06, "loss": 0.6903, "mean_token_accuracy": 0.7903998970985413, "step": 4775 }, { "epoch": 1.8232713400095375, "grad_norm": 6.3125, "learning_rate": 7.841790665140532e-06, "loss": 0.7056, "mean_token_accuracy": 0.794343139231205, "step": 4780 }, { "epoch": 1.825178826895565, "grad_norm": 6.75, "learning_rate": 7.829072872949258e-06, "loss": 0.6673, "mean_token_accuracy": 0.8082071855664253, "step": 4785 }, { "epoch": 1.8270863137815927, "grad_norm": 3.8125, "learning_rate": 7.816355080757982e-06, "loss": 0.656, "mean_token_accuracy": 0.796270664036274, "step": 4790 }, { "epoch": 1.8289938006676203, "grad_norm": 5.28125, "learning_rate": 7.803637288566706e-06, "loss": 0.6458, "mean_token_accuracy": 0.8071435138583183, "step": 4795 }, { "epoch": 1.8309012875536481, "grad_norm": 6.0, "learning_rate": 7.79091949637543e-06, "loss": 0.6605, "mean_token_accuracy": 0.8013365402817726, "step": 4800 }, { "epoch": 1.8328087744396757, "grad_norm": 6.0625, "learning_rate": 7.778201704184153e-06, "loss": 0.7275, "mean_token_accuracy": 0.7757177606225014, "step": 4805 }, { "epoch": 1.8347162613257035, "grad_norm": 5.84375, "learning_rate": 7.765483911992879e-06, "loss": 0.6855, "mean_token_accuracy": 0.7862690791487694, "step": 4810 }, { "epoch": 1.8366237482117311, "grad_norm": 5.3125, "learning_rate": 7.752766119801604e-06, "loss": 0.6781, "mean_token_accuracy": 0.7940695360302925, "step": 4815 }, { "epoch": 1.8385312350977587, "grad_norm": 5.90625, "learning_rate": 7.740048327610328e-06, "loss": 0.7035, "mean_token_accuracy": 0.7908565014600754, "step": 4820 }, { "epoch": 1.8404387219837863, "grad_norm": 5.90625, "learning_rate": 7.727330535419052e-06, "loss": 0.7276, "mean_token_accuracy": 0.7879172414541245, "step": 4825 }, { "epoch": 1.842346208869814, "grad_norm": 7.53125, "learning_rate": 7.714612743227776e-06, "loss": 0.7793, "mean_token_accuracy": 0.7688939779996872, "step": 4830 }, { "epoch": 1.8442536957558415, "grad_norm": 4.875, "learning_rate": 7.701894951036501e-06, "loss": 0.6768, "mean_token_accuracy": 0.7987960994243621, "step": 4835 }, { "epoch": 1.8461611826418693, "grad_norm": 4.75, "learning_rate": 7.689177158845225e-06, "loss": 0.6264, "mean_token_accuracy": 0.8106413125991822, "step": 4840 }, { "epoch": 1.848068669527897, "grad_norm": 5.84375, "learning_rate": 7.67645936665395e-06, "loss": 0.7087, "mean_token_accuracy": 0.7887535482645035, "step": 4845 }, { "epoch": 1.8499761564139248, "grad_norm": 6.09375, "learning_rate": 7.663741574462674e-06, "loss": 0.7736, "mean_token_accuracy": 0.76987956315279, "step": 4850 }, { "epoch": 1.8518836432999524, "grad_norm": 5.5, "learning_rate": 7.651023782271398e-06, "loss": 0.6648, "mean_token_accuracy": 0.8052722126245498, "step": 4855 }, { "epoch": 1.85379113018598, "grad_norm": 6.1875, "learning_rate": 7.638305990080122e-06, "loss": 0.7495, "mean_token_accuracy": 0.7745545446872711, "step": 4860 }, { "epoch": 1.8556986170720076, "grad_norm": 5.6875, "learning_rate": 7.6255881978888465e-06, "loss": 0.6601, "mean_token_accuracy": 0.7937394559383393, "step": 4865 }, { "epoch": 1.8576061039580352, "grad_norm": 6.21875, "learning_rate": 7.612870405697571e-06, "loss": 0.7174, "mean_token_accuracy": 0.7859519511461258, "step": 4870 }, { "epoch": 1.859513590844063, "grad_norm": 5.4375, "learning_rate": 7.600152613506297e-06, "loss": 0.6854, "mean_token_accuracy": 0.7899977654218674, "step": 4875 }, { "epoch": 1.8614210777300906, "grad_norm": 5.0625, "learning_rate": 7.5874348213150204e-06, "loss": 0.6791, "mean_token_accuracy": 0.7809333771467208, "step": 4880 }, { "epoch": 1.8633285646161184, "grad_norm": 5.03125, "learning_rate": 7.574717029123745e-06, "loss": 0.6534, "mean_token_accuracy": 0.7959544643759727, "step": 4885 }, { "epoch": 1.865236051502146, "grad_norm": 6.1875, "learning_rate": 7.561999236932469e-06, "loss": 0.7353, "mean_token_accuracy": 0.7807211935520172, "step": 4890 }, { "epoch": 1.8671435383881736, "grad_norm": 6.59375, "learning_rate": 7.5492814447411935e-06, "loss": 0.6726, "mean_token_accuracy": 0.7969186782836915, "step": 4895 }, { "epoch": 1.8690510252742012, "grad_norm": 5.34375, "learning_rate": 7.536563652549917e-06, "loss": 0.6537, "mean_token_accuracy": 0.80834990888834, "step": 4900 }, { "epoch": 1.8709585121602288, "grad_norm": 6.71875, "learning_rate": 7.523845860358643e-06, "loss": 0.7707, "mean_token_accuracy": 0.7760011538863182, "step": 4905 }, { "epoch": 1.8728659990462564, "grad_norm": 4.46875, "learning_rate": 7.511128068167367e-06, "loss": 0.6588, "mean_token_accuracy": 0.7985247880220413, "step": 4910 }, { "epoch": 1.8747734859322842, "grad_norm": 6.53125, "learning_rate": 7.498410275976091e-06, "loss": 0.6502, "mean_token_accuracy": 0.7975252762436866, "step": 4915 }, { "epoch": 1.876680972818312, "grad_norm": 6.3125, "learning_rate": 7.485692483784815e-06, "loss": 0.7668, "mean_token_accuracy": 0.7764229297637939, "step": 4920 }, { "epoch": 1.8785884597043396, "grad_norm": 5.0, "learning_rate": 7.47297469159354e-06, "loss": 0.6923, "mean_token_accuracy": 0.7958945870399475, "step": 4925 }, { "epoch": 1.8804959465903672, "grad_norm": 4.9375, "learning_rate": 7.4602568994022636e-06, "loss": 0.6674, "mean_token_accuracy": 0.7920473888516426, "step": 4930 }, { "epoch": 1.8824034334763948, "grad_norm": 5.25, "learning_rate": 7.447539107210989e-06, "loss": 0.7502, "mean_token_accuracy": 0.777404710650444, "step": 4935 }, { "epoch": 1.8843109203624224, "grad_norm": 5.4375, "learning_rate": 7.434821315019714e-06, "loss": 0.6444, "mean_token_accuracy": 0.8106374859809875, "step": 4940 }, { "epoch": 1.88621840724845, "grad_norm": 6.0, "learning_rate": 7.4221035228284375e-06, "loss": 0.6936, "mean_token_accuracy": 0.7822411820292473, "step": 4945 }, { "epoch": 1.8881258941344778, "grad_norm": 5.90625, "learning_rate": 7.409385730637162e-06, "loss": 0.7101, "mean_token_accuracy": 0.7878482550382614, "step": 4950 }, { "epoch": 1.8900333810205054, "grad_norm": 5.5, "learning_rate": 7.396667938445886e-06, "loss": 0.6767, "mean_token_accuracy": 0.7991128966212273, "step": 4955 }, { "epoch": 1.8919408679065333, "grad_norm": 5.15625, "learning_rate": 7.383950146254611e-06, "loss": 0.6384, "mean_token_accuracy": 0.7992655903100967, "step": 4960 }, { "epoch": 1.8938483547925609, "grad_norm": 6.96875, "learning_rate": 7.371232354063335e-06, "loss": 0.6932, "mean_token_accuracy": 0.7813262164592742, "step": 4965 }, { "epoch": 1.8957558416785885, "grad_norm": 5.4375, "learning_rate": 7.35851456187206e-06, "loss": 0.7186, "mean_token_accuracy": 0.7845756888389588, "step": 4970 }, { "epoch": 1.897663328564616, "grad_norm": 6.3125, "learning_rate": 7.345796769680784e-06, "loss": 0.7017, "mean_token_accuracy": 0.7855490684509278, "step": 4975 }, { "epoch": 1.8995708154506437, "grad_norm": 5.46875, "learning_rate": 7.333078977489508e-06, "loss": 0.7892, "mean_token_accuracy": 0.771816186606884, "step": 4980 }, { "epoch": 1.9014783023366715, "grad_norm": 4.59375, "learning_rate": 7.320361185298232e-06, "loss": 0.5957, "mean_token_accuracy": 0.8149242594838142, "step": 4985 }, { "epoch": 1.903385789222699, "grad_norm": 6.15625, "learning_rate": 7.307643393106957e-06, "loss": 0.6671, "mean_token_accuracy": 0.7942966371774673, "step": 4990 }, { "epoch": 1.905293276108727, "grad_norm": 7.65625, "learning_rate": 7.294925600915682e-06, "loss": 0.7531, "mean_token_accuracy": 0.7853093549609185, "step": 4995 }, { "epoch": 1.9072007629947545, "grad_norm": 5.71875, "learning_rate": 7.282207808724406e-06, "loss": 0.6564, "mean_token_accuracy": 0.7984834283590316, "step": 5000 }, { "epoch": 1.909108249880782, "grad_norm": 5.375, "learning_rate": 7.269490016533131e-06, "loss": 0.736, "mean_token_accuracy": 0.7914832070469856, "step": 5005 }, { "epoch": 1.9110157367668097, "grad_norm": 5.59375, "learning_rate": 7.256772224341855e-06, "loss": 0.7184, "mean_token_accuracy": 0.7816375851631164, "step": 5010 }, { "epoch": 1.9129232236528373, "grad_norm": 4.875, "learning_rate": 7.244054432150579e-06, "loss": 0.7227, "mean_token_accuracy": 0.7908033803105354, "step": 5015 }, { "epoch": 1.9148307105388649, "grad_norm": 7.09375, "learning_rate": 7.231336639959303e-06, "loss": 0.679, "mean_token_accuracy": 0.8013672217726707, "step": 5020 }, { "epoch": 1.9167381974248927, "grad_norm": 7.125, "learning_rate": 7.218618847768029e-06, "loss": 0.699, "mean_token_accuracy": 0.7886776104569435, "step": 5025 }, { "epoch": 1.9186456843109203, "grad_norm": 4.9375, "learning_rate": 7.205901055576752e-06, "loss": 0.6722, "mean_token_accuracy": 0.795051820576191, "step": 5030 }, { "epoch": 1.9205531711969481, "grad_norm": 8.1875, "learning_rate": 7.193183263385477e-06, "loss": 0.6799, "mean_token_accuracy": 0.7945815742015838, "step": 5035 }, { "epoch": 1.9224606580829757, "grad_norm": 4.9375, "learning_rate": 7.180465471194201e-06, "loss": 0.7474, "mean_token_accuracy": 0.7869170516729355, "step": 5040 }, { "epoch": 1.9243681449690033, "grad_norm": 4.53125, "learning_rate": 7.1677476790029255e-06, "loss": 0.6607, "mean_token_accuracy": 0.7978417113423347, "step": 5045 }, { "epoch": 1.926275631855031, "grad_norm": 6.0625, "learning_rate": 7.155029886811649e-06, "loss": 0.7189, "mean_token_accuracy": 0.7840519219636917, "step": 5050 }, { "epoch": 1.9281831187410585, "grad_norm": 5.21875, "learning_rate": 7.142312094620375e-06, "loss": 0.6998, "mean_token_accuracy": 0.7922796562314034, "step": 5055 }, { "epoch": 1.9300906056270863, "grad_norm": 6.625, "learning_rate": 7.1295943024290995e-06, "loss": 0.7552, "mean_token_accuracy": 0.7724163338541985, "step": 5060 }, { "epoch": 1.931998092513114, "grad_norm": 7.34375, "learning_rate": 7.116876510237823e-06, "loss": 0.7753, "mean_token_accuracy": 0.7780322790145874, "step": 5065 }, { "epoch": 1.9339055793991418, "grad_norm": 5.28125, "learning_rate": 7.104158718046548e-06, "loss": 0.7216, "mean_token_accuracy": 0.7845679372549057, "step": 5070 }, { "epoch": 1.9358130662851694, "grad_norm": 5.875, "learning_rate": 7.091440925855272e-06, "loss": 0.7327, "mean_token_accuracy": 0.780812793970108, "step": 5075 }, { "epoch": 1.937720553171197, "grad_norm": 5.21875, "learning_rate": 7.078723133663996e-06, "loss": 0.624, "mean_token_accuracy": 0.8052924752235413, "step": 5080 }, { "epoch": 1.9396280400572246, "grad_norm": 5.53125, "learning_rate": 7.066005341472721e-06, "loss": 0.6668, "mean_token_accuracy": 0.8004230096936226, "step": 5085 }, { "epoch": 1.9415355269432522, "grad_norm": 5.625, "learning_rate": 7.053287549281446e-06, "loss": 0.7192, "mean_token_accuracy": 0.7904253482818604, "step": 5090 }, { "epoch": 1.94344301382928, "grad_norm": 5.4375, "learning_rate": 7.0405697570901695e-06, "loss": 0.7384, "mean_token_accuracy": 0.7813466861844063, "step": 5095 }, { "epoch": 1.9453505007153076, "grad_norm": 5.90625, "learning_rate": 7.027851964898894e-06, "loss": 0.6972, "mean_token_accuracy": 0.7880258679389953, "step": 5100 }, { "epoch": 1.9472579876013354, "grad_norm": 4.40625, "learning_rate": 7.015134172707618e-06, "loss": 0.6732, "mean_token_accuracy": 0.7904098376631736, "step": 5105 }, { "epoch": 1.949165474487363, "grad_norm": 4.71875, "learning_rate": 7.002416380516343e-06, "loss": 0.5973, "mean_token_accuracy": 0.819116373360157, "step": 5110 }, { "epoch": 1.9510729613733906, "grad_norm": 5.6875, "learning_rate": 6.989698588325068e-06, "loss": 0.7033, "mean_token_accuracy": 0.7800509944558144, "step": 5115 }, { "epoch": 1.9529804482594182, "grad_norm": 4.75, "learning_rate": 6.976980796133792e-06, "loss": 0.7044, "mean_token_accuracy": 0.7840970128774643, "step": 5120 }, { "epoch": 1.9548879351454458, "grad_norm": 5.40625, "learning_rate": 6.9642630039425166e-06, "loss": 0.659, "mean_token_accuracy": 0.7920606225728989, "step": 5125 }, { "epoch": 1.9567954220314734, "grad_norm": 6.46875, "learning_rate": 6.95154521175124e-06, "loss": 0.6624, "mean_token_accuracy": 0.8022448986768722, "step": 5130 }, { "epoch": 1.9587029089175012, "grad_norm": 5.59375, "learning_rate": 6.938827419559965e-06, "loss": 0.6575, "mean_token_accuracy": 0.8071569114923477, "step": 5135 }, { "epoch": 1.9606103958035288, "grad_norm": 5.21875, "learning_rate": 6.926109627368689e-06, "loss": 0.7075, "mean_token_accuracy": 0.7822893932461739, "step": 5140 }, { "epoch": 1.9625178826895566, "grad_norm": 5.96875, "learning_rate": 6.913391835177414e-06, "loss": 0.7143, "mean_token_accuracy": 0.7828281372785568, "step": 5145 }, { "epoch": 1.9644253695755842, "grad_norm": 6.40625, "learning_rate": 6.900674042986138e-06, "loss": 0.7189, "mean_token_accuracy": 0.7802597790956497, "step": 5150 }, { "epoch": 1.9663328564616118, "grad_norm": 4.90625, "learning_rate": 6.887956250794863e-06, "loss": 0.6164, "mean_token_accuracy": 0.816970057785511, "step": 5155 }, { "epoch": 1.9682403433476394, "grad_norm": 5.875, "learning_rate": 6.875238458603587e-06, "loss": 0.6906, "mean_token_accuracy": 0.7868256568908691, "step": 5160 }, { "epoch": 1.970147830233667, "grad_norm": 5.28125, "learning_rate": 6.862520666412311e-06, "loss": 0.6821, "mean_token_accuracy": 0.7953496769070625, "step": 5165 }, { "epoch": 1.9720553171196948, "grad_norm": 5.84375, "learning_rate": 6.849802874221035e-06, "loss": 0.7652, "mean_token_accuracy": 0.7688144698739052, "step": 5170 }, { "epoch": 1.9739628040057224, "grad_norm": 5.8125, "learning_rate": 6.8370850820297605e-06, "loss": 0.6357, "mean_token_accuracy": 0.8053975969552993, "step": 5175 }, { "epoch": 1.9758702908917503, "grad_norm": 5.15625, "learning_rate": 6.824367289838485e-06, "loss": 0.7236, "mean_token_accuracy": 0.7858426973223687, "step": 5180 }, { "epoch": 1.9777777777777779, "grad_norm": 6.125, "learning_rate": 6.811649497647209e-06, "loss": 0.7327, "mean_token_accuracy": 0.7863919615745545, "step": 5185 }, { "epoch": 1.9796852646638055, "grad_norm": 5.0, "learning_rate": 6.798931705455934e-06, "loss": 0.653, "mean_token_accuracy": 0.8058089405298233, "step": 5190 }, { "epoch": 1.981592751549833, "grad_norm": 5.03125, "learning_rate": 6.7862139132646575e-06, "loss": 0.6365, "mean_token_accuracy": 0.8101479053497315, "step": 5195 }, { "epoch": 1.9835002384358607, "grad_norm": 5.96875, "learning_rate": 6.773496121073382e-06, "loss": 0.8004, "mean_token_accuracy": 0.763555309176445, "step": 5200 }, { "epoch": 1.9854077253218883, "grad_norm": 5.125, "learning_rate": 6.760778328882107e-06, "loss": 0.6634, "mean_token_accuracy": 0.8034867867827415, "step": 5205 }, { "epoch": 1.987315212207916, "grad_norm": 5.65625, "learning_rate": 6.748060536690831e-06, "loss": 0.7442, "mean_token_accuracy": 0.7539320230484009, "step": 5210 }, { "epoch": 1.989222699093944, "grad_norm": 5.75, "learning_rate": 6.735342744499555e-06, "loss": 0.6592, "mean_token_accuracy": 0.8079338014125824, "step": 5215 }, { "epoch": 1.9911301859799715, "grad_norm": 5.375, "learning_rate": 6.72262495230828e-06, "loss": 0.6299, "mean_token_accuracy": 0.8066416561603547, "step": 5220 }, { "epoch": 1.993037672865999, "grad_norm": 6.0625, "learning_rate": 6.709907160117004e-06, "loss": 0.7661, "mean_token_accuracy": 0.7778510600328445, "step": 5225 }, { "epoch": 1.9949451597520267, "grad_norm": 5.5625, "learning_rate": 6.697189367925728e-06, "loss": 0.6828, "mean_token_accuracy": 0.7868445530533791, "step": 5230 }, { "epoch": 1.9968526466380543, "grad_norm": 4.875, "learning_rate": 6.684471575734454e-06, "loss": 0.6177, "mean_token_accuracy": 0.8107023119926453, "step": 5235 }, { "epoch": 1.9987601335240819, "grad_norm": 5.03125, "learning_rate": 6.671753783543178e-06, "loss": 0.6335, "mean_token_accuracy": 0.810404734313488, "step": 5240 }, { "epoch": 2.0003814973772056, "grad_norm": 5.40625, "learning_rate": 6.659035991351902e-06, "loss": 0.6568, "mean_token_accuracy": 0.8131301490699544, "step": 5245 }, { "epoch": 2.002288984263233, "grad_norm": 5.96875, "learning_rate": 6.646318199160626e-06, "loss": 0.6125, "mean_token_accuracy": 0.8206262648105621, "step": 5250 }, { "epoch": 2.004196471149261, "grad_norm": 5.21875, "learning_rate": 6.633600406969351e-06, "loss": 0.5473, "mean_token_accuracy": 0.8221940651535988, "step": 5255 }, { "epoch": 2.0061039580352884, "grad_norm": 5.6875, "learning_rate": 6.6208826147780746e-06, "loss": 0.6734, "mean_token_accuracy": 0.800679013133049, "step": 5260 }, { "epoch": 2.008011444921316, "grad_norm": 5.125, "learning_rate": 6.608164822586799e-06, "loss": 0.5795, "mean_token_accuracy": 0.8310953453183174, "step": 5265 }, { "epoch": 2.009918931807344, "grad_norm": 7.09375, "learning_rate": 6.595447030395524e-06, "loss": 0.5389, "mean_token_accuracy": 0.8358400642871857, "step": 5270 }, { "epoch": 2.0118264186933716, "grad_norm": 6.375, "learning_rate": 6.5827292382042485e-06, "loss": 0.6451, "mean_token_accuracy": 0.8094501391053199, "step": 5275 }, { "epoch": 2.0137339055793992, "grad_norm": 5.5, "learning_rate": 6.570011446012972e-06, "loss": 0.5821, "mean_token_accuracy": 0.8170616224408149, "step": 5280 }, { "epoch": 2.015641392465427, "grad_norm": 4.40625, "learning_rate": 6.557293653821697e-06, "loss": 0.5454, "mean_token_accuracy": 0.8332473427057266, "step": 5285 }, { "epoch": 2.0175488793514544, "grad_norm": 5.09375, "learning_rate": 6.544575861630421e-06, "loss": 0.5763, "mean_token_accuracy": 0.8229367405176162, "step": 5290 }, { "epoch": 2.019456366237482, "grad_norm": 6.25, "learning_rate": 6.5318580694391454e-06, "loss": 0.5343, "mean_token_accuracy": 0.8258931323885917, "step": 5295 }, { "epoch": 2.0213638531235096, "grad_norm": 5.59375, "learning_rate": 6.519140277247871e-06, "loss": 0.6415, "mean_token_accuracy": 0.8054152816534043, "step": 5300 }, { "epoch": 2.0232713400095372, "grad_norm": 6.03125, "learning_rate": 6.506422485056595e-06, "loss": 0.6372, "mean_token_accuracy": 0.8117892548441887, "step": 5305 }, { "epoch": 2.0251788268955653, "grad_norm": 4.65625, "learning_rate": 6.493704692865319e-06, "loss": 0.5639, "mean_token_accuracy": 0.8290649086236954, "step": 5310 }, { "epoch": 2.027086313781593, "grad_norm": 4.75, "learning_rate": 6.480986900674043e-06, "loss": 0.6007, "mean_token_accuracy": 0.8206225126981735, "step": 5315 }, { "epoch": 2.0289938006676205, "grad_norm": 6.0, "learning_rate": 6.468269108482768e-06, "loss": 0.6263, "mean_token_accuracy": 0.816709017753601, "step": 5320 }, { "epoch": 2.030901287553648, "grad_norm": 6.71875, "learning_rate": 6.455551316291492e-06, "loss": 0.617, "mean_token_accuracy": 0.8126022920012475, "step": 5325 }, { "epoch": 2.0328087744396757, "grad_norm": 6.0625, "learning_rate": 6.442833524100217e-06, "loss": 0.5926, "mean_token_accuracy": 0.8229933097958565, "step": 5330 }, { "epoch": 2.0347162613257033, "grad_norm": 6.34375, "learning_rate": 6.430115731908941e-06, "loss": 0.6458, "mean_token_accuracy": 0.8141643151640892, "step": 5335 }, { "epoch": 2.036623748211731, "grad_norm": 6.0625, "learning_rate": 6.417397939717666e-06, "loss": 0.5839, "mean_token_accuracy": 0.817927660048008, "step": 5340 }, { "epoch": 2.038531235097759, "grad_norm": 5.3125, "learning_rate": 6.404680147526389e-06, "loss": 0.5594, "mean_token_accuracy": 0.8340118303894997, "step": 5345 }, { "epoch": 2.0404387219837865, "grad_norm": 5.21875, "learning_rate": 6.391962355335114e-06, "loss": 0.6271, "mean_token_accuracy": 0.8086557269096375, "step": 5350 }, { "epoch": 2.042346208869814, "grad_norm": 5.28125, "learning_rate": 6.379244563143838e-06, "loss": 0.5853, "mean_token_accuracy": 0.8243370905518532, "step": 5355 }, { "epoch": 2.0442536957558417, "grad_norm": 6.75, "learning_rate": 6.366526770952563e-06, "loss": 0.6226, "mean_token_accuracy": 0.8111834675073624, "step": 5360 }, { "epoch": 2.0461611826418693, "grad_norm": 4.65625, "learning_rate": 6.353808978761288e-06, "loss": 0.6042, "mean_token_accuracy": 0.8196303457021713, "step": 5365 }, { "epoch": 2.048068669527897, "grad_norm": 6.15625, "learning_rate": 6.341091186570012e-06, "loss": 0.6401, "mean_token_accuracy": 0.8110625892877579, "step": 5370 }, { "epoch": 2.0499761564139245, "grad_norm": 5.46875, "learning_rate": 6.3283733943787365e-06, "loss": 0.5267, "mean_token_accuracy": 0.8233473181724549, "step": 5375 }, { "epoch": 2.051883643299952, "grad_norm": 7.40625, "learning_rate": 6.31565560218746e-06, "loss": 0.5688, "mean_token_accuracy": 0.8308075174689293, "step": 5380 }, { "epoch": 2.05379113018598, "grad_norm": 6.21875, "learning_rate": 6.302937809996185e-06, "loss": 0.5778, "mean_token_accuracy": 0.8248735830187798, "step": 5385 }, { "epoch": 2.0556986170720077, "grad_norm": 5.09375, "learning_rate": 6.29022001780491e-06, "loss": 0.5973, "mean_token_accuracy": 0.819294498860836, "step": 5390 }, { "epoch": 2.0576061039580353, "grad_norm": 5.1875, "learning_rate": 6.277502225613634e-06, "loss": 0.6395, "mean_token_accuracy": 0.8058527052402497, "step": 5395 }, { "epoch": 2.059513590844063, "grad_norm": 5.09375, "learning_rate": 6.264784433422358e-06, "loss": 0.5775, "mean_token_accuracy": 0.8277291983366013, "step": 5400 }, { "epoch": 2.0614210777300905, "grad_norm": 5.09375, "learning_rate": 6.252066641231083e-06, "loss": 0.4886, "mean_token_accuracy": 0.8545591816306114, "step": 5405 }, { "epoch": 2.063328564616118, "grad_norm": 5.625, "learning_rate": 6.2393488490398065e-06, "loss": 0.6205, "mean_token_accuracy": 0.8056149169802665, "step": 5410 }, { "epoch": 2.0652360515021457, "grad_norm": 5.78125, "learning_rate": 6.226631056848531e-06, "loss": 0.5835, "mean_token_accuracy": 0.8185458436608315, "step": 5415 }, { "epoch": 2.0671435383881738, "grad_norm": 5.03125, "learning_rate": 6.213913264657257e-06, "loss": 0.5015, "mean_token_accuracy": 0.8459224656224251, "step": 5420 }, { "epoch": 2.0690510252742014, "grad_norm": 4.96875, "learning_rate": 6.2011954724659805e-06, "loss": 0.5933, "mean_token_accuracy": 0.8209754064679146, "step": 5425 }, { "epoch": 2.070958512160229, "grad_norm": 4.875, "learning_rate": 6.188477680274705e-06, "loss": 0.5948, "mean_token_accuracy": 0.8216954946517945, "step": 5430 }, { "epoch": 2.0728659990462566, "grad_norm": 6.03125, "learning_rate": 6.175759888083429e-06, "loss": 0.6031, "mean_token_accuracy": 0.8233641475439072, "step": 5435 }, { "epoch": 2.074773485932284, "grad_norm": 4.96875, "learning_rate": 6.163042095892154e-06, "loss": 0.571, "mean_token_accuracy": 0.8227150589227676, "step": 5440 }, { "epoch": 2.0766809728183118, "grad_norm": 6.84375, "learning_rate": 6.150324303700877e-06, "loss": 0.6048, "mean_token_accuracy": 0.816265507042408, "step": 5445 }, { "epoch": 2.0785884597043394, "grad_norm": 5.21875, "learning_rate": 6.137606511509603e-06, "loss": 0.4735, "mean_token_accuracy": 0.8504796147346496, "step": 5450 }, { "epoch": 2.080495946590367, "grad_norm": 7.8125, "learning_rate": 6.124888719318327e-06, "loss": 0.6131, "mean_token_accuracy": 0.8110155686736107, "step": 5455 }, { "epoch": 2.082403433476395, "grad_norm": 6.625, "learning_rate": 6.112170927127051e-06, "loss": 0.5432, "mean_token_accuracy": 0.8311783730983734, "step": 5460 }, { "epoch": 2.0843109203624226, "grad_norm": 6.3125, "learning_rate": 6.099453134935775e-06, "loss": 0.6218, "mean_token_accuracy": 0.8129143878817559, "step": 5465 }, { "epoch": 2.08621840724845, "grad_norm": 5.9375, "learning_rate": 6.0867353427445e-06, "loss": 0.6302, "mean_token_accuracy": 0.8178365066647529, "step": 5470 }, { "epoch": 2.088125894134478, "grad_norm": 5.1875, "learning_rate": 6.074017550553224e-06, "loss": 0.553, "mean_token_accuracy": 0.8309461057186127, "step": 5475 }, { "epoch": 2.0900333810205054, "grad_norm": 5.25, "learning_rate": 6.061299758361949e-06, "loss": 0.5383, "mean_token_accuracy": 0.8359053865075111, "step": 5480 }, { "epoch": 2.091940867906533, "grad_norm": 5.15625, "learning_rate": 6.048581966170674e-06, "loss": 0.5209, "mean_token_accuracy": 0.8379359632730484, "step": 5485 }, { "epoch": 2.0938483547925606, "grad_norm": 6.15625, "learning_rate": 6.0358641739793976e-06, "loss": 0.5388, "mean_token_accuracy": 0.825338727235794, "step": 5490 }, { "epoch": 2.0957558416785886, "grad_norm": 6.65625, "learning_rate": 6.023146381788122e-06, "loss": 0.6214, "mean_token_accuracy": 0.8112127646803856, "step": 5495 }, { "epoch": 2.0976633285646162, "grad_norm": 6.0625, "learning_rate": 6.010428589596846e-06, "loss": 0.5743, "mean_token_accuracy": 0.8186524271965027, "step": 5500 }, { "epoch": 2.099570815450644, "grad_norm": 5.875, "learning_rate": 5.997710797405571e-06, "loss": 0.5725, "mean_token_accuracy": 0.8223934233188629, "step": 5505 }, { "epoch": 2.1014783023366714, "grad_norm": 5.96875, "learning_rate": 5.984993005214295e-06, "loss": 0.615, "mean_token_accuracy": 0.8216773048043251, "step": 5510 }, { "epoch": 2.103385789222699, "grad_norm": 5.15625, "learning_rate": 5.97227521302302e-06, "loss": 0.5627, "mean_token_accuracy": 0.8287667348980904, "step": 5515 }, { "epoch": 2.1052932761087266, "grad_norm": 6.71875, "learning_rate": 5.959557420831744e-06, "loss": 0.5917, "mean_token_accuracy": 0.8215829640626907, "step": 5520 }, { "epoch": 2.1072007629947542, "grad_norm": 5.21875, "learning_rate": 5.9468396286404684e-06, "loss": 0.5424, "mean_token_accuracy": 0.8323732078075409, "step": 5525 }, { "epoch": 2.1091082498807823, "grad_norm": 5.78125, "learning_rate": 5.934121836449192e-06, "loss": 0.6049, "mean_token_accuracy": 0.8124245926737785, "step": 5530 }, { "epoch": 2.11101573676681, "grad_norm": 5.21875, "learning_rate": 5.921404044257917e-06, "loss": 0.6271, "mean_token_accuracy": 0.8112147480249405, "step": 5535 }, { "epoch": 2.1129232236528375, "grad_norm": 7.28125, "learning_rate": 5.908686252066642e-06, "loss": 0.5675, "mean_token_accuracy": 0.8308483242988587, "step": 5540 }, { "epoch": 2.114830710538865, "grad_norm": 4.59375, "learning_rate": 5.895968459875366e-06, "loss": 0.5689, "mean_token_accuracy": 0.8260648816823959, "step": 5545 }, { "epoch": 2.1167381974248927, "grad_norm": 6.6875, "learning_rate": 5.883250667684091e-06, "loss": 0.5734, "mean_token_accuracy": 0.8327483266592026, "step": 5550 }, { "epoch": 2.1186456843109203, "grad_norm": 5.9375, "learning_rate": 5.870532875492815e-06, "loss": 0.651, "mean_token_accuracy": 0.7964515581727027, "step": 5555 }, { "epoch": 2.120553171196948, "grad_norm": 5.78125, "learning_rate": 5.857815083301539e-06, "loss": 0.5666, "mean_token_accuracy": 0.8266356199979782, "step": 5560 }, { "epoch": 2.122460658082976, "grad_norm": 5.0625, "learning_rate": 5.845097291110263e-06, "loss": 0.5887, "mean_token_accuracy": 0.8189203411340713, "step": 5565 }, { "epoch": 2.1243681449690035, "grad_norm": 5.625, "learning_rate": 5.832379498918989e-06, "loss": 0.5035, "mean_token_accuracy": 0.8415826693177223, "step": 5570 }, { "epoch": 2.126275631855031, "grad_norm": 4.84375, "learning_rate": 5.8196617067277124e-06, "loss": 0.568, "mean_token_accuracy": 0.8224192067980767, "step": 5575 }, { "epoch": 2.1281831187410587, "grad_norm": 4.9375, "learning_rate": 5.806943914536437e-06, "loss": 0.56, "mean_token_accuracy": 0.8251401484012604, "step": 5580 }, { "epoch": 2.1300906056270863, "grad_norm": 7.21875, "learning_rate": 5.794226122345161e-06, "loss": 0.6442, "mean_token_accuracy": 0.8051092073321342, "step": 5585 }, { "epoch": 2.131998092513114, "grad_norm": 5.5625, "learning_rate": 5.7815083301538855e-06, "loss": 0.6001, "mean_token_accuracy": 0.806366941332817, "step": 5590 }, { "epoch": 2.1339055793991415, "grad_norm": 6.0625, "learning_rate": 5.768790537962609e-06, "loss": 0.5502, "mean_token_accuracy": 0.8303710088133812, "step": 5595 }, { "epoch": 2.135813066285169, "grad_norm": 5.65625, "learning_rate": 5.756072745771335e-06, "loss": 0.5632, "mean_token_accuracy": 0.8413136050105094, "step": 5600 }, { "epoch": 2.137720553171197, "grad_norm": 5.0625, "learning_rate": 5.7433549535800595e-06, "loss": 0.4945, "mean_token_accuracy": 0.8427375286817551, "step": 5605 }, { "epoch": 2.1396280400572247, "grad_norm": 5.5625, "learning_rate": 5.730637161388783e-06, "loss": 0.5489, "mean_token_accuracy": 0.8372544094920158, "step": 5610 }, { "epoch": 2.1415355269432523, "grad_norm": 5.6875, "learning_rate": 5.717919369197508e-06, "loss": 0.5756, "mean_token_accuracy": 0.8199098259210587, "step": 5615 }, { "epoch": 2.14344301382928, "grad_norm": 4.65625, "learning_rate": 5.705201577006232e-06, "loss": 0.6147, "mean_token_accuracy": 0.8170333936810493, "step": 5620 }, { "epoch": 2.1453505007153075, "grad_norm": 7.125, "learning_rate": 5.692483784814956e-06, "loss": 0.6754, "mean_token_accuracy": 0.7958659380674362, "step": 5625 }, { "epoch": 2.147257987601335, "grad_norm": 5.75, "learning_rate": 5.679765992623681e-06, "loss": 0.6272, "mean_token_accuracy": 0.8095776528120041, "step": 5630 }, { "epoch": 2.1491654744873627, "grad_norm": 5.3125, "learning_rate": 5.667048200432406e-06, "loss": 0.5503, "mean_token_accuracy": 0.8265523850917816, "step": 5635 }, { "epoch": 2.1510729613733908, "grad_norm": 6.46875, "learning_rate": 5.6543304082411295e-06, "loss": 0.5081, "mean_token_accuracy": 0.8373180747032165, "step": 5640 }, { "epoch": 2.1529804482594184, "grad_norm": 6.375, "learning_rate": 5.641612616049854e-06, "loss": 0.5811, "mean_token_accuracy": 0.8180435076355934, "step": 5645 }, { "epoch": 2.154887935145446, "grad_norm": 5.0, "learning_rate": 5.628894823858578e-06, "loss": 0.5829, "mean_token_accuracy": 0.8106600716710091, "step": 5650 }, { "epoch": 2.1567954220314736, "grad_norm": 5.6875, "learning_rate": 5.616177031667303e-06, "loss": 0.5669, "mean_token_accuracy": 0.8269154459238053, "step": 5655 }, { "epoch": 2.158702908917501, "grad_norm": 7.09375, "learning_rate": 5.603459239476028e-06, "loss": 0.5206, "mean_token_accuracy": 0.8366212740540504, "step": 5660 }, { "epoch": 2.1606103958035288, "grad_norm": 4.34375, "learning_rate": 5.590741447284752e-06, "loss": 0.5518, "mean_token_accuracy": 0.8372129842638969, "step": 5665 }, { "epoch": 2.1625178826895564, "grad_norm": 6.71875, "learning_rate": 5.578023655093477e-06, "loss": 0.5271, "mean_token_accuracy": 0.8334283381700516, "step": 5670 }, { "epoch": 2.164425369575584, "grad_norm": 5.40625, "learning_rate": 5.5653058629022e-06, "loss": 0.5178, "mean_token_accuracy": 0.838632382452488, "step": 5675 }, { "epoch": 2.166332856461612, "grad_norm": 10.9375, "learning_rate": 5.552588070710925e-06, "loss": 0.5825, "mean_token_accuracy": 0.8231983244419098, "step": 5680 }, { "epoch": 2.1682403433476396, "grad_norm": 5.71875, "learning_rate": 5.539870278519649e-06, "loss": 0.5532, "mean_token_accuracy": 0.8285579845309258, "step": 5685 }, { "epoch": 2.170147830233667, "grad_norm": 6.84375, "learning_rate": 5.527152486328374e-06, "loss": 0.576, "mean_token_accuracy": 0.8275662720203399, "step": 5690 }, { "epoch": 2.172055317119695, "grad_norm": 7.40625, "learning_rate": 5.514434694137098e-06, "loss": 0.6426, "mean_token_accuracy": 0.8081822127103806, "step": 5695 }, { "epoch": 2.1739628040057224, "grad_norm": 5.5625, "learning_rate": 5.501716901945823e-06, "loss": 0.5682, "mean_token_accuracy": 0.8216787219047547, "step": 5700 }, { "epoch": 2.17587029089175, "grad_norm": 6.125, "learning_rate": 5.488999109754547e-06, "loss": 0.5919, "mean_token_accuracy": 0.8163464710116386, "step": 5705 }, { "epoch": 2.1777777777777776, "grad_norm": 6.3125, "learning_rate": 5.476281317563271e-06, "loss": 0.6796, "mean_token_accuracy": 0.7983686536550522, "step": 5710 }, { "epoch": 2.1796852646638056, "grad_norm": 4.78125, "learning_rate": 5.463563525371995e-06, "loss": 0.5427, "mean_token_accuracy": 0.8316027849912644, "step": 5715 }, { "epoch": 2.1815927515498332, "grad_norm": 6.6875, "learning_rate": 5.4508457331807206e-06, "loss": 0.5281, "mean_token_accuracy": 0.833437106013298, "step": 5720 }, { "epoch": 2.183500238435861, "grad_norm": 6.375, "learning_rate": 5.438127940989445e-06, "loss": 0.6005, "mean_token_accuracy": 0.8176343321800232, "step": 5725 }, { "epoch": 2.1854077253218884, "grad_norm": 4.90625, "learning_rate": 5.425410148798169e-06, "loss": 0.5578, "mean_token_accuracy": 0.8291604444384575, "step": 5730 }, { "epoch": 2.187315212207916, "grad_norm": 6.9375, "learning_rate": 5.412692356606894e-06, "loss": 0.5583, "mean_token_accuracy": 0.8320027679204941, "step": 5735 }, { "epoch": 2.1892226990939436, "grad_norm": 7.09375, "learning_rate": 5.3999745644156175e-06, "loss": 0.5888, "mean_token_accuracy": 0.8238993391394616, "step": 5740 }, { "epoch": 2.1911301859799712, "grad_norm": 5.375, "learning_rate": 5.387256772224342e-06, "loss": 0.5294, "mean_token_accuracy": 0.8428800523281097, "step": 5745 }, { "epoch": 2.193037672865999, "grad_norm": 6.96875, "learning_rate": 5.374538980033067e-06, "loss": 0.6129, "mean_token_accuracy": 0.8176163211464882, "step": 5750 }, { "epoch": 2.194945159752027, "grad_norm": 5.0625, "learning_rate": 5.3618211878417915e-06, "loss": 0.5742, "mean_token_accuracy": 0.8208988308906555, "step": 5755 }, { "epoch": 2.1968526466380545, "grad_norm": 4.59375, "learning_rate": 5.349103395650515e-06, "loss": 0.5517, "mean_token_accuracy": 0.8300920352339745, "step": 5760 }, { "epoch": 2.198760133524082, "grad_norm": 6.625, "learning_rate": 5.33638560345924e-06, "loss": 0.6091, "mean_token_accuracy": 0.8163532137870788, "step": 5765 }, { "epoch": 2.2006676204101097, "grad_norm": 5.84375, "learning_rate": 5.323667811267964e-06, "loss": 0.5216, "mean_token_accuracy": 0.8351360887289048, "step": 5770 }, { "epoch": 2.2025751072961373, "grad_norm": 5.5, "learning_rate": 5.310950019076688e-06, "loss": 0.6714, "mean_token_accuracy": 0.8043876081705094, "step": 5775 }, { "epoch": 2.204482594182165, "grad_norm": 5.84375, "learning_rate": 5.298232226885414e-06, "loss": 0.5665, "mean_token_accuracy": 0.8233716934919357, "step": 5780 }, { "epoch": 2.2063900810681925, "grad_norm": 5.875, "learning_rate": 5.285514434694138e-06, "loss": 0.5637, "mean_token_accuracy": 0.8181168228387833, "step": 5785 }, { "epoch": 2.2082975679542205, "grad_norm": 6.4375, "learning_rate": 5.272796642502862e-06, "loss": 0.5879, "mean_token_accuracy": 0.8219581961631774, "step": 5790 }, { "epoch": 2.210205054840248, "grad_norm": 6.15625, "learning_rate": 5.260078850311586e-06, "loss": 0.5386, "mean_token_accuracy": 0.8434656947851181, "step": 5795 }, { "epoch": 2.2121125417262757, "grad_norm": 8.125, "learning_rate": 5.247361058120311e-06, "loss": 0.6117, "mean_token_accuracy": 0.8154094561934471, "step": 5800 }, { "epoch": 2.2140200286123033, "grad_norm": 4.53125, "learning_rate": 5.234643265929035e-06, "loss": 0.5065, "mean_token_accuracy": 0.8439598381519318, "step": 5805 }, { "epoch": 2.215927515498331, "grad_norm": 6.9375, "learning_rate": 5.22192547373776e-06, "loss": 0.5865, "mean_token_accuracy": 0.8173124462366104, "step": 5810 }, { "epoch": 2.2178350023843585, "grad_norm": 6.34375, "learning_rate": 5.209207681546484e-06, "loss": 0.5545, "mean_token_accuracy": 0.8253095105290413, "step": 5815 }, { "epoch": 2.219742489270386, "grad_norm": 5.5, "learning_rate": 5.1964898893552085e-06, "loss": 0.5751, "mean_token_accuracy": 0.8209879517555236, "step": 5820 }, { "epoch": 2.221649976156414, "grad_norm": 7.0625, "learning_rate": 5.183772097163932e-06, "loss": 0.6052, "mean_token_accuracy": 0.8192328453063965, "step": 5825 }, { "epoch": 2.2235574630424417, "grad_norm": 5.0625, "learning_rate": 5.171054304972657e-06, "loss": 0.5329, "mean_token_accuracy": 0.8404569014906883, "step": 5830 }, { "epoch": 2.2254649499284693, "grad_norm": 7.34375, "learning_rate": 5.158336512781381e-06, "loss": 0.6871, "mean_token_accuracy": 0.7994562849402428, "step": 5835 }, { "epoch": 2.227372436814497, "grad_norm": 4.8125, "learning_rate": 5.145618720590106e-06, "loss": 0.5697, "mean_token_accuracy": 0.8249041870236397, "step": 5840 }, { "epoch": 2.2292799237005245, "grad_norm": 7.84375, "learning_rate": 5.132900928398831e-06, "loss": 0.6273, "mean_token_accuracy": 0.816638571023941, "step": 5845 }, { "epoch": 2.231187410586552, "grad_norm": 6.25, "learning_rate": 5.120183136207555e-06, "loss": 0.5686, "mean_token_accuracy": 0.8260336622595787, "step": 5850 }, { "epoch": 2.2330948974725797, "grad_norm": 5.75, "learning_rate": 5.107465344016279e-06, "loss": 0.5662, "mean_token_accuracy": 0.8271235138177871, "step": 5855 }, { "epoch": 2.2350023843586078, "grad_norm": 5.65625, "learning_rate": 5.094747551825003e-06, "loss": 0.5731, "mean_token_accuracy": 0.8245992794632911, "step": 5860 }, { "epoch": 2.2369098712446354, "grad_norm": 5.15625, "learning_rate": 5.082029759633728e-06, "loss": 0.4797, "mean_token_accuracy": 0.8532357349991798, "step": 5865 }, { "epoch": 2.238817358130663, "grad_norm": 6.28125, "learning_rate": 5.0693119674424525e-06, "loss": 0.5316, "mean_token_accuracy": 0.8337232992053032, "step": 5870 }, { "epoch": 2.2407248450166906, "grad_norm": 4.8125, "learning_rate": 5.056594175251177e-06, "loss": 0.5555, "mean_token_accuracy": 0.8319388970732688, "step": 5875 }, { "epoch": 2.242632331902718, "grad_norm": 7.09375, "learning_rate": 5.043876383059901e-06, "loss": 0.6325, "mean_token_accuracy": 0.8110457330942153, "step": 5880 }, { "epoch": 2.2445398187887458, "grad_norm": 5.75, "learning_rate": 5.031158590868626e-06, "loss": 0.5656, "mean_token_accuracy": 0.8316217005252838, "step": 5885 }, { "epoch": 2.2464473056747734, "grad_norm": 5.75, "learning_rate": 5.0184407986773495e-06, "loss": 0.6113, "mean_token_accuracy": 0.8179434195160866, "step": 5890 }, { "epoch": 2.248354792560801, "grad_norm": 5.0, "learning_rate": 5.005723006486074e-06, "loss": 0.5971, "mean_token_accuracy": 0.8208650708198547, "step": 5895 }, { "epoch": 2.250262279446829, "grad_norm": 4.59375, "learning_rate": 4.993005214294799e-06, "loss": 0.4879, "mean_token_accuracy": 0.8487023189663887, "step": 5900 }, { "epoch": 2.2521697663328566, "grad_norm": 7.0, "learning_rate": 4.980287422103523e-06, "loss": 0.5308, "mean_token_accuracy": 0.832112543284893, "step": 5905 }, { "epoch": 2.254077253218884, "grad_norm": 4.84375, "learning_rate": 4.967569629912248e-06, "loss": 0.5688, "mean_token_accuracy": 0.8282816842198372, "step": 5910 }, { "epoch": 2.255984740104912, "grad_norm": 6.0625, "learning_rate": 4.954851837720972e-06, "loss": 0.5618, "mean_token_accuracy": 0.8305701389908791, "step": 5915 }, { "epoch": 2.2578922269909394, "grad_norm": 5.5625, "learning_rate": 4.9421340455296965e-06, "loss": 0.6559, "mean_token_accuracy": 0.8106821358203888, "step": 5920 }, { "epoch": 2.259799713876967, "grad_norm": 6.53125, "learning_rate": 4.929416253338421e-06, "loss": 0.5979, "mean_token_accuracy": 0.8282116293907166, "step": 5925 }, { "epoch": 2.2617072007629946, "grad_norm": 5.84375, "learning_rate": 4.916698461147145e-06, "loss": 0.5554, "mean_token_accuracy": 0.8050909072160721, "step": 5930 }, { "epoch": 2.2636146876490226, "grad_norm": 5.90625, "learning_rate": 4.90398066895587e-06, "loss": 0.5725, "mean_token_accuracy": 0.8295422628521919, "step": 5935 }, { "epoch": 2.2655221745350502, "grad_norm": 5.59375, "learning_rate": 4.891262876764594e-06, "loss": 0.5504, "mean_token_accuracy": 0.8295823588967324, "step": 5940 }, { "epoch": 2.267429661421078, "grad_norm": 5.375, "learning_rate": 4.878545084573318e-06, "loss": 0.5663, "mean_token_accuracy": 0.8239354059100151, "step": 5945 }, { "epoch": 2.2693371483071054, "grad_norm": 5.125, "learning_rate": 4.865827292382043e-06, "loss": 0.6131, "mean_token_accuracy": 0.822664175927639, "step": 5950 }, { "epoch": 2.271244635193133, "grad_norm": 6.375, "learning_rate": 4.853109500190767e-06, "loss": 0.6081, "mean_token_accuracy": 0.8190778091549873, "step": 5955 }, { "epoch": 2.2731521220791606, "grad_norm": 6.65625, "learning_rate": 4.840391707999492e-06, "loss": 0.5852, "mean_token_accuracy": 0.8239965006709099, "step": 5960 }, { "epoch": 2.275059608965188, "grad_norm": 5.71875, "learning_rate": 4.827673915808216e-06, "loss": 0.4651, "mean_token_accuracy": 0.8500474870204926, "step": 5965 }, { "epoch": 2.276967095851216, "grad_norm": 5.59375, "learning_rate": 4.8149561236169405e-06, "loss": 0.5041, "mean_token_accuracy": 0.8404566153883934, "step": 5970 }, { "epoch": 2.278874582737244, "grad_norm": 7.0625, "learning_rate": 4.802238331425665e-06, "loss": 0.6538, "mean_token_accuracy": 0.8016729637980461, "step": 5975 }, { "epoch": 2.2807820696232715, "grad_norm": 7.21875, "learning_rate": 4.789520539234389e-06, "loss": 0.5047, "mean_token_accuracy": 0.8432752504944802, "step": 5980 }, { "epoch": 2.282689556509299, "grad_norm": 5.40625, "learning_rate": 4.776802747043114e-06, "loss": 0.5992, "mean_token_accuracy": 0.8168067425489426, "step": 5985 }, { "epoch": 2.2845970433953267, "grad_norm": 7.65625, "learning_rate": 4.764084954851838e-06, "loss": 0.5689, "mean_token_accuracy": 0.8325183004140854, "step": 5990 }, { "epoch": 2.2865045302813543, "grad_norm": 5.15625, "learning_rate": 4.751367162660562e-06, "loss": 0.5437, "mean_token_accuracy": 0.8334845125675201, "step": 5995 }, { "epoch": 2.288412017167382, "grad_norm": 5.78125, "learning_rate": 4.738649370469287e-06, "loss": 0.5149, "mean_token_accuracy": 0.8367941051721572, "step": 6000 } ], "logging_steps": 5, "max_steps": 7863, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5948696808254013e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }