{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998560352096745, "eval_steps": 100, "global_step": 6077, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008226559447175205, "grad_norm": 8.868131691941368, "learning_rate": 1.6447368421052632e-07, "loss": 1.637, "mean_token_accuracy": 0.6517329469323159, "step": 5 }, { "epoch": 0.001645311889435041, "grad_norm": 8.62583386804497, "learning_rate": 3.2894736842105264e-07, "loss": 1.608, "mean_token_accuracy": 0.6586906433105468, "step": 10 }, { "epoch": 0.0024679678341525614, "grad_norm": 8.39519511424222, "learning_rate": 4.93421052631579e-07, "loss": 1.6156, "mean_token_accuracy": 0.6545264750719071, "step": 15 }, { "epoch": 0.003290623778870082, "grad_norm": 7.536251953603491, "learning_rate": 6.578947368421053e-07, "loss": 1.578, "mean_token_accuracy": 0.6607441410422326, "step": 20 }, { "epoch": 0.004113279723587602, "grad_norm": 5.893589868593618, "learning_rate": 8.223684210526317e-07, "loss": 1.5223, "mean_token_accuracy": 0.6652548253536225, "step": 25 }, { "epoch": 0.004935935668305123, "grad_norm": 5.11898211249642, "learning_rate": 9.86842105263158e-07, "loss": 1.4703, "mean_token_accuracy": 0.66515032351017, "step": 30 }, { "epoch": 0.0057585916130226435, "grad_norm": 2.6587254074606514, "learning_rate": 1.1513157894736842e-06, "loss": 1.3811, "mean_token_accuracy": 0.6735523581504822, "step": 35 }, { "epoch": 0.006581247557740164, "grad_norm": 1.9702811047722817, "learning_rate": 1.3157894736842106e-06, "loss": 1.2961, "mean_token_accuracy": 0.6826083660125732, "step": 40 }, { "epoch": 0.007403903502457685, "grad_norm": 1.2387256545862801, "learning_rate": 1.480263157894737e-06, "loss": 1.2522, "mean_token_accuracy": 0.6873041942715645, "step": 45 }, { "epoch": 0.008226559447175204, "grad_norm": 1.152839731046101, "learning_rate": 1.6447368421052635e-06, "loss": 1.2253, "mean_token_accuracy": 0.6880093082785607, "step": 50 }, { "epoch": 0.009049215391892726, "grad_norm": 0.8213656678547654, "learning_rate": 1.8092105263157896e-06, "loss": 1.1485, "mean_token_accuracy": 0.7043157085776329, "step": 55 }, { "epoch": 0.009871871336610246, "grad_norm": 0.7469164615793938, "learning_rate": 1.973684210526316e-06, "loss": 1.157, "mean_token_accuracy": 0.7010814696550369, "step": 60 }, { "epoch": 0.010694527281327767, "grad_norm": 0.6704794221351722, "learning_rate": 2.138157894736842e-06, "loss": 1.1523, "mean_token_accuracy": 0.7000442028045655, "step": 65 }, { "epoch": 0.011517183226045287, "grad_norm": 0.645948541131335, "learning_rate": 2.3026315789473684e-06, "loss": 1.1138, "mean_token_accuracy": 0.7066650003194809, "step": 70 }, { "epoch": 0.012339839170762808, "grad_norm": 0.596527644749606, "learning_rate": 2.4671052631578948e-06, "loss": 1.0884, "mean_token_accuracy": 0.7121440231800079, "step": 75 }, { "epoch": 0.013162495115480328, "grad_norm": 0.5922222520878986, "learning_rate": 2.631578947368421e-06, "loss": 1.0848, "mean_token_accuracy": 0.7105465576052665, "step": 80 }, { "epoch": 0.013985151060197848, "grad_norm": 0.5808760068110808, "learning_rate": 2.796052631578948e-06, "loss": 1.069, "mean_token_accuracy": 0.7149081036448479, "step": 85 }, { "epoch": 0.01480780700491537, "grad_norm": 0.5787829583913998, "learning_rate": 2.960526315789474e-06, "loss": 1.052, "mean_token_accuracy": 0.7179096817970276, "step": 90 }, { "epoch": 0.01563046294963289, "grad_norm": 0.5710219187926011, "learning_rate": 3.125e-06, "loss": 1.0597, "mean_token_accuracy": 0.7161516606807709, "step": 95 }, { "epoch": 0.01645311889435041, "grad_norm": 0.573018657721767, "learning_rate": 3.289473684210527e-06, "loss": 1.0429, "mean_token_accuracy": 0.7194345474243165, "step": 100 }, { "epoch": 0.01727577483906793, "grad_norm": 0.5787220700238181, "learning_rate": 3.453947368421053e-06, "loss": 1.0441, "mean_token_accuracy": 0.7176929205656052, "step": 105 }, { "epoch": 0.018098430783785452, "grad_norm": 0.584885889850401, "learning_rate": 3.618421052631579e-06, "loss": 1.0426, "mean_token_accuracy": 0.7172926932573318, "step": 110 }, { "epoch": 0.018921086728502973, "grad_norm": 0.5865945154090116, "learning_rate": 3.7828947368421055e-06, "loss": 1.0174, "mean_token_accuracy": 0.7244779646396637, "step": 115 }, { "epoch": 0.01974374267322049, "grad_norm": 0.558398825504374, "learning_rate": 3.947368421052632e-06, "loss": 1.0563, "mean_token_accuracy": 0.7138217866420746, "step": 120 }, { "epoch": 0.020566398617938013, "grad_norm": 0.5525891228992657, "learning_rate": 4.111842105263158e-06, "loss": 1.0158, "mean_token_accuracy": 0.7233551248908043, "step": 125 }, { "epoch": 0.021389054562655534, "grad_norm": 0.598765463679675, "learning_rate": 4.276315789473684e-06, "loss": 1.0205, "mean_token_accuracy": 0.7228221029043198, "step": 130 }, { "epoch": 0.022211710507373052, "grad_norm": 0.5566364845753337, "learning_rate": 4.440789473684211e-06, "loss": 1.0126, "mean_token_accuracy": 0.72349284440279, "step": 135 }, { "epoch": 0.023034366452090574, "grad_norm": 0.5507380296058333, "learning_rate": 4.605263157894737e-06, "loss": 1.0123, "mean_token_accuracy": 0.7244027391076088, "step": 140 }, { "epoch": 0.023857022396808095, "grad_norm": 0.5453340035685558, "learning_rate": 4.769736842105264e-06, "loss": 1.0124, "mean_token_accuracy": 0.7230222299695015, "step": 145 }, { "epoch": 0.024679678341525617, "grad_norm": 0.5420850536327707, "learning_rate": 4.9342105263157895e-06, "loss": 1.011, "mean_token_accuracy": 0.7244324311614037, "step": 150 }, { "epoch": 0.025502334286243135, "grad_norm": 0.5415962863518953, "learning_rate": 5.098684210526316e-06, "loss": 1.0006, "mean_token_accuracy": 0.726619029045105, "step": 155 }, { "epoch": 0.026324990230960656, "grad_norm": 0.5315203834631941, "learning_rate": 5.263157894736842e-06, "loss": 0.9857, "mean_token_accuracy": 0.729770778119564, "step": 160 }, { "epoch": 0.027147646175678178, "grad_norm": 0.5274513053785929, "learning_rate": 5.4276315789473686e-06, "loss": 1.0065, "mean_token_accuracy": 0.7238917708396911, "step": 165 }, { "epoch": 0.027970302120395696, "grad_norm": 0.5376247705287334, "learning_rate": 5.592105263157896e-06, "loss": 0.9873, "mean_token_accuracy": 0.729346638917923, "step": 170 }, { "epoch": 0.028792958065113217, "grad_norm": 0.5377046214076631, "learning_rate": 5.756578947368422e-06, "loss": 1.002, "mean_token_accuracy": 0.7242323219776153, "step": 175 }, { "epoch": 0.02961561400983074, "grad_norm": 0.5431096787941521, "learning_rate": 5.921052631578948e-06, "loss": 0.9731, "mean_token_accuracy": 0.7317973271012306, "step": 180 }, { "epoch": 0.03043826995454826, "grad_norm": 0.552030481222372, "learning_rate": 6.085526315789474e-06, "loss": 0.9865, "mean_token_accuracy": 0.7296167179942131, "step": 185 }, { "epoch": 0.03126092589926578, "grad_norm": 0.5343638068177684, "learning_rate": 6.25e-06, "loss": 0.9703, "mean_token_accuracy": 0.7319935217499733, "step": 190 }, { "epoch": 0.0320835818439833, "grad_norm": 0.5364287078239771, "learning_rate": 6.4144736842105275e-06, "loss": 0.9795, "mean_token_accuracy": 0.7296116515994072, "step": 195 }, { "epoch": 0.03290623778870082, "grad_norm": 0.5385585237439168, "learning_rate": 6.578947368421054e-06, "loss": 0.9865, "mean_token_accuracy": 0.7276000782847405, "step": 200 }, { "epoch": 0.03372889373341834, "grad_norm": 0.5434647031096215, "learning_rate": 6.743421052631579e-06, "loss": 0.9681, "mean_token_accuracy": 0.7318137407302856, "step": 205 }, { "epoch": 0.03455154967813586, "grad_norm": 0.5442172403207912, "learning_rate": 6.907894736842106e-06, "loss": 0.9791, "mean_token_accuracy": 0.7292979001998902, "step": 210 }, { "epoch": 0.03537420562285338, "grad_norm": 0.5288039443136836, "learning_rate": 7.072368421052632e-06, "loss": 0.9575, "mean_token_accuracy": 0.7351444363594055, "step": 215 }, { "epoch": 0.036196861567570904, "grad_norm": 0.5401279785969058, "learning_rate": 7.236842105263158e-06, "loss": 0.9508, "mean_token_accuracy": 0.7365437626838685, "step": 220 }, { "epoch": 0.03701951751228842, "grad_norm": 0.543383850495259, "learning_rate": 7.4013157894736856e-06, "loss": 0.9436, "mean_token_accuracy": 0.7370979130268097, "step": 225 }, { "epoch": 0.03784217345700595, "grad_norm": 0.5414933506171394, "learning_rate": 7.565789473684211e-06, "loss": 0.964, "mean_token_accuracy": 0.7331722363829613, "step": 230 }, { "epoch": 0.038664829401723465, "grad_norm": 0.5508921858076679, "learning_rate": 7.730263157894737e-06, "loss": 0.965, "mean_token_accuracy": 0.7336499884724617, "step": 235 }, { "epoch": 0.03948748534644098, "grad_norm": 0.5559169426504186, "learning_rate": 7.894736842105265e-06, "loss": 0.9618, "mean_token_accuracy": 0.7327608689665794, "step": 240 }, { "epoch": 0.04031014129115851, "grad_norm": 0.5658377053423631, "learning_rate": 8.05921052631579e-06, "loss": 0.9539, "mean_token_accuracy": 0.7350834712386132, "step": 245 }, { "epoch": 0.041132797235876026, "grad_norm": 0.5504134873232572, "learning_rate": 8.223684210526316e-06, "loss": 0.9504, "mean_token_accuracy": 0.7352781683206558, "step": 250 }, { "epoch": 0.041955453180593544, "grad_norm": 0.5706949270079168, "learning_rate": 8.388157894736843e-06, "loss": 0.9434, "mean_token_accuracy": 0.7372456714510918, "step": 255 }, { "epoch": 0.04277810912531107, "grad_norm": 0.5552968882532056, "learning_rate": 8.552631578947368e-06, "loss": 0.9524, "mean_token_accuracy": 0.735064473748207, "step": 260 }, { "epoch": 0.04360076507002859, "grad_norm": 0.5680013693494521, "learning_rate": 8.717105263157895e-06, "loss": 0.9433, "mean_token_accuracy": 0.7370804220438003, "step": 265 }, { "epoch": 0.044423421014746105, "grad_norm": 0.556664521624949, "learning_rate": 8.881578947368423e-06, "loss": 0.9444, "mean_token_accuracy": 0.7375777631998062, "step": 270 }, { "epoch": 0.04524607695946363, "grad_norm": 0.5446190706298419, "learning_rate": 9.046052631578948e-06, "loss": 0.9475, "mean_token_accuracy": 0.7359046369791031, "step": 275 }, { "epoch": 0.04606873290418115, "grad_norm": 0.5388340538851397, "learning_rate": 9.210526315789474e-06, "loss": 0.9492, "mean_token_accuracy": 0.7363917216658592, "step": 280 }, { "epoch": 0.04689138884889867, "grad_norm": 0.5509220058564872, "learning_rate": 9.375000000000001e-06, "loss": 0.9523, "mean_token_accuracy": 0.7358365565538406, "step": 285 }, { "epoch": 0.04771404479361619, "grad_norm": 0.5544783093041145, "learning_rate": 9.539473684210528e-06, "loss": 0.9435, "mean_token_accuracy": 0.7366945788264274, "step": 290 }, { "epoch": 0.04853670073833371, "grad_norm": 0.5598670570217699, "learning_rate": 9.703947368421054e-06, "loss": 0.9509, "mean_token_accuracy": 0.7347988456487655, "step": 295 }, { "epoch": 0.049359356683051234, "grad_norm": 0.5512744808384158, "learning_rate": 9.868421052631579e-06, "loss": 0.9228, "mean_token_accuracy": 0.742069736123085, "step": 300 }, { "epoch": 0.05018201262776875, "grad_norm": 0.5341983218407055, "learning_rate": 1.0032894736842106e-05, "loss": 0.9297, "mean_token_accuracy": 0.740013287961483, "step": 305 }, { "epoch": 0.05100466857248627, "grad_norm": 0.5460625963418834, "learning_rate": 1.0197368421052632e-05, "loss": 0.944, "mean_token_accuracy": 0.7365748167037964, "step": 310 }, { "epoch": 0.051827324517203795, "grad_norm": 0.5477696289595027, "learning_rate": 1.0361842105263157e-05, "loss": 0.9277, "mean_token_accuracy": 0.7403417602181435, "step": 315 }, { "epoch": 0.05264998046192131, "grad_norm": 0.5528433926551594, "learning_rate": 1.0526315789473684e-05, "loss": 0.948, "mean_token_accuracy": 0.7348752021789551, "step": 320 }, { "epoch": 0.05347263640663883, "grad_norm": 0.5541848537799827, "learning_rate": 1.0690789473684212e-05, "loss": 0.9504, "mean_token_accuracy": 0.7345694780349732, "step": 325 }, { "epoch": 0.054295292351356356, "grad_norm": 0.5317200475270444, "learning_rate": 1.0855263157894737e-05, "loss": 0.9536, "mean_token_accuracy": 0.734566105902195, "step": 330 }, { "epoch": 0.055117948296073874, "grad_norm": 0.5541764042465507, "learning_rate": 1.1019736842105264e-05, "loss": 0.9445, "mean_token_accuracy": 0.7362502306699753, "step": 335 }, { "epoch": 0.05594060424079139, "grad_norm": 0.542700904879019, "learning_rate": 1.1184210526315792e-05, "loss": 0.9339, "mean_token_accuracy": 0.7383578926324844, "step": 340 }, { "epoch": 0.05676326018550892, "grad_norm": 0.5630009701684998, "learning_rate": 1.1348684210526317e-05, "loss": 0.9221, "mean_token_accuracy": 0.7414110735058784, "step": 345 }, { "epoch": 0.057585916130226435, "grad_norm": 0.5637165685168448, "learning_rate": 1.1513157894736844e-05, "loss": 0.9394, "mean_token_accuracy": 0.7377202868461609, "step": 350 }, { "epoch": 0.05840857207494396, "grad_norm": 0.5851556006438579, "learning_rate": 1.167763157894737e-05, "loss": 0.9339, "mean_token_accuracy": 0.7383405163884162, "step": 355 }, { "epoch": 0.05923122801966148, "grad_norm": 0.5548681236118901, "learning_rate": 1.1842105263157895e-05, "loss": 0.9188, "mean_token_accuracy": 0.7427036046981812, "step": 360 }, { "epoch": 0.060053883964378996, "grad_norm": 0.544317097107654, "learning_rate": 1.200657894736842e-05, "loss": 0.9268, "mean_token_accuracy": 0.7409031853079796, "step": 365 }, { "epoch": 0.06087653990909652, "grad_norm": 0.5288019993765805, "learning_rate": 1.2171052631578948e-05, "loss": 0.9385, "mean_token_accuracy": 0.7382649883627892, "step": 370 }, { "epoch": 0.06169919585381404, "grad_norm": 0.5418545591752311, "learning_rate": 1.2335526315789475e-05, "loss": 0.9376, "mean_token_accuracy": 0.737010209262371, "step": 375 }, { "epoch": 0.06252185179853156, "grad_norm": 0.5421635898523016, "learning_rate": 1.25e-05, "loss": 0.9181, "mean_token_accuracy": 0.7410633578896523, "step": 380 }, { "epoch": 0.06334450774324908, "grad_norm": 0.5281862526525664, "learning_rate": 1.2664473684210528e-05, "loss": 0.9203, "mean_token_accuracy": 0.7419573023915291, "step": 385 }, { "epoch": 0.0641671636879666, "grad_norm": 0.5168316340427207, "learning_rate": 1.2828947368421055e-05, "loss": 0.9018, "mean_token_accuracy": 0.7461319774389267, "step": 390 }, { "epoch": 0.06498981963268412, "grad_norm": 0.5460722185869403, "learning_rate": 1.299342105263158e-05, "loss": 0.921, "mean_token_accuracy": 0.7415965318679809, "step": 395 }, { "epoch": 0.06581247557740164, "grad_norm": 0.5468177485842205, "learning_rate": 1.3157894736842108e-05, "loss": 0.9231, "mean_token_accuracy": 0.7404225617647171, "step": 400 }, { "epoch": 0.06663513152211917, "grad_norm": 0.554365279399498, "learning_rate": 1.3322368421052631e-05, "loss": 0.9173, "mean_token_accuracy": 0.7422191500663757, "step": 405 }, { "epoch": 0.06745778746683669, "grad_norm": 0.5420472750237723, "learning_rate": 1.3486842105263159e-05, "loss": 0.9232, "mean_token_accuracy": 0.7407791033387184, "step": 410 }, { "epoch": 0.0682804434115542, "grad_norm": 0.5491312188223852, "learning_rate": 1.3651315789473684e-05, "loss": 0.9167, "mean_token_accuracy": 0.742340338230133, "step": 415 }, { "epoch": 0.06910309935627172, "grad_norm": 0.5506946662786645, "learning_rate": 1.3815789473684211e-05, "loss": 0.9176, "mean_token_accuracy": 0.7416011452674866, "step": 420 }, { "epoch": 0.06992575530098924, "grad_norm": 0.5392292592405425, "learning_rate": 1.3980263157894739e-05, "loss": 0.9016, "mean_token_accuracy": 0.746722374856472, "step": 425 }, { "epoch": 0.07074841124570676, "grad_norm": 0.5510905744764057, "learning_rate": 1.4144736842105264e-05, "loss": 0.9145, "mean_token_accuracy": 0.7429044425487519, "step": 430 }, { "epoch": 0.07157106719042429, "grad_norm": 0.5444285888806475, "learning_rate": 1.4309210526315791e-05, "loss": 0.9216, "mean_token_accuracy": 0.7397073268890381, "step": 435 }, { "epoch": 0.07239372313514181, "grad_norm": 0.5464240523802012, "learning_rate": 1.4473684210526317e-05, "loss": 0.9191, "mean_token_accuracy": 0.7409022390842438, "step": 440 }, { "epoch": 0.07321637907985933, "grad_norm": 0.5406639789016983, "learning_rate": 1.4638157894736844e-05, "loss": 0.9201, "mean_token_accuracy": 0.7413958802819252, "step": 445 }, { "epoch": 0.07403903502457684, "grad_norm": 0.5528623919452975, "learning_rate": 1.4802631578947371e-05, "loss": 0.9215, "mean_token_accuracy": 0.7393794819712639, "step": 450 }, { "epoch": 0.07486169096929436, "grad_norm": 0.5424634254063502, "learning_rate": 1.4967105263157895e-05, "loss": 0.914, "mean_token_accuracy": 0.7422097861766815, "step": 455 }, { "epoch": 0.0756843469140119, "grad_norm": 0.5635254903930312, "learning_rate": 1.5131578947368422e-05, "loss": 0.922, "mean_token_accuracy": 0.74082200974226, "step": 460 }, { "epoch": 0.07650700285872941, "grad_norm": 0.5595369183205897, "learning_rate": 1.529605263157895e-05, "loss": 0.9046, "mean_token_accuracy": 0.745243152976036, "step": 465 }, { "epoch": 0.07732965880344693, "grad_norm": 0.530529201529439, "learning_rate": 1.5460526315789475e-05, "loss": 0.9093, "mean_token_accuracy": 0.7439425513148308, "step": 470 }, { "epoch": 0.07815231474816445, "grad_norm": 0.5620135332904229, "learning_rate": 1.5625e-05, "loss": 0.9193, "mean_token_accuracy": 0.740068256855011, "step": 475 }, { "epoch": 0.07897497069288197, "grad_norm": 0.5385837098886685, "learning_rate": 1.578947368421053e-05, "loss": 0.8948, "mean_token_accuracy": 0.7465275138616562, "step": 480 }, { "epoch": 0.07979762663759948, "grad_norm": 0.5326438877919084, "learning_rate": 1.5953947368421055e-05, "loss": 0.8995, "mean_token_accuracy": 0.7465934857726098, "step": 485 }, { "epoch": 0.08062028258231702, "grad_norm": 0.5852668220941116, "learning_rate": 1.611842105263158e-05, "loss": 0.9205, "mean_token_accuracy": 0.7400019004940986, "step": 490 }, { "epoch": 0.08144293852703453, "grad_norm": 0.5466176713085582, "learning_rate": 1.628289473684211e-05, "loss": 0.9139, "mean_token_accuracy": 0.7423398822546006, "step": 495 }, { "epoch": 0.08226559447175205, "grad_norm": 0.5433898490851811, "learning_rate": 1.644736842105263e-05, "loss": 0.9083, "mean_token_accuracy": 0.7429317653179168, "step": 500 }, { "epoch": 0.08308825041646957, "grad_norm": 0.5422048906675139, "learning_rate": 1.661184210526316e-05, "loss": 0.8969, "mean_token_accuracy": 0.7470383390784263, "step": 505 }, { "epoch": 0.08391090636118709, "grad_norm": 0.5644795196299811, "learning_rate": 1.6776315789473686e-05, "loss": 0.9111, "mean_token_accuracy": 0.741926047205925, "step": 510 }, { "epoch": 0.08473356230590462, "grad_norm": 0.5529066782682016, "learning_rate": 1.694078947368421e-05, "loss": 0.9036, "mean_token_accuracy": 0.7436578303575516, "step": 515 }, { "epoch": 0.08555621825062214, "grad_norm": 0.525667862173336, "learning_rate": 1.7105263157894737e-05, "loss": 0.8925, "mean_token_accuracy": 0.746988432109356, "step": 520 }, { "epoch": 0.08637887419533966, "grad_norm": 0.5409944700392442, "learning_rate": 1.7269736842105265e-05, "loss": 0.8975, "mean_token_accuracy": 0.7464745938777924, "step": 525 }, { "epoch": 0.08720153014005717, "grad_norm": 0.5312950086150021, "learning_rate": 1.743421052631579e-05, "loss": 0.8841, "mean_token_accuracy": 0.7492008373141289, "step": 530 }, { "epoch": 0.08802418608477469, "grad_norm": 0.5463063267936722, "learning_rate": 1.7598684210526316e-05, "loss": 0.9152, "mean_token_accuracy": 0.7405162453651428, "step": 535 }, { "epoch": 0.08884684202949221, "grad_norm": 0.5288385163759731, "learning_rate": 1.7763157894736845e-05, "loss": 0.9162, "mean_token_accuracy": 0.7416113764047623, "step": 540 }, { "epoch": 0.08966949797420974, "grad_norm": 0.5489098095740034, "learning_rate": 1.792763157894737e-05, "loss": 0.8919, "mean_token_accuracy": 0.7470447152853013, "step": 545 }, { "epoch": 0.09049215391892726, "grad_norm": 0.5348984691264548, "learning_rate": 1.8092105263157896e-05, "loss": 0.8983, "mean_token_accuracy": 0.745751966536045, "step": 550 }, { "epoch": 0.09131480986364478, "grad_norm": 0.5557400796494141, "learning_rate": 1.8256578947368422e-05, "loss": 0.8988, "mean_token_accuracy": 0.7456147596240044, "step": 555 }, { "epoch": 0.0921374658083623, "grad_norm": 0.5455371304260814, "learning_rate": 1.8421052631578947e-05, "loss": 0.8851, "mean_token_accuracy": 0.7483421131968498, "step": 560 }, { "epoch": 0.09296012175307981, "grad_norm": 0.5475015495829477, "learning_rate": 1.8585526315789476e-05, "loss": 0.908, "mean_token_accuracy": 0.7433639690279961, "step": 565 }, { "epoch": 0.09378277769779735, "grad_norm": 0.5594798228947916, "learning_rate": 1.8750000000000002e-05, "loss": 0.8862, "mean_token_accuracy": 0.7483510807156563, "step": 570 }, { "epoch": 0.09460543364251486, "grad_norm": 0.5388047362933702, "learning_rate": 1.8914473684210527e-05, "loss": 0.8995, "mean_token_accuracy": 0.7457010865211486, "step": 575 }, { "epoch": 0.09542808958723238, "grad_norm": 0.5492873742584096, "learning_rate": 1.9078947368421056e-05, "loss": 0.8951, "mean_token_accuracy": 0.7464174717664719, "step": 580 }, { "epoch": 0.0962507455319499, "grad_norm": 0.5356009711104396, "learning_rate": 1.924342105263158e-05, "loss": 0.8918, "mean_token_accuracy": 0.746445195376873, "step": 585 }, { "epoch": 0.09707340147666742, "grad_norm": 0.5270780781231207, "learning_rate": 1.9407894736842107e-05, "loss": 0.8828, "mean_token_accuracy": 0.7489307940006256, "step": 590 }, { "epoch": 0.09789605742138494, "grad_norm": 0.5279403578740814, "learning_rate": 1.9572368421052633e-05, "loss": 0.8902, "mean_token_accuracy": 0.7478219836950302, "step": 595 }, { "epoch": 0.09871871336610247, "grad_norm": 0.5519831212862074, "learning_rate": 1.9736842105263158e-05, "loss": 0.9024, "mean_token_accuracy": 0.7444657281041145, "step": 600 }, { "epoch": 0.09954136931081999, "grad_norm": 0.5609787239404814, "learning_rate": 1.9901315789473684e-05, "loss": 0.897, "mean_token_accuracy": 0.7453935295343399, "step": 605 }, { "epoch": 0.1003640252555375, "grad_norm": 0.5368068215010757, "learning_rate": 1.999999340045725e-05, "loss": 0.8939, "mean_token_accuracy": 0.7461455389857292, "step": 610 }, { "epoch": 0.10118668120025502, "grad_norm": 0.5366441993422637, "learning_rate": 1.999991915570134e-05, "loss": 0.9089, "mean_token_accuracy": 0.7422514289617539, "step": 615 }, { "epoch": 0.10200933714497254, "grad_norm": 0.5374441483455675, "learning_rate": 1.9999762417375587e-05, "loss": 0.8964, "mean_token_accuracy": 0.745026272535324, "step": 620 }, { "epoch": 0.10283199308969006, "grad_norm": 0.5124044529280732, "learning_rate": 1.9999523186773e-05, "loss": 0.8899, "mean_token_accuracy": 0.7464932456612587, "step": 625 }, { "epoch": 0.10365464903440759, "grad_norm": 0.6100725380949276, "learning_rate": 1.9999201465867094e-05, "loss": 0.8951, "mean_token_accuracy": 0.7462658196687698, "step": 630 }, { "epoch": 0.10447730497912511, "grad_norm": 0.8343528315114032, "learning_rate": 1.9998797257311883e-05, "loss": 0.8909, "mean_token_accuracy": 0.7470527231693268, "step": 635 }, { "epoch": 0.10529996092384263, "grad_norm": 0.558356952046808, "learning_rate": 1.999831056444185e-05, "loss": 0.8964, "mean_token_accuracy": 0.7445585608482361, "step": 640 }, { "epoch": 0.10612261686856014, "grad_norm": 0.5431231447370339, "learning_rate": 1.999774139127193e-05, "loss": 0.8956, "mean_token_accuracy": 0.7458587154746056, "step": 645 }, { "epoch": 0.10694527281327766, "grad_norm": 0.5381134640947329, "learning_rate": 1.9997089742497476e-05, "loss": 0.8857, "mean_token_accuracy": 0.748266126215458, "step": 650 }, { "epoch": 0.1077679287579952, "grad_norm": 0.5322447682549837, "learning_rate": 1.9996355623494218e-05, "loss": 0.9021, "mean_token_accuracy": 0.7439694702625275, "step": 655 }, { "epoch": 0.10859058470271271, "grad_norm": 0.5466327025094077, "learning_rate": 1.9995539040318208e-05, "loss": 0.8894, "mean_token_accuracy": 0.7468291491270065, "step": 660 }, { "epoch": 0.10941324064743023, "grad_norm": 0.5336161472816302, "learning_rate": 1.999463999970579e-05, "loss": 0.9057, "mean_token_accuracy": 0.7425391405820847, "step": 665 }, { "epoch": 0.11023589659214775, "grad_norm": 0.5408455468273392, "learning_rate": 1.9993658509073533e-05, "loss": 0.9111, "mean_token_accuracy": 0.7423589944839477, "step": 670 }, { "epoch": 0.11105855253686527, "grad_norm": 0.5267122029736967, "learning_rate": 1.999259457651817e-05, "loss": 0.8959, "mean_token_accuracy": 0.745606929063797, "step": 675 }, { "epoch": 0.11188120848158278, "grad_norm": 0.5397388033419896, "learning_rate": 1.999144821081652e-05, "loss": 0.8993, "mean_token_accuracy": 0.7442831248044968, "step": 680 }, { "epoch": 0.11270386442630032, "grad_norm": 0.5504036166120375, "learning_rate": 1.9990219421425453e-05, "loss": 0.8833, "mean_token_accuracy": 0.7490384519100189, "step": 685 }, { "epoch": 0.11352652037101783, "grad_norm": 0.5182347036559498, "learning_rate": 1.9988908218481767e-05, "loss": 0.8748, "mean_token_accuracy": 0.7514773726463317, "step": 690 }, { "epoch": 0.11434917631573535, "grad_norm": 0.5233832271009863, "learning_rate": 1.9987514612802133e-05, "loss": 0.8873, "mean_token_accuracy": 0.7469314694404602, "step": 695 }, { "epoch": 0.11517183226045287, "grad_norm": 0.5523260071592497, "learning_rate": 1.998603861588299e-05, "loss": 0.8843, "mean_token_accuracy": 0.7485016688704491, "step": 700 }, { "epoch": 0.11599448820517039, "grad_norm": 0.5303091957773997, "learning_rate": 1.9984480239900466e-05, "loss": 0.8914, "mean_token_accuracy": 0.7466285347938537, "step": 705 }, { "epoch": 0.11681714414988792, "grad_norm": 0.507321203523412, "learning_rate": 1.9982839497710266e-05, "loss": 0.875, "mean_token_accuracy": 0.7516857028007508, "step": 710 }, { "epoch": 0.11763980009460544, "grad_norm": 0.5454186217128951, "learning_rate": 1.9981116402847563e-05, "loss": 0.8803, "mean_token_accuracy": 0.7486071839928627, "step": 715 }, { "epoch": 0.11846245603932296, "grad_norm": 0.5227676325761418, "learning_rate": 1.99793109695269e-05, "loss": 0.8736, "mean_token_accuracy": 0.7507883757352829, "step": 720 }, { "epoch": 0.11928511198404047, "grad_norm": 0.5219317784269293, "learning_rate": 1.997742321264206e-05, "loss": 0.8763, "mean_token_accuracy": 0.7492267623543739, "step": 725 }, { "epoch": 0.12010776792875799, "grad_norm": 0.5149947878213966, "learning_rate": 1.997545314776595e-05, "loss": 0.8827, "mean_token_accuracy": 0.748878738284111, "step": 730 }, { "epoch": 0.12093042387347551, "grad_norm": 0.5074984695101156, "learning_rate": 1.997340079115047e-05, "loss": 0.883, "mean_token_accuracy": 0.7492172762751579, "step": 735 }, { "epoch": 0.12175307981819304, "grad_norm": 0.5496130950134909, "learning_rate": 1.997126615972638e-05, "loss": 0.8845, "mean_token_accuracy": 0.7479833096265793, "step": 740 }, { "epoch": 0.12257573576291056, "grad_norm": 0.5210737933049328, "learning_rate": 1.9969049271103153e-05, "loss": 0.8737, "mean_token_accuracy": 0.7510244935750962, "step": 745 }, { "epoch": 0.12339839170762808, "grad_norm": 0.536745408024483, "learning_rate": 1.9966750143568852e-05, "loss": 0.8937, "mean_token_accuracy": 0.7455605834722518, "step": 750 }, { "epoch": 0.1242210476523456, "grad_norm": 0.5300256797775801, "learning_rate": 1.9964368796089947e-05, "loss": 0.8838, "mean_token_accuracy": 0.7487001419067383, "step": 755 }, { "epoch": 0.12504370359706313, "grad_norm": 0.5271533406750634, "learning_rate": 1.9961905248311184e-05, "loss": 0.8769, "mean_token_accuracy": 0.7500181883573532, "step": 760 }, { "epoch": 0.12586635954178063, "grad_norm": 0.5157607228356845, "learning_rate": 1.9959359520555416e-05, "loss": 0.8771, "mean_token_accuracy": 0.7503041535615921, "step": 765 }, { "epoch": 0.12668901548649816, "grad_norm": 0.5341798725462151, "learning_rate": 1.9956731633823424e-05, "loss": 0.8778, "mean_token_accuracy": 0.7497560173273087, "step": 770 }, { "epoch": 0.12751167143121567, "grad_norm": 0.5209553638678202, "learning_rate": 1.995402160979376e-05, "loss": 0.8781, "mean_token_accuracy": 0.7494660302996635, "step": 775 }, { "epoch": 0.1283343273759332, "grad_norm": 0.5221841761998345, "learning_rate": 1.9951229470822565e-05, "loss": 0.8655, "mean_token_accuracy": 0.752608010172844, "step": 780 }, { "epoch": 0.12915698332065073, "grad_norm": 0.5245222958924632, "learning_rate": 1.9948355239943374e-05, "loss": 0.8613, "mean_token_accuracy": 0.7529785528779029, "step": 785 }, { "epoch": 0.12997963926536824, "grad_norm": 0.5265364673097621, "learning_rate": 1.9945398940866937e-05, "loss": 0.8876, "mean_token_accuracy": 0.7471862375736237, "step": 790 }, { "epoch": 0.13080229521008577, "grad_norm": 0.5260907661900732, "learning_rate": 1.9942360597981013e-05, "loss": 0.8583, "mean_token_accuracy": 0.753960582613945, "step": 795 }, { "epoch": 0.13162495115480327, "grad_norm": 0.5743848884381613, "learning_rate": 1.9939240236350178e-05, "loss": 0.8657, "mean_token_accuracy": 0.7516094073653221, "step": 800 }, { "epoch": 0.1324476070995208, "grad_norm": 0.5171324312286166, "learning_rate": 1.9936037881715628e-05, "loss": 0.8767, "mean_token_accuracy": 0.7495281368494033, "step": 805 }, { "epoch": 0.13327026304423834, "grad_norm": 0.5100587281725074, "learning_rate": 1.9932753560494933e-05, "loss": 0.8576, "mean_token_accuracy": 0.7541581332683563, "step": 810 }, { "epoch": 0.13409291898895584, "grad_norm": 0.5141233592960893, "learning_rate": 1.9929387299781854e-05, "loss": 0.875, "mean_token_accuracy": 0.7506367653608322, "step": 815 }, { "epoch": 0.13491557493367337, "grad_norm": 0.527763043999431, "learning_rate": 1.99259391273461e-05, "loss": 0.8673, "mean_token_accuracy": 0.7517456442117691, "step": 820 }, { "epoch": 0.13573823087839088, "grad_norm": 0.5274939146122033, "learning_rate": 1.992240907163311e-05, "loss": 0.8699, "mean_token_accuracy": 0.751110564172268, "step": 825 }, { "epoch": 0.1365608868231084, "grad_norm": 0.5049234942350854, "learning_rate": 1.991879716176381e-05, "loss": 0.8762, "mean_token_accuracy": 0.7497649356722832, "step": 830 }, { "epoch": 0.13738354276782594, "grad_norm": 0.5205008742132803, "learning_rate": 1.9915103427534368e-05, "loss": 0.8832, "mean_token_accuracy": 0.7477307453751564, "step": 835 }, { "epoch": 0.13820619871254344, "grad_norm": 0.5168311375654675, "learning_rate": 1.9911327899415966e-05, "loss": 0.8533, "mean_token_accuracy": 0.7555161505937577, "step": 840 }, { "epoch": 0.13902885465726098, "grad_norm": 0.5348221273327664, "learning_rate": 1.9907470608554534e-05, "loss": 0.8838, "mean_token_accuracy": 0.7472988218069077, "step": 845 }, { "epoch": 0.13985151060197848, "grad_norm": 0.5166138723242221, "learning_rate": 1.9903531586770502e-05, "loss": 0.8611, "mean_token_accuracy": 0.753131790459156, "step": 850 }, { "epoch": 0.140674166546696, "grad_norm": 0.5152139619423953, "learning_rate": 1.989951086655853e-05, "loss": 0.8605, "mean_token_accuracy": 0.7535258084535599, "step": 855 }, { "epoch": 0.14149682249141352, "grad_norm": 0.5120445607008338, "learning_rate": 1.9895408481087237e-05, "loss": 0.8693, "mean_token_accuracy": 0.750883474946022, "step": 860 }, { "epoch": 0.14231947843613105, "grad_norm": 0.5353562200419911, "learning_rate": 1.989122446419894e-05, "loss": 0.8603, "mean_token_accuracy": 0.7536425784230232, "step": 865 }, { "epoch": 0.14314213438084858, "grad_norm": 0.5232052255550174, "learning_rate": 1.9886958850409373e-05, "loss": 0.8489, "mean_token_accuracy": 0.7574348479509354, "step": 870 }, { "epoch": 0.14396479032556608, "grad_norm": 0.5189200962721031, "learning_rate": 1.9882611674907384e-05, "loss": 0.8765, "mean_token_accuracy": 0.7496709749102592, "step": 875 }, { "epoch": 0.14478744627028362, "grad_norm": 0.5211464682561241, "learning_rate": 1.987818297355467e-05, "loss": 0.8714, "mean_token_accuracy": 0.7504607364535332, "step": 880 }, { "epoch": 0.14561010221500112, "grad_norm": 0.5489607475955633, "learning_rate": 1.9873672782885464e-05, "loss": 0.8596, "mean_token_accuracy": 0.7535700961947441, "step": 885 }, { "epoch": 0.14643275815971865, "grad_norm": 0.5160101761749724, "learning_rate": 1.986908114010624e-05, "loss": 0.8732, "mean_token_accuracy": 0.7504155531525611, "step": 890 }, { "epoch": 0.14725541410443618, "grad_norm": 0.5076227357713657, "learning_rate": 1.9864408083095404e-05, "loss": 0.8639, "mean_token_accuracy": 0.753124563395977, "step": 895 }, { "epoch": 0.1480780700491537, "grad_norm": 0.49835036131210797, "learning_rate": 1.985965365040298e-05, "loss": 0.8597, "mean_token_accuracy": 0.7533095374703407, "step": 900 }, { "epoch": 0.14890072599387122, "grad_norm": 0.509710451172513, "learning_rate": 1.9854817881250305e-05, "loss": 0.871, "mean_token_accuracy": 0.750696936249733, "step": 905 }, { "epoch": 0.14972338193858872, "grad_norm": 0.5049608859249515, "learning_rate": 1.984990081552968e-05, "loss": 0.8702, "mean_token_accuracy": 0.7507258713245392, "step": 910 }, { "epoch": 0.15054603788330626, "grad_norm": 0.49909109319021766, "learning_rate": 1.9844902493804066e-05, "loss": 0.8572, "mean_token_accuracy": 0.7539574787020683, "step": 915 }, { "epoch": 0.1513686938280238, "grad_norm": 0.49571925929775623, "learning_rate": 1.9839822957306736e-05, "loss": 0.8605, "mean_token_accuracy": 0.7532083630561829, "step": 920 }, { "epoch": 0.1521913497727413, "grad_norm": 0.5153785020226853, "learning_rate": 1.9834662247940944e-05, "loss": 0.8784, "mean_token_accuracy": 0.7478323370218277, "step": 925 }, { "epoch": 0.15301400571745882, "grad_norm": 0.5398876116548915, "learning_rate": 1.9829420408279562e-05, "loss": 0.868, "mean_token_accuracy": 0.7509521454572677, "step": 930 }, { "epoch": 0.15383666166217633, "grad_norm": 0.48270207148584904, "learning_rate": 1.9824097481564747e-05, "loss": 0.8561, "mean_token_accuracy": 0.7542909920215607, "step": 935 }, { "epoch": 0.15465931760689386, "grad_norm": 0.524867251899091, "learning_rate": 1.981869351170758e-05, "loss": 0.8736, "mean_token_accuracy": 0.7503400355577469, "step": 940 }, { "epoch": 0.1554819735516114, "grad_norm": 0.5100977008298381, "learning_rate": 1.9813208543287695e-05, "loss": 0.8481, "mean_token_accuracy": 0.7575345560908318, "step": 945 }, { "epoch": 0.1563046294963289, "grad_norm": 0.48795404928038255, "learning_rate": 1.9807642621552917e-05, "loss": 0.8572, "mean_token_accuracy": 0.7539456769824028, "step": 950 }, { "epoch": 0.15712728544104643, "grad_norm": 0.5027961664704412, "learning_rate": 1.98019957924189e-05, "loss": 0.8667, "mean_token_accuracy": 0.752139475941658, "step": 955 }, { "epoch": 0.15794994138576393, "grad_norm": 0.5386480081857747, "learning_rate": 1.9796268102468725e-05, "loss": 0.8513, "mean_token_accuracy": 0.7559567838907242, "step": 960 }, { "epoch": 0.15877259733048146, "grad_norm": 0.5175717997997794, "learning_rate": 1.9790459598952536e-05, "loss": 0.8803, "mean_token_accuracy": 0.7479651778936386, "step": 965 }, { "epoch": 0.15959525327519897, "grad_norm": 0.5143868261934511, "learning_rate": 1.978457032978714e-05, "loss": 0.8429, "mean_token_accuracy": 0.7570624753832818, "step": 970 }, { "epoch": 0.1604179092199165, "grad_norm": 0.49533338338098143, "learning_rate": 1.9778600343555605e-05, "loss": 0.8517, "mean_token_accuracy": 0.7554185375571251, "step": 975 }, { "epoch": 0.16124056516463403, "grad_norm": 0.4991984990971242, "learning_rate": 1.9772549689506884e-05, "loss": 0.8543, "mean_token_accuracy": 0.754626490175724, "step": 980 }, { "epoch": 0.16206322110935154, "grad_norm": 0.5083109115077165, "learning_rate": 1.976641841755539e-05, "loss": 0.8478, "mean_token_accuracy": 0.7574628874659538, "step": 985 }, { "epoch": 0.16288587705406907, "grad_norm": 0.5319777208767457, "learning_rate": 1.9760206578280575e-05, "loss": 0.8499, "mean_token_accuracy": 0.7560464650392532, "step": 990 }, { "epoch": 0.16370853299878657, "grad_norm": 0.4951284709970585, "learning_rate": 1.9753914222926535e-05, "loss": 0.8608, "mean_token_accuracy": 0.7531494438648224, "step": 995 }, { "epoch": 0.1645311889435041, "grad_norm": 0.503681838855057, "learning_rate": 1.9747541403401578e-05, "loss": 0.8423, "mean_token_accuracy": 0.7574863955378532, "step": 1000 }, { "epoch": 0.16535384488822163, "grad_norm": 0.49123378419471314, "learning_rate": 1.974108817227779e-05, "loss": 0.8576, "mean_token_accuracy": 0.7528282403945923, "step": 1005 }, { "epoch": 0.16617650083293914, "grad_norm": 0.5000661712630838, "learning_rate": 1.9734554582790608e-05, "loss": 0.8484, "mean_token_accuracy": 0.7563204362988472, "step": 1010 }, { "epoch": 0.16699915677765667, "grad_norm": 0.5153452424338367, "learning_rate": 1.972794068883838e-05, "loss": 0.8778, "mean_token_accuracy": 0.7484072625637055, "step": 1015 }, { "epoch": 0.16782181272237418, "grad_norm": 0.5134902481601968, "learning_rate": 1.9721246544981923e-05, "loss": 0.8525, "mean_token_accuracy": 0.7551195606589317, "step": 1020 }, { "epoch": 0.1686444686670917, "grad_norm": 0.5097104854680992, "learning_rate": 1.971447220644406e-05, "loss": 0.8413, "mean_token_accuracy": 0.7584511294960976, "step": 1025 }, { "epoch": 0.16946712461180924, "grad_norm": 0.4938813841021338, "learning_rate": 1.9707617729109184e-05, "loss": 0.8581, "mean_token_accuracy": 0.7532594665884972, "step": 1030 }, { "epoch": 0.17028978055652674, "grad_norm": 0.5145824607049234, "learning_rate": 1.970068316952278e-05, "loss": 0.8533, "mean_token_accuracy": 0.7537665665149689, "step": 1035 }, { "epoch": 0.17111243650124427, "grad_norm": 0.5341015730254354, "learning_rate": 1.969366858489097e-05, "loss": 0.856, "mean_token_accuracy": 0.7545783296227455, "step": 1040 }, { "epoch": 0.17193509244596178, "grad_norm": 0.4862193099908953, "learning_rate": 1.9686574033080036e-05, "loss": 0.8468, "mean_token_accuracy": 0.7559708222746849, "step": 1045 }, { "epoch": 0.1727577483906793, "grad_norm": 0.4814229405287128, "learning_rate": 1.9679399572615937e-05, "loss": 0.856, "mean_token_accuracy": 0.7539315417408943, "step": 1050 }, { "epoch": 0.17358040433539682, "grad_norm": 0.4944731279093859, "learning_rate": 1.967214526268384e-05, "loss": 0.8726, "mean_token_accuracy": 0.7499770954251289, "step": 1055 }, { "epoch": 0.17440306028011435, "grad_norm": 0.4991505447385332, "learning_rate": 1.966481116312762e-05, "loss": 0.8433, "mean_token_accuracy": 0.7570923194289207, "step": 1060 }, { "epoch": 0.17522571622483188, "grad_norm": 0.5138326811759314, "learning_rate": 1.9657397334449375e-05, "loss": 0.8485, "mean_token_accuracy": 0.7562300756573677, "step": 1065 }, { "epoch": 0.17604837216954938, "grad_norm": 0.5092615717501003, "learning_rate": 1.9649903837808916e-05, "loss": 0.8697, "mean_token_accuracy": 0.7506465047597886, "step": 1070 }, { "epoch": 0.17687102811426691, "grad_norm": 0.5033435400519938, "learning_rate": 1.9642330735023267e-05, "loss": 0.8654, "mean_token_accuracy": 0.7508367002010345, "step": 1075 }, { "epoch": 0.17769368405898442, "grad_norm": 0.5168982193815044, "learning_rate": 1.9634678088566168e-05, "loss": 0.8591, "mean_token_accuracy": 0.7536868438124656, "step": 1080 }, { "epoch": 0.17851634000370195, "grad_norm": 0.5151403842367096, "learning_rate": 1.962694596156754e-05, "loss": 0.8527, "mean_token_accuracy": 0.7550183489918709, "step": 1085 }, { "epoch": 0.17933899594841948, "grad_norm": 0.49649916267660615, "learning_rate": 1.9619134417812973e-05, "loss": 0.8513, "mean_token_accuracy": 0.7541601672768593, "step": 1090 }, { "epoch": 0.180161651893137, "grad_norm": 0.5010837096062933, "learning_rate": 1.9611243521743206e-05, "loss": 0.8363, "mean_token_accuracy": 0.7587568670511246, "step": 1095 }, { "epoch": 0.18098430783785452, "grad_norm": 0.5083723554004187, "learning_rate": 1.9603273338453578e-05, "loss": 0.8464, "mean_token_accuracy": 0.7566246822476387, "step": 1100 }, { "epoch": 0.18180696378257202, "grad_norm": 0.4977874437155869, "learning_rate": 1.9595223933693513e-05, "loss": 0.8383, "mean_token_accuracy": 0.7582577303051948, "step": 1105 }, { "epoch": 0.18262961972728955, "grad_norm": 0.501901395137192, "learning_rate": 1.958709537386595e-05, "loss": 0.8498, "mean_token_accuracy": 0.7560865581035614, "step": 1110 }, { "epoch": 0.1834522756720071, "grad_norm": 0.49458223140299146, "learning_rate": 1.957888772602684e-05, "loss": 0.8618, "mean_token_accuracy": 0.7532573893666268, "step": 1115 }, { "epoch": 0.1842749316167246, "grad_norm": 0.5269418394041435, "learning_rate": 1.9570601057884536e-05, "loss": 0.8571, "mean_token_accuracy": 0.753414836525917, "step": 1120 }, { "epoch": 0.18509758756144212, "grad_norm": 0.5163771321345518, "learning_rate": 1.9562235437799275e-05, "loss": 0.8497, "mean_token_accuracy": 0.7553903549909592, "step": 1125 }, { "epoch": 0.18592024350615963, "grad_norm": 0.502261716436892, "learning_rate": 1.9553790934782604e-05, "loss": 0.8469, "mean_token_accuracy": 0.7560562178492546, "step": 1130 }, { "epoch": 0.18674289945087716, "grad_norm": 0.5209296946706878, "learning_rate": 1.9545267618496804e-05, "loss": 0.8484, "mean_token_accuracy": 0.7563410833477974, "step": 1135 }, { "epoch": 0.1875655553955947, "grad_norm": 0.5207128822913797, "learning_rate": 1.9536665559254326e-05, "loss": 0.8407, "mean_token_accuracy": 0.7571968659758568, "step": 1140 }, { "epoch": 0.1883882113403122, "grad_norm": 0.49972304278342206, "learning_rate": 1.9527984828017196e-05, "loss": 0.8487, "mean_token_accuracy": 0.754766346514225, "step": 1145 }, { "epoch": 0.18921086728502973, "grad_norm": 0.47925742134064225, "learning_rate": 1.9519225496396448e-05, "loss": 0.8537, "mean_token_accuracy": 0.7545133501291275, "step": 1150 }, { "epoch": 0.19003352322974723, "grad_norm": 0.5071536005910057, "learning_rate": 1.951038763665152e-05, "loss": 0.8366, "mean_token_accuracy": 0.7582453057169914, "step": 1155 }, { "epoch": 0.19085617917446476, "grad_norm": 0.4983721931463634, "learning_rate": 1.9501471321689656e-05, "loss": 0.849, "mean_token_accuracy": 0.7552655354142189, "step": 1160 }, { "epoch": 0.19167883511918227, "grad_norm": 0.506485196794648, "learning_rate": 1.949247662506532e-05, "loss": 0.8557, "mean_token_accuracy": 0.7530778959393502, "step": 1165 }, { "epoch": 0.1925014910638998, "grad_norm": 0.5002249577104929, "learning_rate": 1.9483403620979575e-05, "loss": 0.8528, "mean_token_accuracy": 0.7551621690392494, "step": 1170 }, { "epoch": 0.19332414700861733, "grad_norm": 0.496427408373039, "learning_rate": 1.9474252384279474e-05, "loss": 0.8431, "mean_token_accuracy": 0.7569632783532143, "step": 1175 }, { "epoch": 0.19414680295333483, "grad_norm": 0.5018483387612742, "learning_rate": 1.946502299045745e-05, "loss": 0.8528, "mean_token_accuracy": 0.7554535329341888, "step": 1180 }, { "epoch": 0.19496945889805237, "grad_norm": 0.49616796294138404, "learning_rate": 1.945571551565068e-05, "loss": 0.8554, "mean_token_accuracy": 0.7546748697757721, "step": 1185 }, { "epoch": 0.19579211484276987, "grad_norm": 0.5845591747496165, "learning_rate": 1.9446330036640463e-05, "loss": 0.8391, "mean_token_accuracy": 0.7581113472580909, "step": 1190 }, { "epoch": 0.1966147707874874, "grad_norm": 0.4853170219333744, "learning_rate": 1.94368666308516e-05, "loss": 0.846, "mean_token_accuracy": 0.7570951789617538, "step": 1195 }, { "epoch": 0.19743742673220493, "grad_norm": 0.48722657912701345, "learning_rate": 1.9427325376351725e-05, "loss": 0.8303, "mean_token_accuracy": 0.7598640695214272, "step": 1200 }, { "epoch": 0.19826008267692244, "grad_norm": 0.5042027965768325, "learning_rate": 1.9417706351850695e-05, "loss": 0.8583, "mean_token_accuracy": 0.7516935631632805, "step": 1205 }, { "epoch": 0.19908273862163997, "grad_norm": 0.4940368036308394, "learning_rate": 1.9408009636699924e-05, "loss": 0.8335, "mean_token_accuracy": 0.759718143939972, "step": 1210 }, { "epoch": 0.19990539456635747, "grad_norm": 0.47059780202683466, "learning_rate": 1.939823531089171e-05, "loss": 0.8408, "mean_token_accuracy": 0.7574658304452896, "step": 1215 }, { "epoch": 0.200728050511075, "grad_norm": 0.5068429632390564, "learning_rate": 1.9388383455058618e-05, "loss": 0.8517, "mean_token_accuracy": 0.7539377346634865, "step": 1220 }, { "epoch": 0.20155070645579254, "grad_norm": 0.48967734068978747, "learning_rate": 1.937845415047278e-05, "loss": 0.8426, "mean_token_accuracy": 0.756514236330986, "step": 1225 }, { "epoch": 0.20237336240051004, "grad_norm": 0.49532313945861584, "learning_rate": 1.936844747904523e-05, "loss": 0.8427, "mean_token_accuracy": 0.7562633723020553, "step": 1230 }, { "epoch": 0.20319601834522757, "grad_norm": 0.4970609063296475, "learning_rate": 1.9358363523325242e-05, "loss": 0.8486, "mean_token_accuracy": 0.7552574440836907, "step": 1235 }, { "epoch": 0.20401867428994508, "grad_norm": 0.503921044921938, "learning_rate": 1.9348202366499648e-05, "loss": 0.8256, "mean_token_accuracy": 0.761341966688633, "step": 1240 }, { "epoch": 0.2048413302346626, "grad_norm": 0.49051614448767267, "learning_rate": 1.9337964092392126e-05, "loss": 0.8529, "mean_token_accuracy": 0.7548203825950622, "step": 1245 }, { "epoch": 0.20566398617938011, "grad_norm": 0.5090294294636506, "learning_rate": 1.9327648785462538e-05, "loss": 0.8376, "mean_token_accuracy": 0.7575676992535592, "step": 1250 }, { "epoch": 0.20648664212409765, "grad_norm": 0.5075980961396112, "learning_rate": 1.9317256530806222e-05, "loss": 0.8516, "mean_token_accuracy": 0.7538551047444344, "step": 1255 }, { "epoch": 0.20730929806881518, "grad_norm": 0.4900209741359357, "learning_rate": 1.9306787414153298e-05, "loss": 0.831, "mean_token_accuracy": 0.7598697379231453, "step": 1260 }, { "epoch": 0.20813195401353268, "grad_norm": 0.5203238329559228, "learning_rate": 1.9296241521867928e-05, "loss": 0.8494, "mean_token_accuracy": 0.7589497119188309, "step": 1265 }, { "epoch": 0.20895460995825021, "grad_norm": 0.48535675543969, "learning_rate": 1.928561894094766e-05, "loss": 0.8458, "mean_token_accuracy": 0.7570876494050026, "step": 1270 }, { "epoch": 0.20977726590296772, "grad_norm": 0.49396178842702837, "learning_rate": 1.9274919759022656e-05, "loss": 0.8497, "mean_token_accuracy": 0.7565696477890015, "step": 1275 }, { "epoch": 0.21059992184768525, "grad_norm": 0.49216876968453754, "learning_rate": 1.926414406435501e-05, "loss": 0.8446, "mean_token_accuracy": 0.756624485552311, "step": 1280 }, { "epoch": 0.21142257779240278, "grad_norm": 0.5008436281979866, "learning_rate": 1.925329194583798e-05, "loss": 0.8517, "mean_token_accuracy": 0.754089592397213, "step": 1285 }, { "epoch": 0.2122452337371203, "grad_norm": 0.5004999051446964, "learning_rate": 1.92423634929953e-05, "loss": 0.8401, "mean_token_accuracy": 0.7566401407122612, "step": 1290 }, { "epoch": 0.21306788968183782, "grad_norm": 0.48816269661656086, "learning_rate": 1.9231358795980403e-05, "loss": 0.8405, "mean_token_accuracy": 0.756783838570118, "step": 1295 }, { "epoch": 0.21389054562655532, "grad_norm": 0.4919235789237274, "learning_rate": 1.92202779455757e-05, "loss": 0.826, "mean_token_accuracy": 0.7607905507087708, "step": 1300 }, { "epoch": 0.21471320157127285, "grad_norm": 0.7661640222387942, "learning_rate": 1.9209121033191812e-05, "loss": 0.8376, "mean_token_accuracy": 0.7586381390690804, "step": 1305 }, { "epoch": 0.2155358575159904, "grad_norm": 0.4954106658080673, "learning_rate": 1.919788815086685e-05, "loss": 0.8525, "mean_token_accuracy": 0.7540074199438095, "step": 1310 }, { "epoch": 0.2163585134607079, "grad_norm": 0.5021449161441909, "learning_rate": 1.918657939126561e-05, "loss": 0.8448, "mean_token_accuracy": 0.7563427060842514, "step": 1315 }, { "epoch": 0.21718116940542542, "grad_norm": 0.5178178351844417, "learning_rate": 1.9175194847678843e-05, "loss": 0.844, "mean_token_accuracy": 0.7559559151530266, "step": 1320 }, { "epoch": 0.21800382535014293, "grad_norm": 0.477680111537871, "learning_rate": 1.9163734614022468e-05, "loss": 0.8418, "mean_token_accuracy": 0.7570261433720589, "step": 1325 }, { "epoch": 0.21882648129486046, "grad_norm": 0.48956207759482606, "learning_rate": 1.915219878483682e-05, "loss": 0.8338, "mean_token_accuracy": 0.7595634296536445, "step": 1330 }, { "epoch": 0.219649137239578, "grad_norm": 0.47761391117820046, "learning_rate": 1.914058745528583e-05, "loss": 0.8447, "mean_token_accuracy": 0.7561558216810227, "step": 1335 }, { "epoch": 0.2204717931842955, "grad_norm": 0.4947874741327767, "learning_rate": 1.9128900721156278e-05, "loss": 0.8424, "mean_token_accuracy": 0.7564474269747734, "step": 1340 }, { "epoch": 0.22129444912901303, "grad_norm": 0.4899693979055571, "learning_rate": 1.911713867885699e-05, "loss": 0.8359, "mean_token_accuracy": 0.7585081905126572, "step": 1345 }, { "epoch": 0.22211710507373053, "grad_norm": 0.4935517337806646, "learning_rate": 1.9105301425418026e-05, "loss": 0.8415, "mean_token_accuracy": 0.75685965269804, "step": 1350 }, { "epoch": 0.22293976101844806, "grad_norm": 0.5061631276902183, "learning_rate": 1.9093389058489914e-05, "loss": 0.8375, "mean_token_accuracy": 0.758575314283371, "step": 1355 }, { "epoch": 0.22376241696316557, "grad_norm": 0.5078931706022867, "learning_rate": 1.9081401676342818e-05, "loss": 0.8534, "mean_token_accuracy": 0.753215454518795, "step": 1360 }, { "epoch": 0.2245850729078831, "grad_norm": 0.5119571441720575, "learning_rate": 1.906933937786572e-05, "loss": 0.8418, "mean_token_accuracy": 0.7567502170801162, "step": 1365 }, { "epoch": 0.22540772885260063, "grad_norm": 0.47161144137792904, "learning_rate": 1.9057202262565638e-05, "loss": 0.8251, "mean_token_accuracy": 0.7605437263846397, "step": 1370 }, { "epoch": 0.22623038479731813, "grad_norm": 0.5050988560137499, "learning_rate": 1.9044990430566784e-05, "loss": 0.8454, "mean_token_accuracy": 0.756630253791809, "step": 1375 }, { "epoch": 0.22705304074203567, "grad_norm": 0.5013916937652448, "learning_rate": 1.903270398260972e-05, "loss": 0.8353, "mean_token_accuracy": 0.7572322770953178, "step": 1380 }, { "epoch": 0.22787569668675317, "grad_norm": 0.49065160311156825, "learning_rate": 1.9020343020050577e-05, "loss": 0.8355, "mean_token_accuracy": 0.7583986625075341, "step": 1385 }, { "epoch": 0.2286983526314707, "grad_norm": 0.4855691233257167, "learning_rate": 1.9007907644860156e-05, "loss": 0.8405, "mean_token_accuracy": 0.7571400433778763, "step": 1390 }, { "epoch": 0.22952100857618823, "grad_norm": 0.47689381031497585, "learning_rate": 1.8995397959623148e-05, "loss": 0.8239, "mean_token_accuracy": 0.7619229212403298, "step": 1395 }, { "epoch": 0.23034366452090574, "grad_norm": 0.4886918044995876, "learning_rate": 1.898281406753724e-05, "loss": 0.8569, "mean_token_accuracy": 0.754118898510933, "step": 1400 }, { "epoch": 0.23116632046562327, "grad_norm": 0.4895540097786587, "learning_rate": 1.897015607241229e-05, "loss": 0.8295, "mean_token_accuracy": 0.759735481441021, "step": 1405 }, { "epoch": 0.23198897641034077, "grad_norm": 0.49693768771973756, "learning_rate": 1.8957424078669464e-05, "loss": 0.824, "mean_token_accuracy": 0.7615087181329727, "step": 1410 }, { "epoch": 0.2328116323550583, "grad_norm": 0.4909573567443448, "learning_rate": 1.8944618191340373e-05, "loss": 0.861, "mean_token_accuracy": 0.7524025708436965, "step": 1415 }, { "epoch": 0.23363428829977584, "grad_norm": 0.5869553099889377, "learning_rate": 1.8931738516066208e-05, "loss": 0.8316, "mean_token_accuracy": 0.7599302291870117, "step": 1420 }, { "epoch": 0.23445694424449334, "grad_norm": 0.5278660098992382, "learning_rate": 1.8918785159096865e-05, "loss": 0.839, "mean_token_accuracy": 0.7580491125583648, "step": 1425 }, { "epoch": 0.23527960018921087, "grad_norm": 0.5130966076181913, "learning_rate": 1.8905758227290073e-05, "loss": 0.8204, "mean_token_accuracy": 0.7621259197592736, "step": 1430 }, { "epoch": 0.23610225613392838, "grad_norm": 0.49082495981997254, "learning_rate": 1.889265782811051e-05, "loss": 0.8249, "mean_token_accuracy": 0.7610314086079597, "step": 1435 }, { "epoch": 0.2369249120786459, "grad_norm": 0.4899523774572424, "learning_rate": 1.8879484069628928e-05, "loss": 0.8302, "mean_token_accuracy": 0.7597274377942085, "step": 1440 }, { "epoch": 0.23774756802336344, "grad_norm": 0.489952821615517, "learning_rate": 1.8866237060521233e-05, "loss": 0.8411, "mean_token_accuracy": 0.7562792897224426, "step": 1445 }, { "epoch": 0.23857022396808095, "grad_norm": 0.48183897167660766, "learning_rate": 1.8852916910067622e-05, "loss": 0.8265, "mean_token_accuracy": 0.7611740246415138, "step": 1450 }, { "epoch": 0.23939287991279848, "grad_norm": 0.4826376852500673, "learning_rate": 1.8839523728151654e-05, "loss": 0.8372, "mean_token_accuracy": 0.7584284991025925, "step": 1455 }, { "epoch": 0.24021553585751598, "grad_norm": 0.49537094027194484, "learning_rate": 1.882605762525936e-05, "loss": 0.8329, "mean_token_accuracy": 0.7583387970924378, "step": 1460 }, { "epoch": 0.24103819180223351, "grad_norm": 0.5089262778008258, "learning_rate": 1.8812518712478337e-05, "loss": 0.8344, "mean_token_accuracy": 0.7579466328024864, "step": 1465 }, { "epoch": 0.24186084774695102, "grad_norm": 0.4962688126619629, "learning_rate": 1.8798907101496807e-05, "loss": 0.8388, "mean_token_accuracy": 0.7566520869731903, "step": 1470 }, { "epoch": 0.24268350369166855, "grad_norm": 0.49497711239485964, "learning_rate": 1.878522290460271e-05, "loss": 0.8314, "mean_token_accuracy": 0.7599233373999595, "step": 1475 }, { "epoch": 0.24350615963638608, "grad_norm": 0.4852402255610546, "learning_rate": 1.8771466234682792e-05, "loss": 0.828, "mean_token_accuracy": 0.759791623055935, "step": 1480 }, { "epoch": 0.2443288155811036, "grad_norm": 0.48996137021566677, "learning_rate": 1.8757637205221652e-05, "loss": 0.8312, "mean_token_accuracy": 0.7590737655758858, "step": 1485 }, { "epoch": 0.24515147152582112, "grad_norm": 0.48525703688655186, "learning_rate": 1.874373593030081e-05, "loss": 0.8376, "mean_token_accuracy": 0.7568432822823524, "step": 1490 }, { "epoch": 0.24597412747053862, "grad_norm": 0.48035909932105547, "learning_rate": 1.872976252459777e-05, "loss": 0.8275, "mean_token_accuracy": 0.7599972948431969, "step": 1495 }, { "epoch": 0.24679678341525615, "grad_norm": 0.4935297892902831, "learning_rate": 1.871571710338508e-05, "loss": 0.8251, "mean_token_accuracy": 0.7613622069358825, "step": 1500 }, { "epoch": 0.2476194393599737, "grad_norm": 0.5039511397583422, "learning_rate": 1.8701599782529367e-05, "loss": 0.8255, "mean_token_accuracy": 0.760756005346775, "step": 1505 }, { "epoch": 0.2484420953046912, "grad_norm": 0.4820618839221359, "learning_rate": 1.8687410678490396e-05, "loss": 0.8366, "mean_token_accuracy": 0.7574177429080009, "step": 1510 }, { "epoch": 0.24926475124940872, "grad_norm": 0.5054248233602202, "learning_rate": 1.8673149908320094e-05, "loss": 0.8374, "mean_token_accuracy": 0.7572463363409042, "step": 1515 }, { "epoch": 0.25008740719412625, "grad_norm": 0.5036953411809479, "learning_rate": 1.8658817589661598e-05, "loss": 0.8345, "mean_token_accuracy": 0.7582996025681495, "step": 1520 }, { "epoch": 0.25091006313884373, "grad_norm": 0.4785222957510937, "learning_rate": 1.8644413840748277e-05, "loss": 0.8343, "mean_token_accuracy": 0.7591485381126404, "step": 1525 }, { "epoch": 0.25173271908356126, "grad_norm": 0.47636499681590655, "learning_rate": 1.8629938780402757e-05, "loss": 0.8283, "mean_token_accuracy": 0.7591575637459755, "step": 1530 }, { "epoch": 0.2525553750282788, "grad_norm": 0.49855044164271595, "learning_rate": 1.8615392528035948e-05, "loss": 0.8378, "mean_token_accuracy": 0.7565597623586655, "step": 1535 }, { "epoch": 0.2533780309729963, "grad_norm": 0.5028385983552294, "learning_rate": 1.860077520364604e-05, "loss": 0.8382, "mean_token_accuracy": 0.7565027371048927, "step": 1540 }, { "epoch": 0.25420068691771386, "grad_norm": 0.48906663908040204, "learning_rate": 1.8586086927817542e-05, "loss": 0.825, "mean_token_accuracy": 0.7608486101031303, "step": 1545 }, { "epoch": 0.25502334286243133, "grad_norm": 0.48657813370110403, "learning_rate": 1.857132782172026e-05, "loss": 0.845, "mean_token_accuracy": 0.7556301325559616, "step": 1550 }, { "epoch": 0.25584599880714887, "grad_norm": 0.502008115720921, "learning_rate": 1.8556498007108316e-05, "loss": 0.8274, "mean_token_accuracy": 0.7594741612672806, "step": 1555 }, { "epoch": 0.2566686547518664, "grad_norm": 0.48524458163335343, "learning_rate": 1.8541597606319137e-05, "loss": 0.8192, "mean_token_accuracy": 0.7628079876303673, "step": 1560 }, { "epoch": 0.25749131069658393, "grad_norm": 0.48277746426093804, "learning_rate": 1.8526626742272435e-05, "loss": 0.8305, "mean_token_accuracy": 0.7596687346696853, "step": 1565 }, { "epoch": 0.25831396664130146, "grad_norm": 0.4826548917086668, "learning_rate": 1.8511585538469215e-05, "loss": 0.8294, "mean_token_accuracy": 0.7593120083212852, "step": 1570 }, { "epoch": 0.25913662258601894, "grad_norm": 0.4940268554080099, "learning_rate": 1.8496474118990738e-05, "loss": 0.8442, "mean_token_accuracy": 0.7554322421550751, "step": 1575 }, { "epoch": 0.25995927853073647, "grad_norm": 0.4785814194223959, "learning_rate": 1.8481292608497505e-05, "loss": 0.8177, "mean_token_accuracy": 0.7628436267375946, "step": 1580 }, { "epoch": 0.260781934475454, "grad_norm": 0.4960054511304323, "learning_rate": 1.846604113222823e-05, "loss": 0.8202, "mean_token_accuracy": 0.7614924594759941, "step": 1585 }, { "epoch": 0.26160459042017153, "grad_norm": 0.4830590155286334, "learning_rate": 1.84507198159988e-05, "loss": 0.8298, "mean_token_accuracy": 0.7596190094947814, "step": 1590 }, { "epoch": 0.26242724636488907, "grad_norm": 0.466281889346005, "learning_rate": 1.8435328786201246e-05, "loss": 0.8224, "mean_token_accuracy": 0.7611289650201798, "step": 1595 }, { "epoch": 0.26324990230960654, "grad_norm": 0.4864137705632204, "learning_rate": 1.841986816980269e-05, "loss": 0.8228, "mean_token_accuracy": 0.7614953771233559, "step": 1600 }, { "epoch": 0.2640725582543241, "grad_norm": 0.48306596264134816, "learning_rate": 1.8404338094344315e-05, "loss": 0.8296, "mean_token_accuracy": 0.7595503896474838, "step": 1605 }, { "epoch": 0.2648952141990416, "grad_norm": 0.4864344910118435, "learning_rate": 1.8388738687940288e-05, "loss": 0.8283, "mean_token_accuracy": 0.7591427966952324, "step": 1610 }, { "epoch": 0.26571787014375914, "grad_norm": 0.473856232584769, "learning_rate": 1.837307007927672e-05, "loss": 0.8019, "mean_token_accuracy": 0.7659234255552292, "step": 1615 }, { "epoch": 0.26654052608847667, "grad_norm": 0.4718893808266901, "learning_rate": 1.8357332397610603e-05, "loss": 0.8211, "mean_token_accuracy": 0.7615778028964997, "step": 1620 }, { "epoch": 0.26736318203319415, "grad_norm": 0.48657100950519083, "learning_rate": 1.834152577276874e-05, "loss": 0.815, "mean_token_accuracy": 0.7634675204753876, "step": 1625 }, { "epoch": 0.2681858379779117, "grad_norm": 0.48551180146166756, "learning_rate": 1.832565033514668e-05, "loss": 0.8263, "mean_token_accuracy": 0.7597757071256638, "step": 1630 }, { "epoch": 0.2690084939226292, "grad_norm": 0.47726848520537757, "learning_rate": 1.830970621570763e-05, "loss": 0.8077, "mean_token_accuracy": 0.7652326211333275, "step": 1635 }, { "epoch": 0.26983114986734674, "grad_norm": 0.48587886730744795, "learning_rate": 1.8293693545981393e-05, "loss": 0.8345, "mean_token_accuracy": 0.7579408720135689, "step": 1640 }, { "epoch": 0.2706538058120643, "grad_norm": 0.49498961875237013, "learning_rate": 1.8277612458063257e-05, "loss": 0.8256, "mean_token_accuracy": 0.7609028607606888, "step": 1645 }, { "epoch": 0.27147646175678175, "grad_norm": 0.47727697146086057, "learning_rate": 1.826146308461293e-05, "loss": 0.825, "mean_token_accuracy": 0.7600671201944351, "step": 1650 }, { "epoch": 0.2722991177014993, "grad_norm": 0.49187119586685907, "learning_rate": 1.8245245558853446e-05, "loss": 0.8332, "mean_token_accuracy": 0.7588327869772911, "step": 1655 }, { "epoch": 0.2731217736462168, "grad_norm": 0.4876148386803457, "learning_rate": 1.8228960014570036e-05, "loss": 0.8239, "mean_token_accuracy": 0.7604472801089287, "step": 1660 }, { "epoch": 0.27394442959093435, "grad_norm": 0.5029419321775881, "learning_rate": 1.8212606586109055e-05, "loss": 0.8421, "mean_token_accuracy": 0.7568986222147942, "step": 1665 }, { "epoch": 0.2747670855356519, "grad_norm": 0.4772952130460248, "learning_rate": 1.819618540837687e-05, "loss": 0.8273, "mean_token_accuracy": 0.76001528352499, "step": 1670 }, { "epoch": 0.27558974148036935, "grad_norm": 0.4772918047617865, "learning_rate": 1.8179696616838724e-05, "loss": 0.8258, "mean_token_accuracy": 0.7600896343588829, "step": 1675 }, { "epoch": 0.2764123974250869, "grad_norm": 0.48353979592985324, "learning_rate": 1.816314034751765e-05, "loss": 0.8386, "mean_token_accuracy": 0.7586118161678315, "step": 1680 }, { "epoch": 0.2772350533698044, "grad_norm": 0.4761035836839767, "learning_rate": 1.814651673699333e-05, "loss": 0.8008, "mean_token_accuracy": 0.7667428597807884, "step": 1685 }, { "epoch": 0.27805770931452195, "grad_norm": 0.47489548868141307, "learning_rate": 1.8129825922400965e-05, "loss": 0.8254, "mean_token_accuracy": 0.760156872868538, "step": 1690 }, { "epoch": 0.2788803652592395, "grad_norm": 0.5400173021718119, "learning_rate": 1.811306804143016e-05, "loss": 0.8064, "mean_token_accuracy": 0.7659746944904328, "step": 1695 }, { "epoch": 0.27970302120395696, "grad_norm": 0.48644437077698194, "learning_rate": 1.8096243232323772e-05, "loss": 0.8145, "mean_token_accuracy": 0.7621610805392265, "step": 1700 }, { "epoch": 0.2805256771486745, "grad_norm": 0.47656000421328276, "learning_rate": 1.8079351633876782e-05, "loss": 0.8131, "mean_token_accuracy": 0.7628241062164307, "step": 1705 }, { "epoch": 0.281348333093392, "grad_norm": 0.4901155187171902, "learning_rate": 1.806239338543514e-05, "loss": 0.8158, "mean_token_accuracy": 0.7623119562864303, "step": 1710 }, { "epoch": 0.28217098903810955, "grad_norm": 0.48232283252664304, "learning_rate": 1.8045368626894625e-05, "loss": 0.8178, "mean_token_accuracy": 0.762554481625557, "step": 1715 }, { "epoch": 0.28299364498282703, "grad_norm": 0.48422348005069815, "learning_rate": 1.8028277498699682e-05, "loss": 0.8398, "mean_token_accuracy": 0.7566006511449814, "step": 1720 }, { "epoch": 0.28381630092754456, "grad_norm": 0.5097187374169984, "learning_rate": 1.8011120141842266e-05, "loss": 0.8307, "mean_token_accuracy": 0.7587061107158661, "step": 1725 }, { "epoch": 0.2846389568722621, "grad_norm": 0.4857173272020191, "learning_rate": 1.7993896697860687e-05, "loss": 0.8253, "mean_token_accuracy": 0.7601984903216362, "step": 1730 }, { "epoch": 0.2854616128169796, "grad_norm": 0.4830288614922592, "learning_rate": 1.7976607308838427e-05, "loss": 0.8133, "mean_token_accuracy": 0.7634838283061981, "step": 1735 }, { "epoch": 0.28628426876169716, "grad_norm": 0.472472895567438, "learning_rate": 1.7959252117402986e-05, "loss": 0.8089, "mean_token_accuracy": 0.764598785340786, "step": 1740 }, { "epoch": 0.28710692470641463, "grad_norm": 0.498299564724403, "learning_rate": 1.794183126672469e-05, "loss": 0.8112, "mean_token_accuracy": 0.7632102578878402, "step": 1745 }, { "epoch": 0.28792958065113217, "grad_norm": 0.47529253451596165, "learning_rate": 1.792434490051551e-05, "loss": 0.8149, "mean_token_accuracy": 0.7642448052763939, "step": 1750 }, { "epoch": 0.2887522365958497, "grad_norm": 0.48084456914699064, "learning_rate": 1.7906793163027903e-05, "loss": 0.811, "mean_token_accuracy": 0.7637025311589241, "step": 1755 }, { "epoch": 0.28957489254056723, "grad_norm": 0.4693039679991657, "learning_rate": 1.7889176199053575e-05, "loss": 0.8091, "mean_token_accuracy": 0.764451339840889, "step": 1760 }, { "epoch": 0.29039754848528476, "grad_norm": 0.49404601115682545, "learning_rate": 1.787149415392233e-05, "loss": 0.8336, "mean_token_accuracy": 0.7582444727420807, "step": 1765 }, { "epoch": 0.29122020443000224, "grad_norm": 0.5052936455487074, "learning_rate": 1.7853747173500852e-05, "loss": 0.8343, "mean_token_accuracy": 0.7583705633878708, "step": 1770 }, { "epoch": 0.29204286037471977, "grad_norm": 0.48252058273796206, "learning_rate": 1.7835935404191504e-05, "loss": 0.8226, "mean_token_accuracy": 0.7599636286497116, "step": 1775 }, { "epoch": 0.2928655163194373, "grad_norm": 0.4783483550074471, "learning_rate": 1.781805899293111e-05, "loss": 0.8197, "mean_token_accuracy": 0.7615763008594513, "step": 1780 }, { "epoch": 0.29368817226415483, "grad_norm": 0.48679571729139043, "learning_rate": 1.7800118087189762e-05, "loss": 0.8287, "mean_token_accuracy": 0.7588724300265313, "step": 1785 }, { "epoch": 0.29451082820887237, "grad_norm": 2.2675028889206925, "learning_rate": 1.7782112834969595e-05, "loss": 0.8187, "mean_token_accuracy": 0.7631213799118995, "step": 1790 }, { "epoch": 0.29533348415358984, "grad_norm": 0.46921890093205454, "learning_rate": 1.7764043384803564e-05, "loss": 0.8217, "mean_token_accuracy": 0.7612684339284896, "step": 1795 }, { "epoch": 0.2961561400983074, "grad_norm": 0.48439781522278, "learning_rate": 1.7745909885754215e-05, "loss": 0.8331, "mean_token_accuracy": 0.7575021594762802, "step": 1800 }, { "epoch": 0.2969787960430249, "grad_norm": 0.6422025067512073, "learning_rate": 1.772771248741247e-05, "loss": 0.8116, "mean_token_accuracy": 0.7626554921269417, "step": 1805 }, { "epoch": 0.29780145198774244, "grad_norm": 0.4749310610192254, "learning_rate": 1.770945133989637e-05, "loss": 0.8197, "mean_token_accuracy": 0.7608926296234131, "step": 1810 }, { "epoch": 0.29862410793245997, "grad_norm": 0.477928140157227, "learning_rate": 1.7691126593849865e-05, "loss": 0.8044, "mean_token_accuracy": 0.7656101107597351, "step": 1815 }, { "epoch": 0.29944676387717745, "grad_norm": 0.4727758843967269, "learning_rate": 1.7672738400441543e-05, "loss": 0.828, "mean_token_accuracy": 0.7595351278781891, "step": 1820 }, { "epoch": 0.300269419821895, "grad_norm": 0.49347580800097085, "learning_rate": 1.765428691136341e-05, "loss": 0.808, "mean_token_accuracy": 0.7658808439970016, "step": 1825 }, { "epoch": 0.3010920757666125, "grad_norm": 0.4835673456841425, "learning_rate": 1.7635772278829604e-05, "loss": 0.808, "mean_token_accuracy": 0.7649905353784561, "step": 1830 }, { "epoch": 0.30191473171133004, "grad_norm": 0.4784740202595709, "learning_rate": 1.7617194655575188e-05, "loss": 0.8091, "mean_token_accuracy": 0.764254453778267, "step": 1835 }, { "epoch": 0.3027373876560476, "grad_norm": 0.5834717842074425, "learning_rate": 1.7598554194854832e-05, "loss": 0.8113, "mean_token_accuracy": 0.764817337691784, "step": 1840 }, { "epoch": 0.30356004360076505, "grad_norm": 0.46150588508683194, "learning_rate": 1.757985105044161e-05, "loss": 0.8194, "mean_token_accuracy": 0.7605888351798058, "step": 1845 }, { "epoch": 0.3043826995454826, "grad_norm": 0.4846164324039382, "learning_rate": 1.756108537662567e-05, "loss": 0.8103, "mean_token_accuracy": 0.7634046152234077, "step": 1850 }, { "epoch": 0.3052053554902001, "grad_norm": 0.4785080524727792, "learning_rate": 1.7542257328213014e-05, "loss": 0.8291, "mean_token_accuracy": 0.7598793238401413, "step": 1855 }, { "epoch": 0.30602801143491765, "grad_norm": 0.4952731178566638, "learning_rate": 1.752336706052419e-05, "loss": 0.8065, "mean_token_accuracy": 0.7649891525506973, "step": 1860 }, { "epoch": 0.3068506673796352, "grad_norm": 2.895508568015643, "learning_rate": 1.7504414729393017e-05, "loss": 0.8167, "mean_token_accuracy": 0.7627354219555855, "step": 1865 }, { "epoch": 0.30767332332435265, "grad_norm": 0.48056569989535763, "learning_rate": 1.7485400491165302e-05, "loss": 0.8145, "mean_token_accuracy": 0.762684454023838, "step": 1870 }, { "epoch": 0.3084959792690702, "grad_norm": 0.49667781835787295, "learning_rate": 1.746632450269756e-05, "loss": 0.8104, "mean_token_accuracy": 0.7638380706310273, "step": 1875 }, { "epoch": 0.3093186352137877, "grad_norm": 0.47727745920218106, "learning_rate": 1.7447186921355694e-05, "loss": 0.7997, "mean_token_accuracy": 0.7673030987381935, "step": 1880 }, { "epoch": 0.31014129115850525, "grad_norm": 0.48078340055104696, "learning_rate": 1.7427987905013722e-05, "loss": 0.8287, "mean_token_accuracy": 0.7589187994599342, "step": 1885 }, { "epoch": 0.3109639471032228, "grad_norm": 0.49600119836813517, "learning_rate": 1.740872761205247e-05, "loss": 0.8138, "mean_token_accuracy": 0.762779350578785, "step": 1890 }, { "epoch": 0.31178660304794026, "grad_norm": 0.48080424818920114, "learning_rate": 1.7389406201358244e-05, "loss": 0.8253, "mean_token_accuracy": 0.7599517673254013, "step": 1895 }, { "epoch": 0.3126092589926578, "grad_norm": 0.4876527689088522, "learning_rate": 1.7370023832321558e-05, "loss": 0.8163, "mean_token_accuracy": 0.7628432556986808, "step": 1900 }, { "epoch": 0.3134319149373753, "grad_norm": 0.504568953192464, "learning_rate": 1.735058066483578e-05, "loss": 0.817, "mean_token_accuracy": 0.7617260217666626, "step": 1905 }, { "epoch": 0.31425457088209285, "grad_norm": 0.47852364972348044, "learning_rate": 1.7331076859295838e-05, "loss": 0.8281, "mean_token_accuracy": 0.7595314383506775, "step": 1910 }, { "epoch": 0.31507722682681033, "grad_norm": 0.4791709659798099, "learning_rate": 1.7311512576596884e-05, "loss": 0.8228, "mean_token_accuracy": 0.7596703842282295, "step": 1915 }, { "epoch": 0.31589988277152786, "grad_norm": 0.4773298799140304, "learning_rate": 1.7291887978132977e-05, "loss": 0.8299, "mean_token_accuracy": 0.759058165550232, "step": 1920 }, { "epoch": 0.3167225387162454, "grad_norm": 0.5482450592387155, "learning_rate": 1.7272203225795747e-05, "loss": 0.8158, "mean_token_accuracy": 0.7622257485985756, "step": 1925 }, { "epoch": 0.3175451946609629, "grad_norm": 0.4713022667778991, "learning_rate": 1.7252458481973053e-05, "loss": 0.8147, "mean_token_accuracy": 0.7625905066728592, "step": 1930 }, { "epoch": 0.31836785060568046, "grad_norm": 0.4787198197971356, "learning_rate": 1.723265390954765e-05, "loss": 0.8164, "mean_token_accuracy": 0.7617929130792618, "step": 1935 }, { "epoch": 0.31919050655039793, "grad_norm": 0.4846304167380501, "learning_rate": 1.7212789671895853e-05, "loss": 0.8029, "mean_token_accuracy": 0.7662945687770844, "step": 1940 }, { "epoch": 0.32001316249511547, "grad_norm": 0.5050799472976293, "learning_rate": 1.7192865932886167e-05, "loss": 0.8332, "mean_token_accuracy": 0.7565551772713661, "step": 1945 }, { "epoch": 0.320835818439833, "grad_norm": 0.467395507791723, "learning_rate": 1.717288285687796e-05, "loss": 0.8201, "mean_token_accuracy": 0.7614371821284294, "step": 1950 }, { "epoch": 0.32165847438455053, "grad_norm": 0.4746076706136549, "learning_rate": 1.7152840608720094e-05, "loss": 0.8124, "mean_token_accuracy": 0.7635828867554665, "step": 1955 }, { "epoch": 0.32248113032926806, "grad_norm": 0.4786147940673755, "learning_rate": 1.7132739353749564e-05, "loss": 0.8258, "mean_token_accuracy": 0.7601158261299134, "step": 1960 }, { "epoch": 0.32330378627398554, "grad_norm": 0.480144043821985, "learning_rate": 1.711257925779014e-05, "loss": 0.8086, "mean_token_accuracy": 0.7636149317026139, "step": 1965 }, { "epoch": 0.32412644221870307, "grad_norm": 0.5197516814200821, "learning_rate": 1.709236048715099e-05, "loss": 0.8104, "mean_token_accuracy": 0.763784684240818, "step": 1970 }, { "epoch": 0.3249490981634206, "grad_norm": 0.4735180511852434, "learning_rate": 1.7072083208625322e-05, "loss": 0.828, "mean_token_accuracy": 0.7591763481497764, "step": 1975 }, { "epoch": 0.32577175410813813, "grad_norm": 0.5913671045045492, "learning_rate": 1.7051747589489002e-05, "loss": 0.812, "mean_token_accuracy": 0.7638006538152695, "step": 1980 }, { "epoch": 0.32659441005285567, "grad_norm": 0.4780216940882599, "learning_rate": 1.703135379749916e-05, "loss": 0.8237, "mean_token_accuracy": 0.761235861480236, "step": 1985 }, { "epoch": 0.32741706599757314, "grad_norm": 0.47476342175626024, "learning_rate": 1.7010902000892833e-05, "loss": 0.816, "mean_token_accuracy": 0.7617868468165397, "step": 1990 }, { "epoch": 0.3282397219422907, "grad_norm": 0.47034211060030956, "learning_rate": 1.699039236838555e-05, "loss": 0.817, "mean_token_accuracy": 0.762543173134327, "step": 1995 }, { "epoch": 0.3290623778870082, "grad_norm": 0.4679679477005928, "learning_rate": 1.6969825069169967e-05, "loss": 0.8091, "mean_token_accuracy": 0.7643688783049584, "step": 2000 }, { "epoch": 0.32988503383172574, "grad_norm": 0.4887097734560892, "learning_rate": 1.6949200272914443e-05, "loss": 0.8208, "mean_token_accuracy": 0.7606669962406158, "step": 2005 }, { "epoch": 0.33070768977644327, "grad_norm": 0.475185630414847, "learning_rate": 1.6928518149761664e-05, "loss": 0.822, "mean_token_accuracy": 0.7605111807584762, "step": 2010 }, { "epoch": 0.33153034572116075, "grad_norm": 0.47838042297620925, "learning_rate": 1.690777887032722e-05, "loss": 0.801, "mean_token_accuracy": 0.7666290313005447, "step": 2015 }, { "epoch": 0.3323530016658783, "grad_norm": 0.573140010367027, "learning_rate": 1.6886982605698217e-05, "loss": 0.8054, "mean_token_accuracy": 0.7644026547670364, "step": 2020 }, { "epoch": 0.3331756576105958, "grad_norm": 0.4942634468631715, "learning_rate": 1.686612952743186e-05, "loss": 0.8234, "mean_token_accuracy": 0.7596978649497033, "step": 2025 }, { "epoch": 0.33399831355531334, "grad_norm": 0.48128079759926673, "learning_rate": 1.684521980755402e-05, "loss": 0.818, "mean_token_accuracy": 0.7615244269371033, "step": 2030 }, { "epoch": 0.3348209695000309, "grad_norm": 0.48863496893638586, "learning_rate": 1.6824253618557837e-05, "loss": 0.822, "mean_token_accuracy": 0.76096031665802, "step": 2035 }, { "epoch": 0.33564362544474835, "grad_norm": 0.4924226112011605, "learning_rate": 1.6803231133402282e-05, "loss": 0.8084, "mean_token_accuracy": 0.7644886285066604, "step": 2040 }, { "epoch": 0.3364662813894659, "grad_norm": 0.4716928714741004, "learning_rate": 1.6782152525510745e-05, "loss": 0.8118, "mean_token_accuracy": 0.7627069145441056, "step": 2045 }, { "epoch": 0.3372889373341834, "grad_norm": 1.0133572404521058, "learning_rate": 1.6761017968769597e-05, "loss": 0.8135, "mean_token_accuracy": 0.7630855336785316, "step": 2050 }, { "epoch": 0.33811159327890095, "grad_norm": 0.4739418480320512, "learning_rate": 1.6739827637526746e-05, "loss": 0.814, "mean_token_accuracy": 0.7626608163118362, "step": 2055 }, { "epoch": 0.3389342492236185, "grad_norm": 0.4971482055609185, "learning_rate": 1.6718581706590213e-05, "loss": 0.8117, "mean_token_accuracy": 0.7632580801844597, "step": 2060 }, { "epoch": 0.33975690516833595, "grad_norm": 0.48471930643400457, "learning_rate": 1.6697280351226686e-05, "loss": 0.8064, "mean_token_accuracy": 0.764969702064991, "step": 2065 }, { "epoch": 0.3405795611130535, "grad_norm": 0.48480869992522496, "learning_rate": 1.6675923747160062e-05, "loss": 0.7968, "mean_token_accuracy": 0.7671492546796799, "step": 2070 }, { "epoch": 0.341402217057771, "grad_norm": 0.4839367380693257, "learning_rate": 1.6654512070570032e-05, "loss": 0.7974, "mean_token_accuracy": 0.7674106031656265, "step": 2075 }, { "epoch": 0.34222487300248855, "grad_norm": 0.4885070897081242, "learning_rate": 1.6633045498090574e-05, "loss": 0.815, "mean_token_accuracy": 0.7619894102215767, "step": 2080 }, { "epoch": 0.3430475289472061, "grad_norm": 0.4866290029863979, "learning_rate": 1.6611524206808543e-05, "loss": 0.8291, "mean_token_accuracy": 0.7621819108724595, "step": 2085 }, { "epoch": 0.34387018489192356, "grad_norm": 0.4740315502187465, "learning_rate": 1.6589948374262192e-05, "loss": 0.7932, "mean_token_accuracy": 0.768438458442688, "step": 2090 }, { "epoch": 0.3446928408366411, "grad_norm": 0.484593462278472, "learning_rate": 1.65683181784397e-05, "loss": 0.8014, "mean_token_accuracy": 0.7661840379238128, "step": 2095 }, { "epoch": 0.3455154967813586, "grad_norm": 0.46708317184034387, "learning_rate": 1.654663379777772e-05, "loss": 0.8057, "mean_token_accuracy": 0.7643803939223289, "step": 2100 }, { "epoch": 0.34633815272607615, "grad_norm": 0.48259995905622705, "learning_rate": 1.652489541115989e-05, "loss": 0.8082, "mean_token_accuracy": 0.7645273372530937, "step": 2105 }, { "epoch": 0.34716080867079363, "grad_norm": 0.47619905855479333, "learning_rate": 1.6503103197915383e-05, "loss": 0.8029, "mean_token_accuracy": 0.7649444311857223, "step": 2110 }, { "epoch": 0.34798346461551116, "grad_norm": 0.458108543194004, "learning_rate": 1.6481257337817383e-05, "loss": 0.801, "mean_token_accuracy": 0.7658943757414818, "step": 2115 }, { "epoch": 0.3488061205602287, "grad_norm": 0.5136700738678858, "learning_rate": 1.645935801108166e-05, "loss": 0.8099, "mean_token_accuracy": 0.762997205555439, "step": 2120 }, { "epoch": 0.3496287765049462, "grad_norm": 0.5024034991731576, "learning_rate": 1.6437405398365033e-05, "loss": 0.8051, "mean_token_accuracy": 0.7656115055084228, "step": 2125 }, { "epoch": 0.35045143244966376, "grad_norm": 0.4651572761159412, "learning_rate": 1.6415399680763903e-05, "loss": 0.7916, "mean_token_accuracy": 0.768904858827591, "step": 2130 }, { "epoch": 0.35127408839438123, "grad_norm": 0.4637292283975601, "learning_rate": 1.6393341039812754e-05, "loss": 0.8039, "mean_token_accuracy": 0.7651457905769348, "step": 2135 }, { "epoch": 0.35209674433909877, "grad_norm": 0.47582997273469463, "learning_rate": 1.637122965748267e-05, "loss": 0.8058, "mean_token_accuracy": 0.7647428318858147, "step": 2140 }, { "epoch": 0.3529194002838163, "grad_norm": 0.47792927777088867, "learning_rate": 1.6349065716179808e-05, "loss": 0.8182, "mean_token_accuracy": 0.7612769499421119, "step": 2145 }, { "epoch": 0.35374205622853383, "grad_norm": 0.4770938790867725, "learning_rate": 1.6326849398743906e-05, "loss": 0.811, "mean_token_accuracy": 0.7632219076156617, "step": 2150 }, { "epoch": 0.35456471217325136, "grad_norm": 0.47472105712717977, "learning_rate": 1.630458088844679e-05, "loss": 0.804, "mean_token_accuracy": 0.765542957186699, "step": 2155 }, { "epoch": 0.35538736811796884, "grad_norm": 0.4689974319611913, "learning_rate": 1.6282260368990824e-05, "loss": 0.8146, "mean_token_accuracy": 0.7629323855042458, "step": 2160 }, { "epoch": 0.35621002406268637, "grad_norm": 0.4727136536542539, "learning_rate": 1.625988802450744e-05, "loss": 0.7989, "mean_token_accuracy": 0.7670793890953064, "step": 2165 }, { "epoch": 0.3570326800074039, "grad_norm": 0.4811943706676573, "learning_rate": 1.623746403955557e-05, "loss": 0.8126, "mean_token_accuracy": 0.7641498431563377, "step": 2170 }, { "epoch": 0.35785533595212143, "grad_norm": 0.5078753437779725, "learning_rate": 1.6214988599120183e-05, "loss": 0.8058, "mean_token_accuracy": 0.7656649202108383, "step": 2175 }, { "epoch": 0.35867799189683897, "grad_norm": 0.46663546378592297, "learning_rate": 1.619246188861071e-05, "loss": 0.8192, "mean_token_accuracy": 0.7612325087189674, "step": 2180 }, { "epoch": 0.35950064784155644, "grad_norm": 0.4692931858414664, "learning_rate": 1.6169884093859516e-05, "loss": 0.8053, "mean_token_accuracy": 0.7644297704100609, "step": 2185 }, { "epoch": 0.360323303786274, "grad_norm": 0.4791948077876773, "learning_rate": 1.6147255401120408e-05, "loss": 0.8146, "mean_token_accuracy": 0.7636240869760513, "step": 2190 }, { "epoch": 0.3611459597309915, "grad_norm": 0.47948328269584883, "learning_rate": 1.6124575997067053e-05, "loss": 0.8155, "mean_token_accuracy": 0.7616774171590805, "step": 2195 }, { "epoch": 0.36196861567570904, "grad_norm": 0.46712884194394144, "learning_rate": 1.6101846068791464e-05, "loss": 0.8112, "mean_token_accuracy": 0.7635985165834427, "step": 2200 }, { "epoch": 0.36279127162042657, "grad_norm": 0.5896635704885507, "learning_rate": 1.6079065803802453e-05, "loss": 0.81, "mean_token_accuracy": 0.7643268212676049, "step": 2205 }, { "epoch": 0.36361392756514405, "grad_norm": 0.4939022277351784, "learning_rate": 1.605623539002408e-05, "loss": 0.8036, "mean_token_accuracy": 0.7656280279159546, "step": 2210 }, { "epoch": 0.3644365835098616, "grad_norm": 0.4664450126445517, "learning_rate": 1.6033355015794086e-05, "loss": 0.8038, "mean_token_accuracy": 0.7650366142392159, "step": 2215 }, { "epoch": 0.3652592394545791, "grad_norm": 0.46364200176343695, "learning_rate": 1.601042486986239e-05, "loss": 0.7871, "mean_token_accuracy": 0.7699290573596954, "step": 2220 }, { "epoch": 0.36608189539929664, "grad_norm": 0.4709765210335064, "learning_rate": 1.598744514138947e-05, "loss": 0.8042, "mean_token_accuracy": 0.7654130250215531, "step": 2225 }, { "epoch": 0.3669045513440142, "grad_norm": 0.46737671725696045, "learning_rate": 1.596441601994485e-05, "loss": 0.7914, "mean_token_accuracy": 0.769035255908966, "step": 2230 }, { "epoch": 0.36772720728873165, "grad_norm": 0.47601438376263133, "learning_rate": 1.59413376955055e-05, "loss": 0.8055, "mean_token_accuracy": 0.7643353536725044, "step": 2235 }, { "epoch": 0.3685498632334492, "grad_norm": 0.4715574147672739, "learning_rate": 1.59182103584543e-05, "loss": 0.8103, "mean_token_accuracy": 0.76314507573843, "step": 2240 }, { "epoch": 0.3693725191781667, "grad_norm": 0.46889293803518123, "learning_rate": 1.5895034199578458e-05, "loss": 0.7972, "mean_token_accuracy": 0.7672025814652443, "step": 2245 }, { "epoch": 0.37019517512288425, "grad_norm": 0.4795799589227057, "learning_rate": 1.587180941006792e-05, "loss": 0.8194, "mean_token_accuracy": 0.7603564217686654, "step": 2250 }, { "epoch": 0.3710178310676018, "grad_norm": 0.5265892418925828, "learning_rate": 1.5848536181513815e-05, "loss": 0.7917, "mean_token_accuracy": 0.768917427957058, "step": 2255 }, { "epoch": 0.37184048701231925, "grad_norm": 0.4627750342309612, "learning_rate": 1.5825214705906867e-05, "loss": 0.8187, "mean_token_accuracy": 0.7618304193019867, "step": 2260 }, { "epoch": 0.3726631429570368, "grad_norm": 0.48260842221825906, "learning_rate": 1.5801845175635814e-05, "loss": 0.8148, "mean_token_accuracy": 0.7617446586489678, "step": 2265 }, { "epoch": 0.3734857989017543, "grad_norm": 0.48407832790822275, "learning_rate": 1.5778427783485803e-05, "loss": 0.8035, "mean_token_accuracy": 0.764531995356083, "step": 2270 }, { "epoch": 0.37430845484647185, "grad_norm": 0.4957579416712468, "learning_rate": 1.575496272263683e-05, "loss": 0.8092, "mean_token_accuracy": 0.7634660422801971, "step": 2275 }, { "epoch": 0.3751311107911894, "grad_norm": 0.4736760993367138, "learning_rate": 1.573145018666212e-05, "loss": 0.807, "mean_token_accuracy": 0.7642150223255157, "step": 2280 }, { "epoch": 0.37595376673590686, "grad_norm": 0.4631755306215762, "learning_rate": 1.5707890369526548e-05, "loss": 0.8046, "mean_token_accuracy": 0.7647641763091088, "step": 2285 }, { "epoch": 0.3767764226806244, "grad_norm": 0.45666352168107865, "learning_rate": 1.5684283465585024e-05, "loss": 0.8042, "mean_token_accuracy": 0.7654905050992966, "step": 2290 }, { "epoch": 0.3775990786253419, "grad_norm": 0.46614074213997453, "learning_rate": 1.56606296695809e-05, "loss": 0.806, "mean_token_accuracy": 0.7629829660058022, "step": 2295 }, { "epoch": 0.37842173457005945, "grad_norm": 0.48429780343885476, "learning_rate": 1.5636929176644363e-05, "loss": 0.8, "mean_token_accuracy": 0.7665152743458747, "step": 2300 }, { "epoch": 0.37924439051477693, "grad_norm": 0.46463315255415044, "learning_rate": 1.561318218229082e-05, "loss": 0.809, "mean_token_accuracy": 0.7635734781622887, "step": 2305 }, { "epoch": 0.38006704645949446, "grad_norm": 0.48248581229894694, "learning_rate": 1.5589388882419284e-05, "loss": 0.7987, "mean_token_accuracy": 0.7667117416858673, "step": 2310 }, { "epoch": 0.380889702404212, "grad_norm": 0.45845245625465375, "learning_rate": 1.5565549473310775e-05, "loss": 0.7916, "mean_token_accuracy": 0.767702516913414, "step": 2315 }, { "epoch": 0.3817123583489295, "grad_norm": 0.4897638125942071, "learning_rate": 1.554166415162667e-05, "loss": 0.8055, "mean_token_accuracy": 0.7639914020895958, "step": 2320 }, { "epoch": 0.38253501429364706, "grad_norm": 0.4626260447369422, "learning_rate": 1.5517733114407113e-05, "loss": 0.8003, "mean_token_accuracy": 0.7654257893562317, "step": 2325 }, { "epoch": 0.38335767023836453, "grad_norm": 0.4582355230417109, "learning_rate": 1.549375655906937e-05, "loss": 0.7969, "mean_token_accuracy": 0.7671603292226792, "step": 2330 }, { "epoch": 0.38418032618308207, "grad_norm": 0.47856777247790677, "learning_rate": 1.5469734683406197e-05, "loss": 0.794, "mean_token_accuracy": 0.7676752358675003, "step": 2335 }, { "epoch": 0.3850029821277996, "grad_norm": 0.46121477369722225, "learning_rate": 1.544566768558423e-05, "loss": 0.7882, "mean_token_accuracy": 0.7691125229001046, "step": 2340 }, { "epoch": 0.38582563807251713, "grad_norm": 0.46472164494539725, "learning_rate": 1.5421555764142325e-05, "loss": 0.799, "mean_token_accuracy": 0.7655122086405755, "step": 2345 }, { "epoch": 0.38664829401723466, "grad_norm": 0.5490108469162374, "learning_rate": 1.539739911798995e-05, "loss": 0.8124, "mean_token_accuracy": 0.7627492219209671, "step": 2350 }, { "epoch": 0.38747094996195214, "grad_norm": 0.46478972597512763, "learning_rate": 1.53731979464055e-05, "loss": 0.8033, "mean_token_accuracy": 0.7651499047875404, "step": 2355 }, { "epoch": 0.38829360590666967, "grad_norm": 0.4813405195163153, "learning_rate": 1.5348952449034697e-05, "loss": 0.8071, "mean_token_accuracy": 0.7632728815078735, "step": 2360 }, { "epoch": 0.3891162618513872, "grad_norm": 0.4847066690097845, "learning_rate": 1.5324662825888923e-05, "loss": 0.7963, "mean_token_accuracy": 0.7663283303380013, "step": 2365 }, { "epoch": 0.38993891779610473, "grad_norm": 0.495550077325848, "learning_rate": 1.5300329277343574e-05, "loss": 0.7892, "mean_token_accuracy": 0.7683043420314789, "step": 2370 }, { "epoch": 0.39076157374082227, "grad_norm": 0.47296742041950224, "learning_rate": 1.527595200413639e-05, "loss": 0.8175, "mean_token_accuracy": 0.7608013927936554, "step": 2375 }, { "epoch": 0.39158422968553974, "grad_norm": 0.4824608488710112, "learning_rate": 1.5251531207365835e-05, "loss": 0.8046, "mean_token_accuracy": 0.7651599958539009, "step": 2380 }, { "epoch": 0.3924068856302573, "grad_norm": 0.4745679035525498, "learning_rate": 1.5227067088489405e-05, "loss": 0.793, "mean_token_accuracy": 0.7676354691386222, "step": 2385 }, { "epoch": 0.3932295415749748, "grad_norm": 0.4658617483500754, "learning_rate": 1.5202559849321983e-05, "loss": 0.7957, "mean_token_accuracy": 0.766944108903408, "step": 2390 }, { "epoch": 0.39405219751969234, "grad_norm": 0.45366468743837146, "learning_rate": 1.5178009692034165e-05, "loss": 0.7897, "mean_token_accuracy": 0.7692028492689132, "step": 2395 }, { "epoch": 0.39487485346440987, "grad_norm": 0.4913678165597741, "learning_rate": 1.5153416819150606e-05, "loss": 0.7829, "mean_token_accuracy": 0.7704806506633759, "step": 2400 }, { "epoch": 0.39569750940912735, "grad_norm": 0.462702040559741, "learning_rate": 1.5128781433548328e-05, "loss": 0.7966, "mean_token_accuracy": 0.767667056620121, "step": 2405 }, { "epoch": 0.3965201653538449, "grad_norm": 0.4844567239683439, "learning_rate": 1.5104103738455065e-05, "loss": 0.7991, "mean_token_accuracy": 0.7657046720385552, "step": 2410 }, { "epoch": 0.3973428212985624, "grad_norm": 0.4763088718060662, "learning_rate": 1.507938393744758e-05, "loss": 0.8017, "mean_token_accuracy": 0.7657511979341507, "step": 2415 }, { "epoch": 0.39816547724327994, "grad_norm": 0.4737154803487421, "learning_rate": 1.5054622234449985e-05, "loss": 0.8003, "mean_token_accuracy": 0.7652138695120811, "step": 2420 }, { "epoch": 0.3989881331879975, "grad_norm": 0.4690540801427219, "learning_rate": 1.5029818833732059e-05, "loss": 0.816, "mean_token_accuracy": 0.7626078277826309, "step": 2425 }, { "epoch": 0.39981078913271495, "grad_norm": 0.47369218134155405, "learning_rate": 1.5004973939907563e-05, "loss": 0.8097, "mean_token_accuracy": 0.7635345906019211, "step": 2430 }, { "epoch": 0.4006334450774325, "grad_norm": 0.4828967261341461, "learning_rate": 1.4980087757932549e-05, "loss": 0.7926, "mean_token_accuracy": 0.7691109508275986, "step": 2435 }, { "epoch": 0.40145610102215, "grad_norm": 0.4646693981086129, "learning_rate": 1.4955160493103682e-05, "loss": 0.8223, "mean_token_accuracy": 0.7597701415419579, "step": 2440 }, { "epoch": 0.40227875696686755, "grad_norm": 0.4812711783303008, "learning_rate": 1.4930192351056526e-05, "loss": 0.7964, "mean_token_accuracy": 0.7671713173389435, "step": 2445 }, { "epoch": 0.4031014129115851, "grad_norm": 0.6041166093075327, "learning_rate": 1.4905183537763867e-05, "loss": 0.7961, "mean_token_accuracy": 0.7662586435675621, "step": 2450 }, { "epoch": 0.40392406885630255, "grad_norm": 0.4805824508988389, "learning_rate": 1.4880134259533998e-05, "loss": 0.7959, "mean_token_accuracy": 0.7665027692914009, "step": 2455 }, { "epoch": 0.4047467248010201, "grad_norm": 0.47551458679784053, "learning_rate": 1.485504472300903e-05, "loss": 0.8065, "mean_token_accuracy": 0.7642807126045227, "step": 2460 }, { "epoch": 0.4055693807457376, "grad_norm": 0.46667789008493893, "learning_rate": 1.4829915135163178e-05, "loss": 0.7919, "mean_token_accuracy": 0.7672284737229347, "step": 2465 }, { "epoch": 0.40639203669045515, "grad_norm": 0.468845687909243, "learning_rate": 1.4804745703301059e-05, "loss": 0.8187, "mean_token_accuracy": 0.7604504510760307, "step": 2470 }, { "epoch": 0.4072146926351727, "grad_norm": 0.4663158546978021, "learning_rate": 1.477953663505598e-05, "loss": 0.7967, "mean_token_accuracy": 0.7667260974645614, "step": 2475 }, { "epoch": 0.40803734857989016, "grad_norm": 0.4629479824480154, "learning_rate": 1.4754288138388227e-05, "loss": 0.7935, "mean_token_accuracy": 0.7677477866411209, "step": 2480 }, { "epoch": 0.4088600045246077, "grad_norm": 0.5410635654819581, "learning_rate": 1.4729000421583344e-05, "loss": 0.798, "mean_token_accuracy": 0.7660590931773186, "step": 2485 }, { "epoch": 0.4096826604693252, "grad_norm": 0.46916835075318736, "learning_rate": 1.4703673693250425e-05, "loss": 0.8073, "mean_token_accuracy": 0.7641143724322319, "step": 2490 }, { "epoch": 0.41050531641404275, "grad_norm": 0.5032195634867334, "learning_rate": 1.4678308162320376e-05, "loss": 0.802, "mean_token_accuracy": 0.7665286660194397, "step": 2495 }, { "epoch": 0.41132797235876023, "grad_norm": 0.49420235066087415, "learning_rate": 1.4652904038044211e-05, "loss": 0.7997, "mean_token_accuracy": 0.7657902359962463, "step": 2500 }, { "epoch": 0.41215062830347776, "grad_norm": 0.46999969404273073, "learning_rate": 1.462746152999132e-05, "loss": 0.7889, "mean_token_accuracy": 0.7686048895120621, "step": 2505 }, { "epoch": 0.4129732842481953, "grad_norm": 0.47009571457943045, "learning_rate": 1.4601980848047724e-05, "loss": 0.812, "mean_token_accuracy": 0.7631899088621139, "step": 2510 }, { "epoch": 0.4137959401929128, "grad_norm": 0.46228155749147887, "learning_rate": 1.4576462202414363e-05, "loss": 0.795, "mean_token_accuracy": 0.7677616506814957, "step": 2515 }, { "epoch": 0.41461859613763036, "grad_norm": 0.4758431754314983, "learning_rate": 1.4550905803605362e-05, "loss": 0.8054, "mean_token_accuracy": 0.7648142784833908, "step": 2520 }, { "epoch": 0.41544125208234783, "grad_norm": 0.4527526394606362, "learning_rate": 1.4525311862446284e-05, "loss": 0.7704, "mean_token_accuracy": 0.7738257959485054, "step": 2525 }, { "epoch": 0.41626390802706537, "grad_norm": 0.4779077154563903, "learning_rate": 1.4499680590072382e-05, "loss": 0.8111, "mean_token_accuracy": 0.7630104154348374, "step": 2530 }, { "epoch": 0.4170865639717829, "grad_norm": 0.4707056003630672, "learning_rate": 1.4474012197926892e-05, "loss": 0.7812, "mean_token_accuracy": 0.772066043317318, "step": 2535 }, { "epoch": 0.41790921991650043, "grad_norm": 0.47564550146216655, "learning_rate": 1.4448306897759252e-05, "loss": 0.7977, "mean_token_accuracy": 0.7660088017582893, "step": 2540 }, { "epoch": 0.41873187586121796, "grad_norm": 0.49848716734386017, "learning_rate": 1.4422564901623375e-05, "loss": 0.801, "mean_token_accuracy": 0.7652359053492546, "step": 2545 }, { "epoch": 0.41955453180593544, "grad_norm": 0.4682123173119025, "learning_rate": 1.4396786421875889e-05, "loss": 0.7936, "mean_token_accuracy": 0.7686336636543274, "step": 2550 }, { "epoch": 0.42037718775065297, "grad_norm": 0.4799810964001143, "learning_rate": 1.43709716711744e-05, "loss": 0.8051, "mean_token_accuracy": 0.7650800868868828, "step": 2555 }, { "epoch": 0.4211998436953705, "grad_norm": 0.46623836681983405, "learning_rate": 1.4345120862475724e-05, "loss": 0.7898, "mean_token_accuracy": 0.7687389358878136, "step": 2560 }, { "epoch": 0.42202249964008803, "grad_norm": 0.6219293690845119, "learning_rate": 1.431923420903413e-05, "loss": 0.7881, "mean_token_accuracy": 0.7689273729920387, "step": 2565 }, { "epoch": 0.42284515558480557, "grad_norm": 0.47814049209641374, "learning_rate": 1.429331192439959e-05, "loss": 0.8004, "mean_token_accuracy": 0.7657509535551071, "step": 2570 }, { "epoch": 0.42366781152952304, "grad_norm": 0.4806308564821128, "learning_rate": 1.4267354222416021e-05, "loss": 0.8046, "mean_token_accuracy": 0.7642430752515793, "step": 2575 }, { "epoch": 0.4244904674742406, "grad_norm": 0.4709571330154841, "learning_rate": 1.4241361317219496e-05, "loss": 0.8042, "mean_token_accuracy": 0.764810462296009, "step": 2580 }, { "epoch": 0.4253131234189581, "grad_norm": 0.4745802770287837, "learning_rate": 1.4215333423236502e-05, "loss": 0.8045, "mean_token_accuracy": 0.764426140487194, "step": 2585 }, { "epoch": 0.42613577936367564, "grad_norm": 0.4768913769311737, "learning_rate": 1.4189270755182168e-05, "loss": 0.7983, "mean_token_accuracy": 0.7660163179039955, "step": 2590 }, { "epoch": 0.42695843530839317, "grad_norm": 0.4926100518308143, "learning_rate": 1.4163173528058482e-05, "loss": 0.7871, "mean_token_accuracy": 0.7687412664294243, "step": 2595 }, { "epoch": 0.42778109125311065, "grad_norm": 0.5112657550672961, "learning_rate": 1.413704195715253e-05, "loss": 0.798, "mean_token_accuracy": 0.7659529626369477, "step": 2600 }, { "epoch": 0.4286037471978282, "grad_norm": 0.4761200997709054, "learning_rate": 1.4110876258034711e-05, "loss": 0.7865, "mean_token_accuracy": 0.7692981913685799, "step": 2605 }, { "epoch": 0.4294264031425457, "grad_norm": 0.4640359219047277, "learning_rate": 1.4084676646556962e-05, "loss": 0.8041, "mean_token_accuracy": 0.7644286021590233, "step": 2610 }, { "epoch": 0.43024905908726324, "grad_norm": 0.4683995262308605, "learning_rate": 1.4058443338850975e-05, "loss": 0.7852, "mean_token_accuracy": 0.7691784113645553, "step": 2615 }, { "epoch": 0.4310717150319808, "grad_norm": 0.4778747212631565, "learning_rate": 1.4032176551326429e-05, "loss": 0.7896, "mean_token_accuracy": 0.7684204697608947, "step": 2620 }, { "epoch": 0.43189437097669825, "grad_norm": 0.4735484167404302, "learning_rate": 1.4005876500669173e-05, "loss": 0.7979, "mean_token_accuracy": 0.765180166065693, "step": 2625 }, { "epoch": 0.4327170269214158, "grad_norm": 0.46838162854139237, "learning_rate": 1.3979543403839473e-05, "loss": 0.7872, "mean_token_accuracy": 0.7699773997068405, "step": 2630 }, { "epoch": 0.4335396828661333, "grad_norm": 0.46197478498955213, "learning_rate": 1.3953177478070197e-05, "loss": 0.7893, "mean_token_accuracy": 0.7686680600047111, "step": 2635 }, { "epoch": 0.43436233881085085, "grad_norm": 0.47178418614697837, "learning_rate": 1.3926778940865043e-05, "loss": 0.7876, "mean_token_accuracy": 0.7691331326961517, "step": 2640 }, { "epoch": 0.4351849947555684, "grad_norm": 0.4627417823470683, "learning_rate": 1.3900348009996725e-05, "loss": 0.7849, "mean_token_accuracy": 0.7704642966389657, "step": 2645 }, { "epoch": 0.43600765070028585, "grad_norm": 0.45754062623484126, "learning_rate": 1.387388490350519e-05, "loss": 0.7816, "mean_token_accuracy": 0.7705094590783119, "step": 2650 }, { "epoch": 0.4368303066450034, "grad_norm": 0.4670260489124018, "learning_rate": 1.3847389839695818e-05, "loss": 0.7876, "mean_token_accuracy": 0.7680950865149498, "step": 2655 }, { "epoch": 0.4376529625897209, "grad_norm": 0.5117528793338135, "learning_rate": 1.3820863037137615e-05, "loss": 0.7865, "mean_token_accuracy": 0.7688084736466407, "step": 2660 }, { "epoch": 0.43847561853443845, "grad_norm": 0.4824602721202857, "learning_rate": 1.3794304714661408e-05, "loss": 0.8106, "mean_token_accuracy": 0.7625801861286163, "step": 2665 }, { "epoch": 0.439298274479156, "grad_norm": 0.45845009551484045, "learning_rate": 1.3767715091358058e-05, "loss": 0.7815, "mean_token_accuracy": 0.7695310950279236, "step": 2670 }, { "epoch": 0.44012093042387346, "grad_norm": 0.4658908437355656, "learning_rate": 1.374109438657663e-05, "loss": 0.8086, "mean_token_accuracy": 0.7630115702748299, "step": 2675 }, { "epoch": 0.440943586368591, "grad_norm": 0.45481911565892535, "learning_rate": 1.3714442819922603e-05, "loss": 0.7773, "mean_token_accuracy": 0.7719591572880745, "step": 2680 }, { "epoch": 0.4417662423133085, "grad_norm": 0.5626099071838703, "learning_rate": 1.368776061125603e-05, "loss": 0.7874, "mean_token_accuracy": 0.7684416368603706, "step": 2685 }, { "epoch": 0.44258889825802605, "grad_norm": 0.5013316989426091, "learning_rate": 1.3661047980689763e-05, "loss": 0.8018, "mean_token_accuracy": 0.7659082531929016, "step": 2690 }, { "epoch": 0.4434115542027436, "grad_norm": 0.4717828681115714, "learning_rate": 1.3634305148587608e-05, "loss": 0.7988, "mean_token_accuracy": 0.7648628443479538, "step": 2695 }, { "epoch": 0.44423421014746106, "grad_norm": 0.5028626194045326, "learning_rate": 1.3607532335562516e-05, "loss": 0.7995, "mean_token_accuracy": 0.765478515625, "step": 2700 }, { "epoch": 0.4450568660921786, "grad_norm": 0.4723961630845055, "learning_rate": 1.3580729762474762e-05, "loss": 0.777, "mean_token_accuracy": 0.7718922600150109, "step": 2705 }, { "epoch": 0.4458795220368961, "grad_norm": 0.46986118205432337, "learning_rate": 1.3553897650430135e-05, "loss": 0.8017, "mean_token_accuracy": 0.7653205826878547, "step": 2710 }, { "epoch": 0.44670217798161366, "grad_norm": 0.4510124065347189, "learning_rate": 1.3527036220778087e-05, "loss": 0.7782, "mean_token_accuracy": 0.7707252070307732, "step": 2715 }, { "epoch": 0.44752483392633113, "grad_norm": 0.6156744930231819, "learning_rate": 1.3500145695109939e-05, "loss": 0.8118, "mean_token_accuracy": 0.7620943546295166, "step": 2720 }, { "epoch": 0.44834748987104867, "grad_norm": 0.46420904631916726, "learning_rate": 1.347322629525702e-05, "loss": 0.7887, "mean_token_accuracy": 0.7687474727630615, "step": 2725 }, { "epoch": 0.4491701458157662, "grad_norm": 0.4686914678989704, "learning_rate": 1.3446278243288877e-05, "loss": 0.794, "mean_token_accuracy": 0.7673821225762367, "step": 2730 }, { "epoch": 0.44999280176048373, "grad_norm": 0.47871021254486273, "learning_rate": 1.3419301761511396e-05, "loss": 0.7938, "mean_token_accuracy": 0.7673749104142189, "step": 2735 }, { "epoch": 0.45081545770520126, "grad_norm": 0.4985495847308479, "learning_rate": 1.3392297072465014e-05, "loss": 0.8181, "mean_token_accuracy": 0.7602700963616371, "step": 2740 }, { "epoch": 0.45163811364991874, "grad_norm": 0.4597749377426168, "learning_rate": 1.3365264398922845e-05, "loss": 0.7995, "mean_token_accuracy": 0.7657791376113892, "step": 2745 }, { "epoch": 0.45246076959463627, "grad_norm": 0.4562510376722243, "learning_rate": 1.3338203963888874e-05, "loss": 0.7879, "mean_token_accuracy": 0.7685150653123856, "step": 2750 }, { "epoch": 0.4532834255393538, "grad_norm": 0.47120174863041253, "learning_rate": 1.3311115990596087e-05, "loss": 0.7912, "mean_token_accuracy": 0.7675330832600593, "step": 2755 }, { "epoch": 0.45410608148407133, "grad_norm": 0.4501641200056066, "learning_rate": 1.3284000702504656e-05, "loss": 0.7795, "mean_token_accuracy": 0.771689559519291, "step": 2760 }, { "epoch": 0.45492873742878887, "grad_norm": 0.45874355297824915, "learning_rate": 1.3256858323300079e-05, "loss": 0.7951, "mean_token_accuracy": 0.7665720328688621, "step": 2765 }, { "epoch": 0.45575139337350634, "grad_norm": 0.46486395481522724, "learning_rate": 1.3229689076891342e-05, "loss": 0.7919, "mean_token_accuracy": 0.7676732882857322, "step": 2770 }, { "epoch": 0.4565740493182239, "grad_norm": 0.4726614193253512, "learning_rate": 1.3202493187409069e-05, "loss": 0.7855, "mean_token_accuracy": 0.7702659428119659, "step": 2775 }, { "epoch": 0.4573967052629414, "grad_norm": 0.4846766742790622, "learning_rate": 1.3175270879203677e-05, "loss": 0.7901, "mean_token_accuracy": 0.7670933231711388, "step": 2780 }, { "epoch": 0.45821936120765894, "grad_norm": 0.4671643549323303, "learning_rate": 1.3148022376843519e-05, "loss": 0.7945, "mean_token_accuracy": 0.7661603257060051, "step": 2785 }, { "epoch": 0.45904201715237647, "grad_norm": 0.4639536909511509, "learning_rate": 1.3120747905113032e-05, "loss": 0.7863, "mean_token_accuracy": 0.769007894396782, "step": 2790 }, { "epoch": 0.45986467309709395, "grad_norm": 0.4528608171759212, "learning_rate": 1.3093447689010892e-05, "loss": 0.7804, "mean_token_accuracy": 0.7701355412602424, "step": 2795 }, { "epoch": 0.4606873290418115, "grad_norm": 0.49697098267362394, "learning_rate": 1.3066121953748147e-05, "loss": 0.7892, "mean_token_accuracy": 0.7684523716568947, "step": 2800 }, { "epoch": 0.461509984986529, "grad_norm": 0.49005832281284717, "learning_rate": 1.3038770924746366e-05, "loss": 0.7743, "mean_token_accuracy": 0.7723978370428085, "step": 2805 }, { "epoch": 0.46233264093124654, "grad_norm": 0.5091411174416574, "learning_rate": 1.3011394827635773e-05, "loss": 0.7933, "mean_token_accuracy": 0.7656970754265785, "step": 2810 }, { "epoch": 0.4631552968759641, "grad_norm": 0.47969433433412045, "learning_rate": 1.2983993888253395e-05, "loss": 0.7834, "mean_token_accuracy": 0.7703572407364845, "step": 2815 }, { "epoch": 0.46397795282068155, "grad_norm": 0.46309105406909007, "learning_rate": 1.2956568332641188e-05, "loss": 0.7943, "mean_token_accuracy": 0.7669329807162285, "step": 2820 }, { "epoch": 0.4648006087653991, "grad_norm": 0.5604797340547285, "learning_rate": 1.2929118387044177e-05, "loss": 0.7884, "mean_token_accuracy": 0.7693686798214913, "step": 2825 }, { "epoch": 0.4656232647101166, "grad_norm": 0.5990654697554315, "learning_rate": 1.2901644277908595e-05, "loss": 0.7737, "mean_token_accuracy": 0.7724661782383919, "step": 2830 }, { "epoch": 0.46644592065483415, "grad_norm": 0.4859616468963596, "learning_rate": 1.2874146231880015e-05, "loss": 0.803, "mean_token_accuracy": 0.7637146398425102, "step": 2835 }, { "epoch": 0.4672685765995517, "grad_norm": 0.4724342136591092, "learning_rate": 1.2846624475801457e-05, "loss": 0.7877, "mean_token_accuracy": 0.7679189041256904, "step": 2840 }, { "epoch": 0.46809123254426915, "grad_norm": 0.45431709736616727, "learning_rate": 1.2819079236711556e-05, "loss": 0.7779, "mean_token_accuracy": 0.7711359709501266, "step": 2845 }, { "epoch": 0.4689138884889867, "grad_norm": 0.4855821687873824, "learning_rate": 1.2791510741842656e-05, "loss": 0.7862, "mean_token_accuracy": 0.7686131462454796, "step": 2850 }, { "epoch": 0.4697365444337042, "grad_norm": 0.46104688337904814, "learning_rate": 1.2763919218618951e-05, "loss": 0.7964, "mean_token_accuracy": 0.7660603493452072, "step": 2855 }, { "epoch": 0.47055920037842175, "grad_norm": 0.45952875057161346, "learning_rate": 1.2736304894654608e-05, "loss": 0.7866, "mean_token_accuracy": 0.7685164690017701, "step": 2860 }, { "epoch": 0.4713818563231393, "grad_norm": 0.4623243899766877, "learning_rate": 1.2708667997751885e-05, "loss": 0.7819, "mean_token_accuracy": 0.7701334491372108, "step": 2865 }, { "epoch": 0.47220451226785676, "grad_norm": 0.46172397944773286, "learning_rate": 1.268100875589925e-05, "loss": 0.7865, "mean_token_accuracy": 0.7683913841843605, "step": 2870 }, { "epoch": 0.4730271682125743, "grad_norm": 0.45376314723628175, "learning_rate": 1.2653327397269513e-05, "loss": 0.7956, "mean_token_accuracy": 0.7667783379554749, "step": 2875 }, { "epoch": 0.4738498241572918, "grad_norm": 0.46866059879105315, "learning_rate": 1.262562415021792e-05, "loss": 0.7982, "mean_token_accuracy": 0.7660658523440361, "step": 2880 }, { "epoch": 0.47467248010200935, "grad_norm": 0.4661094926979794, "learning_rate": 1.2597899243280308e-05, "loss": 0.7726, "mean_token_accuracy": 0.7717386990785599, "step": 2885 }, { "epoch": 0.4754951360467269, "grad_norm": 0.479312158745329, "learning_rate": 1.2570152905171167e-05, "loss": 0.7838, "mean_token_accuracy": 0.76998650431633, "step": 2890 }, { "epoch": 0.47631779199144436, "grad_norm": 0.45679096129087476, "learning_rate": 1.2542385364781802e-05, "loss": 0.7899, "mean_token_accuracy": 0.7679675027728081, "step": 2895 }, { "epoch": 0.4771404479361619, "grad_norm": 0.48499825846807737, "learning_rate": 1.2514596851178414e-05, "loss": 0.7751, "mean_token_accuracy": 0.7709692850708961, "step": 2900 }, { "epoch": 0.4779631038808794, "grad_norm": 0.4666113008798478, "learning_rate": 1.2486787593600232e-05, "loss": 0.7781, "mean_token_accuracy": 0.7725317612290382, "step": 2905 }, { "epoch": 0.47878575982559696, "grad_norm": 0.454281107258924, "learning_rate": 1.2458957821457595e-05, "loss": 0.7913, "mean_token_accuracy": 0.7671986296772957, "step": 2910 }, { "epoch": 0.47960841577031443, "grad_norm": 0.4842668053230154, "learning_rate": 1.2431107764330093e-05, "loss": 0.7755, "mean_token_accuracy": 0.7722512975335121, "step": 2915 }, { "epoch": 0.48043107171503197, "grad_norm": 0.46316187877090853, "learning_rate": 1.2403237651964639e-05, "loss": 0.7703, "mean_token_accuracy": 0.7731240600347519, "step": 2920 }, { "epoch": 0.4812537276597495, "grad_norm": 0.46222746849059476, "learning_rate": 1.23753477142736e-05, "loss": 0.7927, "mean_token_accuracy": 0.7663831248879432, "step": 2925 }, { "epoch": 0.48207638360446703, "grad_norm": 0.45290329569557336, "learning_rate": 1.2347438181332893e-05, "loss": 0.7681, "mean_token_accuracy": 0.773770771920681, "step": 2930 }, { "epoch": 0.48289903954918456, "grad_norm": 0.4766976155842292, "learning_rate": 1.2319509283380076e-05, "loss": 0.7827, "mean_token_accuracy": 0.7706102699041366, "step": 2935 }, { "epoch": 0.48372169549390204, "grad_norm": 0.4771222243803682, "learning_rate": 1.229156125081246e-05, "loss": 0.7856, "mean_token_accuracy": 0.7692234188318252, "step": 2940 }, { "epoch": 0.48454435143861957, "grad_norm": 0.46591356023957, "learning_rate": 1.2263594314185207e-05, "loss": 0.7768, "mean_token_accuracy": 0.770325568318367, "step": 2945 }, { "epoch": 0.4853670073833371, "grad_norm": 0.4651722796715673, "learning_rate": 1.223560870420943e-05, "loss": 0.7885, "mean_token_accuracy": 0.7686824157834053, "step": 2950 }, { "epoch": 0.48618966332805463, "grad_norm": 0.47179845623598565, "learning_rate": 1.2207604651750278e-05, "loss": 0.7814, "mean_token_accuracy": 0.770134799182415, "step": 2955 }, { "epoch": 0.48701231927277217, "grad_norm": 0.46765323791223684, "learning_rate": 1.2179582387825045e-05, "loss": 0.7899, "mean_token_accuracy": 0.7675438940525054, "step": 2960 }, { "epoch": 0.48783497521748964, "grad_norm": 0.4561089658653647, "learning_rate": 1.2151542143601258e-05, "loss": 0.7877, "mean_token_accuracy": 0.7683557704091072, "step": 2965 }, { "epoch": 0.4886576311622072, "grad_norm": 0.45483622342300467, "learning_rate": 1.2123484150394775e-05, "loss": 0.7862, "mean_token_accuracy": 0.7685320675373077, "step": 2970 }, { "epoch": 0.4894802871069247, "grad_norm": 0.481581721104823, "learning_rate": 1.2095408639667867e-05, "loss": 0.7885, "mean_token_accuracy": 0.7678330108523369, "step": 2975 }, { "epoch": 0.49030294305164224, "grad_norm": 0.4517242383700136, "learning_rate": 1.2067315843027316e-05, "loss": 0.7852, "mean_token_accuracy": 0.7699625015258789, "step": 2980 }, { "epoch": 0.49112559899635977, "grad_norm": 0.4654794743040154, "learning_rate": 1.2039205992222504e-05, "loss": 0.7932, "mean_token_accuracy": 0.7677570313215256, "step": 2985 }, { "epoch": 0.49194825494107725, "grad_norm": 0.46446792575244034, "learning_rate": 1.2011079319143505e-05, "loss": 0.7843, "mean_token_accuracy": 0.769979153573513, "step": 2990 }, { "epoch": 0.4927709108857948, "grad_norm": 0.45489834620047626, "learning_rate": 1.1982936055819155e-05, "loss": 0.7662, "mean_token_accuracy": 0.7742626756429672, "step": 2995 }, { "epoch": 0.4935935668305123, "grad_norm": 0.46792623816975887, "learning_rate": 1.1954776434415159e-05, "loss": 0.7848, "mean_token_accuracy": 0.7683902621269226, "step": 3000 }, { "epoch": 0.49441622277522984, "grad_norm": 0.47371649284385053, "learning_rate": 1.1926600687232164e-05, "loss": 0.7887, "mean_token_accuracy": 0.7674418568611145, "step": 3005 }, { "epoch": 0.4952388787199474, "grad_norm": 0.46298923083764404, "learning_rate": 1.1898409046703849e-05, "loss": 0.7844, "mean_token_accuracy": 0.7690287992358208, "step": 3010 }, { "epoch": 0.49606153466466485, "grad_norm": 0.49025425746157364, "learning_rate": 1.1870201745394995e-05, "loss": 0.7734, "mean_token_accuracy": 0.7726288199424743, "step": 3015 }, { "epoch": 0.4968841906093824, "grad_norm": 0.4609096492088201, "learning_rate": 1.1841979015999587e-05, "loss": 0.7806, "mean_token_accuracy": 0.7698292002081871, "step": 3020 }, { "epoch": 0.4977068465540999, "grad_norm": 0.46169271823019936, "learning_rate": 1.1813741091338875e-05, "loss": 0.7918, "mean_token_accuracy": 0.7667649328708649, "step": 3025 }, { "epoch": 0.49852950249881745, "grad_norm": 0.4808248285258695, "learning_rate": 1.1785488204359463e-05, "loss": 0.7874, "mean_token_accuracy": 0.768246577680111, "step": 3030 }, { "epoch": 0.499352158443535, "grad_norm": 0.4731018648687588, "learning_rate": 1.1757220588131384e-05, "loss": 0.7865, "mean_token_accuracy": 0.7689037337899208, "step": 3035 }, { "epoch": 0.5001748143882525, "grad_norm": 0.45745218802066334, "learning_rate": 1.1728938475846184e-05, "loss": 0.766, "mean_token_accuracy": 0.7744945481419563, "step": 3040 }, { "epoch": 0.50099747033297, "grad_norm": 0.46883157830070726, "learning_rate": 1.1700642100814987e-05, "loss": 0.8069, "mean_token_accuracy": 0.7632927536964417, "step": 3045 }, { "epoch": 0.5018201262776875, "grad_norm": 0.45300211663939566, "learning_rate": 1.1672331696466581e-05, "loss": 0.7865, "mean_token_accuracy": 0.7684119626879692, "step": 3050 }, { "epoch": 0.502642782222405, "grad_norm": 0.47404004805776906, "learning_rate": 1.1644007496345484e-05, "loss": 0.7774, "mean_token_accuracy": 0.7707745179533958, "step": 3055 }, { "epoch": 0.5034654381671225, "grad_norm": 0.47259430528784463, "learning_rate": 1.1615669734110026e-05, "loss": 0.7883, "mean_token_accuracy": 0.7685762092471122, "step": 3060 }, { "epoch": 0.5042880941118401, "grad_norm": 0.46667601411420057, "learning_rate": 1.1587318643530408e-05, "loss": 0.7992, "mean_token_accuracy": 0.7662133559584617, "step": 3065 }, { "epoch": 0.5051107500565576, "grad_norm": 0.5753166656776038, "learning_rate": 1.1558954458486794e-05, "loss": 0.7866, "mean_token_accuracy": 0.7688018932938576, "step": 3070 }, { "epoch": 0.5059334060012751, "grad_norm": 0.45995804208885427, "learning_rate": 1.153057741296736e-05, "loss": 0.794, "mean_token_accuracy": 0.7670333579182624, "step": 3075 }, { "epoch": 0.5067560619459927, "grad_norm": 0.45491955345066276, "learning_rate": 1.1502187741066382e-05, "loss": 0.7744, "mean_token_accuracy": 0.7718422040343285, "step": 3080 }, { "epoch": 0.5075787178907102, "grad_norm": 0.4681861582553631, "learning_rate": 1.1473785676982283e-05, "loss": 0.7725, "mean_token_accuracy": 0.7727426141500473, "step": 3085 }, { "epoch": 0.5084013738354277, "grad_norm": 0.4443046347956145, "learning_rate": 1.144537145501573e-05, "loss": 0.7702, "mean_token_accuracy": 0.7732807263731957, "step": 3090 }, { "epoch": 0.5092240297801452, "grad_norm": 0.4666632699823999, "learning_rate": 1.1416945309567678e-05, "loss": 0.7837, "mean_token_accuracy": 0.7698408752679825, "step": 3095 }, { "epoch": 0.5100466857248627, "grad_norm": 0.4936154497503301, "learning_rate": 1.1388507475137438e-05, "loss": 0.77, "mean_token_accuracy": 0.7732049807906151, "step": 3100 }, { "epoch": 0.5108693416695802, "grad_norm": 0.4901513567393619, "learning_rate": 1.1360058186320762e-05, "loss": 0.7933, "mean_token_accuracy": 0.7677827894687652, "step": 3105 }, { "epoch": 0.5116919976142977, "grad_norm": 0.47516814357393333, "learning_rate": 1.1331597677807884e-05, "loss": 0.7762, "mean_token_accuracy": 0.771317133307457, "step": 3110 }, { "epoch": 0.5125146535590153, "grad_norm": 0.46052033645437074, "learning_rate": 1.1303126184381595e-05, "loss": 0.7751, "mean_token_accuracy": 0.7715024754405022, "step": 3115 }, { "epoch": 0.5133373095037328, "grad_norm": 0.5495189706322401, "learning_rate": 1.1274643940915309e-05, "loss": 0.7885, "mean_token_accuracy": 0.7671706482768059, "step": 3120 }, { "epoch": 0.5141599654484503, "grad_norm": 0.479170477962212, "learning_rate": 1.1246151182371118e-05, "loss": 0.7758, "mean_token_accuracy": 0.7712239667773246, "step": 3125 }, { "epoch": 0.5149826213931679, "grad_norm": 0.46797478954522875, "learning_rate": 1.1217648143797863e-05, "loss": 0.7886, "mean_token_accuracy": 0.7686742693185806, "step": 3130 }, { "epoch": 0.5158052773378854, "grad_norm": 0.45990670077262785, "learning_rate": 1.1189135060329181e-05, "loss": 0.7695, "mean_token_accuracy": 0.7728790566325188, "step": 3135 }, { "epoch": 0.5166279332826029, "grad_norm": 0.4687902890192736, "learning_rate": 1.1160612167181577e-05, "loss": 0.7737, "mean_token_accuracy": 0.7713962331414222, "step": 3140 }, { "epoch": 0.5174505892273205, "grad_norm": 0.46821333847458774, "learning_rate": 1.1132079699652488e-05, "loss": 0.7829, "mean_token_accuracy": 0.7697373449802398, "step": 3145 }, { "epoch": 0.5182732451720379, "grad_norm": 0.45668909688466985, "learning_rate": 1.1103537893118315e-05, "loss": 0.7811, "mean_token_accuracy": 0.7698833554983139, "step": 3150 }, { "epoch": 0.5190959011167554, "grad_norm": 0.45184099179228815, "learning_rate": 1.1074986983032525e-05, "loss": 0.7678, "mean_token_accuracy": 0.7740291714668274, "step": 3155 }, { "epoch": 0.5199185570614729, "grad_norm": 1.0577498975032722, "learning_rate": 1.1046427204923662e-05, "loss": 0.7852, "mean_token_accuracy": 0.7699206933379174, "step": 3160 }, { "epoch": 0.5207412130061905, "grad_norm": 0.4715175503671157, "learning_rate": 1.1017858794393446e-05, "loss": 0.7895, "mean_token_accuracy": 0.7668691083788872, "step": 3165 }, { "epoch": 0.521563868950908, "grad_norm": 0.4651355648967318, "learning_rate": 1.0989281987114788e-05, "loss": 0.7944, "mean_token_accuracy": 0.7665895089507103, "step": 3170 }, { "epoch": 0.5223865248956255, "grad_norm": 0.5295116931171061, "learning_rate": 1.0960697018829886e-05, "loss": 0.788, "mean_token_accuracy": 0.767827396094799, "step": 3175 }, { "epoch": 0.5232091808403431, "grad_norm": 0.4573573787416488, "learning_rate": 1.0932104125348253e-05, "loss": 0.7843, "mean_token_accuracy": 0.7686645835638046, "step": 3180 }, { "epoch": 0.5240318367850606, "grad_norm": 0.4571858947790788, "learning_rate": 1.090350354254478e-05, "loss": 0.7785, "mean_token_accuracy": 0.771085238456726, "step": 3185 }, { "epoch": 0.5248544927297781, "grad_norm": 0.4665507281252133, "learning_rate": 1.087489550635779e-05, "loss": 0.7858, "mean_token_accuracy": 0.7684463292360306, "step": 3190 }, { "epoch": 0.5256771486744957, "grad_norm": 0.46248487532079646, "learning_rate": 1.0846280252787098e-05, "loss": 0.769, "mean_token_accuracy": 0.772568815946579, "step": 3195 }, { "epoch": 0.5264998046192131, "grad_norm": 0.4612234321530446, "learning_rate": 1.0817658017892051e-05, "loss": 0.7947, "mean_token_accuracy": 0.7661830738186837, "step": 3200 }, { "epoch": 0.5273224605639306, "grad_norm": 0.46256121979636095, "learning_rate": 1.0789029037789598e-05, "loss": 0.7904, "mean_token_accuracy": 0.7670846864581108, "step": 3205 }, { "epoch": 0.5281451165086481, "grad_norm": 0.45657147113608776, "learning_rate": 1.0760393548652312e-05, "loss": 0.7635, "mean_token_accuracy": 0.7739939898252487, "step": 3210 }, { "epoch": 0.5289677724533657, "grad_norm": 0.47508372467244614, "learning_rate": 1.0731751786706492e-05, "loss": 0.784, "mean_token_accuracy": 0.7688389793038368, "step": 3215 }, { "epoch": 0.5297904283980832, "grad_norm": 0.4679693622071408, "learning_rate": 1.0703103988230152e-05, "loss": 0.7694, "mean_token_accuracy": 0.7729762881994248, "step": 3220 }, { "epoch": 0.5306130843428007, "grad_norm": 0.4596215514355411, "learning_rate": 1.0674450389551127e-05, "loss": 0.7921, "mean_token_accuracy": 0.7659540921449661, "step": 3225 }, { "epoch": 0.5314357402875183, "grad_norm": 0.45432371024527507, "learning_rate": 1.0645791227045089e-05, "loss": 0.7785, "mean_token_accuracy": 0.769449009001255, "step": 3230 }, { "epoch": 0.5322583962322358, "grad_norm": 0.44814613731778336, "learning_rate": 1.0617126737133615e-05, "loss": 0.7714, "mean_token_accuracy": 0.7722168162465095, "step": 3235 }, { "epoch": 0.5330810521769533, "grad_norm": 0.4663847193727986, "learning_rate": 1.0588457156282223e-05, "loss": 0.776, "mean_token_accuracy": 0.7706132620573044, "step": 3240 }, { "epoch": 0.5339037081216708, "grad_norm": 0.4702778599492854, "learning_rate": 1.0559782720998437e-05, "loss": 0.7768, "mean_token_accuracy": 0.773099347949028, "step": 3245 }, { "epoch": 0.5347263640663883, "grad_norm": 0.4570270347016442, "learning_rate": 1.0531103667829818e-05, "loss": 0.7736, "mean_token_accuracy": 0.7731436893343926, "step": 3250 }, { "epoch": 0.5355490200111058, "grad_norm": 0.4584426610758232, "learning_rate": 1.0502420233362027e-05, "loss": 0.7865, "mean_token_accuracy": 0.7682988733053208, "step": 3255 }, { "epoch": 0.5363716759558234, "grad_norm": 0.47583959535754955, "learning_rate": 1.0473732654216866e-05, "loss": 0.7832, "mean_token_accuracy": 0.7701527640223503, "step": 3260 }, { "epoch": 0.5371943319005409, "grad_norm": 0.4573050145494101, "learning_rate": 1.0445041167050333e-05, "loss": 0.797, "mean_token_accuracy": 0.7666754394769668, "step": 3265 }, { "epoch": 0.5380169878452584, "grad_norm": 0.46617015243194043, "learning_rate": 1.0416346008550662e-05, "loss": 0.7818, "mean_token_accuracy": 0.7701493218541146, "step": 3270 }, { "epoch": 0.538839643789976, "grad_norm": 0.4591897932816748, "learning_rate": 1.0387647415436368e-05, "loss": 0.7794, "mean_token_accuracy": 0.7693421319127083, "step": 3275 }, { "epoch": 0.5396622997346935, "grad_norm": 0.4619322463845668, "learning_rate": 1.0358945624454308e-05, "loss": 0.7728, "mean_token_accuracy": 0.7716913402080536, "step": 3280 }, { "epoch": 0.540484955679411, "grad_norm": 0.4580189462711364, "learning_rate": 1.0330240872377719e-05, "loss": 0.7871, "mean_token_accuracy": 0.7686161130666733, "step": 3285 }, { "epoch": 0.5413076116241285, "grad_norm": 0.4475254462307451, "learning_rate": 1.0301533396004255e-05, "loss": 0.7827, "mean_token_accuracy": 0.7694762796163559, "step": 3290 }, { "epoch": 0.542130267568846, "grad_norm": 0.4629067021392271, "learning_rate": 1.0272823432154055e-05, "loss": 0.7809, "mean_token_accuracy": 0.7698782742023468, "step": 3295 }, { "epoch": 0.5429529235135635, "grad_norm": 0.4604383304724694, "learning_rate": 1.0244111217667778e-05, "loss": 0.7846, "mean_token_accuracy": 0.7683286279439926, "step": 3300 }, { "epoch": 0.543775579458281, "grad_norm": 0.45393878504467683, "learning_rate": 1.0215396989404638e-05, "loss": 0.7617, "mean_token_accuracy": 0.7751777544617653, "step": 3305 }, { "epoch": 0.5445982354029986, "grad_norm": 0.45322977220904237, "learning_rate": 1.0186680984240478e-05, "loss": 0.7774, "mean_token_accuracy": 0.7721681401133538, "step": 3310 }, { "epoch": 0.5454208913477161, "grad_norm": 0.46361354844336744, "learning_rate": 1.0157963439065783e-05, "loss": 0.7784, "mean_token_accuracy": 0.7711560308933259, "step": 3315 }, { "epoch": 0.5462435472924336, "grad_norm": 0.4606973861109438, "learning_rate": 1.0129244590783758e-05, "loss": 0.7935, "mean_token_accuracy": 0.7662897482514381, "step": 3320 }, { "epoch": 0.5470662032371512, "grad_norm": 0.4734137818843256, "learning_rate": 1.0100524676308342e-05, "loss": 0.7633, "mean_token_accuracy": 0.7745096743106842, "step": 3325 }, { "epoch": 0.5478888591818687, "grad_norm": 0.4725981801840983, "learning_rate": 1.0071803932562283e-05, "loss": 0.757, "mean_token_accuracy": 0.7766506478190423, "step": 3330 }, { "epoch": 0.5487115151265862, "grad_norm": 0.4603307098048824, "learning_rate": 1.0043082596475165e-05, "loss": 0.7795, "mean_token_accuracy": 0.77028938382864, "step": 3335 }, { "epoch": 0.5495341710713038, "grad_norm": 0.4790711358987626, "learning_rate": 1.0014360904981454e-05, "loss": 0.7801, "mean_token_accuracy": 0.7693585216999054, "step": 3340 }, { "epoch": 0.5503568270160212, "grad_norm": 0.4684665461355867, "learning_rate": 9.985639095018551e-06, "loss": 0.7725, "mean_token_accuracy": 0.7720367282629013, "step": 3345 }, { "epoch": 0.5511794829607387, "grad_norm": 0.4611581106534005, "learning_rate": 9.95691740352484e-06, "loss": 0.7775, "mean_token_accuracy": 0.7699839472770691, "step": 3350 }, { "epoch": 0.5520021389054562, "grad_norm": 0.4617277403547618, "learning_rate": 9.928196067437719e-06, "loss": 0.7807, "mean_token_accuracy": 0.7694995492696762, "step": 3355 }, { "epoch": 0.5528247948501738, "grad_norm": 0.4496135929136695, "learning_rate": 9.899475323691661e-06, "loss": 0.7692, "mean_token_accuracy": 0.7734804585576057, "step": 3360 }, { "epoch": 0.5536474507948913, "grad_norm": 0.5439320619394111, "learning_rate": 9.870755409216246e-06, "loss": 0.7905, "mean_token_accuracy": 0.7678239092230796, "step": 3365 }, { "epoch": 0.5544701067396088, "grad_norm": 0.459777091147703, "learning_rate": 9.842036560934218e-06, "loss": 0.773, "mean_token_accuracy": 0.7711254164576531, "step": 3370 }, { "epoch": 0.5552927626843264, "grad_norm": 0.4614023339908886, "learning_rate": 9.813319015759524e-06, "loss": 0.7726, "mean_token_accuracy": 0.7718210324645043, "step": 3375 }, { "epoch": 0.5561154186290439, "grad_norm": 0.45442198789805643, "learning_rate": 9.784603010595363e-06, "loss": 0.7547, "mean_token_accuracy": 0.7762991413474083, "step": 3380 }, { "epoch": 0.5569380745737614, "grad_norm": 0.45436520273358016, "learning_rate": 9.755888782332227e-06, "loss": 0.7765, "mean_token_accuracy": 0.7711010754108429, "step": 3385 }, { "epoch": 0.557760730518479, "grad_norm": 0.4632065867366389, "learning_rate": 9.727176567845948e-06, "loss": 0.7776, "mean_token_accuracy": 0.7708674401044846, "step": 3390 }, { "epoch": 0.5585833864631964, "grad_norm": 0.46525828138049913, "learning_rate": 9.698466603995748e-06, "loss": 0.7737, "mean_token_accuracy": 0.7711882501840591, "step": 3395 }, { "epoch": 0.5594060424079139, "grad_norm": 0.45763782743324566, "learning_rate": 9.669759127622284e-06, "loss": 0.7753, "mean_token_accuracy": 0.7715722754597664, "step": 3400 }, { "epoch": 0.5602286983526314, "grad_norm": 0.46682239206229076, "learning_rate": 9.64105437554569e-06, "loss": 0.757, "mean_token_accuracy": 0.7772704586386681, "step": 3405 }, { "epoch": 0.561051354297349, "grad_norm": 0.47500542812473273, "learning_rate": 9.612352584563632e-06, "loss": 0.7725, "mean_token_accuracy": 0.7725993901491165, "step": 3410 }, { "epoch": 0.5618740102420665, "grad_norm": 0.45574043155317645, "learning_rate": 9.583653991449342e-06, "loss": 0.7845, "mean_token_accuracy": 0.7690472900867462, "step": 3415 }, { "epoch": 0.562696666186784, "grad_norm": 0.45568964156922703, "learning_rate": 9.554958832949669e-06, "loss": 0.7911, "mean_token_accuracy": 0.7665956854820252, "step": 3420 }, { "epoch": 0.5635193221315016, "grad_norm": 0.4657013129357225, "learning_rate": 9.526267345783136e-06, "loss": 0.7812, "mean_token_accuracy": 0.7697620689868927, "step": 3425 }, { "epoch": 0.5643419780762191, "grad_norm": 0.44296362434619035, "learning_rate": 9.497579766637975e-06, "loss": 0.7561, "mean_token_accuracy": 0.7769164964556694, "step": 3430 }, { "epoch": 0.5651646340209366, "grad_norm": 0.4639875113805968, "learning_rate": 9.468896332170185e-06, "loss": 0.7833, "mean_token_accuracy": 0.7689115390181541, "step": 3435 }, { "epoch": 0.5659872899656541, "grad_norm": 0.4553858855333979, "learning_rate": 9.440217279001567e-06, "loss": 0.7505, "mean_token_accuracy": 0.7769271239638329, "step": 3440 }, { "epoch": 0.5668099459103716, "grad_norm": 0.4970279781044799, "learning_rate": 9.411542843717777e-06, "loss": 0.7707, "mean_token_accuracy": 0.771998131275177, "step": 3445 }, { "epoch": 0.5676326018550891, "grad_norm": 0.47708233979696346, "learning_rate": 9.38287326286639e-06, "loss": 0.7815, "mean_token_accuracy": 0.7694500774145127, "step": 3450 }, { "epoch": 0.5684552577998067, "grad_norm": 0.4662036758736906, "learning_rate": 9.354208772954915e-06, "loss": 0.7606, "mean_token_accuracy": 0.775182056427002, "step": 3455 }, { "epoch": 0.5692779137445242, "grad_norm": 0.4733777069079889, "learning_rate": 9.325549610448876e-06, "loss": 0.7711, "mean_token_accuracy": 0.7724026590585709, "step": 3460 }, { "epoch": 0.5701005696892417, "grad_norm": 0.4637617818160024, "learning_rate": 9.296896011769851e-06, "loss": 0.7685, "mean_token_accuracy": 0.7738096997141838, "step": 3465 }, { "epoch": 0.5709232256339593, "grad_norm": 0.45459466555745737, "learning_rate": 9.268248213293511e-06, "loss": 0.7756, "mean_token_accuracy": 0.7712172091007232, "step": 3470 }, { "epoch": 0.5717458815786768, "grad_norm": 0.45464213520263386, "learning_rate": 9.239606451347686e-06, "loss": 0.7709, "mean_token_accuracy": 0.7729450181126595, "step": 3475 }, { "epoch": 0.5725685375233943, "grad_norm": 0.45392423352602945, "learning_rate": 9.210970962210405e-06, "loss": 0.7838, "mean_token_accuracy": 0.7685442820191384, "step": 3480 }, { "epoch": 0.5733911934681118, "grad_norm": 0.4619154000820807, "learning_rate": 9.182341982107952e-06, "loss": 0.7596, "mean_token_accuracy": 0.7756431564688683, "step": 3485 }, { "epoch": 0.5742138494128293, "grad_norm": 0.4616693339858107, "learning_rate": 9.153719747212905e-06, "loss": 0.7805, "mean_token_accuracy": 0.7698265299201011, "step": 3490 }, { "epoch": 0.5750365053575468, "grad_norm": 0.4722706581315494, "learning_rate": 9.125104493642212e-06, "loss": 0.7846, "mean_token_accuracy": 0.7681806728243827, "step": 3495 }, { "epoch": 0.5758591613022643, "grad_norm": 0.47898567150725246, "learning_rate": 9.096496457455223e-06, "loss": 0.7764, "mean_token_accuracy": 0.7719867005944252, "step": 3500 }, { "epoch": 0.5766818172469819, "grad_norm": 0.4985876462211309, "learning_rate": 9.06789587465175e-06, "loss": 0.7673, "mean_token_accuracy": 0.7739386200904846, "step": 3505 }, { "epoch": 0.5775044731916994, "grad_norm": 0.46603755449533485, "learning_rate": 9.039302981170114e-06, "loss": 0.7544, "mean_token_accuracy": 0.7760583847761154, "step": 3510 }, { "epoch": 0.5783271291364169, "grad_norm": 0.47380881735575575, "learning_rate": 9.010718012885212e-06, "loss": 0.766, "mean_token_accuracy": 0.7731515884399414, "step": 3515 }, { "epoch": 0.5791497850811345, "grad_norm": 0.4562418956345965, "learning_rate": 8.982141205606559e-06, "loss": 0.7756, "mean_token_accuracy": 0.7705094218254089, "step": 3520 }, { "epoch": 0.579972441025852, "grad_norm": 0.4647998607821628, "learning_rate": 8.953572795076341e-06, "loss": 0.7706, "mean_token_accuracy": 0.7734148159623147, "step": 3525 }, { "epoch": 0.5807950969705695, "grad_norm": 0.45891021973591506, "learning_rate": 8.925013016967478e-06, "loss": 0.7717, "mean_token_accuracy": 0.7725850462913513, "step": 3530 }, { "epoch": 0.5816177529152871, "grad_norm": 0.44834434181177746, "learning_rate": 8.896462106881687e-06, "loss": 0.7606, "mean_token_accuracy": 0.7759411215782166, "step": 3535 }, { "epoch": 0.5824404088600045, "grad_norm": 0.46299530894272334, "learning_rate": 8.867920300347517e-06, "loss": 0.772, "mean_token_accuracy": 0.7722566545009613, "step": 3540 }, { "epoch": 0.583263064804722, "grad_norm": 0.44786844045676183, "learning_rate": 8.839387832818425e-06, "loss": 0.7685, "mean_token_accuracy": 0.7731774538755417, "step": 3545 }, { "epoch": 0.5840857207494395, "grad_norm": 0.4768671632764795, "learning_rate": 8.810864939670825e-06, "loss": 0.7751, "mean_token_accuracy": 0.7715122416615486, "step": 3550 }, { "epoch": 0.5849083766941571, "grad_norm": 0.45302339596804847, "learning_rate": 8.78235185620214e-06, "loss": 0.7752, "mean_token_accuracy": 0.7715805619955063, "step": 3555 }, { "epoch": 0.5857310326388746, "grad_norm": 0.45882507851350396, "learning_rate": 8.753848817628883e-06, "loss": 0.7778, "mean_token_accuracy": 0.7703679725527763, "step": 3560 }, { "epoch": 0.5865536885835921, "grad_norm": 0.4591511526188353, "learning_rate": 8.725356059084696e-06, "loss": 0.7701, "mean_token_accuracy": 0.7730422660708427, "step": 3565 }, { "epoch": 0.5873763445283097, "grad_norm": 0.46972118494588283, "learning_rate": 8.696873815618407e-06, "loss": 0.7833, "mean_token_accuracy": 0.7686605393886566, "step": 3570 }, { "epoch": 0.5881990004730272, "grad_norm": 0.461821851113267, "learning_rate": 8.66840232219212e-06, "loss": 0.7633, "mean_token_accuracy": 0.7746263861656189, "step": 3575 }, { "epoch": 0.5890216564177447, "grad_norm": 0.4817183887029049, "learning_rate": 8.639941813679238e-06, "loss": 0.7684, "mean_token_accuracy": 0.7735347226262093, "step": 3580 }, { "epoch": 0.5898443123624623, "grad_norm": 0.458351481627188, "learning_rate": 8.611492524862565e-06, "loss": 0.7758, "mean_token_accuracy": 0.7714830338954926, "step": 3585 }, { "epoch": 0.5906669683071797, "grad_norm": 0.4643126709864931, "learning_rate": 8.583054690432329e-06, "loss": 0.7778, "mean_token_accuracy": 0.7706717267632485, "step": 3590 }, { "epoch": 0.5914896242518972, "grad_norm": 0.4480103505121957, "learning_rate": 8.554628544984272e-06, "loss": 0.7678, "mean_token_accuracy": 0.7732058227062225, "step": 3595 }, { "epoch": 0.5923122801966147, "grad_norm": 0.46139088601753253, "learning_rate": 8.52621432301772e-06, "loss": 0.7768, "mean_token_accuracy": 0.7704577445983887, "step": 3600 }, { "epoch": 0.5931349361413323, "grad_norm": 0.4586306544190601, "learning_rate": 8.497812258933621e-06, "loss": 0.7513, "mean_token_accuracy": 0.7776725247502327, "step": 3605 }, { "epoch": 0.5939575920860498, "grad_norm": 0.46175133314293826, "learning_rate": 8.469422587032641e-06, "loss": 0.7744, "mean_token_accuracy": 0.7716501489281654, "step": 3610 }, { "epoch": 0.5947802480307673, "grad_norm": 0.4598736170218616, "learning_rate": 8.441045541513209e-06, "loss": 0.7643, "mean_token_accuracy": 0.7737933561205864, "step": 3615 }, { "epoch": 0.5956029039754849, "grad_norm": 0.4451350611590473, "learning_rate": 8.412681356469596e-06, "loss": 0.7704, "mean_token_accuracy": 0.7725784838199615, "step": 3620 }, { "epoch": 0.5964255599202024, "grad_norm": 0.4439598084597321, "learning_rate": 8.384330265889979e-06, "loss": 0.7623, "mean_token_accuracy": 0.7749139934778213, "step": 3625 }, { "epoch": 0.5972482158649199, "grad_norm": 0.479554019478234, "learning_rate": 8.355992503654519e-06, "loss": 0.7605, "mean_token_accuracy": 0.776183907687664, "step": 3630 }, { "epoch": 0.5980708718096374, "grad_norm": 0.45779201682703896, "learning_rate": 8.327668303533422e-06, "loss": 0.7743, "mean_token_accuracy": 0.7713848367333412, "step": 3635 }, { "epoch": 0.5988935277543549, "grad_norm": 0.4990121637569206, "learning_rate": 8.299357899185016e-06, "loss": 0.7729, "mean_token_accuracy": 0.7710495695471764, "step": 3640 }, { "epoch": 0.5997161836990724, "grad_norm": 0.44610915559317543, "learning_rate": 8.271061524153818e-06, "loss": 0.7644, "mean_token_accuracy": 0.7736274048686027, "step": 3645 }, { "epoch": 0.60053883964379, "grad_norm": 0.455153184568419, "learning_rate": 8.242779411868616e-06, "loss": 0.7888, "mean_token_accuracy": 0.7673418775200844, "step": 3650 }, { "epoch": 0.6013614955885075, "grad_norm": 0.4623123440511645, "learning_rate": 8.214511795640542e-06, "loss": 0.7738, "mean_token_accuracy": 0.771447129547596, "step": 3655 }, { "epoch": 0.602184151533225, "grad_norm": 0.4536568082014379, "learning_rate": 8.186258908661128e-06, "loss": 0.779, "mean_token_accuracy": 0.7700050801038743, "step": 3660 }, { "epoch": 0.6030068074779426, "grad_norm": 0.45928808070070753, "learning_rate": 8.158020984000415e-06, "loss": 0.7824, "mean_token_accuracy": 0.7694301664829254, "step": 3665 }, { "epoch": 0.6038294634226601, "grad_norm": 0.46289352128309513, "learning_rate": 8.129798254605006e-06, "loss": 0.7681, "mean_token_accuracy": 0.7733155444264412, "step": 3670 }, { "epoch": 0.6046521193673776, "grad_norm": 0.4810784379527575, "learning_rate": 8.101590953296154e-06, "loss": 0.7734, "mean_token_accuracy": 0.771414577960968, "step": 3675 }, { "epoch": 0.6054747753120951, "grad_norm": 0.4581719440943556, "learning_rate": 8.073399312767838e-06, "loss": 0.7696, "mean_token_accuracy": 0.7734584420919418, "step": 3680 }, { "epoch": 0.6062974312568126, "grad_norm": 0.47456800297124563, "learning_rate": 8.045223565584841e-06, "loss": 0.7677, "mean_token_accuracy": 0.7729291900992393, "step": 3685 }, { "epoch": 0.6071200872015301, "grad_norm": 0.46836742587477953, "learning_rate": 8.01706394418085e-06, "loss": 0.7513, "mean_token_accuracy": 0.777021586894989, "step": 3690 }, { "epoch": 0.6079427431462476, "grad_norm": 0.4601299887703487, "learning_rate": 7.9889206808565e-06, "loss": 0.77, "mean_token_accuracy": 0.772555972635746, "step": 3695 }, { "epoch": 0.6087653990909652, "grad_norm": 0.469939733926521, "learning_rate": 7.960794007777498e-06, "loss": 0.7715, "mean_token_accuracy": 0.7716592118144036, "step": 3700 }, { "epoch": 0.6095880550356827, "grad_norm": 0.4656008625571225, "learning_rate": 7.932684156972685e-06, "loss": 0.7759, "mean_token_accuracy": 0.7708727464079856, "step": 3705 }, { "epoch": 0.6104107109804002, "grad_norm": 0.4630865504287751, "learning_rate": 7.904591360332137e-06, "loss": 0.7684, "mean_token_accuracy": 0.7727461010217667, "step": 3710 }, { "epoch": 0.6112333669251178, "grad_norm": 0.4676371622493111, "learning_rate": 7.876515849605226e-06, "loss": 0.7875, "mean_token_accuracy": 0.7673552170395851, "step": 3715 }, { "epoch": 0.6120560228698353, "grad_norm": 0.47327777462598775, "learning_rate": 7.848457856398743e-06, "loss": 0.784, "mean_token_accuracy": 0.7693225339055061, "step": 3720 }, { "epoch": 0.6128786788145528, "grad_norm": 0.44854179801340505, "learning_rate": 7.82041761217496e-06, "loss": 0.7739, "mean_token_accuracy": 0.771759931743145, "step": 3725 }, { "epoch": 0.6137013347592704, "grad_norm": 0.4550739583064102, "learning_rate": 7.792395348249725e-06, "loss": 0.7638, "mean_token_accuracy": 0.7735701218247414, "step": 3730 }, { "epoch": 0.6145239907039878, "grad_norm": 0.4459044288791231, "learning_rate": 7.764391295790575e-06, "loss": 0.7638, "mean_token_accuracy": 0.7734913572669029, "step": 3735 }, { "epoch": 0.6153466466487053, "grad_norm": 0.4562743952752508, "learning_rate": 7.736405685814796e-06, "loss": 0.766, "mean_token_accuracy": 0.772436510026455, "step": 3740 }, { "epoch": 0.6161693025934228, "grad_norm": 0.45018078007793544, "learning_rate": 7.708438749187543e-06, "loss": 0.7682, "mean_token_accuracy": 0.7727370098233223, "step": 3745 }, { "epoch": 0.6169919585381404, "grad_norm": 0.4429638004867701, "learning_rate": 7.680490716619928e-06, "loss": 0.754, "mean_token_accuracy": 0.7775316894054413, "step": 3750 }, { "epoch": 0.6178146144828579, "grad_norm": 0.46175611227856006, "learning_rate": 7.652561818667108e-06, "loss": 0.7769, "mean_token_accuracy": 0.7694996193051338, "step": 3755 }, { "epoch": 0.6186372704275754, "grad_norm": 0.4662027363312584, "learning_rate": 7.6246522857264015e-06, "loss": 0.7723, "mean_token_accuracy": 0.771298025548458, "step": 3760 }, { "epoch": 0.619459926372293, "grad_norm": 0.45144150699061286, "learning_rate": 7.596762348035366e-06, "loss": 0.7575, "mean_token_accuracy": 0.7756345018744468, "step": 3765 }, { "epoch": 0.6202825823170105, "grad_norm": 0.466569306007744, "learning_rate": 7.568892235669911e-06, "loss": 0.7757, "mean_token_accuracy": 0.7711633428931236, "step": 3770 }, { "epoch": 0.621105238261728, "grad_norm": 0.4611210545317059, "learning_rate": 7.541042178542408e-06, "loss": 0.7592, "mean_token_accuracy": 0.7756694719195366, "step": 3775 }, { "epoch": 0.6219278942064456, "grad_norm": 0.4737163996540446, "learning_rate": 7.513212406399771e-06, "loss": 0.7705, "mean_token_accuracy": 0.7718516737222672, "step": 3780 }, { "epoch": 0.622750550151163, "grad_norm": 0.47425086382547155, "learning_rate": 7.485403148821588e-06, "loss": 0.7794, "mean_token_accuracy": 0.7692067638039589, "step": 3785 }, { "epoch": 0.6235732060958805, "grad_norm": 0.4471914064244973, "learning_rate": 7.457614635218202e-06, "loss": 0.7748, "mean_token_accuracy": 0.7713749289512635, "step": 3790 }, { "epoch": 0.624395862040598, "grad_norm": 0.46280611466276644, "learning_rate": 7.429847094828839e-06, "loss": 0.7675, "mean_token_accuracy": 0.7734036639332771, "step": 3795 }, { "epoch": 0.6252185179853156, "grad_norm": 0.4780903652457628, "learning_rate": 7.402100756719696e-06, "loss": 0.7842, "mean_token_accuracy": 0.7684579610824585, "step": 3800 }, { "epoch": 0.6260411739300331, "grad_norm": 0.4695836518392489, "learning_rate": 7.374375849782081e-06, "loss": 0.7733, "mean_token_accuracy": 0.7706691741943359, "step": 3805 }, { "epoch": 0.6268638298747506, "grad_norm": 0.4562023928373823, "learning_rate": 7.34667260273049e-06, "loss": 0.7568, "mean_token_accuracy": 0.775682607293129, "step": 3810 }, { "epoch": 0.6276864858194682, "grad_norm": 0.4600036661787852, "learning_rate": 7.318991244100753e-06, "loss": 0.754, "mean_token_accuracy": 0.7761555492877961, "step": 3815 }, { "epoch": 0.6285091417641857, "grad_norm": 0.4568670535784765, "learning_rate": 7.291332002248117e-06, "loss": 0.7772, "mean_token_accuracy": 0.7700536966323852, "step": 3820 }, { "epoch": 0.6293317977089032, "grad_norm": 0.4581119976875548, "learning_rate": 7.263695105345392e-06, "loss": 0.7589, "mean_token_accuracy": 0.7746703669428825, "step": 3825 }, { "epoch": 0.6301544536536207, "grad_norm": 0.46044142244207925, "learning_rate": 7.236080781381052e-06, "loss": 0.7557, "mean_token_accuracy": 0.7750990003347397, "step": 3830 }, { "epoch": 0.6309771095983382, "grad_norm": 0.4545602440042137, "learning_rate": 7.2084892581573485e-06, "loss": 0.756, "mean_token_accuracy": 0.7743361085653305, "step": 3835 }, { "epoch": 0.6317997655430557, "grad_norm": 0.4488302940589489, "learning_rate": 7.180920763288447e-06, "loss": 0.7785, "mean_token_accuracy": 0.771411019563675, "step": 3840 }, { "epoch": 0.6326224214877733, "grad_norm": 0.4323009497891946, "learning_rate": 7.153375524198545e-06, "loss": 0.753, "mean_token_accuracy": 0.7777467295527458, "step": 3845 }, { "epoch": 0.6334450774324908, "grad_norm": 0.4514651407779557, "learning_rate": 7.125853768119988e-06, "loss": 0.7588, "mean_token_accuracy": 0.7752303004264831, "step": 3850 }, { "epoch": 0.6342677333772083, "grad_norm": 0.5393599020503915, "learning_rate": 7.098355722091405e-06, "loss": 0.7652, "mean_token_accuracy": 0.771909749507904, "step": 3855 }, { "epoch": 0.6350903893219259, "grad_norm": 0.4566803362286354, "learning_rate": 7.070881612955823e-06, "loss": 0.7607, "mean_token_accuracy": 0.7749977692961693, "step": 3860 }, { "epoch": 0.6359130452666434, "grad_norm": 0.5348994375787252, "learning_rate": 7.0434316673588175e-06, "loss": 0.7709, "mean_token_accuracy": 0.7733311906456948, "step": 3865 }, { "epoch": 0.6367357012113609, "grad_norm": 0.45097114864684495, "learning_rate": 7.016006111746607e-06, "loss": 0.7696, "mean_token_accuracy": 0.7730101883411408, "step": 3870 }, { "epoch": 0.6375583571560784, "grad_norm": 0.4584827870530558, "learning_rate": 6.988605172364229e-06, "loss": 0.7676, "mean_token_accuracy": 0.772914570569992, "step": 3875 }, { "epoch": 0.6383810131007959, "grad_norm": 0.4656055961154652, "learning_rate": 6.961229075253635e-06, "loss": 0.7545, "mean_token_accuracy": 0.7773006170988083, "step": 3880 }, { "epoch": 0.6392036690455134, "grad_norm": 0.47278904907170016, "learning_rate": 6.933878046251855e-06, "loss": 0.7629, "mean_token_accuracy": 0.77431710511446, "step": 3885 }, { "epoch": 0.6400263249902309, "grad_norm": 0.46845696324601277, "learning_rate": 6.90655231098911e-06, "loss": 0.7553, "mean_token_accuracy": 0.776340226829052, "step": 3890 }, { "epoch": 0.6408489809349485, "grad_norm": 0.46205885491160964, "learning_rate": 6.87925209488697e-06, "loss": 0.7711, "mean_token_accuracy": 0.7720604494214058, "step": 3895 }, { "epoch": 0.641671636879666, "grad_norm": 0.45244908163689995, "learning_rate": 6.851977623156487e-06, "loss": 0.7822, "mean_token_accuracy": 0.7691676452755928, "step": 3900 }, { "epoch": 0.6424942928243835, "grad_norm": 0.45956615637568937, "learning_rate": 6.824729120796325e-06, "loss": 0.7501, "mean_token_accuracy": 0.7775457292795181, "step": 3905 }, { "epoch": 0.6433169487691011, "grad_norm": 0.4609317547378707, "learning_rate": 6.797506812590933e-06, "loss": 0.7606, "mean_token_accuracy": 0.7753668203949928, "step": 3910 }, { "epoch": 0.6441396047138186, "grad_norm": 0.454884670345376, "learning_rate": 6.770310923108662e-06, "loss": 0.7733, "mean_token_accuracy": 0.7720229342579842, "step": 3915 }, { "epoch": 0.6449622606585361, "grad_norm": 0.4692381985981877, "learning_rate": 6.743141676699923e-06, "loss": 0.7687, "mean_token_accuracy": 0.7735409364104271, "step": 3920 }, { "epoch": 0.6457849166032537, "grad_norm": 0.48094585324403266, "learning_rate": 6.715999297495346e-06, "loss": 0.7715, "mean_token_accuracy": 0.7724312484264374, "step": 3925 }, { "epoch": 0.6466075725479711, "grad_norm": 0.4736368969563962, "learning_rate": 6.688884009403913e-06, "loss": 0.7548, "mean_token_accuracy": 0.7763358741998673, "step": 3930 }, { "epoch": 0.6474302284926886, "grad_norm": 0.4665898023988688, "learning_rate": 6.66179603611113e-06, "loss": 0.7719, "mean_token_accuracy": 0.7717908099293709, "step": 3935 }, { "epoch": 0.6482528844374061, "grad_norm": 0.453729361603286, "learning_rate": 6.6347356010771554e-06, "loss": 0.7753, "mean_token_accuracy": 0.771356637775898, "step": 3940 }, { "epoch": 0.6490755403821237, "grad_norm": 0.47893907454640133, "learning_rate": 6.607702927534988e-06, "loss": 0.7662, "mean_token_accuracy": 0.7737555369734764, "step": 3945 }, { "epoch": 0.6498981963268412, "grad_norm": 0.4602174936583016, "learning_rate": 6.580698238488605e-06, "loss": 0.7741, "mean_token_accuracy": 0.7708314999938011, "step": 3950 }, { "epoch": 0.6507208522715587, "grad_norm": 0.4426316981736859, "learning_rate": 6.553721756711126e-06, "loss": 0.7548, "mean_token_accuracy": 0.776660680770874, "step": 3955 }, { "epoch": 0.6515435082162763, "grad_norm": 0.4597243515573918, "learning_rate": 6.52677370474298e-06, "loss": 0.7654, "mean_token_accuracy": 0.773367291688919, "step": 3960 }, { "epoch": 0.6523661641609938, "grad_norm": 0.46448069930804897, "learning_rate": 6.499854304890068e-06, "loss": 0.7791, "mean_token_accuracy": 0.7697444453835487, "step": 3965 }, { "epoch": 0.6531888201057113, "grad_norm": 0.46957619862330435, "learning_rate": 6.472963779221917e-06, "loss": 0.7737, "mean_token_accuracy": 0.7713129743933678, "step": 3970 }, { "epoch": 0.6540114760504289, "grad_norm": 0.4489453324595893, "learning_rate": 6.446102349569869e-06, "loss": 0.7577, "mean_token_accuracy": 0.7758531466126442, "step": 3975 }, { "epoch": 0.6548341319951463, "grad_norm": 0.44701897130815016, "learning_rate": 6.41927023752524e-06, "loss": 0.7618, "mean_token_accuracy": 0.7751427501440048, "step": 3980 }, { "epoch": 0.6556567879398638, "grad_norm": 0.45439971660084266, "learning_rate": 6.392467664437487e-06, "loss": 0.7527, "mean_token_accuracy": 0.7768514856696129, "step": 3985 }, { "epoch": 0.6564794438845813, "grad_norm": 0.4731057043320774, "learning_rate": 6.3656948514123966e-06, "loss": 0.7605, "mean_token_accuracy": 0.7753668650984764, "step": 3990 }, { "epoch": 0.6573020998292989, "grad_norm": 0.47468073943854877, "learning_rate": 6.338952019310238e-06, "loss": 0.7713, "mean_token_accuracy": 0.7712651416659355, "step": 3995 }, { "epoch": 0.6581247557740164, "grad_norm": 0.4438164033465551, "learning_rate": 6.312239388743976e-06, "loss": 0.7563, "mean_token_accuracy": 0.7760533064603805, "step": 4000 }, { "epoch": 0.6589474117187339, "grad_norm": 0.46094407034528234, "learning_rate": 6.285557180077403e-06, "loss": 0.7664, "mean_token_accuracy": 0.7739064082503319, "step": 4005 }, { "epoch": 0.6597700676634515, "grad_norm": 0.4570734592508089, "learning_rate": 6.258905613423372e-06, "loss": 0.7549, "mean_token_accuracy": 0.776693370938301, "step": 4010 }, { "epoch": 0.660592723608169, "grad_norm": 0.4601116292637463, "learning_rate": 6.232284908641944e-06, "loss": 0.7642, "mean_token_accuracy": 0.7730260610580444, "step": 4015 }, { "epoch": 0.6614153795528865, "grad_norm": 0.46190942795433293, "learning_rate": 6.205695285338596e-06, "loss": 0.7727, "mean_token_accuracy": 0.7725880980491638, "step": 4020 }, { "epoch": 0.662238035497604, "grad_norm": 0.45982900250445286, "learning_rate": 6.179136962862389e-06, "loss": 0.7602, "mean_token_accuracy": 0.7743444696068764, "step": 4025 }, { "epoch": 0.6630606914423215, "grad_norm": 0.4510532265536869, "learning_rate": 6.1526101603041845e-06, "loss": 0.7424, "mean_token_accuracy": 0.7799021378159523, "step": 4030 }, { "epoch": 0.663883347387039, "grad_norm": 0.45802687155387184, "learning_rate": 6.126115096494813e-06, "loss": 0.7737, "mean_token_accuracy": 0.7713621735572815, "step": 4035 }, { "epoch": 0.6647060033317566, "grad_norm": 0.8397538110885859, "learning_rate": 6.0996519900032805e-06, "loss": 0.7648, "mean_token_accuracy": 0.7745592251420022, "step": 4040 }, { "epoch": 0.6655286592764741, "grad_norm": 0.4759427372487236, "learning_rate": 6.07322105913496e-06, "loss": 0.7626, "mean_token_accuracy": 0.7740123614668846, "step": 4045 }, { "epoch": 0.6663513152211916, "grad_norm": 0.47416040355739536, "learning_rate": 6.046822521929806e-06, "loss": 0.758, "mean_token_accuracy": 0.7760253787040711, "step": 4050 }, { "epoch": 0.6671739711659092, "grad_norm": 0.4632083062253306, "learning_rate": 6.020456596160531e-06, "loss": 0.7668, "mean_token_accuracy": 0.7731039538979531, "step": 4055 }, { "epoch": 0.6679966271106267, "grad_norm": 0.4602138737520666, "learning_rate": 5.9941234993308305e-06, "loss": 0.7571, "mean_token_accuracy": 0.7755627349019051, "step": 4060 }, { "epoch": 0.6688192830553442, "grad_norm": 0.4906522553806413, "learning_rate": 5.9678234486735735e-06, "loss": 0.7543, "mean_token_accuracy": 0.7767635330557823, "step": 4065 }, { "epoch": 0.6696419390000617, "grad_norm": 0.4601250086446542, "learning_rate": 5.941556661149026e-06, "loss": 0.7564, "mean_token_accuracy": 0.7765131279826164, "step": 4070 }, { "epoch": 0.6704645949447792, "grad_norm": 0.4553079671892362, "learning_rate": 5.915323353443042e-06, "loss": 0.7566, "mean_token_accuracy": 0.7765011787414551, "step": 4075 }, { "epoch": 0.6712872508894967, "grad_norm": 0.46815133785078084, "learning_rate": 5.88912374196529e-06, "loss": 0.7685, "mean_token_accuracy": 0.771938169002533, "step": 4080 }, { "epoch": 0.6721099068342142, "grad_norm": 0.44918664244904677, "learning_rate": 5.862958042847472e-06, "loss": 0.7646, "mean_token_accuracy": 0.7742537468671798, "step": 4085 }, { "epoch": 0.6729325627789318, "grad_norm": 0.4547462837254723, "learning_rate": 5.836826471941518e-06, "loss": 0.7509, "mean_token_accuracy": 0.7778204038739205, "step": 4090 }, { "epoch": 0.6737552187236493, "grad_norm": 0.44898211992117176, "learning_rate": 5.8107292448178365e-06, "loss": 0.7584, "mean_token_accuracy": 0.7755276337265968, "step": 4095 }, { "epoch": 0.6745778746683668, "grad_norm": 0.46830065610629157, "learning_rate": 5.7846665767635e-06, "loss": 0.7782, "mean_token_accuracy": 0.7700761422514916, "step": 4100 }, { "epoch": 0.6754005306130844, "grad_norm": 0.4577158911316175, "learning_rate": 5.758638682780511e-06, "loss": 0.7532, "mean_token_accuracy": 0.7771164670586586, "step": 4105 }, { "epoch": 0.6762231865578019, "grad_norm": 0.4531651779880498, "learning_rate": 5.732645777583983e-06, "loss": 0.7597, "mean_token_accuracy": 0.775702728331089, "step": 4110 }, { "epoch": 0.6770458425025194, "grad_norm": 0.4548254936294526, "learning_rate": 5.706688075600408e-06, "loss": 0.7704, "mean_token_accuracy": 0.7720891103148461, "step": 4115 }, { "epoch": 0.677868498447237, "grad_norm": 0.4597185570223001, "learning_rate": 5.680765790965874e-06, "loss": 0.7563, "mean_token_accuracy": 0.776373165845871, "step": 4120 }, { "epoch": 0.6786911543919544, "grad_norm": 0.4674212258179096, "learning_rate": 5.654879137524279e-06, "loss": 0.7671, "mean_token_accuracy": 0.7736944913864136, "step": 4125 }, { "epoch": 0.6795138103366719, "grad_norm": 0.4574224037225107, "learning_rate": 5.6290283288256005e-06, "loss": 0.7761, "mean_token_accuracy": 0.7705698952078819, "step": 4130 }, { "epoch": 0.6803364662813894, "grad_norm": 0.4490980435577234, "learning_rate": 5.603213578124115e-06, "loss": 0.7506, "mean_token_accuracy": 0.7778286948800087, "step": 4135 }, { "epoch": 0.681159122226107, "grad_norm": 0.5091574243749065, "learning_rate": 5.577435098376633e-06, "loss": 0.7726, "mean_token_accuracy": 0.7711112067103386, "step": 4140 }, { "epoch": 0.6819817781708245, "grad_norm": 0.4695208793901739, "learning_rate": 5.551693102240754e-06, "loss": 0.7725, "mean_token_accuracy": 0.7720778390765191, "step": 4145 }, { "epoch": 0.682804434115542, "grad_norm": 0.47599714386277486, "learning_rate": 5.525987802073111e-06, "loss": 0.7554, "mean_token_accuracy": 0.7764408767223359, "step": 4150 }, { "epoch": 0.6836270900602596, "grad_norm": 0.44459705060678956, "learning_rate": 5.500319409927619e-06, "loss": 0.7601, "mean_token_accuracy": 0.7756818816065788, "step": 4155 }, { "epoch": 0.6844497460049771, "grad_norm": 0.45599341660456744, "learning_rate": 5.474688137553723e-06, "loss": 0.7611, "mean_token_accuracy": 0.7748866334557534, "step": 4160 }, { "epoch": 0.6852724019496946, "grad_norm": 0.7404870406731603, "learning_rate": 5.44909419639464e-06, "loss": 0.7455, "mean_token_accuracy": 0.778318066895008, "step": 4165 }, { "epoch": 0.6860950578944122, "grad_norm": 0.46051607649711274, "learning_rate": 5.4235377975856365e-06, "loss": 0.7637, "mean_token_accuracy": 0.774312736093998, "step": 4170 }, { "epoch": 0.6869177138391296, "grad_norm": 0.4509609219293775, "learning_rate": 5.3980191519522805e-06, "loss": 0.762, "mean_token_accuracy": 0.7747115671634675, "step": 4175 }, { "epoch": 0.6877403697838471, "grad_norm": 0.459270376444976, "learning_rate": 5.372538470008686e-06, "loss": 0.7629, "mean_token_accuracy": 0.7749601185321808, "step": 4180 }, { "epoch": 0.6885630257285646, "grad_norm": 0.45910408307647066, "learning_rate": 5.34709596195579e-06, "loss": 0.7746, "mean_token_accuracy": 0.7706676989793777, "step": 4185 }, { "epoch": 0.6893856816732822, "grad_norm": 0.4637955734636076, "learning_rate": 5.3216918376796255e-06, "loss": 0.7662, "mean_token_accuracy": 0.772616659104824, "step": 4190 }, { "epoch": 0.6902083376179997, "grad_norm": 0.4480065196798341, "learning_rate": 5.2963263067495775e-06, "loss": 0.7547, "mean_token_accuracy": 0.7760295331478119, "step": 4195 }, { "epoch": 0.6910309935627172, "grad_norm": 0.4572442605489735, "learning_rate": 5.270999578416658e-06, "loss": 0.7586, "mean_token_accuracy": 0.7765922248363495, "step": 4200 }, { "epoch": 0.6918536495074348, "grad_norm": 0.4642495260119232, "learning_rate": 5.245711861611776e-06, "loss": 0.7506, "mean_token_accuracy": 0.7767052724957466, "step": 4205 }, { "epoch": 0.6926763054521523, "grad_norm": 0.4533969138006868, "learning_rate": 5.220463364944024e-06, "loss": 0.751, "mean_token_accuracy": 0.7771932721138001, "step": 4210 }, { "epoch": 0.6934989613968698, "grad_norm": 0.5034899882305789, "learning_rate": 5.195254296698945e-06, "loss": 0.7609, "mean_token_accuracy": 0.7747261628508568, "step": 4215 }, { "epoch": 0.6943216173415873, "grad_norm": 0.4583194174788914, "learning_rate": 5.170084864836829e-06, "loss": 0.7637, "mean_token_accuracy": 0.7734209105372429, "step": 4220 }, { "epoch": 0.6951442732863048, "grad_norm": 0.46207517604451914, "learning_rate": 5.144955276990975e-06, "loss": 0.7585, "mean_token_accuracy": 0.77558224350214, "step": 4225 }, { "epoch": 0.6959669292310223, "grad_norm": 0.46953508636731506, "learning_rate": 5.119865740466003e-06, "loss": 0.7677, "mean_token_accuracy": 0.7723922997713089, "step": 4230 }, { "epoch": 0.6967895851757399, "grad_norm": 0.4541898616879353, "learning_rate": 5.094816462236133e-06, "loss": 0.7665, "mean_token_accuracy": 0.7730496510863304, "step": 4235 }, { "epoch": 0.6976122411204574, "grad_norm": 0.45607280167058284, "learning_rate": 5.069807648943474e-06, "loss": 0.7624, "mean_token_accuracy": 0.7748121082782745, "step": 4240 }, { "epoch": 0.6984348970651749, "grad_norm": 0.45108793246117657, "learning_rate": 5.044839506896322e-06, "loss": 0.7506, "mean_token_accuracy": 0.7784677252173424, "step": 4245 }, { "epoch": 0.6992575530098925, "grad_norm": 0.46693675409123014, "learning_rate": 5.019912242067453e-06, "loss": 0.7529, "mean_token_accuracy": 0.7776501134037972, "step": 4250 }, { "epoch": 0.70008020895461, "grad_norm": 0.45113339032896355, "learning_rate": 4.995026060092441e-06, "loss": 0.7626, "mean_token_accuracy": 0.774859470129013, "step": 4255 }, { "epoch": 0.7009028648993275, "grad_norm": 0.45599644962694463, "learning_rate": 4.970181166267942e-06, "loss": 0.7531, "mean_token_accuracy": 0.7769437432289124, "step": 4260 }, { "epoch": 0.701725520844045, "grad_norm": 0.44925634861144564, "learning_rate": 4.945377765550018e-06, "loss": 0.7717, "mean_token_accuracy": 0.7718919664621353, "step": 4265 }, { "epoch": 0.7025481767887625, "grad_norm": 0.47367194411517904, "learning_rate": 4.920616062552422e-06, "loss": 0.7674, "mean_token_accuracy": 0.7727532312273979, "step": 4270 }, { "epoch": 0.70337083273348, "grad_norm": 0.4557461469684282, "learning_rate": 4.895896261544936e-06, "loss": 0.7628, "mean_token_accuracy": 0.7736784398555756, "step": 4275 }, { "epoch": 0.7041934886781975, "grad_norm": 0.4519245744597467, "learning_rate": 4.871218566451675e-06, "loss": 0.7517, "mean_token_accuracy": 0.7769881278276444, "step": 4280 }, { "epoch": 0.7050161446229151, "grad_norm": 0.4570682597814927, "learning_rate": 4.8465831808493994e-06, "loss": 0.7604, "mean_token_accuracy": 0.7743771329522133, "step": 4285 }, { "epoch": 0.7058388005676326, "grad_norm": 0.4504353308376181, "learning_rate": 4.8219903079658355e-06, "loss": 0.7563, "mean_token_accuracy": 0.7755854785442352, "step": 4290 }, { "epoch": 0.7066614565123501, "grad_norm": 0.4445807263194282, "learning_rate": 4.797440150678019e-06, "loss": 0.7626, "mean_token_accuracy": 0.7735432744026184, "step": 4295 }, { "epoch": 0.7074841124570677, "grad_norm": 0.46317727838765704, "learning_rate": 4.772932911510595e-06, "loss": 0.7612, "mean_token_accuracy": 0.7750167936086655, "step": 4300 }, { "epoch": 0.7083067684017852, "grad_norm": 0.6580756996522307, "learning_rate": 4.748468792634169e-06, "loss": 0.7722, "mean_token_accuracy": 0.7717395380139351, "step": 4305 }, { "epoch": 0.7091294243465027, "grad_norm": 0.46281903960848425, "learning_rate": 4.7240479958636125e-06, "loss": 0.7641, "mean_token_accuracy": 0.7737842112779617, "step": 4310 }, { "epoch": 0.7099520802912203, "grad_norm": 0.4548784620593125, "learning_rate": 4.699670722656433e-06, "loss": 0.7637, "mean_token_accuracy": 0.7731479197740555, "step": 4315 }, { "epoch": 0.7107747362359377, "grad_norm": 0.45919693021810537, "learning_rate": 4.6753371741110785e-06, "loss": 0.7682, "mean_token_accuracy": 0.7729432970285416, "step": 4320 }, { "epoch": 0.7115973921806552, "grad_norm": 0.4516944783706439, "learning_rate": 4.6510475509653075e-06, "loss": 0.7498, "mean_token_accuracy": 0.7773463204503059, "step": 4325 }, { "epoch": 0.7124200481253727, "grad_norm": 0.4518164765098139, "learning_rate": 4.6268020535945045e-06, "loss": 0.7565, "mean_token_accuracy": 0.7763844445347786, "step": 4330 }, { "epoch": 0.7132427040700903, "grad_norm": 0.45878233155066256, "learning_rate": 4.602600882010053e-06, "loss": 0.7605, "mean_token_accuracy": 0.774750930070877, "step": 4335 }, { "epoch": 0.7140653600148078, "grad_norm": 0.461988021002741, "learning_rate": 4.578444235857672e-06, "loss": 0.7515, "mean_token_accuracy": 0.7768089339137078, "step": 4340 }, { "epoch": 0.7148880159595253, "grad_norm": 0.4492155500822266, "learning_rate": 4.554332314415774e-06, "loss": 0.757, "mean_token_accuracy": 0.7758794248104095, "step": 4345 }, { "epoch": 0.7157106719042429, "grad_norm": 0.4484925717548311, "learning_rate": 4.530265316593808e-06, "loss": 0.7546, "mean_token_accuracy": 0.7757991969585418, "step": 4350 }, { "epoch": 0.7165333278489604, "grad_norm": 0.45139324441318496, "learning_rate": 4.506243440930635e-06, "loss": 0.7535, "mean_token_accuracy": 0.7761650532484055, "step": 4355 }, { "epoch": 0.7173559837936779, "grad_norm": 0.45651009241834994, "learning_rate": 4.482266885592889e-06, "loss": 0.7535, "mean_token_accuracy": 0.7769538760185242, "step": 4360 }, { "epoch": 0.7181786397383955, "grad_norm": 0.45467121821870116, "learning_rate": 4.458335848373333e-06, "loss": 0.7652, "mean_token_accuracy": 0.772624684870243, "step": 4365 }, { "epoch": 0.7190012956831129, "grad_norm": 0.4661890057115758, "learning_rate": 4.434450526689228e-06, "loss": 0.7584, "mean_token_accuracy": 0.7750358328223228, "step": 4370 }, { "epoch": 0.7198239516278304, "grad_norm": 0.48938849094470266, "learning_rate": 4.410611117580716e-06, "loss": 0.7592, "mean_token_accuracy": 0.7746859535574913, "step": 4375 }, { "epoch": 0.720646607572548, "grad_norm": 0.4539841082523151, "learning_rate": 4.386817817709185e-06, "loss": 0.7656, "mean_token_accuracy": 0.7734770685434341, "step": 4380 }, { "epoch": 0.7214692635172655, "grad_norm": 0.4629757058037324, "learning_rate": 4.36307082335564e-06, "loss": 0.7616, "mean_token_accuracy": 0.7751207754015923, "step": 4385 }, { "epoch": 0.722291919461983, "grad_norm": 0.45452405735972257, "learning_rate": 4.3393703304191035e-06, "loss": 0.7463, "mean_token_accuracy": 0.7782783165574074, "step": 4390 }, { "epoch": 0.7231145754067005, "grad_norm": 0.45150071570632233, "learning_rate": 4.3157165344149785e-06, "loss": 0.7589, "mean_token_accuracy": 0.7759007066488266, "step": 4395 }, { "epoch": 0.7239372313514181, "grad_norm": 0.5346767792414896, "learning_rate": 4.292109630473454e-06, "loss": 0.7461, "mean_token_accuracy": 0.7783639162778855, "step": 4400 }, { "epoch": 0.7247598872961356, "grad_norm": 0.462260877988652, "learning_rate": 4.268549813337882e-06, "loss": 0.7549, "mean_token_accuracy": 0.7753161311149597, "step": 4405 }, { "epoch": 0.7255825432408531, "grad_norm": 0.4528891098163022, "learning_rate": 4.2450372773631735e-06, "loss": 0.767, "mean_token_accuracy": 0.7727597311139107, "step": 4410 }, { "epoch": 0.7264051991855706, "grad_norm": 0.4507727544865607, "learning_rate": 4.221572216514201e-06, "loss": 0.758, "mean_token_accuracy": 0.7745763584971428, "step": 4415 }, { "epoch": 0.7272278551302881, "grad_norm": 0.4527152786698503, "learning_rate": 4.1981548243641915e-06, "loss": 0.7484, "mean_token_accuracy": 0.7774866044521331, "step": 4420 }, { "epoch": 0.7280505110750056, "grad_norm": 0.45480082649885883, "learning_rate": 4.174785294093134e-06, "loss": 0.7451, "mean_token_accuracy": 0.7789734438061714, "step": 4425 }, { "epoch": 0.7288731670197232, "grad_norm": 0.4542162430945047, "learning_rate": 4.15146381848619e-06, "loss": 0.7467, "mean_token_accuracy": 0.7783559635281563, "step": 4430 }, { "epoch": 0.7296958229644407, "grad_norm": 0.47104067280478806, "learning_rate": 4.128190589932084e-06, "loss": 0.7469, "mean_token_accuracy": 0.7784626111388206, "step": 4435 }, { "epoch": 0.7305184789091582, "grad_norm": 0.4614564517721067, "learning_rate": 4.104965800421544e-06, "loss": 0.7771, "mean_token_accuracy": 0.7705422267317772, "step": 4440 }, { "epoch": 0.7313411348538758, "grad_norm": 0.4575939021467565, "learning_rate": 4.0817896415457014e-06, "loss": 0.7391, "mean_token_accuracy": 0.7801946252584457, "step": 4445 }, { "epoch": 0.7321637907985933, "grad_norm": 0.4545938891820146, "learning_rate": 4.058662304494505e-06, "loss": 0.7627, "mean_token_accuracy": 0.7745560288429261, "step": 4450 }, { "epoch": 0.7329864467433108, "grad_norm": 0.4585536548158602, "learning_rate": 4.035583980055155e-06, "loss": 0.7697, "mean_token_accuracy": 0.7717498689889908, "step": 4455 }, { "epoch": 0.7338091026880283, "grad_norm": 0.4612169390412934, "learning_rate": 4.01255485861053e-06, "loss": 0.7631, "mean_token_accuracy": 0.773409353196621, "step": 4460 }, { "epoch": 0.7346317586327458, "grad_norm": 0.8661179357997616, "learning_rate": 3.98957513013761e-06, "loss": 0.7479, "mean_token_accuracy": 0.7769591033458709, "step": 4465 }, { "epoch": 0.7354544145774633, "grad_norm": 0.46473777928556437, "learning_rate": 3.966644984205914e-06, "loss": 0.7668, "mean_token_accuracy": 0.7737424924969674, "step": 4470 }, { "epoch": 0.7362770705221808, "grad_norm": 0.4767737283768491, "learning_rate": 3.943764609975925e-06, "loss": 0.7814, "mean_token_accuracy": 0.7691428795456886, "step": 4475 }, { "epoch": 0.7370997264668984, "grad_norm": 0.45490349229607435, "learning_rate": 3.920934196197546e-06, "loss": 0.7693, "mean_token_accuracy": 0.7719476729631424, "step": 4480 }, { "epoch": 0.7379223824116159, "grad_norm": 0.5305179614726209, "learning_rate": 3.898153931208537e-06, "loss": 0.751, "mean_token_accuracy": 0.7768639504909516, "step": 4485 }, { "epoch": 0.7387450383563334, "grad_norm": 0.4697966715715965, "learning_rate": 3.8754240029329515e-06, "loss": 0.7521, "mean_token_accuracy": 0.7770631209015846, "step": 4490 }, { "epoch": 0.739567694301051, "grad_norm": 0.45844438142439703, "learning_rate": 3.852744598879595e-06, "loss": 0.7635, "mean_token_accuracy": 0.7739180430769921, "step": 4495 }, { "epoch": 0.7403903502457685, "grad_norm": 0.4605384553809832, "learning_rate": 3.8301159061404845e-06, "loss": 0.7569, "mean_token_accuracy": 0.7757810413837433, "step": 4500 }, { "epoch": 0.741213006190486, "grad_norm": 0.45303293330437755, "learning_rate": 3.807538111389292e-06, "loss": 0.7614, "mean_token_accuracy": 0.7754284694790841, "step": 4505 }, { "epoch": 0.7420356621352036, "grad_norm": 0.45388671233037653, "learning_rate": 3.7850114008798165e-06, "loss": 0.7512, "mean_token_accuracy": 0.7772485420107842, "step": 4510 }, { "epoch": 0.742858318079921, "grad_norm": 0.4553816612463552, "learning_rate": 3.762535960444429e-06, "loss": 0.7528, "mean_token_accuracy": 0.7759792044758796, "step": 4515 }, { "epoch": 0.7436809740246385, "grad_norm": 0.4566564151396422, "learning_rate": 3.7401119754925686e-06, "loss": 0.7739, "mean_token_accuracy": 0.7711115226149559, "step": 4520 }, { "epoch": 0.744503629969356, "grad_norm": 0.4719253982177264, "learning_rate": 3.7177396310091783e-06, "loss": 0.7642, "mean_token_accuracy": 0.7741918414831161, "step": 4525 }, { "epoch": 0.7453262859140736, "grad_norm": 0.46437196306348744, "learning_rate": 3.695419111553217e-06, "loss": 0.7626, "mean_token_accuracy": 0.7733220234513283, "step": 4530 }, { "epoch": 0.7461489418587911, "grad_norm": 0.46756406299929837, "learning_rate": 3.6731506012560946e-06, "loss": 0.7607, "mean_token_accuracy": 0.7737310588359833, "step": 4535 }, { "epoch": 0.7469715978035086, "grad_norm": 0.45821947101640753, "learning_rate": 3.6509342838201956e-06, "loss": 0.7683, "mean_token_accuracy": 0.7719688341021538, "step": 4540 }, { "epoch": 0.7477942537482262, "grad_norm": 0.4562824049606341, "learning_rate": 3.6287703425173305e-06, "loss": 0.7403, "mean_token_accuracy": 0.7807064011693001, "step": 4545 }, { "epoch": 0.7486169096929437, "grad_norm": 0.4514361830446328, "learning_rate": 3.606658960187248e-06, "loss": 0.7435, "mean_token_accuracy": 0.779884296655655, "step": 4550 }, { "epoch": 0.7494395656376612, "grad_norm": 0.45302809092134555, "learning_rate": 3.5846003192361044e-06, "loss": 0.7588, "mean_token_accuracy": 0.7744943097233772, "step": 4555 }, { "epoch": 0.7502622215823788, "grad_norm": 0.45178783829148905, "learning_rate": 3.562594601634972e-06, "loss": 0.7335, "mean_token_accuracy": 0.7820829957723617, "step": 4560 }, { "epoch": 0.7510848775270962, "grad_norm": 0.445680765982709, "learning_rate": 3.5406419889183407e-06, "loss": 0.7607, "mean_token_accuracy": 0.774886916577816, "step": 4565 }, { "epoch": 0.7519075334718137, "grad_norm": 0.4473398271371335, "learning_rate": 3.518742662182618e-06, "loss": 0.7596, "mean_token_accuracy": 0.7750257462263107, "step": 4570 }, { "epoch": 0.7527301894165312, "grad_norm": 0.4548780189540175, "learning_rate": 3.4968968020846215e-06, "loss": 0.7574, "mean_token_accuracy": 0.7753012299537658, "step": 4575 }, { "epoch": 0.7535528453612488, "grad_norm": 0.47212794938806674, "learning_rate": 3.475104588840109e-06, "loss": 0.7699, "mean_token_accuracy": 0.7702872931957245, "step": 4580 }, { "epoch": 0.7543755013059663, "grad_norm": 0.4699185661681817, "learning_rate": 3.453366202222281e-06, "loss": 0.7691, "mean_token_accuracy": 0.7727020382881165, "step": 4585 }, { "epoch": 0.7551981572506838, "grad_norm": 0.4453054653989745, "learning_rate": 3.431681821560303e-06, "loss": 0.7579, "mean_token_accuracy": 0.7747899144887924, "step": 4590 }, { "epoch": 0.7560208131954014, "grad_norm": 0.4514109564998647, "learning_rate": 3.4100516257378135e-06, "loss": 0.7581, "mean_token_accuracy": 0.77606081366539, "step": 4595 }, { "epoch": 0.7568434691401189, "grad_norm": 0.45996661200891736, "learning_rate": 3.3884757931914604e-06, "loss": 0.7501, "mean_token_accuracy": 0.7772179991006851, "step": 4600 }, { "epoch": 0.7576661250848364, "grad_norm": 0.4564528306897701, "learning_rate": 3.3669545019094285e-06, "loss": 0.7476, "mean_token_accuracy": 0.7771415904164314, "step": 4605 }, { "epoch": 0.7584887810295539, "grad_norm": 0.4508605428190896, "learning_rate": 3.3454879294299693e-06, "loss": 0.7588, "mean_token_accuracy": 0.7753650680184364, "step": 4610 }, { "epoch": 0.7593114369742714, "grad_norm": 0.449063017339619, "learning_rate": 3.3240762528399373e-06, "loss": 0.7445, "mean_token_accuracy": 0.7779753386974335, "step": 4615 }, { "epoch": 0.7601340929189889, "grad_norm": 0.4489578915754036, "learning_rate": 3.3027196487733172e-06, "loss": 0.7609, "mean_token_accuracy": 0.7749652817845345, "step": 4620 }, { "epoch": 0.7609567488637065, "grad_norm": 0.46472570287884646, "learning_rate": 3.2814182934097905e-06, "loss": 0.7591, "mean_token_accuracy": 0.7749405935406685, "step": 4625 }, { "epoch": 0.761779404808424, "grad_norm": 0.4587400236124694, "learning_rate": 3.260172362473256e-06, "loss": 0.7539, "mean_token_accuracy": 0.7767691686749458, "step": 4630 }, { "epoch": 0.7626020607531415, "grad_norm": 0.4506623081445744, "learning_rate": 3.2389820312304065e-06, "loss": 0.7596, "mean_token_accuracy": 0.7751207500696182, "step": 4635 }, { "epoch": 0.763424716697859, "grad_norm": 0.453591720148907, "learning_rate": 3.2178474744892564e-06, "loss": 0.7531, "mean_token_accuracy": 0.7770177185535431, "step": 4640 }, { "epoch": 0.7642473726425766, "grad_norm": 0.44706119795819627, "learning_rate": 3.19676886659772e-06, "loss": 0.7542, "mean_token_accuracy": 0.7764334037899971, "step": 4645 }, { "epoch": 0.7650700285872941, "grad_norm": 0.45877389691129683, "learning_rate": 3.175746381442165e-06, "loss": 0.7445, "mean_token_accuracy": 0.7782713577151299, "step": 4650 }, { "epoch": 0.7658926845320116, "grad_norm": 0.4618081385459698, "learning_rate": 3.154780192445982e-06, "loss": 0.7618, "mean_token_accuracy": 0.7746585324406624, "step": 4655 }, { "epoch": 0.7667153404767291, "grad_norm": 0.4615102311226442, "learning_rate": 3.1338704725681425e-06, "loss": 0.7492, "mean_token_accuracy": 0.7781389579176903, "step": 4660 }, { "epoch": 0.7675379964214466, "grad_norm": 0.47245810039907243, "learning_rate": 3.1130173943017816e-06, "loss": 0.755, "mean_token_accuracy": 0.7757288441061974, "step": 4665 }, { "epoch": 0.7683606523661641, "grad_norm": 0.49476468332027557, "learning_rate": 3.092221129672781e-06, "loss": 0.757, "mean_token_accuracy": 0.7753856807947159, "step": 4670 }, { "epoch": 0.7691833083108817, "grad_norm": 0.4488328341950593, "learning_rate": 3.0714818502383425e-06, "loss": 0.7528, "mean_token_accuracy": 0.7771218538284301, "step": 4675 }, { "epoch": 0.7700059642555992, "grad_norm": 0.44830164104811504, "learning_rate": 3.0507997270855593e-06, "loss": 0.7536, "mean_token_accuracy": 0.7762871950864791, "step": 4680 }, { "epoch": 0.7708286202003167, "grad_norm": 0.4487427476063173, "learning_rate": 3.0301749308300344e-06, "loss": 0.7476, "mean_token_accuracy": 0.777713580429554, "step": 4685 }, { "epoch": 0.7716512761450343, "grad_norm": 0.46019614597279757, "learning_rate": 3.0096076316144485e-06, "loss": 0.7679, "mean_token_accuracy": 0.7721841782331467, "step": 4690 }, { "epoch": 0.7724739320897518, "grad_norm": 0.45928014960590724, "learning_rate": 2.9890979991071733e-06, "loss": 0.7783, "mean_token_accuracy": 0.7705237150192261, "step": 4695 }, { "epoch": 0.7732965880344693, "grad_norm": 0.44778840546004417, "learning_rate": 2.9686462025008433e-06, "loss": 0.7657, "mean_token_accuracy": 0.772407004237175, "step": 4700 }, { "epoch": 0.7741192439791869, "grad_norm": 0.44938881242750045, "learning_rate": 2.948252410511002e-06, "loss": 0.7401, "mean_token_accuracy": 0.7794214293360711, "step": 4705 }, { "epoch": 0.7749418999239043, "grad_norm": 0.44701522155912304, "learning_rate": 2.9279167913746787e-06, "loss": 0.7559, "mean_token_accuracy": 0.7753959491848945, "step": 4710 }, { "epoch": 0.7757645558686218, "grad_norm": 0.4453884428214154, "learning_rate": 2.9076395128490142e-06, "loss": 0.7451, "mean_token_accuracy": 0.7779855608940125, "step": 4715 }, { "epoch": 0.7765872118133393, "grad_norm": 0.4719944562631911, "learning_rate": 2.887420742209863e-06, "loss": 0.7665, "mean_token_accuracy": 0.7726840317249298, "step": 4720 }, { "epoch": 0.7774098677580569, "grad_norm": 0.4770278715388468, "learning_rate": 2.867260646250436e-06, "loss": 0.7518, "mean_token_accuracy": 0.7767297640442848, "step": 4725 }, { "epoch": 0.7782325237027744, "grad_norm": 0.598349752425175, "learning_rate": 2.8471593912799078e-06, "loss": 0.7681, "mean_token_accuracy": 0.7728853732347488, "step": 4730 }, { "epoch": 0.7790551796474919, "grad_norm": 0.44577085473211525, "learning_rate": 2.82711714312204e-06, "loss": 0.7509, "mean_token_accuracy": 0.7767453342676163, "step": 4735 }, { "epoch": 0.7798778355922095, "grad_norm": 0.4556509559356986, "learning_rate": 2.8071340671138368e-06, "loss": 0.7533, "mean_token_accuracy": 0.7765545532107353, "step": 4740 }, { "epoch": 0.780700491536927, "grad_norm": 0.4531491219282271, "learning_rate": 2.7872103281041507e-06, "loss": 0.7593, "mean_token_accuracy": 0.7750761911273003, "step": 4745 }, { "epoch": 0.7815231474816445, "grad_norm": 0.4584119112514478, "learning_rate": 2.7673460904523506e-06, "loss": 0.7448, "mean_token_accuracy": 0.7786479026079178, "step": 4750 }, { "epoch": 0.7823458034263621, "grad_norm": 0.44952187171365166, "learning_rate": 2.7475415180269517e-06, "loss": 0.7569, "mean_token_accuracy": 0.7755636006593705, "step": 4755 }, { "epoch": 0.7831684593710795, "grad_norm": 0.4664515109934718, "learning_rate": 2.7277967742042577e-06, "loss": 0.7428, "mean_token_accuracy": 0.7788804203271866, "step": 4760 }, { "epoch": 0.783991115315797, "grad_norm": 0.45924509508757105, "learning_rate": 2.7081120218670253e-06, "loss": 0.7677, "mean_token_accuracy": 0.7727869749069214, "step": 4765 }, { "epoch": 0.7848137712605145, "grad_norm": 0.45788744310998797, "learning_rate": 2.688487423403119e-06, "loss": 0.768, "mean_token_accuracy": 0.7724412441253662, "step": 4770 }, { "epoch": 0.7856364272052321, "grad_norm": 0.4500755499182482, "learning_rate": 2.668923140704165e-06, "loss": 0.7576, "mean_token_accuracy": 0.7754251524806023, "step": 4775 }, { "epoch": 0.7864590831499496, "grad_norm": 0.45993741292006335, "learning_rate": 2.6494193351642227e-06, "loss": 0.7612, "mean_token_accuracy": 0.7746811181306839, "step": 4780 }, { "epoch": 0.7872817390946671, "grad_norm": 0.46565288232317464, "learning_rate": 2.629976167678443e-06, "loss": 0.765, "mean_token_accuracy": 0.7741100504994393, "step": 4785 }, { "epoch": 0.7881043950393847, "grad_norm": 0.48584418751163455, "learning_rate": 2.610593798641754e-06, "loss": 0.7611, "mean_token_accuracy": 0.7737690359354019, "step": 4790 }, { "epoch": 0.7889270509841022, "grad_norm": 0.46679892020236813, "learning_rate": 2.5912723879475334e-06, "loss": 0.7459, "mean_token_accuracy": 0.7791336432099343, "step": 4795 }, { "epoch": 0.7897497069288197, "grad_norm": 0.4664448392599311, "learning_rate": 2.5720120949862802e-06, "loss": 0.735, "mean_token_accuracy": 0.7821645587682724, "step": 4800 }, { "epoch": 0.7905723628735372, "grad_norm": 0.45768498362357635, "learning_rate": 2.5528130786443093e-06, "loss": 0.7527, "mean_token_accuracy": 0.776323638856411, "step": 4805 }, { "epoch": 0.7913950188182547, "grad_norm": 0.4499352697982891, "learning_rate": 2.5336754973024424e-06, "loss": 0.7529, "mean_token_accuracy": 0.776432503759861, "step": 4810 }, { "epoch": 0.7922176747629722, "grad_norm": 0.6035637338594386, "learning_rate": 2.5145995088346963e-06, "loss": 0.7555, "mean_token_accuracy": 0.776105310022831, "step": 4815 }, { "epoch": 0.7930403307076898, "grad_norm": 0.4592727748331765, "learning_rate": 2.495585270606986e-06, "loss": 0.7672, "mean_token_accuracy": 0.7730541467666626, "step": 4820 }, { "epoch": 0.7938629866524073, "grad_norm": 0.45209784304438017, "learning_rate": 2.4766329394758113e-06, "loss": 0.7575, "mean_token_accuracy": 0.7754378780722618, "step": 4825 }, { "epoch": 0.7946856425971248, "grad_norm": 0.44865271722449784, "learning_rate": 2.4577426717869877e-06, "loss": 0.7601, "mean_token_accuracy": 0.7752453282475471, "step": 4830 }, { "epoch": 0.7955082985418424, "grad_norm": 0.44982605445285445, "learning_rate": 2.4389146233743312e-06, "loss": 0.7623, "mean_token_accuracy": 0.773838748037815, "step": 4835 }, { "epoch": 0.7963309544865599, "grad_norm": 0.4580134090351707, "learning_rate": 2.4201489495583954e-06, "loss": 0.7559, "mean_token_accuracy": 0.7758683785796165, "step": 4840 }, { "epoch": 0.7971536104312774, "grad_norm": 0.4488987136836358, "learning_rate": 2.4014458051451696e-06, "loss": 0.7476, "mean_token_accuracy": 0.7781781032681465, "step": 4845 }, { "epoch": 0.797976266375995, "grad_norm": 0.46573124246103864, "learning_rate": 2.382805344424817e-06, "loss": 0.766, "mean_token_accuracy": 0.7739240199327468, "step": 4850 }, { "epoch": 0.7987989223207124, "grad_norm": 0.49016504847988857, "learning_rate": 2.3642277211703957e-06, "loss": 0.7593, "mean_token_accuracy": 0.7750187516212463, "step": 4855 }, { "epoch": 0.7996215782654299, "grad_norm": 0.4403951424770007, "learning_rate": 2.3457130886365964e-06, "loss": 0.7431, "mean_token_accuracy": 0.7791830554604531, "step": 4860 }, { "epoch": 0.8004442342101474, "grad_norm": 0.44928445154183244, "learning_rate": 2.3272615995584614e-06, "loss": 0.7447, "mean_token_accuracy": 0.7789540514349937, "step": 4865 }, { "epoch": 0.801266890154865, "grad_norm": 0.45219836496879356, "learning_rate": 2.3088734061501395e-06, "loss": 0.7336, "mean_token_accuracy": 0.7810052052140236, "step": 4870 }, { "epoch": 0.8020895460995825, "grad_norm": 0.45310640453333556, "learning_rate": 2.2905486601036318e-06, "loss": 0.7558, "mean_token_accuracy": 0.7761013254523277, "step": 4875 }, { "epoch": 0.8029122020443, "grad_norm": 0.45616111640731144, "learning_rate": 2.2722875125875354e-06, "loss": 0.7457, "mean_token_accuracy": 0.7781639114022255, "step": 4880 }, { "epoch": 0.8037348579890176, "grad_norm": 0.45484299177922377, "learning_rate": 2.254090114245787e-06, "loss": 0.746, "mean_token_accuracy": 0.7785458162426948, "step": 4885 }, { "epoch": 0.8045575139337351, "grad_norm": 0.45392067610030357, "learning_rate": 2.2359566151964385e-06, "loss": 0.7546, "mean_token_accuracy": 0.7755633801221847, "step": 4890 }, { "epoch": 0.8053801698784526, "grad_norm": 0.44818960639696015, "learning_rate": 2.2178871650304046e-06, "loss": 0.7469, "mean_token_accuracy": 0.778185498714447, "step": 4895 }, { "epoch": 0.8062028258231702, "grad_norm": 0.44955298559254003, "learning_rate": 2.19988191281024e-06, "loss": 0.7469, "mean_token_accuracy": 0.7784252226352691, "step": 4900 }, { "epoch": 0.8070254817678876, "grad_norm": 0.45433636817992795, "learning_rate": 2.181941007068895e-06, "loss": 0.747, "mean_token_accuracy": 0.7790346369147301, "step": 4905 }, { "epoch": 0.8078481377126051, "grad_norm": 0.4621586039396848, "learning_rate": 2.1640645958085003e-06, "loss": 0.7719, "mean_token_accuracy": 0.7709806621074676, "step": 4910 }, { "epoch": 0.8086707936573226, "grad_norm": 0.44464878041151695, "learning_rate": 2.146252826499148e-06, "loss": 0.757, "mean_token_accuracy": 0.7755614250898362, "step": 4915 }, { "epoch": 0.8094934496020402, "grad_norm": 0.4519260542335169, "learning_rate": 2.128505846077672e-06, "loss": 0.7666, "mean_token_accuracy": 0.7731157541275024, "step": 4920 }, { "epoch": 0.8103161055467577, "grad_norm": 0.45871660182570095, "learning_rate": 2.1108238009464277e-06, "loss": 0.7607, "mean_token_accuracy": 0.773801825940609, "step": 4925 }, { "epoch": 0.8111387614914752, "grad_norm": 0.4498914540984321, "learning_rate": 2.0932068369721003e-06, "loss": 0.7442, "mean_token_accuracy": 0.7789148986339569, "step": 4930 }, { "epoch": 0.8119614174361928, "grad_norm": 0.45657825622438386, "learning_rate": 2.0756550994844903e-06, "loss": 0.7644, "mean_token_accuracy": 0.7740113973617554, "step": 4935 }, { "epoch": 0.8127840733809103, "grad_norm": 0.4519919934660599, "learning_rate": 2.058168733275314e-06, "loss": 0.7676, "mean_token_accuracy": 0.7730287864804268, "step": 4940 }, { "epoch": 0.8136067293256278, "grad_norm": 0.454708515606137, "learning_rate": 2.0407478825970172e-06, "loss": 0.7593, "mean_token_accuracy": 0.7736750692129135, "step": 4945 }, { "epoch": 0.8144293852703454, "grad_norm": 0.44754392098938994, "learning_rate": 2.0233926911615743e-06, "loss": 0.7576, "mean_token_accuracy": 0.7746249347925186, "step": 4950 }, { "epoch": 0.8152520412150628, "grad_norm": 0.44857446068081464, "learning_rate": 2.006103302139315e-06, "loss": 0.7441, "mean_token_accuracy": 0.779543386399746, "step": 4955 }, { "epoch": 0.8160746971597803, "grad_norm": 0.5167442985462488, "learning_rate": 1.988879858157734e-06, "loss": 0.733, "mean_token_accuracy": 0.7821626350283623, "step": 4960 }, { "epoch": 0.8168973531044978, "grad_norm": 0.46440740977224854, "learning_rate": 1.9717225013003205e-06, "loss": 0.7612, "mean_token_accuracy": 0.7739380061626434, "step": 4965 }, { "epoch": 0.8177200090492154, "grad_norm": 0.470915430038146, "learning_rate": 1.954631373105379e-06, "loss": 0.7584, "mean_token_accuracy": 0.774423734843731, "step": 4970 }, { "epoch": 0.8185426649939329, "grad_norm": 0.4483976353968263, "learning_rate": 1.937606614564862e-06, "loss": 0.7673, "mean_token_accuracy": 0.7734792694449425, "step": 4975 }, { "epoch": 0.8193653209386504, "grad_norm": 0.44525498115021345, "learning_rate": 1.92064836612322e-06, "loss": 0.7522, "mean_token_accuracy": 0.7767123430967331, "step": 4980 }, { "epoch": 0.820187976883368, "grad_norm": 0.4577802329126903, "learning_rate": 1.9037567676762314e-06, "loss": 0.7509, "mean_token_accuracy": 0.777600210905075, "step": 4985 }, { "epoch": 0.8210106328280855, "grad_norm": 1.2073819670643449, "learning_rate": 1.886931958569843e-06, "loss": 0.7648, "mean_token_accuracy": 0.7722527593374252, "step": 4990 }, { "epoch": 0.821833288772803, "grad_norm": 0.45581373852703877, "learning_rate": 1.8701740775990363e-06, "loss": 0.7596, "mean_token_accuracy": 0.7753255382180214, "step": 4995 }, { "epoch": 0.8226559447175205, "grad_norm": 0.46624320002522784, "learning_rate": 1.85348326300667e-06, "loss": 0.7456, "mean_token_accuracy": 0.777042618393898, "step": 5000 }, { "epoch": 0.823478600662238, "grad_norm": 0.4691634960735166, "learning_rate": 1.8368596524823534e-06, "loss": 0.749, "mean_token_accuracy": 0.776347927749157, "step": 5005 }, { "epoch": 0.8243012566069555, "grad_norm": 0.4786154127156054, "learning_rate": 1.8203033831612794e-06, "loss": 0.752, "mean_token_accuracy": 0.7776790499687195, "step": 5010 }, { "epoch": 0.825123912551673, "grad_norm": 0.45098159727174386, "learning_rate": 1.8038145916231343e-06, "loss": 0.762, "mean_token_accuracy": 0.7744000017642975, "step": 5015 }, { "epoch": 0.8259465684963906, "grad_norm": 0.46187041722228395, "learning_rate": 1.7873934138909454e-06, "loss": 0.758, "mean_token_accuracy": 0.7748545706272125, "step": 5020 }, { "epoch": 0.8267692244411081, "grad_norm": 0.4495300807714366, "learning_rate": 1.7710399854299687e-06, "loss": 0.7542, "mean_token_accuracy": 0.7757770672440529, "step": 5025 }, { "epoch": 0.8275918803858257, "grad_norm": 0.4544997971082489, "learning_rate": 1.7547544411465567e-06, "loss": 0.7653, "mean_token_accuracy": 0.7723886862397193, "step": 5030 }, { "epoch": 0.8284145363305432, "grad_norm": 0.4422036133015202, "learning_rate": 1.738536915387068e-06, "loss": 0.7371, "mean_token_accuracy": 0.7802234813570976, "step": 5035 }, { "epoch": 0.8292371922752607, "grad_norm": 0.46140099895942943, "learning_rate": 1.7223875419367465e-06, "loss": 0.7656, "mean_token_accuracy": 0.7729993358254432, "step": 5040 }, { "epoch": 0.8300598482199782, "grad_norm": 0.45756629465945936, "learning_rate": 1.7063064540186125e-06, "loss": 0.7471, "mean_token_accuracy": 0.778562781214714, "step": 5045 }, { "epoch": 0.8308825041646957, "grad_norm": 0.45478746488715277, "learning_rate": 1.6902937842923706e-06, "loss": 0.7678, "mean_token_accuracy": 0.7721138253808022, "step": 5050 }, { "epoch": 0.8317051601094132, "grad_norm": 0.45954061623470877, "learning_rate": 1.6743496648533208e-06, "loss": 0.7589, "mean_token_accuracy": 0.774648317694664, "step": 5055 }, { "epoch": 0.8325278160541307, "grad_norm": 0.4496739885867956, "learning_rate": 1.6584742272312604e-06, "loss": 0.7535, "mean_token_accuracy": 0.7759391143918037, "step": 5060 }, { "epoch": 0.8333504719988483, "grad_norm": 0.46151954259604117, "learning_rate": 1.6426676023894007e-06, "loss": 0.749, "mean_token_accuracy": 0.7772417515516281, "step": 5065 }, { "epoch": 0.8341731279435658, "grad_norm": 0.4431469873320078, "learning_rate": 1.6269299207232824e-06, "loss": 0.7488, "mean_token_accuracy": 0.7777144953608512, "step": 5070 }, { "epoch": 0.8349957838882833, "grad_norm": 0.4611868606350716, "learning_rate": 1.6112613120597164e-06, "loss": 0.7606, "mean_token_accuracy": 0.7754010453820228, "step": 5075 }, { "epoch": 0.8358184398330009, "grad_norm": 0.47702586551589743, "learning_rate": 1.5956619056556865e-06, "loss": 0.745, "mean_token_accuracy": 0.778921264410019, "step": 5080 }, { "epoch": 0.8366410957777184, "grad_norm": 0.4425677682617662, "learning_rate": 1.5801318301973078e-06, "loss": 0.7499, "mean_token_accuracy": 0.7768089964985847, "step": 5085 }, { "epoch": 0.8374637517224359, "grad_norm": 0.4523405216752326, "learning_rate": 1.5646712137987575e-06, "loss": 0.7538, "mean_token_accuracy": 0.7765919134020806, "step": 5090 }, { "epoch": 0.8382864076671535, "grad_norm": 0.4603215521474645, "learning_rate": 1.5492801840012018e-06, "loss": 0.75, "mean_token_accuracy": 0.7767136886715889, "step": 5095 }, { "epoch": 0.8391090636118709, "grad_norm": 0.4874284025364324, "learning_rate": 1.533958867771772e-06, "loss": 0.7624, "mean_token_accuracy": 0.7744793921709061, "step": 5100 }, { "epoch": 0.8399317195565884, "grad_norm": 0.4408726951896832, "learning_rate": 1.5187073915024985e-06, "loss": 0.7567, "mean_token_accuracy": 0.7752280026674271, "step": 5105 }, { "epoch": 0.8407543755013059, "grad_norm": 0.4562707355979529, "learning_rate": 1.5035258810092668e-06, "loss": 0.7427, "mean_token_accuracy": 0.7797864437103271, "step": 5110 }, { "epoch": 0.8415770314460235, "grad_norm": 0.44881173422128795, "learning_rate": 1.4884144615307882e-06, "loss": 0.7538, "mean_token_accuracy": 0.7763380840420723, "step": 5115 }, { "epoch": 0.842399687390741, "grad_norm": 0.45562789911287727, "learning_rate": 1.4733732577275672e-06, "loss": 0.7467, "mean_token_accuracy": 0.7782832205295562, "step": 5120 }, { "epoch": 0.8432223433354585, "grad_norm": 0.5421684195517157, "learning_rate": 1.4584023936808645e-06, "loss": 0.7476, "mean_token_accuracy": 0.7765630632638931, "step": 5125 }, { "epoch": 0.8440449992801761, "grad_norm": 0.4611353495051131, "learning_rate": 1.443501992891685e-06, "loss": 0.7684, "mean_token_accuracy": 0.7724569365382195, "step": 5130 }, { "epoch": 0.8448676552248936, "grad_norm": 0.4588084839852691, "learning_rate": 1.4286721782797407e-06, "loss": 0.7651, "mean_token_accuracy": 0.7733467847108841, "step": 5135 }, { "epoch": 0.8456903111696111, "grad_norm": 0.45457711943121504, "learning_rate": 1.413913072182459e-06, "loss": 0.7554, "mean_token_accuracy": 0.775216418504715, "step": 5140 }, { "epoch": 0.8465129671143287, "grad_norm": 0.45620511123335766, "learning_rate": 1.3992247963539618e-06, "loss": 0.7669, "mean_token_accuracy": 0.7727680012583733, "step": 5145 }, { "epoch": 0.8473356230590461, "grad_norm": 0.4773932338359636, "learning_rate": 1.384607471964058e-06, "loss": 0.7718, "mean_token_accuracy": 0.7714467778801918, "step": 5150 }, { "epoch": 0.8481582790037636, "grad_norm": 0.45255757708337135, "learning_rate": 1.370061219597244e-06, "loss": 0.746, "mean_token_accuracy": 0.7779589399695397, "step": 5155 }, { "epoch": 0.8489809349484811, "grad_norm": 0.46194731875057976, "learning_rate": 1.3555861592517239e-06, "loss": 0.7567, "mean_token_accuracy": 0.7747567281126976, "step": 5160 }, { "epoch": 0.8498035908931987, "grad_norm": 0.4548384337555146, "learning_rate": 1.3411824103384018e-06, "loss": 0.751, "mean_token_accuracy": 0.7774652823805809, "step": 5165 }, { "epoch": 0.8506262468379162, "grad_norm": 0.4571724171722975, "learning_rate": 1.3268500916799077e-06, "loss": 0.7597, "mean_token_accuracy": 0.7748971879482269, "step": 5170 }, { "epoch": 0.8514489027826337, "grad_norm": 0.46317663357014593, "learning_rate": 1.3125893215096075e-06, "loss": 0.7656, "mean_token_accuracy": 0.7728927344083786, "step": 5175 }, { "epoch": 0.8522715587273513, "grad_norm": 0.4530624539524837, "learning_rate": 1.2984002174706346e-06, "loss": 0.7474, "mean_token_accuracy": 0.7788657501339913, "step": 5180 }, { "epoch": 0.8530942146720688, "grad_norm": 0.5143861775272591, "learning_rate": 1.2842828966149223e-06, "loss": 0.7615, "mean_token_accuracy": 0.7744556188583374, "step": 5185 }, { "epoch": 0.8539168706167863, "grad_norm": 0.4491472577711793, "learning_rate": 1.2702374754022329e-06, "loss": 0.7506, "mean_token_accuracy": 0.7771938264369964, "step": 5190 }, { "epoch": 0.8547395265615038, "grad_norm": 0.44450225546884303, "learning_rate": 1.2562640696991934e-06, "loss": 0.7488, "mean_token_accuracy": 0.7772212013602257, "step": 5195 }, { "epoch": 0.8555621825062213, "grad_norm": 0.4521644248160052, "learning_rate": 1.242362794778349e-06, "loss": 0.7624, "mean_token_accuracy": 0.7742334112524987, "step": 5200 }, { "epoch": 0.8563848384509388, "grad_norm": 0.4481135673483707, "learning_rate": 1.228533765317207e-06, "loss": 0.7455, "mean_token_accuracy": 0.7781358554959297, "step": 5205 }, { "epoch": 0.8572074943956564, "grad_norm": 0.45622360166570936, "learning_rate": 1.2147770953972915e-06, "loss": 0.7666, "mean_token_accuracy": 0.7718599781394004, "step": 5210 }, { "epoch": 0.8580301503403739, "grad_norm": 0.47048398157336385, "learning_rate": 1.2010928985031989e-06, "loss": 0.7701, "mean_token_accuracy": 0.7719263911247254, "step": 5215 }, { "epoch": 0.8588528062850914, "grad_norm": 0.4415881179571527, "learning_rate": 1.1874812875216656e-06, "loss": 0.7331, "mean_token_accuracy": 0.7817918747663498, "step": 5220 }, { "epoch": 0.859675462229809, "grad_norm": 0.44547650317217063, "learning_rate": 1.1739423747406387e-06, "loss": 0.7506, "mean_token_accuracy": 0.7769679322838783, "step": 5225 }, { "epoch": 0.8604981181745265, "grad_norm": 0.45849259655752383, "learning_rate": 1.1604762718483498e-06, "loss": 0.7476, "mean_token_accuracy": 0.7775586381554603, "step": 5230 }, { "epoch": 0.861320774119244, "grad_norm": 0.44962357416062615, "learning_rate": 1.1470830899323814e-06, "loss": 0.7471, "mean_token_accuracy": 0.7777603000402451, "step": 5235 }, { "epoch": 0.8621434300639615, "grad_norm": 0.46467354791192156, "learning_rate": 1.1337629394787663e-06, "loss": 0.7507, "mean_token_accuracy": 0.7766257613897324, "step": 5240 }, { "epoch": 0.862966086008679, "grad_norm": 0.45414302945065527, "learning_rate": 1.1205159303710744e-06, "loss": 0.7532, "mean_token_accuracy": 0.7755645141005516, "step": 5245 }, { "epoch": 0.8637887419533965, "grad_norm": 0.4527537949206399, "learning_rate": 1.1073421718894894e-06, "loss": 0.7548, "mean_token_accuracy": 0.7755493834614754, "step": 5250 }, { "epoch": 0.864611397898114, "grad_norm": 0.44306192338112677, "learning_rate": 1.0942417727099309e-06, "loss": 0.755, "mean_token_accuracy": 0.7756238400936126, "step": 5255 }, { "epoch": 0.8654340538428316, "grad_norm": 0.44663788609163824, "learning_rate": 1.0812148409031387e-06, "loss": 0.7384, "mean_token_accuracy": 0.7804351940751075, "step": 5260 }, { "epoch": 0.8662567097875491, "grad_norm": 0.4477052598347741, "learning_rate": 1.0682614839337947e-06, "loss": 0.7447, "mean_token_accuracy": 0.7770758211612702, "step": 5265 }, { "epoch": 0.8670793657322666, "grad_norm": 0.4571564328821459, "learning_rate": 1.055381808659629e-06, "loss": 0.7636, "mean_token_accuracy": 0.7744614854454994, "step": 5270 }, { "epoch": 0.8679020216769842, "grad_norm": 0.45812909771411514, "learning_rate": 1.0425759213305374e-06, "loss": 0.7489, "mean_token_accuracy": 0.7774764329195023, "step": 5275 }, { "epoch": 0.8687246776217017, "grad_norm": 0.4423394513697046, "learning_rate": 1.0298439275877137e-06, "loss": 0.7514, "mean_token_accuracy": 0.7770288631319999, "step": 5280 }, { "epoch": 0.8695473335664192, "grad_norm": 0.46329627066511253, "learning_rate": 1.0171859324627641e-06, "loss": 0.7495, "mean_token_accuracy": 0.7785645380616188, "step": 5285 }, { "epoch": 0.8703699895111368, "grad_norm": 0.44350863537326346, "learning_rate": 1.0046020403768552e-06, "loss": 0.7536, "mean_token_accuracy": 0.7765577554702758, "step": 5290 }, { "epoch": 0.8711926454558542, "grad_norm": 0.4414470568715232, "learning_rate": 9.920923551398465e-07, "loss": 0.7541, "mean_token_accuracy": 0.7754393994808197, "step": 5295 }, { "epoch": 0.8720153014005717, "grad_norm": 0.45564846665911685, "learning_rate": 9.796569799494282e-07, "loss": 0.7553, "mean_token_accuracy": 0.7764479413628578, "step": 5300 }, { "epoch": 0.8728379573452892, "grad_norm": 0.45948740131036453, "learning_rate": 9.67296017390279e-07, "loss": 0.762, "mean_token_accuracy": 0.7739602044224739, "step": 5305 }, { "epoch": 0.8736606132900068, "grad_norm": 0.45108358401844423, "learning_rate": 9.550095694332184e-07, "loss": 0.7597, "mean_token_accuracy": 0.7742050752043724, "step": 5310 }, { "epoch": 0.8744832692347243, "grad_norm": 0.4552045889436472, "learning_rate": 9.427977374343633e-07, "loss": 0.7561, "mean_token_accuracy": 0.7753622144460678, "step": 5315 }, { "epoch": 0.8753059251794418, "grad_norm": 0.44380865267745817, "learning_rate": 9.306606221342829e-07, "loss": 0.7467, "mean_token_accuracy": 0.7786542281508446, "step": 5320 }, { "epoch": 0.8761285811241594, "grad_norm": 0.447338666759802, "learning_rate": 9.18598323657186e-07, "loss": 0.7461, "mean_token_accuracy": 0.7787746682763099, "step": 5325 }, { "epoch": 0.8769512370688769, "grad_norm": 0.4676000715148859, "learning_rate": 9.066109415100855e-07, "loss": 0.7536, "mean_token_accuracy": 0.7766070172190667, "step": 5330 }, { "epoch": 0.8777738930135944, "grad_norm": 0.45637586057026486, "learning_rate": 8.946985745819747e-07, "loss": 0.7477, "mean_token_accuracy": 0.7775689244270325, "step": 5335 }, { "epoch": 0.878596548958312, "grad_norm": 0.4418654103506246, "learning_rate": 8.828613211430137e-07, "loss": 0.7635, "mean_token_accuracy": 0.7737632393836975, "step": 5340 }, { "epoch": 0.8794192049030294, "grad_norm": 0.453886257095775, "learning_rate": 8.710992788437233e-07, "loss": 0.7492, "mean_token_accuracy": 0.7768275111913681, "step": 5345 }, { "epoch": 0.8802418608477469, "grad_norm": 0.4520892354507425, "learning_rate": 8.594125447141732e-07, "loss": 0.7559, "mean_token_accuracy": 0.775428768992424, "step": 5350 }, { "epoch": 0.8810645167924644, "grad_norm": 0.44408110073617796, "learning_rate": 8.478012151631854e-07, "loss": 0.7558, "mean_token_accuracy": 0.7759406581521034, "step": 5355 }, { "epoch": 0.881887172737182, "grad_norm": 0.4616306131581194, "learning_rate": 8.362653859775327e-07, "loss": 0.7472, "mean_token_accuracy": 0.778403989970684, "step": 5360 }, { "epoch": 0.8827098286818995, "grad_norm": 0.4558936187209564, "learning_rate": 8.248051523211609e-07, "loss": 0.7526, "mean_token_accuracy": 0.7767306223511696, "step": 5365 }, { "epoch": 0.883532484626617, "grad_norm": 0.45349564065991677, "learning_rate": 8.134206087343933e-07, "loss": 0.7531, "mean_token_accuracy": 0.77746422290802, "step": 5370 }, { "epoch": 0.8843551405713346, "grad_norm": 0.44013461902874046, "learning_rate": 8.021118491331537e-07, "loss": 0.7544, "mean_token_accuracy": 0.775374884903431, "step": 5375 }, { "epoch": 0.8851777965160521, "grad_norm": 0.4529079339749554, "learning_rate": 7.908789668081873e-07, "loss": 0.7485, "mean_token_accuracy": 0.7780358821153641, "step": 5380 }, { "epoch": 0.8860004524607696, "grad_norm": 0.4820501861139083, "learning_rate": 7.797220544243045e-07, "loss": 0.7451, "mean_token_accuracy": 0.7785145387053489, "step": 5385 }, { "epoch": 0.8868231084054872, "grad_norm": 0.4550985418239331, "learning_rate": 7.68641204019599e-07, "loss": 0.7444, "mean_token_accuracy": 0.7792016997933388, "step": 5390 }, { "epoch": 0.8876457643502046, "grad_norm": 0.44901686494965937, "learning_rate": 7.576365070047032e-07, "loss": 0.7404, "mean_token_accuracy": 0.7805183827877045, "step": 5395 }, { "epoch": 0.8884684202949221, "grad_norm": 0.44541585672165007, "learning_rate": 7.467080541620209e-07, "loss": 0.7467, "mean_token_accuracy": 0.7783748626708984, "step": 5400 }, { "epoch": 0.8892910762396397, "grad_norm": 0.4612853052522339, "learning_rate": 7.358559356449935e-07, "loss": 0.7468, "mean_token_accuracy": 0.7784297719597817, "step": 5405 }, { "epoch": 0.8901137321843572, "grad_norm": 0.4473503219055565, "learning_rate": 7.250802409773428e-07, "loss": 0.7578, "mean_token_accuracy": 0.7757548198103905, "step": 5410 }, { "epoch": 0.8909363881290747, "grad_norm": 0.44544816733806114, "learning_rate": 7.143810590523426e-07, "loss": 0.7535, "mean_token_accuracy": 0.7764914765954017, "step": 5415 }, { "epoch": 0.8917590440737923, "grad_norm": 0.4456489631448736, "learning_rate": 7.037584781320739e-07, "loss": 0.7571, "mean_token_accuracy": 0.7758123815059662, "step": 5420 }, { "epoch": 0.8925817000185098, "grad_norm": 0.44852732566556525, "learning_rate": 6.932125858467076e-07, "loss": 0.7377, "mean_token_accuracy": 0.7800244882702827, "step": 5425 }, { "epoch": 0.8934043559632273, "grad_norm": 0.449716740617411, "learning_rate": 6.827434691937773e-07, "loss": 0.7597, "mean_token_accuracy": 0.7749462381005288, "step": 5430 }, { "epoch": 0.8942270119079448, "grad_norm": 0.47859915802006914, "learning_rate": 6.723512145374633e-07, "loss": 0.7493, "mean_token_accuracy": 0.7777243986725807, "step": 5435 }, { "epoch": 0.8950496678526623, "grad_norm": 0.45288389804450013, "learning_rate": 6.620359076078775e-07, "loss": 0.7319, "mean_token_accuracy": 0.7819717019796372, "step": 5440 }, { "epoch": 0.8958723237973798, "grad_norm": 0.45037916839905456, "learning_rate": 6.517976335003551e-07, "loss": 0.7516, "mean_token_accuracy": 0.7776877984404564, "step": 5445 }, { "epoch": 0.8966949797420973, "grad_norm": 0.5913812708319135, "learning_rate": 6.416364766747563e-07, "loss": 0.7502, "mean_token_accuracy": 0.7769842103123665, "step": 5450 }, { "epoch": 0.8975176356868149, "grad_norm": 0.4474796865508915, "learning_rate": 6.315525209547735e-07, "loss": 0.7438, "mean_token_accuracy": 0.7778741076588631, "step": 5455 }, { "epoch": 0.8983402916315324, "grad_norm": 0.4850690519891261, "learning_rate": 6.215458495272253e-07, "loss": 0.7502, "mean_token_accuracy": 0.7772305890917778, "step": 5460 }, { "epoch": 0.8991629475762499, "grad_norm": 0.47599556748773075, "learning_rate": 6.116165449413847e-07, "loss": 0.7529, "mean_token_accuracy": 0.7764015629887581, "step": 5465 }, { "epoch": 0.8999856035209675, "grad_norm": 0.46116258559283796, "learning_rate": 6.017646891082918e-07, "loss": 0.7464, "mean_token_accuracy": 0.778635123372078, "step": 5470 }, { "epoch": 0.900808259465685, "grad_norm": 0.44503112105916165, "learning_rate": 5.9199036330008e-07, "loss": 0.7432, "mean_token_accuracy": 0.7780135944485664, "step": 5475 }, { "epoch": 0.9016309154104025, "grad_norm": 0.4434729276604829, "learning_rate": 5.822936481493047e-07, "loss": 0.7442, "mean_token_accuracy": 0.7784598052501679, "step": 5480 }, { "epoch": 0.90245357135512, "grad_norm": 0.45650240106599926, "learning_rate": 5.726746236482772e-07, "loss": 0.7433, "mean_token_accuracy": 0.7800303131341935, "step": 5485 }, { "epoch": 0.9032762272998375, "grad_norm": 0.4581521819790634, "learning_rate": 5.631333691484053e-07, "loss": 0.761, "mean_token_accuracy": 0.7743619292974472, "step": 5490 }, { "epoch": 0.904098883244555, "grad_norm": 0.44973259824493583, "learning_rate": 5.5366996335954e-07, "loss": 0.7479, "mean_token_accuracy": 0.7772611483931542, "step": 5495 }, { "epoch": 0.9049215391892725, "grad_norm": 0.45676133483970377, "learning_rate": 5.442844843493256e-07, "loss": 0.7602, "mean_token_accuracy": 0.7742813900113106, "step": 5500 }, { "epoch": 0.9057441951339901, "grad_norm": 0.4469787204477241, "learning_rate": 5.349770095425533e-07, "loss": 0.7496, "mean_token_accuracy": 0.7784984081983566, "step": 5505 }, { "epoch": 0.9065668510787076, "grad_norm": 0.4504759310015148, "learning_rate": 5.257476157205266e-07, "loss": 0.7472, "mean_token_accuracy": 0.7774579241871834, "step": 5510 }, { "epoch": 0.9073895070234251, "grad_norm": 0.4540141872610964, "learning_rate": 5.165963790204265e-07, "loss": 0.7589, "mean_token_accuracy": 0.7746942058205605, "step": 5515 }, { "epoch": 0.9082121629681427, "grad_norm": 0.45123192804752477, "learning_rate": 5.07523374934682e-07, "loss": 0.7596, "mean_token_accuracy": 0.7746987491846085, "step": 5520 }, { "epoch": 0.9090348189128602, "grad_norm": 0.47027144585532094, "learning_rate": 4.985286783103471e-07, "loss": 0.7517, "mean_token_accuracy": 0.7773586988449097, "step": 5525 }, { "epoch": 0.9098574748575777, "grad_norm": 0.4488694488706293, "learning_rate": 4.89612363348484e-07, "loss": 0.7575, "mean_token_accuracy": 0.7745656073093414, "step": 5530 }, { "epoch": 0.9106801308022953, "grad_norm": 0.4644095257024458, "learning_rate": 4.807745036035538e-07, "loss": 0.7458, "mean_token_accuracy": 0.7790784955024719, "step": 5535 }, { "epoch": 0.9115027867470127, "grad_norm": 0.4486545889740564, "learning_rate": 4.720151719828059e-07, "loss": 0.7504, "mean_token_accuracy": 0.7769330680370331, "step": 5540 }, { "epoch": 0.9123254426917302, "grad_norm": 0.4836659409545379, "learning_rate": 4.6333444074567637e-07, "loss": 0.7583, "mean_token_accuracy": 0.7759171262383461, "step": 5545 }, { "epoch": 0.9131480986364477, "grad_norm": 0.44101405630240303, "learning_rate": 4.5473238150319676e-07, "loss": 0.7559, "mean_token_accuracy": 0.7744720235466958, "step": 5550 }, { "epoch": 0.9139707545811653, "grad_norm": 0.43971922943736186, "learning_rate": 4.462090652173989e-07, "loss": 0.7443, "mean_token_accuracy": 0.7783125743269921, "step": 5555 }, { "epoch": 0.9147934105258828, "grad_norm": 0.45534332662680016, "learning_rate": 4.377645622007276e-07, "loss": 0.7439, "mean_token_accuracy": 0.7784717574715614, "step": 5560 }, { "epoch": 0.9156160664706003, "grad_norm": 0.451897056510664, "learning_rate": 4.2939894211546763e-07, "loss": 0.7487, "mean_token_accuracy": 0.7775903835892677, "step": 5565 }, { "epoch": 0.9164387224153179, "grad_norm": 0.45110733123011293, "learning_rate": 4.211122739731621e-07, "loss": 0.738, "mean_token_accuracy": 0.7803859367966652, "step": 5570 }, { "epoch": 0.9172613783600354, "grad_norm": 0.43974596187133075, "learning_rate": 4.1290462613404746e-07, "loss": 0.7495, "mean_token_accuracy": 0.7770720511674881, "step": 5575 }, { "epoch": 0.9180840343047529, "grad_norm": 0.48743441953523925, "learning_rate": 4.047760663064915e-07, "loss": 0.7596, "mean_token_accuracy": 0.7746918871998787, "step": 5580 }, { "epoch": 0.9189066902494705, "grad_norm": 0.45635990420771355, "learning_rate": 3.967266615464227e-07, "loss": 0.7544, "mean_token_accuracy": 0.7755736202001572, "step": 5585 }, { "epoch": 0.9197293461941879, "grad_norm": 0.454428624199265, "learning_rate": 3.8875647825679543e-07, "loss": 0.7468, "mean_token_accuracy": 0.7779560372233391, "step": 5590 }, { "epoch": 0.9205520021389054, "grad_norm": 0.4577121054677454, "learning_rate": 3.8086558218702663e-07, "loss": 0.751, "mean_token_accuracy": 0.7767623484134674, "step": 5595 }, { "epoch": 0.921374658083623, "grad_norm": 0.45133545451766693, "learning_rate": 3.7305403843246104e-07, "loss": 0.7501, "mean_token_accuracy": 0.7773380026221275, "step": 5600 }, { "epoch": 0.9221973140283405, "grad_norm": 0.46549489229218793, "learning_rate": 3.6532191143383354e-07, "loss": 0.7491, "mean_token_accuracy": 0.7784052848815918, "step": 5605 }, { "epoch": 0.923019969973058, "grad_norm": 0.4518356632960425, "learning_rate": 3.576692649767355e-07, "loss": 0.7722, "mean_token_accuracy": 0.7708909615874291, "step": 5610 }, { "epoch": 0.9238426259177756, "grad_norm": 0.4598173614027414, "learning_rate": 3.500961621910881e-07, "loss": 0.756, "mean_token_accuracy": 0.7759104192256927, "step": 5615 }, { "epoch": 0.9246652818624931, "grad_norm": 0.4562251267236369, "learning_rate": 3.426026655506276e-07, "loss": 0.7494, "mean_token_accuracy": 0.7776082113385201, "step": 5620 }, { "epoch": 0.9254879378072106, "grad_norm": 0.46624544931402495, "learning_rate": 3.3518883687238103e-07, "loss": 0.7442, "mean_token_accuracy": 0.7793371543288231, "step": 5625 }, { "epoch": 0.9263105937519281, "grad_norm": 0.45011954781317903, "learning_rate": 3.2785473731616224e-07, "loss": 0.7599, "mean_token_accuracy": 0.7746311292052269, "step": 5630 }, { "epoch": 0.9271332496966456, "grad_norm": 0.44410787211573954, "learning_rate": 3.206004273840646e-07, "loss": 0.7462, "mean_token_accuracy": 0.7789055734872818, "step": 5635 }, { "epoch": 0.9279559056413631, "grad_norm": 0.4480814977855624, "learning_rate": 3.1342596691996577e-07, "loss": 0.7646, "mean_token_accuracy": 0.773674713075161, "step": 5640 }, { "epoch": 0.9287785615860806, "grad_norm": 0.5114459515559566, "learning_rate": 3.063314151090313e-07, "loss": 0.7521, "mean_token_accuracy": 0.7761005654931068, "step": 5645 }, { "epoch": 0.9296012175307982, "grad_norm": 0.440725539644043, "learning_rate": 2.9931683047722095e-07, "loss": 0.7541, "mean_token_accuracy": 0.7768652409315109, "step": 5650 }, { "epoch": 0.9304238734755157, "grad_norm": 0.4502259420815018, "learning_rate": 2.923822708908175e-07, "loss": 0.7675, "mean_token_accuracy": 0.7728524684906006, "step": 5655 }, { "epoch": 0.9312465294202332, "grad_norm": 0.45750465605134205, "learning_rate": 2.8552779355594195e-07, "loss": 0.7626, "mean_token_accuracy": 0.7737421125173569, "step": 5660 }, { "epoch": 0.9320691853649508, "grad_norm": 0.4442403218512623, "learning_rate": 2.7875345501808147e-07, "loss": 0.744, "mean_token_accuracy": 0.7784305810928345, "step": 5665 }, { "epoch": 0.9328918413096683, "grad_norm": 0.4494601477204588, "learning_rate": 2.720593111616221e-07, "loss": 0.7692, "mean_token_accuracy": 0.7718387633562088, "step": 5670 }, { "epoch": 0.9337144972543858, "grad_norm": 0.4471129791831627, "learning_rate": 2.654454172093945e-07, "loss": 0.752, "mean_token_accuracy": 0.7765217125415802, "step": 5675 }, { "epoch": 0.9345371531991034, "grad_norm": 0.4596349066067918, "learning_rate": 2.5891182772221247e-07, "loss": 0.7512, "mean_token_accuracy": 0.7767874285578728, "step": 5680 }, { "epoch": 0.9353598091438208, "grad_norm": 0.4608186655475542, "learning_rate": 2.5245859659842387e-07, "loss": 0.7594, "mean_token_accuracy": 0.7754180327057838, "step": 5685 }, { "epoch": 0.9361824650885383, "grad_norm": 0.45332386852005885, "learning_rate": 2.460857770734659e-07, "loss": 0.7456, "mean_token_accuracy": 0.7782048270106315, "step": 5690 }, { "epoch": 0.9370051210332558, "grad_norm": 0.4427048169814939, "learning_rate": 2.3979342171942757e-07, "loss": 0.7389, "mean_token_accuracy": 0.7792840272188186, "step": 5695 }, { "epoch": 0.9378277769779734, "grad_norm": 0.45649920479580847, "learning_rate": 2.3358158244461193e-07, "loss": 0.7459, "mean_token_accuracy": 0.7788773998618126, "step": 5700 }, { "epoch": 0.9386504329226909, "grad_norm": 0.4587820349965553, "learning_rate": 2.2745031049311582e-07, "loss": 0.7543, "mean_token_accuracy": 0.7756284520030021, "step": 5705 }, { "epoch": 0.9394730888674084, "grad_norm": 0.4553493110133557, "learning_rate": 2.2139965644439764e-07, "loss": 0.7491, "mean_token_accuracy": 0.7767566189169883, "step": 5710 }, { "epoch": 0.940295744812126, "grad_norm": 4.731856449290639, "learning_rate": 2.1542967021286553e-07, "loss": 0.7359, "mean_token_accuracy": 0.781352449953556, "step": 5715 }, { "epoch": 0.9411184007568435, "grad_norm": 0.4421518464390205, "learning_rate": 2.0954040104746554e-07, "loss": 0.7564, "mean_token_accuracy": 0.7755159273743629, "step": 5720 }, { "epoch": 0.941941056701561, "grad_norm": 0.4553871023652811, "learning_rate": 2.037318975312763e-07, "loss": 0.7707, "mean_token_accuracy": 0.771423852443695, "step": 5725 }, { "epoch": 0.9427637126462786, "grad_norm": 0.4615194260939641, "learning_rate": 1.9800420758110284e-07, "loss": 0.7444, "mean_token_accuracy": 0.7790366724133492, "step": 5730 }, { "epoch": 0.943586368590996, "grad_norm": 0.4588572332668005, "learning_rate": 1.9235737844708445e-07, "loss": 0.7316, "mean_token_accuracy": 0.7822706729173661, "step": 5735 }, { "epoch": 0.9444090245357135, "grad_norm": 0.4405611733519534, "learning_rate": 1.8679145671230858e-07, "loss": 0.7337, "mean_token_accuracy": 0.781727097928524, "step": 5740 }, { "epoch": 0.945231680480431, "grad_norm": 0.445024903199555, "learning_rate": 1.8130648829242314e-07, "loss": 0.7418, "mean_token_accuracy": 0.7790333375334739, "step": 5745 }, { "epoch": 0.9460543364251486, "grad_norm": 0.46611132680489187, "learning_rate": 1.7590251843525474e-07, "loss": 0.7595, "mean_token_accuracy": 0.7746739342808724, "step": 5750 }, { "epoch": 0.9468769923698661, "grad_norm": 0.44763520078236585, "learning_rate": 1.7057959172044002e-07, "loss": 0.7582, "mean_token_accuracy": 0.7751786157488822, "step": 5755 }, { "epoch": 0.9476996483145836, "grad_norm": 0.4517839124668547, "learning_rate": 1.6533775205905712e-07, "loss": 0.7477, "mean_token_accuracy": 0.7770958289504051, "step": 5760 }, { "epoch": 0.9485223042593012, "grad_norm": 0.44845706030266597, "learning_rate": 1.6017704269326363e-07, "loss": 0.7302, "mean_token_accuracy": 0.7825514763593674, "step": 5765 }, { "epoch": 0.9493449602040187, "grad_norm": 0.45999381249433086, "learning_rate": 1.55097506195937e-07, "loss": 0.7407, "mean_token_accuracy": 0.7799299120903015, "step": 5770 }, { "epoch": 0.9501676161487362, "grad_norm": 0.7358524105241566, "learning_rate": 1.5009918447032368e-07, "loss": 0.7535, "mean_token_accuracy": 0.7767059206962585, "step": 5775 }, { "epoch": 0.9509902720934538, "grad_norm": 0.44163047565105074, "learning_rate": 1.4518211874969823e-07, "loss": 0.7485, "mean_token_accuracy": 0.7770814657211303, "step": 5780 }, { "epoch": 0.9518129280381712, "grad_norm": 0.458171905458851, "learning_rate": 1.4034634959702032e-07, "loss": 0.7457, "mean_token_accuracy": 0.7787189275026322, "step": 5785 }, { "epoch": 0.9526355839828887, "grad_norm": 0.46111520703885855, "learning_rate": 1.355919169045994e-07, "loss": 0.7704, "mean_token_accuracy": 0.7715024873614311, "step": 5790 }, { "epoch": 0.9534582399276063, "grad_norm": 0.45235239066186106, "learning_rate": 1.3091885989376164e-07, "loss": 0.7312, "mean_token_accuracy": 0.7820179462432861, "step": 5795 }, { "epoch": 0.9542808958723238, "grad_norm": 0.4423315597867002, "learning_rate": 1.263272171145369e-07, "loss": 0.7588, "mean_token_accuracy": 0.7744734302163124, "step": 5800 }, { "epoch": 0.9551035518170413, "grad_norm": 0.4458579316470923, "learning_rate": 1.2181702644533e-07, "loss": 0.7394, "mean_token_accuracy": 0.7798651322722435, "step": 5805 }, { "epoch": 0.9559262077617589, "grad_norm": 0.44803472164784147, "learning_rate": 1.1738832509261667e-07, "loss": 0.7506, "mean_token_accuracy": 0.7774027287960052, "step": 5810 }, { "epoch": 0.9567488637064764, "grad_norm": 0.4549915517451572, "learning_rate": 1.1304114959062918e-07, "loss": 0.7575, "mean_token_accuracy": 0.7752734228968621, "step": 5815 }, { "epoch": 0.9575715196511939, "grad_norm": 0.4540473918288485, "learning_rate": 1.0877553580106004e-07, "loss": 0.7666, "mean_token_accuracy": 0.773671455681324, "step": 5820 }, { "epoch": 0.9583941755959114, "grad_norm": 0.4605457383062201, "learning_rate": 1.0459151891276554e-07, "loss": 0.7511, "mean_token_accuracy": 0.7771538272500038, "step": 5825 }, { "epoch": 0.9592168315406289, "grad_norm": 0.45072757260934937, "learning_rate": 1.004891334414737e-07, "loss": 0.7565, "mean_token_accuracy": 0.7750105679035186, "step": 5830 }, { "epoch": 0.9600394874853464, "grad_norm": 0.4429947354253443, "learning_rate": 9.646841322949907e-08, "loss": 0.7414, "mean_token_accuracy": 0.7799411609768867, "step": 5835 }, { "epoch": 0.9608621434300639, "grad_norm": 0.4563538941223601, "learning_rate": 9.252939144546724e-08, "loss": 0.7431, "mean_token_accuracy": 0.7801319226622582, "step": 5840 }, { "epoch": 0.9616847993747815, "grad_norm": 0.4485539567263921, "learning_rate": 8.86721005840363e-08, "loss": 0.7566, "mean_token_accuracy": 0.7754243984818459, "step": 5845 }, { "epoch": 0.962507455319499, "grad_norm": 0.45882821536035695, "learning_rate": 8.489657246563588e-08, "loss": 0.7345, "mean_token_accuracy": 0.7815362140536308, "step": 5850 }, { "epoch": 0.9633301112642165, "grad_norm": 0.453792847846846, "learning_rate": 8.120283823619291e-08, "loss": 0.7437, "mean_token_accuracy": 0.7794279009103775, "step": 5855 }, { "epoch": 0.9641527672089341, "grad_norm": 0.4505048977026569, "learning_rate": 7.759092836688963e-08, "loss": 0.7521, "mean_token_accuracy": 0.7778195083141327, "step": 5860 }, { "epoch": 0.9649754231536516, "grad_norm": 0.44685629597449394, "learning_rate": 7.406087265389938e-08, "loss": 0.7541, "mean_token_accuracy": 0.775357460975647, "step": 5865 }, { "epoch": 0.9657980790983691, "grad_norm": 0.46068954867214335, "learning_rate": 7.06127002181467e-08, "loss": 0.7499, "mean_token_accuracy": 0.7766010835766792, "step": 5870 }, { "epoch": 0.9666207350430867, "grad_norm": 0.4471524544391484, "learning_rate": 6.724643950506871e-08, "loss": 0.7488, "mean_token_accuracy": 0.7771037504076957, "step": 5875 }, { "epoch": 0.9674433909878041, "grad_norm": 0.4498568327668567, "learning_rate": 6.396211828437415e-08, "loss": 0.7522, "mean_token_accuracy": 0.7760146751999855, "step": 5880 }, { "epoch": 0.9682660469325216, "grad_norm": 0.45784438198803984, "learning_rate": 6.075976364982027e-08, "loss": 0.7404, "mean_token_accuracy": 0.7802209571003914, "step": 5885 }, { "epoch": 0.9690887028772391, "grad_norm": 0.4507171628496811, "learning_rate": 5.763940201898965e-08, "loss": 0.7461, "mean_token_accuracy": 0.7771788358688354, "step": 5890 }, { "epoch": 0.9699113588219567, "grad_norm": 0.4504899266002272, "learning_rate": 5.46010591330659e-08, "loss": 0.7557, "mean_token_accuracy": 0.7755205482244492, "step": 5895 }, { "epoch": 0.9707340147666742, "grad_norm": 0.4511955067068251, "learning_rate": 5.164476005662611e-08, "loss": 0.7686, "mean_token_accuracy": 0.7712117403745651, "step": 5900 }, { "epoch": 0.9715566707113917, "grad_norm": 0.45525113701350833, "learning_rate": 4.87705291774343e-08, "loss": 0.751, "mean_token_accuracy": 0.7766298398375511, "step": 5905 }, { "epoch": 0.9723793266561093, "grad_norm": 0.4570453775434546, "learning_rate": 4.5978390206239395e-08, "loss": 0.7587, "mean_token_accuracy": 0.7754485875368118, "step": 5910 }, { "epoch": 0.9732019826008268, "grad_norm": 0.44654797519447686, "learning_rate": 4.32683661765787e-08, "loss": 0.744, "mean_token_accuracy": 0.7790850877761841, "step": 5915 }, { "epoch": 0.9740246385455443, "grad_norm": 0.45402254964040634, "learning_rate": 4.064047944458693e-08, "loss": 0.7597, "mean_token_accuracy": 0.7751212686300277, "step": 5920 }, { "epoch": 0.9748472944902619, "grad_norm": 0.4664477551926387, "learning_rate": 3.809475168881638e-08, "loss": 0.7499, "mean_token_accuracy": 0.7768390193581581, "step": 5925 }, { "epoch": 0.9756699504349793, "grad_norm": 0.44848861368679777, "learning_rate": 3.563120391005481e-08, "loss": 0.7582, "mean_token_accuracy": 0.77487383633852, "step": 5930 }, { "epoch": 0.9764926063796968, "grad_norm": 0.45086781427841577, "learning_rate": 3.3249856431150084e-08, "loss": 0.7499, "mean_token_accuracy": 0.7774128139019012, "step": 5935 }, { "epoch": 0.9773152623244143, "grad_norm": 0.4447594030981159, "learning_rate": 3.095072889684802e-08, "loss": 0.7531, "mean_token_accuracy": 0.7756358876824379, "step": 5940 }, { "epoch": 0.9781379182691319, "grad_norm": 0.44848636566642797, "learning_rate": 2.8733840273623693e-08, "loss": 0.7462, "mean_token_accuracy": 0.7772621959447861, "step": 5945 }, { "epoch": 0.9789605742138494, "grad_norm": 0.45467097067345374, "learning_rate": 2.6599208849531488e-08, "loss": 0.7536, "mean_token_accuracy": 0.7759514361619949, "step": 5950 }, { "epoch": 0.9797832301585669, "grad_norm": 0.4563923766332378, "learning_rate": 2.4546852234050844e-08, "loss": 0.7486, "mean_token_accuracy": 0.7775369539856911, "step": 5955 }, { "epoch": 0.9806058861032845, "grad_norm": 0.47494807813831275, "learning_rate": 2.2576787357940777e-08, "loss": 0.7345, "mean_token_accuracy": 0.7816342487931252, "step": 5960 }, { "epoch": 0.981428542048002, "grad_norm": 0.4614885735908709, "learning_rate": 2.068903047310111e-08, "loss": 0.7334, "mean_token_accuracy": 0.7814917072653771, "step": 5965 }, { "epoch": 0.9822511979927195, "grad_norm": 0.4592050890972982, "learning_rate": 1.888359715243815e-08, "loss": 0.7536, "mean_token_accuracy": 0.7761042430996895, "step": 5970 }, { "epoch": 0.9830738539374371, "grad_norm": 0.458151383366414, "learning_rate": 1.716050228973698e-08, "loss": 0.7451, "mean_token_accuracy": 0.7794349446892739, "step": 5975 }, { "epoch": 0.9838965098821545, "grad_norm": 0.45001260727327136, "learning_rate": 1.551976009953493e-08, "loss": 0.733, "mean_token_accuracy": 0.7825687080621719, "step": 5980 }, { "epoch": 0.984719165826872, "grad_norm": 0.44241564447865017, "learning_rate": 1.3961384117011644e-08, "loss": 0.7474, "mean_token_accuracy": 0.7777259394526481, "step": 5985 }, { "epoch": 0.9855418217715896, "grad_norm": 0.4616890655688477, "learning_rate": 1.2485387197870291e-08, "loss": 0.7685, "mean_token_accuracy": 0.77273228764534, "step": 5990 }, { "epoch": 0.9863644777163071, "grad_norm": 0.46207963041367633, "learning_rate": 1.1091781518235422e-08, "loss": 0.7501, "mean_token_accuracy": 0.7767863020300865, "step": 5995 }, { "epoch": 0.9871871336610246, "grad_norm": 0.5032966101487214, "learning_rate": 9.780578574548615e-09, "loss": 0.7527, "mean_token_accuracy": 0.775577288866043, "step": 6000 }, { "epoch": 0.9880097896057422, "grad_norm": 0.4355620376238061, "learning_rate": 8.551789183480763e-09, "loss": 0.7315, "mean_token_accuracy": 0.7823772370815277, "step": 6005 }, { "epoch": 0.9888324455504597, "grad_norm": 0.45769891479047287, "learning_rate": 7.405423481834373e-09, "loss": 0.745, "mean_token_accuracy": 0.7788512855768204, "step": 6010 }, { "epoch": 0.9896551014951772, "grad_norm": 0.4524094902800803, "learning_rate": 6.3414909264680744e-09, "loss": 0.751, "mean_token_accuracy": 0.7768659979104996, "step": 6015 }, { "epoch": 0.9904777574398947, "grad_norm": 0.4533655422617914, "learning_rate": 5.360000294210022e-09, "loss": 0.7431, "mean_token_accuracy": 0.7788541808724403, "step": 6020 }, { "epoch": 0.9913004133846122, "grad_norm": 0.4661228569878443, "learning_rate": 4.460959681792387e-09, "loss": 0.7403, "mean_token_accuracy": 0.7796418890357018, "step": 6025 }, { "epoch": 0.9921230693293297, "grad_norm": 0.45639931788553617, "learning_rate": 3.644376505783642e-09, "loss": 0.7579, "mean_token_accuracy": 0.7749292984604835, "step": 6030 }, { "epoch": 0.9929457252740472, "grad_norm": 0.47315987682008687, "learning_rate": 2.910257502524161e-09, "loss": 0.7471, "mean_token_accuracy": 0.7778731539845467, "step": 6035 }, { "epoch": 0.9937683812187648, "grad_norm": 0.4541278974568057, "learning_rate": 2.2586087280718203e-09, "loss": 0.7356, "mean_token_accuracy": 0.7806911557912827, "step": 6040 }, { "epoch": 0.9945910371634823, "grad_norm": 0.45570518562060813, "learning_rate": 1.6894355581531518e-09, "loss": 0.7523, "mean_token_accuracy": 0.7767167553305626, "step": 6045 }, { "epoch": 0.9954136931081998, "grad_norm": 0.44820253892323186, "learning_rate": 1.2027426881189298e-09, "loss": 0.7345, "mean_token_accuracy": 0.7809069648385047, "step": 6050 }, { "epoch": 0.9962363490529174, "grad_norm": 0.44801125751276416, "learning_rate": 7.985341329064256e-10, "loss": 0.7553, "mean_token_accuracy": 0.7756011754274368, "step": 6055 }, { "epoch": 0.9970590049976349, "grad_norm": 0.442936078697438, "learning_rate": 4.768132270005499e-10, "loss": 0.7442, "mean_token_accuracy": 0.7779188930988312, "step": 6060 }, { "epoch": 0.9978816609423524, "grad_norm": 0.4383729756522858, "learning_rate": 2.375826244160884e-10, "loss": 0.7494, "mean_token_accuracy": 0.777212829887867, "step": 6065 }, { "epoch": 0.99870431688707, "grad_norm": 0.46217188980246654, "learning_rate": 8.084429866550558e-11, "loss": 0.7543, "mean_token_accuracy": 0.7761295482516288, "step": 6070 }, { "epoch": 0.9995269728317874, "grad_norm": 0.4593482379713257, "learning_rate": 6.59954275117336e-12, "loss": 0.7546, "mean_token_accuracy": 0.7762762635946274, "step": 6075 }, { "epoch": 0.9998560352096745, "mean_token_accuracy": 0.7777425087988377, "step": 6077, "total_flos": 636186997555200.0, "train_loss": 0.8093022745220956, "train_runtime": 266903.0441, "train_samples_per_second": 2.915, "train_steps_per_second": 0.023 } ], "logging_steps": 5, "max_steps": 6077, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 636186997555200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }