{ "best_metric": 1.07904065, "best_model_checkpoint": "/yldm0226/llm_pretrain_output/qwen2-7b/v0-20240722-150910/checkpoint-90000", "epoch": 1.6106324109014087, "eval_steps": 2000, "global_step": 144000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.58401263, "epoch": 1.118494729792645e-05, "grad_norm": 7.40625, "learning_rate": 1.1184431271669837e-09, "loss": 1.83790052, "memory(GiB)": 101.9, "step": 1, "train_speed(iter/s)": 0.030671 }, { "acc": 0.61300579, "epoch": 0.00022369894595852902, "grad_norm": 14.25, "learning_rate": 2.2368862543339672e-08, "loss": 1.68681797, "memory(GiB)": 104.53, "step": 20, "train_speed(iter/s)": 0.250224 }, { "acc": 0.60555315, "epoch": 0.00044739789191705805, "grad_norm": 12.3125, "learning_rate": 4.4737725086679345e-08, "loss": 1.70955944, "memory(GiB)": 117.61, "step": 40, "train_speed(iter/s)": 0.316797 }, { "acc": 0.61611967, "epoch": 0.000671096837875587, "grad_norm": 18.625, "learning_rate": 6.710658763001902e-08, "loss": 1.65505829, "memory(GiB)": 117.61, "step": 60, "train_speed(iter/s)": 0.345463 }, { "acc": 0.60689883, "epoch": 0.0008947957838341161, "grad_norm": 11.375, "learning_rate": 8.947545017335869e-08, "loss": 1.69493446, "memory(GiB)": 117.61, "step": 80, "train_speed(iter/s)": 0.366829 }, { "acc": 0.62171593, "epoch": 0.001118494729792645, "grad_norm": 8.3125, "learning_rate": 1.1184431271669837e-07, "loss": 1.63419189, "memory(GiB)": 117.61, "step": 100, "train_speed(iter/s)": 0.380286 }, { "acc": 0.61374025, "epoch": 0.001342193675751174, "grad_norm": 10.5625, "learning_rate": 1.3421317526003804e-07, "loss": 1.68054218, "memory(GiB)": 117.61, "step": 120, "train_speed(iter/s)": 0.386464 }, { "acc": 0.61110582, "epoch": 0.0015658926217097032, "grad_norm": 16.375, "learning_rate": 1.565820378033777e-07, "loss": 1.68423538, "memory(GiB)": 117.61, "step": 140, "train_speed(iter/s)": 0.393206 }, { "acc": 0.62144737, "epoch": 0.0017895915676682322, "grad_norm": 12.4375, "learning_rate": 1.7895090034671738e-07, "loss": 1.6396986, "memory(GiB)": 117.61, "step": 160, "train_speed(iter/s)": 0.399442 }, { "acc": 0.61322927, "epoch": 0.002013290513626761, "grad_norm": 18.875, "learning_rate": 2.0131976289005705e-07, "loss": 1.66895962, "memory(GiB)": 117.61, "step": 180, "train_speed(iter/s)": 0.403831 }, { "acc": 0.6071619, "epoch": 0.00223698945958529, "grad_norm": 12.5625, "learning_rate": 2.2368862543339674e-07, "loss": 1.69814606, "memory(GiB)": 117.61, "step": 200, "train_speed(iter/s)": 0.407863 }, { "acc": 0.60482159, "epoch": 0.002460688405543819, "grad_norm": 13.4375, "learning_rate": 2.460574879767364e-07, "loss": 1.69889221, "memory(GiB)": 117.61, "step": 220, "train_speed(iter/s)": 0.41246 }, { "acc": 0.6119339, "epoch": 0.002684387351502348, "grad_norm": 9.25, "learning_rate": 2.684263505200761e-07, "loss": 1.68139858, "memory(GiB)": 117.61, "step": 240, "train_speed(iter/s)": 0.415354 }, { "acc": 0.61980085, "epoch": 0.002908086297460877, "grad_norm": 13.1875, "learning_rate": 2.9079521306341575e-07, "loss": 1.63855476, "memory(GiB)": 117.61, "step": 260, "train_speed(iter/s)": 0.416129 }, { "acc": 0.61686287, "epoch": 0.0031317852434194064, "grad_norm": 14.5625, "learning_rate": 3.131640756067554e-07, "loss": 1.67242813, "memory(GiB)": 117.62, "step": 280, "train_speed(iter/s)": 0.418215 }, { "acc": 0.61823235, "epoch": 0.0033554841893779354, "grad_norm": 10.5, "learning_rate": 3.3553293815009514e-07, "loss": 1.62561264, "memory(GiB)": 117.62, "step": 300, "train_speed(iter/s)": 0.420709 }, { "acc": 0.61807985, "epoch": 0.0035791831353364644, "grad_norm": 17.125, "learning_rate": 3.5790180069343476e-07, "loss": 1.65389442, "memory(GiB)": 117.62, "step": 320, "train_speed(iter/s)": 0.422735 }, { "acc": 0.62486267, "epoch": 0.0038028820812949934, "grad_norm": 14.125, "learning_rate": 3.802706632367745e-07, "loss": 1.61433296, "memory(GiB)": 117.62, "step": 340, "train_speed(iter/s)": 0.424214 }, { "acc": 0.62037683, "epoch": 0.004026581027253522, "grad_norm": 10.8125, "learning_rate": 4.026395257801141e-07, "loss": 1.65096283, "memory(GiB)": 117.62, "step": 360, "train_speed(iter/s)": 0.42614 }, { "acc": 0.63148808, "epoch": 0.004250279973212051, "grad_norm": 12.0, "learning_rate": 4.2500838832345377e-07, "loss": 1.58975954, "memory(GiB)": 117.62, "step": 380, "train_speed(iter/s)": 0.426345 }, { "acc": 0.61639957, "epoch": 0.00447397891917058, "grad_norm": 9.25, "learning_rate": 4.473772508667935e-07, "loss": 1.66262512, "memory(GiB)": 117.62, "step": 400, "train_speed(iter/s)": 0.426644 }, { "acc": 0.62606716, "epoch": 0.004697677865129109, "grad_norm": 8.9375, "learning_rate": 4.697461134101331e-07, "loss": 1.60076771, "memory(GiB)": 117.62, "step": 420, "train_speed(iter/s)": 0.427066 }, { "acc": 0.6258441, "epoch": 0.004921376811087638, "grad_norm": 11.8125, "learning_rate": 4.921149759534728e-07, "loss": 1.61572151, "memory(GiB)": 117.62, "step": 440, "train_speed(iter/s)": 0.427763 }, { "acc": 0.63357754, "epoch": 0.005145075757046167, "grad_norm": 12.6875, "learning_rate": 5.144838384968125e-07, "loss": 1.57787266, "memory(GiB)": 117.62, "step": 460, "train_speed(iter/s)": 0.428303 }, { "acc": 0.6152401, "epoch": 0.005368774703004696, "grad_norm": 12.5, "learning_rate": 5.368527010401522e-07, "loss": 1.67216053, "memory(GiB)": 117.62, "step": 480, "train_speed(iter/s)": 0.429007 }, { "acc": 0.62834096, "epoch": 0.005592473648963225, "grad_norm": 10.375, "learning_rate": 5.592215635834918e-07, "loss": 1.60352955, "memory(GiB)": 117.62, "step": 500, "train_speed(iter/s)": 0.429554 }, { "acc": 0.62614722, "epoch": 0.005816172594921754, "grad_norm": 12.375, "learning_rate": 5.815904261268315e-07, "loss": 1.60007286, "memory(GiB)": 117.62, "step": 520, "train_speed(iter/s)": 0.430894 }, { "acc": 0.62772303, "epoch": 0.006039871540880284, "grad_norm": 10.1875, "learning_rate": 6.039592886701712e-07, "loss": 1.60175133, "memory(GiB)": 117.62, "step": 540, "train_speed(iter/s)": 0.431724 }, { "acc": 0.62388868, "epoch": 0.006263570486838813, "grad_norm": 14.4375, "learning_rate": 6.263281512135108e-07, "loss": 1.62173958, "memory(GiB)": 117.62, "step": 560, "train_speed(iter/s)": 0.432991 }, { "acc": 0.62458601, "epoch": 0.006487269432797342, "grad_norm": 12.125, "learning_rate": 6.486970137568505e-07, "loss": 1.61973534, "memory(GiB)": 117.62, "step": 580, "train_speed(iter/s)": 0.433956 }, { "acc": 0.62618923, "epoch": 0.006710968378755871, "grad_norm": 14.75, "learning_rate": 6.710658763001903e-07, "loss": 1.61276169, "memory(GiB)": 117.62, "step": 600, "train_speed(iter/s)": 0.43348 }, { "acc": 0.62921262, "epoch": 0.0069346673247144, "grad_norm": 11.9375, "learning_rate": 6.934347388435298e-07, "loss": 1.59251509, "memory(GiB)": 117.62, "step": 620, "train_speed(iter/s)": 0.434227 }, { "acc": 0.63283491, "epoch": 0.007158366270672929, "grad_norm": 11.0625, "learning_rate": 7.158036013868695e-07, "loss": 1.55291252, "memory(GiB)": 117.62, "step": 640, "train_speed(iter/s)": 0.433938 }, { "acc": 0.62195764, "epoch": 0.007382065216631458, "grad_norm": 9.0625, "learning_rate": 7.381724639302092e-07, "loss": 1.61795368, "memory(GiB)": 117.62, "step": 660, "train_speed(iter/s)": 0.434023 }, { "acc": 0.63778925, "epoch": 0.007605764162589987, "grad_norm": 10.25, "learning_rate": 7.60541326473549e-07, "loss": 1.54154186, "memory(GiB)": 117.62, "step": 680, "train_speed(iter/s)": 0.433819 }, { "acc": 0.63311768, "epoch": 0.007829463108548516, "grad_norm": 9.375, "learning_rate": 7.829101890168886e-07, "loss": 1.56080818, "memory(GiB)": 117.62, "step": 700, "train_speed(iter/s)": 0.433748 }, { "acc": 0.63229198, "epoch": 0.008053162054507044, "grad_norm": 12.0625, "learning_rate": 8.052790515602282e-07, "loss": 1.54572201, "memory(GiB)": 117.62, "step": 720, "train_speed(iter/s)": 0.434145 }, { "acc": 0.63715706, "epoch": 0.008276861000465574, "grad_norm": 9.25, "learning_rate": 8.276479141035679e-07, "loss": 1.52899294, "memory(GiB)": 117.62, "step": 740, "train_speed(iter/s)": 0.433695 }, { "acc": 0.630971, "epoch": 0.008500559946424102, "grad_norm": 8.9375, "learning_rate": 8.500167766469075e-07, "loss": 1.57853565, "memory(GiB)": 117.62, "step": 760, "train_speed(iter/s)": 0.434085 }, { "acc": 0.65611, "epoch": 0.008724258892382632, "grad_norm": 11.0625, "learning_rate": 8.723856391902473e-07, "loss": 1.45154285, "memory(GiB)": 117.62, "step": 780, "train_speed(iter/s)": 0.434881 }, { "acc": 0.63270435, "epoch": 0.00894795783834116, "grad_norm": 8.75, "learning_rate": 8.94754501733587e-07, "loss": 1.54720831, "memory(GiB)": 117.62, "step": 800, "train_speed(iter/s)": 0.435051 }, { "acc": 0.64731321, "epoch": 0.00917165678429969, "grad_norm": 10.6875, "learning_rate": 9.171233642769265e-07, "loss": 1.52091694, "memory(GiB)": 117.62, "step": 820, "train_speed(iter/s)": 0.434897 }, { "acc": 0.64316664, "epoch": 0.009395355730258218, "grad_norm": 11.8125, "learning_rate": 9.394922268202662e-07, "loss": 1.52376089, "memory(GiB)": 117.62, "step": 840, "train_speed(iter/s)": 0.434729 }, { "acc": 0.647855, "epoch": 0.009619054676216748, "grad_norm": 7.96875, "learning_rate": 9.61861089363606e-07, "loss": 1.50671206, "memory(GiB)": 117.62, "step": 860, "train_speed(iter/s)": 0.435146 }, { "acc": 0.63743463, "epoch": 0.009842753622175276, "grad_norm": 10.625, "learning_rate": 9.842299519069457e-07, "loss": 1.54877014, "memory(GiB)": 117.62, "step": 880, "train_speed(iter/s)": 0.435699 }, { "acc": 0.65347614, "epoch": 0.010066452568133806, "grad_norm": 9.9375, "learning_rate": 1.0065988144502852e-06, "loss": 1.4734108, "memory(GiB)": 117.62, "step": 900, "train_speed(iter/s)": 0.435871 }, { "acc": 0.64756165, "epoch": 0.010290151514092334, "grad_norm": 8.125, "learning_rate": 1.028967676993625e-06, "loss": 1.50872746, "memory(GiB)": 117.62, "step": 920, "train_speed(iter/s)": 0.436141 }, { "acc": 0.64697986, "epoch": 0.010513850460050864, "grad_norm": 8.625, "learning_rate": 1.0513365395369646e-06, "loss": 1.50849056, "memory(GiB)": 117.62, "step": 940, "train_speed(iter/s)": 0.436795 }, { "acc": 0.66154251, "epoch": 0.010737549406009392, "grad_norm": 9.6875, "learning_rate": 1.0737054020803043e-06, "loss": 1.45164938, "memory(GiB)": 117.62, "step": 960, "train_speed(iter/s)": 0.436982 }, { "acc": 0.65337954, "epoch": 0.010961248351967922, "grad_norm": 8.5625, "learning_rate": 1.0960742646236439e-06, "loss": 1.48134422, "memory(GiB)": 117.62, "step": 980, "train_speed(iter/s)": 0.43626 }, { "acc": 0.65412164, "epoch": 0.01118494729792645, "grad_norm": 7.875, "learning_rate": 1.1184431271669837e-06, "loss": 1.46567478, "memory(GiB)": 117.62, "step": 1000, "train_speed(iter/s)": 0.436391 }, { "acc": 0.63818817, "epoch": 0.01140864624388498, "grad_norm": 7.8125, "learning_rate": 1.1408119897103232e-06, "loss": 1.52636375, "memory(GiB)": 117.62, "step": 1020, "train_speed(iter/s)": 0.43661 }, { "acc": 0.64717693, "epoch": 0.011632345189843508, "grad_norm": 9.3125, "learning_rate": 1.163180852253663e-06, "loss": 1.50802441, "memory(GiB)": 117.62, "step": 1040, "train_speed(iter/s)": 0.436821 }, { "acc": 0.65894604, "epoch": 0.011856044135802038, "grad_norm": 10.5625, "learning_rate": 1.1855497147970028e-06, "loss": 1.44905519, "memory(GiB)": 117.62, "step": 1060, "train_speed(iter/s)": 0.437259 }, { "acc": 0.65834684, "epoch": 0.012079743081760567, "grad_norm": 7.78125, "learning_rate": 1.2079185773403423e-06, "loss": 1.44631176, "memory(GiB)": 117.62, "step": 1080, "train_speed(iter/s)": 0.437563 }, { "acc": 0.65706844, "epoch": 0.012303442027719096, "grad_norm": 9.75, "learning_rate": 1.230287439883682e-06, "loss": 1.46356163, "memory(GiB)": 117.62, "step": 1100, "train_speed(iter/s)": 0.4376 }, { "acc": 0.65510035, "epoch": 0.012527140973677625, "grad_norm": 9.125, "learning_rate": 1.2526563024270217e-06, "loss": 1.45755072, "memory(GiB)": 117.62, "step": 1120, "train_speed(iter/s)": 0.437948 }, { "acc": 0.66718187, "epoch": 0.012750839919636154, "grad_norm": 11.125, "learning_rate": 1.2750251649703612e-06, "loss": 1.41157808, "memory(GiB)": 117.62, "step": 1140, "train_speed(iter/s)": 0.437314 }, { "acc": 0.65866957, "epoch": 0.012974538865594683, "grad_norm": 13.5625, "learning_rate": 1.297394027513701e-06, "loss": 1.45369205, "memory(GiB)": 117.62, "step": 1160, "train_speed(iter/s)": 0.437593 }, { "acc": 0.66622529, "epoch": 0.013198237811553212, "grad_norm": 9.1875, "learning_rate": 1.3197628900570408e-06, "loss": 1.41443348, "memory(GiB)": 117.62, "step": 1180, "train_speed(iter/s)": 0.437673 }, { "acc": 0.66682572, "epoch": 0.013421936757511741, "grad_norm": 8.625, "learning_rate": 1.3421317526003806e-06, "loss": 1.42252331, "memory(GiB)": 117.62, "step": 1200, "train_speed(iter/s)": 0.437812 }, { "acc": 0.6648139, "epoch": 0.01364563570347027, "grad_norm": 8.25, "learning_rate": 1.3645006151437201e-06, "loss": 1.42548504, "memory(GiB)": 117.62, "step": 1220, "train_speed(iter/s)": 0.438007 }, { "acc": 0.66223741, "epoch": 0.0138693346494288, "grad_norm": 7.65625, "learning_rate": 1.3868694776870597e-06, "loss": 1.42827988, "memory(GiB)": 117.62, "step": 1240, "train_speed(iter/s)": 0.438233 }, { "acc": 0.66515379, "epoch": 0.014093033595387328, "grad_norm": 8.0625, "learning_rate": 1.4092383402303995e-06, "loss": 1.40433578, "memory(GiB)": 117.62, "step": 1260, "train_speed(iter/s)": 0.438249 }, { "acc": 0.65045047, "epoch": 0.014316732541345857, "grad_norm": 11.8125, "learning_rate": 1.431607202773739e-06, "loss": 1.47604408, "memory(GiB)": 117.62, "step": 1280, "train_speed(iter/s)": 0.438443 }, { "acc": 0.65648441, "epoch": 0.014540431487304386, "grad_norm": 8.125, "learning_rate": 1.4539760653170786e-06, "loss": 1.44408751, "memory(GiB)": 117.62, "step": 1300, "train_speed(iter/s)": 0.438855 }, { "acc": 0.65582542, "epoch": 0.014764130433262915, "grad_norm": 8.125, "learning_rate": 1.4763449278604184e-06, "loss": 1.44951611, "memory(GiB)": 117.62, "step": 1320, "train_speed(iter/s)": 0.439074 }, { "acc": 0.66775432, "epoch": 0.014987829379221444, "grad_norm": 8.5625, "learning_rate": 1.498713790403758e-06, "loss": 1.3989584, "memory(GiB)": 117.62, "step": 1340, "train_speed(iter/s)": 0.439258 }, { "acc": 0.67278481, "epoch": 0.015211528325179973, "grad_norm": 9.9375, "learning_rate": 1.521082652947098e-06, "loss": 1.36517639, "memory(GiB)": 117.62, "step": 1360, "train_speed(iter/s)": 0.439329 }, { "acc": 0.66518183, "epoch": 0.015435227271138502, "grad_norm": 7.3125, "learning_rate": 1.5434515154904375e-06, "loss": 1.39231644, "memory(GiB)": 117.62, "step": 1380, "train_speed(iter/s)": 0.439479 }, { "acc": 0.66069012, "epoch": 0.01565892621709703, "grad_norm": 8.3125, "learning_rate": 1.5658203780337773e-06, "loss": 1.42770844, "memory(GiB)": 117.62, "step": 1400, "train_speed(iter/s)": 0.439684 }, { "acc": 0.6686748, "epoch": 0.01588262516305556, "grad_norm": 6.3125, "learning_rate": 1.5881892405771168e-06, "loss": 1.38092718, "memory(GiB)": 117.62, "step": 1420, "train_speed(iter/s)": 0.439902 }, { "acc": 0.6749342, "epoch": 0.016106324109014088, "grad_norm": 8.3125, "learning_rate": 1.6105581031204564e-06, "loss": 1.37825413, "memory(GiB)": 117.62, "step": 1440, "train_speed(iter/s)": 0.439669 }, { "acc": 0.6698019, "epoch": 0.01633002305497262, "grad_norm": 8.25, "learning_rate": 1.6329269656637962e-06, "loss": 1.39120235, "memory(GiB)": 117.62, "step": 1460, "train_speed(iter/s)": 0.439776 }, { "acc": 0.67976723, "epoch": 0.016553722000931147, "grad_norm": 7.40625, "learning_rate": 1.6552958282071357e-06, "loss": 1.31465454, "memory(GiB)": 117.62, "step": 1480, "train_speed(iter/s)": 0.440002 }, { "acc": 0.65486193, "epoch": 0.016777420946889676, "grad_norm": 8.4375, "learning_rate": 1.6776646907504753e-06, "loss": 1.43418674, "memory(GiB)": 117.62, "step": 1500, "train_speed(iter/s)": 0.440199 }, { "acc": 0.67428155, "epoch": 0.017001119892848204, "grad_norm": 7.40625, "learning_rate": 1.700033553293815e-06, "loss": 1.36995335, "memory(GiB)": 117.62, "step": 1520, "train_speed(iter/s)": 0.440113 }, { "acc": 0.66365395, "epoch": 0.017224818838806735, "grad_norm": 7.3125, "learning_rate": 1.722402415837155e-06, "loss": 1.39805012, "memory(GiB)": 117.62, "step": 1540, "train_speed(iter/s)": 0.440404 }, { "acc": 0.67380295, "epoch": 0.017448517784765263, "grad_norm": 7.8125, "learning_rate": 1.7447712783804946e-06, "loss": 1.37404919, "memory(GiB)": 117.62, "step": 1560, "train_speed(iter/s)": 0.440504 }, { "acc": 0.66488619, "epoch": 0.01767221673072379, "grad_norm": 8.0625, "learning_rate": 1.7671401409238342e-06, "loss": 1.40051918, "memory(GiB)": 117.62, "step": 1580, "train_speed(iter/s)": 0.440725 }, { "acc": 0.65940495, "epoch": 0.01789591567668232, "grad_norm": 9.125, "learning_rate": 1.789509003467174e-06, "loss": 1.44215679, "memory(GiB)": 117.62, "step": 1600, "train_speed(iter/s)": 0.440702 }, { "acc": 0.67575378, "epoch": 0.01811961462264085, "grad_norm": 6.96875, "learning_rate": 1.8118778660105135e-06, "loss": 1.35349331, "memory(GiB)": 117.62, "step": 1620, "train_speed(iter/s)": 0.440618 }, { "acc": 0.66316595, "epoch": 0.01834331356859938, "grad_norm": 8.3125, "learning_rate": 1.834246728553853e-06, "loss": 1.40664482, "memory(GiB)": 117.62, "step": 1640, "train_speed(iter/s)": 0.440401 }, { "acc": 0.66099038, "epoch": 0.018567012514557907, "grad_norm": 7.625, "learning_rate": 1.8566155910971929e-06, "loss": 1.42417212, "memory(GiB)": 117.62, "step": 1660, "train_speed(iter/s)": 0.440507 }, { "acc": 0.68456678, "epoch": 0.018790711460516436, "grad_norm": 8.4375, "learning_rate": 1.8789844536405324e-06, "loss": 1.31471424, "memory(GiB)": 117.62, "step": 1680, "train_speed(iter/s)": 0.440804 }, { "acc": 0.66889639, "epoch": 0.019014410406474967, "grad_norm": 8.1875, "learning_rate": 1.9013533161838722e-06, "loss": 1.36207504, "memory(GiB)": 117.62, "step": 1700, "train_speed(iter/s)": 0.440267 }, { "acc": 0.66627913, "epoch": 0.019238109352433495, "grad_norm": 10.375, "learning_rate": 1.923722178727212e-06, "loss": 1.39730244, "memory(GiB)": 117.62, "step": 1720, "train_speed(iter/s)": 0.440086 }, { "acc": 0.68036075, "epoch": 0.019461808298392023, "grad_norm": 7.46875, "learning_rate": 1.9460910412705515e-06, "loss": 1.34147625, "memory(GiB)": 117.62, "step": 1740, "train_speed(iter/s)": 0.440302 }, { "acc": 0.68648028, "epoch": 0.01968550724435055, "grad_norm": 7.71875, "learning_rate": 1.9684599038138913e-06, "loss": 1.2967001, "memory(GiB)": 117.62, "step": 1760, "train_speed(iter/s)": 0.440328 }, { "acc": 0.67244539, "epoch": 0.019909206190309083, "grad_norm": 9.4375, "learning_rate": 1.990828766357231e-06, "loss": 1.35570087, "memory(GiB)": 117.62, "step": 1780, "train_speed(iter/s)": 0.440498 }, { "acc": 0.67700491, "epoch": 0.02013290513626761, "grad_norm": 5.5625, "learning_rate": 2.0131976289005704e-06, "loss": 1.33803444, "memory(GiB)": 117.62, "step": 1800, "train_speed(iter/s)": 0.440695 }, { "acc": 0.68336315, "epoch": 0.02035660408222614, "grad_norm": 13.3125, "learning_rate": 2.03556649144391e-06, "loss": 1.31315908, "memory(GiB)": 117.62, "step": 1820, "train_speed(iter/s)": 0.440716 }, { "acc": 0.68210378, "epoch": 0.020580303028184668, "grad_norm": 7.65625, "learning_rate": 2.05793535398725e-06, "loss": 1.32528343, "memory(GiB)": 117.62, "step": 1840, "train_speed(iter/s)": 0.44069 }, { "acc": 0.67458482, "epoch": 0.0208040019741432, "grad_norm": 8.375, "learning_rate": 2.0803042165305893e-06, "loss": 1.35738935, "memory(GiB)": 117.62, "step": 1860, "train_speed(iter/s)": 0.440848 }, { "acc": 0.67145729, "epoch": 0.021027700920101727, "grad_norm": 8.125, "learning_rate": 2.102673079073929e-06, "loss": 1.36822014, "memory(GiB)": 117.62, "step": 1880, "train_speed(iter/s)": 0.440675 }, { "acc": 0.67249699, "epoch": 0.021251399866060255, "grad_norm": 9.8125, "learning_rate": 2.125041941617269e-06, "loss": 1.36932373, "memory(GiB)": 117.62, "step": 1900, "train_speed(iter/s)": 0.440863 }, { "acc": 0.67407575, "epoch": 0.021475098812018784, "grad_norm": 7.75, "learning_rate": 2.1474108041606087e-06, "loss": 1.3441061, "memory(GiB)": 117.62, "step": 1920, "train_speed(iter/s)": 0.440882 }, { "acc": 0.68230267, "epoch": 0.021698797757977315, "grad_norm": 6.46875, "learning_rate": 2.1697796667039484e-06, "loss": 1.32266197, "memory(GiB)": 117.62, "step": 1940, "train_speed(iter/s)": 0.440745 }, { "acc": 0.66909533, "epoch": 0.021922496703935843, "grad_norm": 9.5, "learning_rate": 2.1921485292472878e-06, "loss": 1.38041115, "memory(GiB)": 117.62, "step": 1960, "train_speed(iter/s)": 0.440869 }, { "acc": 0.67759514, "epoch": 0.02214619564989437, "grad_norm": 7.125, "learning_rate": 2.2145173917906276e-06, "loss": 1.34123259, "memory(GiB)": 117.62, "step": 1980, "train_speed(iter/s)": 0.440932 }, { "acc": 0.67666321, "epoch": 0.0223698945958529, "grad_norm": 8.3125, "learning_rate": 2.2368862543339673e-06, "loss": 1.3396966, "memory(GiB)": 117.62, "step": 2000, "train_speed(iter/s)": 0.441053 }, { "epoch": 0.0223698945958529, "eval_acc": 0.6361438046527366, "eval_loss": 1.3316701650619507, "eval_runtime": 2450.8927, "eval_samples_per_second": 30.717, "eval_steps_per_second": 15.358, "step": 2000 }, { "acc": 0.67806082, "epoch": 0.02259359354181143, "grad_norm": 8.375, "learning_rate": 2.259255116877307e-06, "loss": 1.34437637, "memory(GiB)": 129.59, "step": 2020, "train_speed(iter/s)": 0.285254 }, { "acc": 0.67953653, "epoch": 0.02281729248776996, "grad_norm": 7.0625, "learning_rate": 2.2816239794206465e-06, "loss": 1.33933029, "memory(GiB)": 122.96, "step": 2040, "train_speed(iter/s)": 0.286267 }, { "acc": 0.69605422, "epoch": 0.023040991433728487, "grad_norm": 9.3125, "learning_rate": 2.3039928419639862e-06, "loss": 1.2860816, "memory(GiB)": 122.96, "step": 2060, "train_speed(iter/s)": 0.287251 }, { "acc": 0.69226422, "epoch": 0.023264690379687016, "grad_norm": 6.78125, "learning_rate": 2.326361704507326e-06, "loss": 1.27455425, "memory(GiB)": 122.96, "step": 2080, "train_speed(iter/s)": 0.288238 }, { "acc": 0.67887859, "epoch": 0.023488389325645547, "grad_norm": 8.1875, "learning_rate": 2.3487305670506658e-06, "loss": 1.32311249, "memory(GiB)": 122.96, "step": 2100, "train_speed(iter/s)": 0.289151 }, { "acc": 0.68429108, "epoch": 0.023712088271604075, "grad_norm": 7.28125, "learning_rate": 2.3710994295940056e-06, "loss": 1.30750351, "memory(GiB)": 122.96, "step": 2120, "train_speed(iter/s)": 0.290112 }, { "acc": 0.69901991, "epoch": 0.023935787217562603, "grad_norm": 8.5625, "learning_rate": 2.393468292137345e-06, "loss": 1.25955629, "memory(GiB)": 122.96, "step": 2140, "train_speed(iter/s)": 0.291001 }, { "acc": 0.68692999, "epoch": 0.024159486163521135, "grad_norm": 7.53125, "learning_rate": 2.4158371546806847e-06, "loss": 1.30947609, "memory(GiB)": 122.96, "step": 2160, "train_speed(iter/s)": 0.291826 }, { "acc": 0.68384209, "epoch": 0.024383185109479663, "grad_norm": 7.21875, "learning_rate": 2.4382060172240245e-06, "loss": 1.31815376, "memory(GiB)": 122.96, "step": 2180, "train_speed(iter/s)": 0.292689 }, { "acc": 0.68035574, "epoch": 0.02460688405543819, "grad_norm": 7.09375, "learning_rate": 2.460574879767364e-06, "loss": 1.31523647, "memory(GiB)": 122.96, "step": 2200, "train_speed(iter/s)": 0.293722 }, { "acc": 0.68033056, "epoch": 0.02483058300139672, "grad_norm": 8.375, "learning_rate": 2.4829437423107036e-06, "loss": 1.33527203, "memory(GiB)": 122.96, "step": 2220, "train_speed(iter/s)": 0.294627 }, { "acc": 0.684198, "epoch": 0.02505428194735525, "grad_norm": 7.71875, "learning_rate": 2.5053126048540434e-06, "loss": 1.32213411, "memory(GiB)": 122.96, "step": 2240, "train_speed(iter/s)": 0.295618 }, { "acc": 0.70079365, "epoch": 0.02527798089331378, "grad_norm": 8.6875, "learning_rate": 2.527681467397383e-06, "loss": 1.23095894, "memory(GiB)": 122.96, "step": 2260, "train_speed(iter/s)": 0.296581 }, { "acc": 0.68951759, "epoch": 0.025501679839272307, "grad_norm": 8.4375, "learning_rate": 2.5500503299407225e-06, "loss": 1.29883347, "memory(GiB)": 122.96, "step": 2280, "train_speed(iter/s)": 0.297487 }, { "acc": 0.68598266, "epoch": 0.025725378785230835, "grad_norm": 8.4375, "learning_rate": 2.5724191924840623e-06, "loss": 1.29278316, "memory(GiB)": 122.96, "step": 2300, "train_speed(iter/s)": 0.298302 }, { "acc": 0.68801665, "epoch": 0.025949077731189367, "grad_norm": 8.5625, "learning_rate": 2.594788055027402e-06, "loss": 1.31824036, "memory(GiB)": 122.96, "step": 2320, "train_speed(iter/s)": 0.299241 }, { "acc": 0.69589891, "epoch": 0.026172776677147895, "grad_norm": 6.59375, "learning_rate": 2.617156917570742e-06, "loss": 1.25048132, "memory(GiB)": 122.96, "step": 2340, "train_speed(iter/s)": 0.300158 }, { "acc": 0.67066317, "epoch": 0.026396475623106423, "grad_norm": 6.5625, "learning_rate": 2.6395257801140816e-06, "loss": 1.38632107, "memory(GiB)": 122.96, "step": 2360, "train_speed(iter/s)": 0.300898 }, { "acc": 0.69249887, "epoch": 0.02662017456906495, "grad_norm": 8.4375, "learning_rate": 2.661894642657421e-06, "loss": 1.27349033, "memory(GiB)": 122.96, "step": 2380, "train_speed(iter/s)": 0.301758 }, { "acc": 0.68169699, "epoch": 0.026843873515023483, "grad_norm": 6.28125, "learning_rate": 2.684263505200761e-06, "loss": 1.31538391, "memory(GiB)": 122.96, "step": 2400, "train_speed(iter/s)": 0.302645 }, { "acc": 0.68120146, "epoch": 0.02706757246098201, "grad_norm": 10.625, "learning_rate": 2.7066323677441005e-06, "loss": 1.33719597, "memory(GiB)": 122.96, "step": 2420, "train_speed(iter/s)": 0.303478 }, { "acc": 0.68212957, "epoch": 0.02729127140694054, "grad_norm": 7.59375, "learning_rate": 2.7290012302874403e-06, "loss": 1.32233181, "memory(GiB)": 122.96, "step": 2440, "train_speed(iter/s)": 0.304326 }, { "acc": 0.67873197, "epoch": 0.027514970352899067, "grad_norm": 5.46875, "learning_rate": 2.7513700928307796e-06, "loss": 1.348242, "memory(GiB)": 122.96, "step": 2460, "train_speed(iter/s)": 0.305046 }, { "acc": 0.68609877, "epoch": 0.0277386692988576, "grad_norm": 7.71875, "learning_rate": 2.7737389553741194e-06, "loss": 1.29516029, "memory(GiB)": 122.96, "step": 2480, "train_speed(iter/s)": 0.305733 }, { "acc": 0.68035035, "epoch": 0.027962368244816127, "grad_norm": 14.4375, "learning_rate": 2.7961078179174587e-06, "loss": 1.30853586, "memory(GiB)": 122.96, "step": 2500, "train_speed(iter/s)": 0.306517 }, { "acc": 0.69968171, "epoch": 0.028186067190774655, "grad_norm": 7.0, "learning_rate": 2.818476680460799e-06, "loss": 1.21350842, "memory(GiB)": 122.96, "step": 2520, "train_speed(iter/s)": 0.307264 }, { "acc": 0.6851182, "epoch": 0.028409766136733183, "grad_norm": 7.125, "learning_rate": 2.8408455430041387e-06, "loss": 1.314816, "memory(GiB)": 122.96, "step": 2540, "train_speed(iter/s)": 0.307994 }, { "acc": 0.69167528, "epoch": 0.028633465082691715, "grad_norm": 7.625, "learning_rate": 2.863214405547478e-06, "loss": 1.28667297, "memory(GiB)": 122.96, "step": 2560, "train_speed(iter/s)": 0.308706 }, { "acc": 0.68522787, "epoch": 0.028857164028650243, "grad_norm": 7.21875, "learning_rate": 2.885583268090818e-06, "loss": 1.30050545, "memory(GiB)": 122.96, "step": 2580, "train_speed(iter/s)": 0.309493 }, { "acc": 0.69020834, "epoch": 0.02908086297460877, "grad_norm": 7.71875, "learning_rate": 2.907952130634157e-06, "loss": 1.28359261, "memory(GiB)": 122.96, "step": 2600, "train_speed(iter/s)": 0.310289 }, { "acc": 0.68383312, "epoch": 0.0293045619205673, "grad_norm": 6.375, "learning_rate": 2.9303209931774974e-06, "loss": 1.30797844, "memory(GiB)": 122.96, "step": 2620, "train_speed(iter/s)": 0.311001 }, { "acc": 0.69512386, "epoch": 0.02952826086652583, "grad_norm": 7.8125, "learning_rate": 2.9526898557208367e-06, "loss": 1.26219387, "memory(GiB)": 122.96, "step": 2640, "train_speed(iter/s)": 0.31168 }, { "acc": 0.6981988, "epoch": 0.02975195981248436, "grad_norm": 10.8125, "learning_rate": 2.9750587182641765e-06, "loss": 1.26064796, "memory(GiB)": 122.96, "step": 2660, "train_speed(iter/s)": 0.312345 }, { "acc": 0.68848524, "epoch": 0.029975658758442887, "grad_norm": 7.40625, "learning_rate": 2.997427580807516e-06, "loss": 1.3048914, "memory(GiB)": 122.96, "step": 2680, "train_speed(iter/s)": 0.313074 }, { "acc": 0.70083714, "epoch": 0.030199357704401415, "grad_norm": 7.34375, "learning_rate": 3.019796443350856e-06, "loss": 1.24813862, "memory(GiB)": 122.96, "step": 2700, "train_speed(iter/s)": 0.31372 }, { "acc": 0.69680061, "epoch": 0.030423056650359947, "grad_norm": 14.4375, "learning_rate": 3.042165305894196e-06, "loss": 1.26735382, "memory(GiB)": 122.96, "step": 2720, "train_speed(iter/s)": 0.314369 }, { "acc": 0.69484329, "epoch": 0.030646755596318475, "grad_norm": 8.375, "learning_rate": 3.064534168437535e-06, "loss": 1.26485119, "memory(GiB)": 122.96, "step": 2740, "train_speed(iter/s)": 0.315092 }, { "acc": 0.69722424, "epoch": 0.030870454542277003, "grad_norm": 8.5625, "learning_rate": 3.086903030980875e-06, "loss": 1.24916506, "memory(GiB)": 122.96, "step": 2760, "train_speed(iter/s)": 0.315715 }, { "acc": 0.68573112, "epoch": 0.03109415348823553, "grad_norm": 7.09375, "learning_rate": 3.1092718935242143e-06, "loss": 1.3050993, "memory(GiB)": 122.96, "step": 2780, "train_speed(iter/s)": 0.316414 }, { "acc": 0.68715401, "epoch": 0.03131785243419406, "grad_norm": 10.0625, "learning_rate": 3.1316407560675545e-06, "loss": 1.30362148, "memory(GiB)": 122.96, "step": 2800, "train_speed(iter/s)": 0.317094 }, { "acc": 0.68556404, "epoch": 0.03154155138015259, "grad_norm": 5.5625, "learning_rate": 3.154009618610894e-06, "loss": 1.29131784, "memory(GiB)": 122.96, "step": 2820, "train_speed(iter/s)": 0.317766 }, { "acc": 0.68590207, "epoch": 0.03176525032611112, "grad_norm": 8.0625, "learning_rate": 3.1763784811542336e-06, "loss": 1.31351461, "memory(GiB)": 122.96, "step": 2840, "train_speed(iter/s)": 0.318412 }, { "acc": 0.69050026, "epoch": 0.03198894927206965, "grad_norm": 7.03125, "learning_rate": 3.198747343697573e-06, "loss": 1.27213497, "memory(GiB)": 122.96, "step": 2860, "train_speed(iter/s)": 0.319072 }, { "acc": 0.69015007, "epoch": 0.032212648218028175, "grad_norm": 7.25, "learning_rate": 3.2211162062409128e-06, "loss": 1.28307972, "memory(GiB)": 122.96, "step": 2880, "train_speed(iter/s)": 0.319715 }, { "acc": 0.68392997, "epoch": 0.032436347163986703, "grad_norm": 6.46875, "learning_rate": 3.243485068784253e-06, "loss": 1.30688152, "memory(GiB)": 122.96, "step": 2900, "train_speed(iter/s)": 0.320303 }, { "acc": 0.69477625, "epoch": 0.03266004610994524, "grad_norm": 10.8125, "learning_rate": 3.2658539313275923e-06, "loss": 1.25764999, "memory(GiB)": 122.96, "step": 2920, "train_speed(iter/s)": 0.320982 }, { "acc": 0.67980356, "epoch": 0.03288374505590377, "grad_norm": 7.21875, "learning_rate": 3.288222793870932e-06, "loss": 1.32958632, "memory(GiB)": 122.96, "step": 2940, "train_speed(iter/s)": 0.321633 }, { "acc": 0.68618908, "epoch": 0.033107444001862295, "grad_norm": 8.75, "learning_rate": 3.3105916564142715e-06, "loss": 1.29489031, "memory(GiB)": 122.96, "step": 2960, "train_speed(iter/s)": 0.322288 }, { "acc": 0.69523234, "epoch": 0.03333114294782082, "grad_norm": 7.15625, "learning_rate": 3.3329605189576112e-06, "loss": 1.27237606, "memory(GiB)": 122.96, "step": 2980, "train_speed(iter/s)": 0.322991 }, { "acc": 0.69146729, "epoch": 0.03355484189377935, "grad_norm": 7.59375, "learning_rate": 3.3553293815009506e-06, "loss": 1.27050953, "memory(GiB)": 122.96, "step": 3000, "train_speed(iter/s)": 0.323659 }, { "acc": 0.69193916, "epoch": 0.03377854083973788, "grad_norm": 6.8125, "learning_rate": 3.3776982440442908e-06, "loss": 1.28379269, "memory(GiB)": 122.96, "step": 3020, "train_speed(iter/s)": 0.324202 }, { "acc": 0.69968233, "epoch": 0.03400223978569641, "grad_norm": 10.75, "learning_rate": 3.40006710658763e-06, "loss": 1.24143791, "memory(GiB)": 122.96, "step": 3040, "train_speed(iter/s)": 0.324818 }, { "acc": 0.69856262, "epoch": 0.034225938731654935, "grad_norm": 7.0, "learning_rate": 3.42243596913097e-06, "loss": 1.25594292, "memory(GiB)": 122.96, "step": 3060, "train_speed(iter/s)": 0.325361 }, { "acc": 0.69569182, "epoch": 0.03444963767761347, "grad_norm": 7.21875, "learning_rate": 3.44480483167431e-06, "loss": 1.24745741, "memory(GiB)": 122.96, "step": 3080, "train_speed(iter/s)": 0.325953 }, { "acc": 0.68625526, "epoch": 0.034673336623572, "grad_norm": 7.8125, "learning_rate": 3.4671736942176495e-06, "loss": 1.30939159, "memory(GiB)": 122.96, "step": 3100, "train_speed(iter/s)": 0.326571 }, { "acc": 0.68599644, "epoch": 0.03489703556953053, "grad_norm": 6.5625, "learning_rate": 3.4895425567609892e-06, "loss": 1.31146297, "memory(GiB)": 122.96, "step": 3120, "train_speed(iter/s)": 0.32718 }, { "acc": 0.70258794, "epoch": 0.035120734515489055, "grad_norm": 15.5, "learning_rate": 3.5119114193043286e-06, "loss": 1.23724995, "memory(GiB)": 122.96, "step": 3140, "train_speed(iter/s)": 0.327774 }, { "acc": 0.70028515, "epoch": 0.03534443346144758, "grad_norm": 6.625, "learning_rate": 3.5342802818476684e-06, "loss": 1.22909374, "memory(GiB)": 122.96, "step": 3160, "train_speed(iter/s)": 0.328392 }, { "acc": 0.68161101, "epoch": 0.03556813240740611, "grad_norm": 6.1875, "learning_rate": 3.5566491443910077e-06, "loss": 1.31649561, "memory(GiB)": 122.96, "step": 3180, "train_speed(iter/s)": 0.32896 }, { "acc": 0.69252539, "epoch": 0.03579183135336464, "grad_norm": 8.6875, "learning_rate": 3.579018006934348e-06, "loss": 1.27021894, "memory(GiB)": 122.96, "step": 3200, "train_speed(iter/s)": 0.329556 }, { "acc": 0.70157781, "epoch": 0.036015530299323174, "grad_norm": 9.8125, "learning_rate": 3.6013868694776873e-06, "loss": 1.23860283, "memory(GiB)": 122.96, "step": 3220, "train_speed(iter/s)": 0.330004 }, { "acc": 0.7053298, "epoch": 0.0362392292452817, "grad_norm": 9.875, "learning_rate": 3.623755732021027e-06, "loss": 1.20884666, "memory(GiB)": 122.96, "step": 3240, "train_speed(iter/s)": 0.330582 }, { "acc": 0.69035926, "epoch": 0.03646292819124023, "grad_norm": 7.90625, "learning_rate": 3.646124594564367e-06, "loss": 1.27745438, "memory(GiB)": 122.96, "step": 3260, "train_speed(iter/s)": 0.331144 }, { "acc": 0.70003691, "epoch": 0.03668662713719876, "grad_norm": 6.0625, "learning_rate": 3.668493457107706e-06, "loss": 1.23731956, "memory(GiB)": 122.96, "step": 3280, "train_speed(iter/s)": 0.331626 }, { "acc": 0.70545235, "epoch": 0.03691032608315729, "grad_norm": 6.21875, "learning_rate": 3.6908623196510464e-06, "loss": 1.20646734, "memory(GiB)": 122.96, "step": 3300, "train_speed(iter/s)": 0.332183 }, { "acc": 0.68275557, "epoch": 0.037134025029115815, "grad_norm": 7.125, "learning_rate": 3.7132311821943857e-06, "loss": 1.30821152, "memory(GiB)": 122.96, "step": 3320, "train_speed(iter/s)": 0.332692 }, { "acc": 0.69393644, "epoch": 0.03735772397507434, "grad_norm": 8.25, "learning_rate": 3.7356000447377255e-06, "loss": 1.25308952, "memory(GiB)": 122.96, "step": 3340, "train_speed(iter/s)": 0.333243 }, { "acc": 0.69886985, "epoch": 0.03758142292103287, "grad_norm": 6.5, "learning_rate": 3.757968907281065e-06, "loss": 1.23730488, "memory(GiB)": 122.96, "step": 3360, "train_speed(iter/s)": 0.333784 }, { "acc": 0.69545145, "epoch": 0.037805121866991406, "grad_norm": 8.0, "learning_rate": 3.7803377698244046e-06, "loss": 1.26314678, "memory(GiB)": 122.96, "step": 3380, "train_speed(iter/s)": 0.33429 }, { "acc": 0.70097685, "epoch": 0.038028820812949934, "grad_norm": 7.9375, "learning_rate": 3.8027066323677444e-06, "loss": 1.21980438, "memory(GiB)": 122.96, "step": 3400, "train_speed(iter/s)": 0.334822 }, { "acc": 0.69828978, "epoch": 0.03825251975890846, "grad_norm": 5.75, "learning_rate": 3.825075494911084e-06, "loss": 1.23926668, "memory(GiB)": 122.96, "step": 3420, "train_speed(iter/s)": 0.335347 }, { "acc": 0.68717194, "epoch": 0.03847621870486699, "grad_norm": 8.0625, "learning_rate": 3.847444357454424e-06, "loss": 1.30212011, "memory(GiB)": 122.96, "step": 3440, "train_speed(iter/s)": 0.335736 }, { "acc": 0.69220037, "epoch": 0.03869991765082552, "grad_norm": 7.375, "learning_rate": 3.869813219997764e-06, "loss": 1.27510223, "memory(GiB)": 122.96, "step": 3460, "train_speed(iter/s)": 0.336283 }, { "acc": 0.70324659, "epoch": 0.03892361659678405, "grad_norm": 7.3125, "learning_rate": 3.892182082541103e-06, "loss": 1.23261719, "memory(GiB)": 122.96, "step": 3480, "train_speed(iter/s)": 0.336812 }, { "acc": 0.69968443, "epoch": 0.039147315542742575, "grad_norm": 5.125, "learning_rate": 3.914550945084442e-06, "loss": 1.22574787, "memory(GiB)": 122.96, "step": 3500, "train_speed(iter/s)": 0.337298 }, { "acc": 0.70147686, "epoch": 0.0393710144887011, "grad_norm": 6.4375, "learning_rate": 3.936919807627783e-06, "loss": 1.23704071, "memory(GiB)": 122.96, "step": 3520, "train_speed(iter/s)": 0.337773 }, { "acc": 0.71094656, "epoch": 0.03959471343465964, "grad_norm": 7.3125, "learning_rate": 3.959288670171122e-06, "loss": 1.18971481, "memory(GiB)": 122.96, "step": 3540, "train_speed(iter/s)": 0.338174 }, { "acc": 0.68740625, "epoch": 0.039818412380618166, "grad_norm": 6.75, "learning_rate": 3.981657532714462e-06, "loss": 1.28546543, "memory(GiB)": 122.96, "step": 3560, "train_speed(iter/s)": 0.338705 }, { "acc": 0.70357609, "epoch": 0.040042111326576695, "grad_norm": 6.65625, "learning_rate": 4.0040263952578015e-06, "loss": 1.20702896, "memory(GiB)": 122.96, "step": 3580, "train_speed(iter/s)": 0.339219 }, { "acc": 0.7003489, "epoch": 0.04026581027253522, "grad_norm": 7.09375, "learning_rate": 4.026395257801141e-06, "loss": 1.23482552, "memory(GiB)": 122.96, "step": 3600, "train_speed(iter/s)": 0.339699 }, { "acc": 0.68519983, "epoch": 0.04048950921849375, "grad_norm": 7.65625, "learning_rate": 4.048764120344481e-06, "loss": 1.3021596, "memory(GiB)": 122.96, "step": 3620, "train_speed(iter/s)": 0.340116 }, { "acc": 0.69530478, "epoch": 0.04071320816445228, "grad_norm": 7.40625, "learning_rate": 4.07113298288782e-06, "loss": 1.27201824, "memory(GiB)": 122.96, "step": 3640, "train_speed(iter/s)": 0.340556 }, { "acc": 0.69185171, "epoch": 0.04093690711041081, "grad_norm": 7.875, "learning_rate": 4.093501845431161e-06, "loss": 1.26926661, "memory(GiB)": 122.96, "step": 3660, "train_speed(iter/s)": 0.340977 }, { "acc": 0.70344629, "epoch": 0.041160606056369335, "grad_norm": 6.875, "learning_rate": 4.1158707079745e-06, "loss": 1.22163429, "memory(GiB)": 133.94, "step": 3680, "train_speed(iter/s)": 0.341372 }, { "acc": 0.69899244, "epoch": 0.04138430500232787, "grad_norm": 9.3125, "learning_rate": 4.138239570517839e-06, "loss": 1.23351192, "memory(GiB)": 133.94, "step": 3700, "train_speed(iter/s)": 0.341823 }, { "acc": 0.69377975, "epoch": 0.0416080039482864, "grad_norm": 6.09375, "learning_rate": 4.160608433061179e-06, "loss": 1.25965052, "memory(GiB)": 133.94, "step": 3720, "train_speed(iter/s)": 0.342267 }, { "acc": 0.70693645, "epoch": 0.041831702894244926, "grad_norm": 7.8125, "learning_rate": 4.182977295604519e-06, "loss": 1.20307341, "memory(GiB)": 133.94, "step": 3740, "train_speed(iter/s)": 0.342596 }, { "acc": 0.69127769, "epoch": 0.042055401840203455, "grad_norm": 8.25, "learning_rate": 4.205346158147858e-06, "loss": 1.27115593, "memory(GiB)": 133.94, "step": 3760, "train_speed(iter/s)": 0.343016 }, { "acc": 0.68842306, "epoch": 0.04227910078616198, "grad_norm": 6.1875, "learning_rate": 4.227715020691198e-06, "loss": 1.2952919, "memory(GiB)": 133.94, "step": 3780, "train_speed(iter/s)": 0.34346 }, { "acc": 0.70342836, "epoch": 0.04250279973212051, "grad_norm": 8.25, "learning_rate": 4.250083883234538e-06, "loss": 1.21378288, "memory(GiB)": 133.94, "step": 3800, "train_speed(iter/s)": 0.343907 }, { "acc": 0.68314009, "epoch": 0.04272649867807904, "grad_norm": 6.15625, "learning_rate": 4.272452745777877e-06, "loss": 1.29691277, "memory(GiB)": 133.94, "step": 3820, "train_speed(iter/s)": 0.344317 }, { "acc": 0.69909706, "epoch": 0.04295019762403757, "grad_norm": 9.8125, "learning_rate": 4.294821608321217e-06, "loss": 1.23779316, "memory(GiB)": 133.94, "step": 3840, "train_speed(iter/s)": 0.344752 }, { "acc": 0.70969334, "epoch": 0.0431738965699961, "grad_norm": 7.5625, "learning_rate": 4.317190470864557e-06, "loss": 1.20853262, "memory(GiB)": 133.94, "step": 3860, "train_speed(iter/s)": 0.345221 }, { "acc": 0.69392776, "epoch": 0.04339759551595463, "grad_norm": 7.6875, "learning_rate": 4.339559333407897e-06, "loss": 1.26406288, "memory(GiB)": 133.94, "step": 3880, "train_speed(iter/s)": 0.345596 }, { "acc": 0.69592214, "epoch": 0.04362129446191316, "grad_norm": 8.5625, "learning_rate": 4.361928195951236e-06, "loss": 1.24691944, "memory(GiB)": 133.94, "step": 3900, "train_speed(iter/s)": 0.346017 }, { "acc": 0.69118729, "epoch": 0.04384499340787169, "grad_norm": 7.5625, "learning_rate": 4.3842970584945756e-06, "loss": 1.26892252, "memory(GiB)": 133.94, "step": 3920, "train_speed(iter/s)": 0.346439 }, { "acc": 0.69272909, "epoch": 0.044068692353830215, "grad_norm": 5.34375, "learning_rate": 4.406665921037916e-06, "loss": 1.26197815, "memory(GiB)": 133.94, "step": 3940, "train_speed(iter/s)": 0.346902 }, { "acc": 0.70219755, "epoch": 0.04429239129978874, "grad_norm": 8.0625, "learning_rate": 4.429034783581255e-06, "loss": 1.22058382, "memory(GiB)": 133.94, "step": 3960, "train_speed(iter/s)": 0.3473 }, { "acc": 0.7008009, "epoch": 0.04451609024574727, "grad_norm": 6.5625, "learning_rate": 4.451403646124595e-06, "loss": 1.23451271, "memory(GiB)": 133.94, "step": 3980, "train_speed(iter/s)": 0.347605 }, { "acc": 0.69164658, "epoch": 0.0447397891917058, "grad_norm": 7.5625, "learning_rate": 4.473772508667935e-06, "loss": 1.28374214, "memory(GiB)": 133.94, "step": 4000, "train_speed(iter/s)": 0.347969 }, { "epoch": 0.0447397891917058, "eval_acc": 0.6545729154712228, "eval_loss": 1.2316155433654785, "eval_runtime": 2227.6419, "eval_samples_per_second": 33.795, "eval_steps_per_second": 16.898, "step": 4000 }, { "acc": 0.71144052, "epoch": 0.044963488137664334, "grad_norm": 7.40625, "learning_rate": 4.496141371211275e-06, "loss": 1.18051014, "memory(GiB)": 133.94, "step": 4020, "train_speed(iter/s)": 0.290931 }, { "acc": 0.70320768, "epoch": 0.04518718708362286, "grad_norm": 9.1875, "learning_rate": 4.518510233754614e-06, "loss": 1.21955719, "memory(GiB)": 133.94, "step": 4040, "train_speed(iter/s)": 0.291382 }, { "acc": 0.71792331, "epoch": 0.04541088602958139, "grad_norm": 7.4375, "learning_rate": 4.5408790962979536e-06, "loss": 1.15604763, "memory(GiB)": 133.94, "step": 4060, "train_speed(iter/s)": 0.29185 }, { "acc": 0.70167837, "epoch": 0.04563458497553992, "grad_norm": 6.78125, "learning_rate": 4.563247958841293e-06, "loss": 1.21637735, "memory(GiB)": 133.94, "step": 4080, "train_speed(iter/s)": 0.292326 }, { "acc": 0.71449327, "epoch": 0.04585828392149845, "grad_norm": 7.375, "learning_rate": 4.585616821384633e-06, "loss": 1.17052717, "memory(GiB)": 133.94, "step": 4100, "train_speed(iter/s)": 0.292799 }, { "acc": 0.71154375, "epoch": 0.046081982867456975, "grad_norm": 6.5, "learning_rate": 4.6079856839279725e-06, "loss": 1.17908382, "memory(GiB)": 133.94, "step": 4120, "train_speed(iter/s)": 0.293287 }, { "acc": 0.70784016, "epoch": 0.0463056818134155, "grad_norm": 7.46875, "learning_rate": 4.630354546471313e-06, "loss": 1.19791832, "memory(GiB)": 133.94, "step": 4140, "train_speed(iter/s)": 0.293754 }, { "acc": 0.7047698, "epoch": 0.04652938075937403, "grad_norm": 8.6875, "learning_rate": 4.652723409014652e-06, "loss": 1.19707241, "memory(GiB)": 133.94, "step": 4160, "train_speed(iter/s)": 0.294256 }, { "acc": 0.70071936, "epoch": 0.046753079705332566, "grad_norm": 7.53125, "learning_rate": 4.675092271557991e-06, "loss": 1.23339539, "memory(GiB)": 133.94, "step": 4180, "train_speed(iter/s)": 0.294711 }, { "acc": 0.69165998, "epoch": 0.046976778651291094, "grad_norm": 6.40625, "learning_rate": 4.6974611341013316e-06, "loss": 1.27133436, "memory(GiB)": 133.94, "step": 4200, "train_speed(iter/s)": 0.295233 }, { "acc": 0.70484653, "epoch": 0.04720047759724962, "grad_norm": 10.25, "learning_rate": 4.719829996644671e-06, "loss": 1.18337936, "memory(GiB)": 133.94, "step": 4220, "train_speed(iter/s)": 0.295732 }, { "acc": 0.71020508, "epoch": 0.04742417654320815, "grad_norm": 6.40625, "learning_rate": 4.742198859188011e-06, "loss": 1.1633297, "memory(GiB)": 133.94, "step": 4240, "train_speed(iter/s)": 0.296256 }, { "acc": 0.70359659, "epoch": 0.04764787548916668, "grad_norm": 6.4375, "learning_rate": 4.7645677217313505e-06, "loss": 1.2192565, "memory(GiB)": 133.94, "step": 4260, "train_speed(iter/s)": 0.296708 }, { "acc": 0.68969221, "epoch": 0.04787157443512521, "grad_norm": 8.25, "learning_rate": 4.78693658427469e-06, "loss": 1.28762779, "memory(GiB)": 133.94, "step": 4280, "train_speed(iter/s)": 0.297197 }, { "acc": 0.69996471, "epoch": 0.048095273381083735, "grad_norm": 8.5625, "learning_rate": 4.809305446818029e-06, "loss": 1.2330616, "memory(GiB)": 133.94, "step": 4300, "train_speed(iter/s)": 0.297622 }, { "acc": 0.68728313, "epoch": 0.04831897232704227, "grad_norm": 8.875, "learning_rate": 4.831674309361369e-06, "loss": 1.29330387, "memory(GiB)": 133.94, "step": 4320, "train_speed(iter/s)": 0.298057 }, { "acc": 0.71414585, "epoch": 0.0485426712730008, "grad_norm": 7.375, "learning_rate": 4.8540431719047096e-06, "loss": 1.16966562, "memory(GiB)": 133.94, "step": 4340, "train_speed(iter/s)": 0.298497 }, { "acc": 0.70067911, "epoch": 0.048766370218959326, "grad_norm": 6.625, "learning_rate": 4.876412034448049e-06, "loss": 1.23172951, "memory(GiB)": 133.94, "step": 4360, "train_speed(iter/s)": 0.298949 }, { "acc": 0.69487553, "epoch": 0.048990069164917854, "grad_norm": 7.28125, "learning_rate": 4.898780896991388e-06, "loss": 1.25579786, "memory(GiB)": 133.94, "step": 4380, "train_speed(iter/s)": 0.299408 }, { "acc": 0.70673351, "epoch": 0.04921376811087638, "grad_norm": 7.28125, "learning_rate": 4.921149759534728e-06, "loss": 1.1976243, "memory(GiB)": 133.94, "step": 4400, "train_speed(iter/s)": 0.299843 }, { "acc": 0.70842714, "epoch": 0.04943746705683491, "grad_norm": 4.96875, "learning_rate": 4.943518622078068e-06, "loss": 1.19306202, "memory(GiB)": 133.94, "step": 4420, "train_speed(iter/s)": 0.300219 }, { "acc": 0.69686451, "epoch": 0.04966116600279344, "grad_norm": 6.59375, "learning_rate": 4.965887484621407e-06, "loss": 1.25427818, "memory(GiB)": 133.94, "step": 4440, "train_speed(iter/s)": 0.300632 }, { "acc": 0.6900528, "epoch": 0.04988486494875197, "grad_norm": 6.5625, "learning_rate": 4.988256347164747e-06, "loss": 1.28272963, "memory(GiB)": 133.94, "step": 4460, "train_speed(iter/s)": 0.301104 }, { "acc": 0.71013498, "epoch": 0.0501085638947105, "grad_norm": 7.5625, "learning_rate": 5.010625209708087e-06, "loss": 1.17185001, "memory(GiB)": 133.94, "step": 4480, "train_speed(iter/s)": 0.301558 }, { "acc": 0.69956307, "epoch": 0.05033226284066903, "grad_norm": 6.625, "learning_rate": 5.032994072251426e-06, "loss": 1.22955732, "memory(GiB)": 133.94, "step": 4500, "train_speed(iter/s)": 0.301977 }, { "acc": 0.70256701, "epoch": 0.05055596178662756, "grad_norm": 9.0, "learning_rate": 5.055362934794766e-06, "loss": 1.21367788, "memory(GiB)": 133.94, "step": 4520, "train_speed(iter/s)": 0.30241 }, { "acc": 0.70415564, "epoch": 0.050779660732586086, "grad_norm": 7.46875, "learning_rate": 5.077731797338106e-06, "loss": 1.23468304, "memory(GiB)": 133.94, "step": 4540, "train_speed(iter/s)": 0.302795 }, { "acc": 0.70975976, "epoch": 0.051003359678544614, "grad_norm": 7.9375, "learning_rate": 5.100100659881445e-06, "loss": 1.18833523, "memory(GiB)": 133.94, "step": 4560, "train_speed(iter/s)": 0.303201 }, { "acc": 0.6964035, "epoch": 0.05122705862450314, "grad_norm": 8.125, "learning_rate": 5.122469522424785e-06, "loss": 1.23227415, "memory(GiB)": 133.94, "step": 4580, "train_speed(iter/s)": 0.303622 }, { "acc": 0.70087137, "epoch": 0.05145075757046167, "grad_norm": 4.8125, "learning_rate": 5.1448383849681245e-06, "loss": 1.2193121, "memory(GiB)": 133.94, "step": 4600, "train_speed(iter/s)": 0.304033 }, { "acc": 0.71043568, "epoch": 0.0516744565164202, "grad_norm": 7.0, "learning_rate": 5.167207247511464e-06, "loss": 1.18272305, "memory(GiB)": 133.94, "step": 4620, "train_speed(iter/s)": 0.304498 }, { "acc": 0.68662424, "epoch": 0.051898155462378734, "grad_norm": 7.875, "learning_rate": 5.189576110054804e-06, "loss": 1.28136826, "memory(GiB)": 133.94, "step": 4640, "train_speed(iter/s)": 0.304891 }, { "acc": 0.6951714, "epoch": 0.05212185440833726, "grad_norm": 7.1875, "learning_rate": 5.211944972598144e-06, "loss": 1.252034, "memory(GiB)": 133.94, "step": 4660, "train_speed(iter/s)": 0.305324 }, { "acc": 0.6954412, "epoch": 0.05234555335429579, "grad_norm": 7.0, "learning_rate": 5.234313835141484e-06, "loss": 1.25223427, "memory(GiB)": 133.94, "step": 4680, "train_speed(iter/s)": 0.305699 }, { "acc": 0.70625296, "epoch": 0.05256925230025432, "grad_norm": 8.0625, "learning_rate": 5.256682697684823e-06, "loss": 1.19204254, "memory(GiB)": 133.94, "step": 4700, "train_speed(iter/s)": 0.30612 }, { "acc": 0.70415258, "epoch": 0.052792951246212846, "grad_norm": 7.25, "learning_rate": 5.279051560228163e-06, "loss": 1.19117861, "memory(GiB)": 133.94, "step": 4720, "train_speed(iter/s)": 0.306452 }, { "acc": 0.71505089, "epoch": 0.053016650192171375, "grad_norm": 7.34375, "learning_rate": 5.3014204227715025e-06, "loss": 1.176157, "memory(GiB)": 133.94, "step": 4740, "train_speed(iter/s)": 0.306856 }, { "acc": 0.71358032, "epoch": 0.0532403491381299, "grad_norm": 6.4375, "learning_rate": 5.323789285314842e-06, "loss": 1.15648518, "memory(GiB)": 133.94, "step": 4760, "train_speed(iter/s)": 0.307253 }, { "acc": 0.70210137, "epoch": 0.05346404808408843, "grad_norm": 8.4375, "learning_rate": 5.346158147858181e-06, "loss": 1.22877235, "memory(GiB)": 133.94, "step": 4780, "train_speed(iter/s)": 0.30762 }, { "acc": 0.70027905, "epoch": 0.053687747030046966, "grad_norm": 7.1875, "learning_rate": 5.368527010401522e-06, "loss": 1.24358749, "memory(GiB)": 133.94, "step": 4800, "train_speed(iter/s)": 0.308051 }, { "acc": 0.69930339, "epoch": 0.053911445976005494, "grad_norm": 8.0, "learning_rate": 5.390895872944862e-06, "loss": 1.2314579, "memory(GiB)": 133.94, "step": 4820, "train_speed(iter/s)": 0.308425 }, { "acc": 0.72059245, "epoch": 0.05413514492196402, "grad_norm": 5.9375, "learning_rate": 5.413264735488201e-06, "loss": 1.13674879, "memory(GiB)": 133.94, "step": 4840, "train_speed(iter/s)": 0.308801 }, { "acc": 0.69668798, "epoch": 0.05435884386792255, "grad_norm": 6.09375, "learning_rate": 5.43563359803154e-06, "loss": 1.26371136, "memory(GiB)": 133.94, "step": 4860, "train_speed(iter/s)": 0.309244 }, { "acc": 0.70971222, "epoch": 0.05458254281388108, "grad_norm": 7.34375, "learning_rate": 5.4580024605748805e-06, "loss": 1.18700638, "memory(GiB)": 133.94, "step": 4880, "train_speed(iter/s)": 0.309676 }, { "acc": 0.69180269, "epoch": 0.054806241759839606, "grad_norm": 8.4375, "learning_rate": 5.48037132311822e-06, "loss": 1.27055044, "memory(GiB)": 133.94, "step": 4900, "train_speed(iter/s)": 0.310041 }, { "acc": 0.72104521, "epoch": 0.055029940705798135, "grad_norm": 7.625, "learning_rate": 5.502740185661559e-06, "loss": 1.14737167, "memory(GiB)": 133.94, "step": 4920, "train_speed(iter/s)": 0.31042 }, { "acc": 0.70087247, "epoch": 0.05525363965175666, "grad_norm": 8.0625, "learning_rate": 5.5251090482048994e-06, "loss": 1.2462883, "memory(GiB)": 133.94, "step": 4940, "train_speed(iter/s)": 0.310824 }, { "acc": 0.71955991, "epoch": 0.0554773385977152, "grad_norm": 8.125, "learning_rate": 5.547477910748239e-06, "loss": 1.13706083, "memory(GiB)": 133.94, "step": 4960, "train_speed(iter/s)": 0.311248 }, { "acc": 0.71838131, "epoch": 0.055701037543673726, "grad_norm": 7.34375, "learning_rate": 5.569846773291578e-06, "loss": 1.14620428, "memory(GiB)": 133.94, "step": 4980, "train_speed(iter/s)": 0.311581 }, { "acc": 0.70051203, "epoch": 0.055924736489632254, "grad_norm": 7.0625, "learning_rate": 5.5922156358349175e-06, "loss": 1.22514, "memory(GiB)": 133.94, "step": 5000, "train_speed(iter/s)": 0.311945 }, { "acc": 0.71180286, "epoch": 0.05614843543559078, "grad_norm": 8.625, "learning_rate": 5.6145844983782585e-06, "loss": 1.17334223, "memory(GiB)": 133.94, "step": 5020, "train_speed(iter/s)": 0.312251 }, { "acc": 0.68782821, "epoch": 0.05637213438154931, "grad_norm": 8.625, "learning_rate": 5.636953360921598e-06, "loss": 1.27938023, "memory(GiB)": 133.94, "step": 5040, "train_speed(iter/s)": 0.312639 }, { "acc": 0.71260052, "epoch": 0.05659583332750784, "grad_norm": 5.625, "learning_rate": 5.659322223464937e-06, "loss": 1.17678318, "memory(GiB)": 133.94, "step": 5060, "train_speed(iter/s)": 0.313022 }, { "acc": 0.69580712, "epoch": 0.05681953227346637, "grad_norm": 8.125, "learning_rate": 5.6816910860082774e-06, "loss": 1.24280796, "memory(GiB)": 133.94, "step": 5080, "train_speed(iter/s)": 0.3134 }, { "acc": 0.70884991, "epoch": 0.057043231219424895, "grad_norm": 8.3125, "learning_rate": 5.704059948551617e-06, "loss": 1.19112091, "memory(GiB)": 133.94, "step": 5100, "train_speed(iter/s)": 0.313821 }, { "acc": 0.70917234, "epoch": 0.05726693016538343, "grad_norm": 7.90625, "learning_rate": 5.726428811094956e-06, "loss": 1.19651384, "memory(GiB)": 133.94, "step": 5120, "train_speed(iter/s)": 0.314162 }, { "acc": 0.71320543, "epoch": 0.05749062911134196, "grad_norm": 5.75, "learning_rate": 5.7487976736382955e-06, "loss": 1.1548027, "memory(GiB)": 133.94, "step": 5140, "train_speed(iter/s)": 0.31449 }, { "acc": 0.70506725, "epoch": 0.057714328057300486, "grad_norm": 6.5, "learning_rate": 5.771166536181636e-06, "loss": 1.2190588, "memory(GiB)": 133.94, "step": 5160, "train_speed(iter/s)": 0.314886 }, { "acc": 0.71700821, "epoch": 0.057938027003259014, "grad_norm": 7.5625, "learning_rate": 5.793535398724975e-06, "loss": 1.15571423, "memory(GiB)": 133.94, "step": 5180, "train_speed(iter/s)": 0.315227 }, { "acc": 0.71175995, "epoch": 0.05816172594921754, "grad_norm": 6.875, "learning_rate": 5.815904261268314e-06, "loss": 1.16734171, "memory(GiB)": 133.94, "step": 5200, "train_speed(iter/s)": 0.315581 }, { "acc": 0.70732269, "epoch": 0.05838542489517607, "grad_norm": 8.375, "learning_rate": 5.838273123811654e-06, "loss": 1.19824858, "memory(GiB)": 133.94, "step": 5220, "train_speed(iter/s)": 0.315949 }, { "acc": 0.71267414, "epoch": 0.0586091238411346, "grad_norm": 6.8125, "learning_rate": 5.860641986354995e-06, "loss": 1.15677395, "memory(GiB)": 133.94, "step": 5240, "train_speed(iter/s)": 0.316327 }, { "acc": 0.70294409, "epoch": 0.05883282278709313, "grad_norm": 6.25, "learning_rate": 5.883010848898334e-06, "loss": 1.22120609, "memory(GiB)": 133.94, "step": 5260, "train_speed(iter/s)": 0.316676 }, { "acc": 0.70331631, "epoch": 0.05905652173305166, "grad_norm": 5.6875, "learning_rate": 5.9053797114416735e-06, "loss": 1.21643677, "memory(GiB)": 133.94, "step": 5280, "train_speed(iter/s)": 0.317 }, { "acc": 0.69919395, "epoch": 0.05928022067901019, "grad_norm": 6.5, "learning_rate": 5.927748573985014e-06, "loss": 1.23891363, "memory(GiB)": 133.94, "step": 5300, "train_speed(iter/s)": 0.317356 }, { "acc": 0.70114317, "epoch": 0.05950391962496872, "grad_norm": 6.96875, "learning_rate": 5.950117436528353e-06, "loss": 1.21891851, "memory(GiB)": 133.94, "step": 5320, "train_speed(iter/s)": 0.317717 }, { "acc": 0.69542036, "epoch": 0.059727618570927246, "grad_norm": 7.1875, "learning_rate": 5.972486299071692e-06, "loss": 1.24147148, "memory(GiB)": 133.94, "step": 5340, "train_speed(iter/s)": 0.31805 }, { "acc": 0.71833568, "epoch": 0.059951317516885774, "grad_norm": 6.75, "learning_rate": 5.994855161615032e-06, "loss": 1.1319294, "memory(GiB)": 133.94, "step": 5360, "train_speed(iter/s)": 0.318389 }, { "acc": 0.71129875, "epoch": 0.0601750164628443, "grad_norm": 9.875, "learning_rate": 6.017224024158373e-06, "loss": 1.17768116, "memory(GiB)": 133.94, "step": 5380, "train_speed(iter/s)": 0.318723 }, { "acc": 0.7064271, "epoch": 0.06039871540880283, "grad_norm": 6.875, "learning_rate": 6.039592886701712e-06, "loss": 1.19056797, "memory(GiB)": 133.94, "step": 5400, "train_speed(iter/s)": 0.319092 }, { "acc": 0.70623207, "epoch": 0.060622414354761366, "grad_norm": 6.65625, "learning_rate": 6.0619617492450515e-06, "loss": 1.20473309, "memory(GiB)": 133.94, "step": 5420, "train_speed(iter/s)": 0.319428 }, { "acc": 0.70178003, "epoch": 0.060846113300719894, "grad_norm": 6.15625, "learning_rate": 6.084330611788392e-06, "loss": 1.21495361, "memory(GiB)": 133.94, "step": 5440, "train_speed(iter/s)": 0.319789 }, { "acc": 0.71124325, "epoch": 0.06106981224667842, "grad_norm": 7.78125, "learning_rate": 6.106699474331731e-06, "loss": 1.17225895, "memory(GiB)": 133.94, "step": 5460, "train_speed(iter/s)": 0.320151 }, { "acc": 0.71542439, "epoch": 0.06129351119263695, "grad_norm": 8.6875, "learning_rate": 6.12906833687507e-06, "loss": 1.16530876, "memory(GiB)": 133.94, "step": 5480, "train_speed(iter/s)": 0.3205 }, { "acc": 0.7102325, "epoch": 0.06151721013859548, "grad_norm": 7.90625, "learning_rate": 6.15143719941841e-06, "loss": 1.19558744, "memory(GiB)": 133.94, "step": 5500, "train_speed(iter/s)": 0.320768 }, { "acc": 0.6879271, "epoch": 0.061740909084554006, "grad_norm": 8.3125, "learning_rate": 6.17380606196175e-06, "loss": 1.28028545, "memory(GiB)": 133.94, "step": 5520, "train_speed(iter/s)": 0.321097 }, { "acc": 0.70252686, "epoch": 0.061964608030512534, "grad_norm": 7.0625, "learning_rate": 6.196174924505089e-06, "loss": 1.21234112, "memory(GiB)": 133.94, "step": 5540, "train_speed(iter/s)": 0.321433 }, { "acc": 0.71264577, "epoch": 0.06218830697647106, "grad_norm": 8.75, "learning_rate": 6.218543787048429e-06, "loss": 1.19027557, "memory(GiB)": 133.94, "step": 5560, "train_speed(iter/s)": 0.321793 }, { "acc": 0.70524907, "epoch": 0.0624120059224296, "grad_norm": 7.46875, "learning_rate": 6.240912649591768e-06, "loss": 1.20299397, "memory(GiB)": 133.94, "step": 5580, "train_speed(iter/s)": 0.322128 }, { "acc": 0.69689822, "epoch": 0.06263570486838813, "grad_norm": 6.9375, "learning_rate": 6.263281512135109e-06, "loss": 1.24121914, "memory(GiB)": 133.94, "step": 5600, "train_speed(iter/s)": 0.322457 }, { "acc": 0.70542755, "epoch": 0.06285940381434665, "grad_norm": 7.0, "learning_rate": 6.285650374678448e-06, "loss": 1.20799913, "memory(GiB)": 133.94, "step": 5620, "train_speed(iter/s)": 0.322767 }, { "acc": 0.71085777, "epoch": 0.06308310276030518, "grad_norm": 6.78125, "learning_rate": 6.308019237221788e-06, "loss": 1.17015629, "memory(GiB)": 133.94, "step": 5640, "train_speed(iter/s)": 0.323054 }, { "acc": 0.70876389, "epoch": 0.06330680170626371, "grad_norm": 7.09375, "learning_rate": 6.330388099765128e-06, "loss": 1.18746033, "memory(GiB)": 133.94, "step": 5660, "train_speed(iter/s)": 0.3234 }, { "acc": 0.7041647, "epoch": 0.06353050065222224, "grad_norm": 6.21875, "learning_rate": 6.352756962308467e-06, "loss": 1.21224499, "memory(GiB)": 133.94, "step": 5680, "train_speed(iter/s)": 0.323704 }, { "acc": 0.706534, "epoch": 0.06375419959818077, "grad_norm": 8.5, "learning_rate": 6.375125824851807e-06, "loss": 1.1935791, "memory(GiB)": 133.94, "step": 5700, "train_speed(iter/s)": 0.324012 }, { "acc": 0.713269, "epoch": 0.0639778985441393, "grad_norm": 8.125, "learning_rate": 6.397494687395146e-06, "loss": 1.16382656, "memory(GiB)": 133.94, "step": 5720, "train_speed(iter/s)": 0.324338 }, { "acc": 0.70960655, "epoch": 0.06420159749009782, "grad_norm": 7.1875, "learning_rate": 6.419863549938486e-06, "loss": 1.19942303, "memory(GiB)": 133.94, "step": 5740, "train_speed(iter/s)": 0.324673 }, { "acc": 0.71067586, "epoch": 0.06442529643605635, "grad_norm": 6.90625, "learning_rate": 6.4422324124818256e-06, "loss": 1.17117081, "memory(GiB)": 133.94, "step": 5760, "train_speed(iter/s)": 0.324943 }, { "acc": 0.70324068, "epoch": 0.06464899538201488, "grad_norm": 5.0, "learning_rate": 6.464601275025165e-06, "loss": 1.19875126, "memory(GiB)": 133.94, "step": 5780, "train_speed(iter/s)": 0.325276 }, { "acc": 0.70012817, "epoch": 0.06487269432797341, "grad_norm": 8.0625, "learning_rate": 6.486970137568506e-06, "loss": 1.24320927, "memory(GiB)": 133.94, "step": 5800, "train_speed(iter/s)": 0.325593 }, { "acc": 0.71798878, "epoch": 0.06509639327393195, "grad_norm": 6.65625, "learning_rate": 6.509339000111845e-06, "loss": 1.1504734, "memory(GiB)": 133.94, "step": 5820, "train_speed(iter/s)": 0.325856 }, { "acc": 0.70864697, "epoch": 0.06532009221989048, "grad_norm": 7.4375, "learning_rate": 6.531707862655185e-06, "loss": 1.18512163, "memory(GiB)": 133.94, "step": 5840, "train_speed(iter/s)": 0.326184 }, { "acc": 0.70438123, "epoch": 0.065543791165849, "grad_norm": 9.375, "learning_rate": 6.554076725198524e-06, "loss": 1.21252937, "memory(GiB)": 133.94, "step": 5860, "train_speed(iter/s)": 0.326495 }, { "acc": 0.6936429, "epoch": 0.06576749011180753, "grad_norm": 6.46875, "learning_rate": 6.576445587741864e-06, "loss": 1.25809441, "memory(GiB)": 133.94, "step": 5880, "train_speed(iter/s)": 0.326839 }, { "acc": 0.69664536, "epoch": 0.06599118905776606, "grad_norm": 6.25, "learning_rate": 6.5988144502852036e-06, "loss": 1.24299068, "memory(GiB)": 133.94, "step": 5900, "train_speed(iter/s)": 0.327089 }, { "acc": 0.70921383, "epoch": 0.06621488800372459, "grad_norm": 6.28125, "learning_rate": 6.621183312828543e-06, "loss": 1.17858963, "memory(GiB)": 133.94, "step": 5920, "train_speed(iter/s)": 0.327372 }, { "acc": 0.7070662, "epoch": 0.06643858694968312, "grad_norm": 6.625, "learning_rate": 6.643552175371882e-06, "loss": 1.19148922, "memory(GiB)": 133.94, "step": 5940, "train_speed(iter/s)": 0.327661 }, { "acc": 0.70910864, "epoch": 0.06666228589564165, "grad_norm": 7.09375, "learning_rate": 6.6659210379152225e-06, "loss": 1.18391342, "memory(GiB)": 133.94, "step": 5960, "train_speed(iter/s)": 0.327988 }, { "acc": 0.70565214, "epoch": 0.06688598484160017, "grad_norm": 6.1875, "learning_rate": 6.688289900458562e-06, "loss": 1.20718775, "memory(GiB)": 133.94, "step": 5980, "train_speed(iter/s)": 0.328246 }, { "acc": 0.70225859, "epoch": 0.0671096837875587, "grad_norm": 8.5, "learning_rate": 6.710658763001901e-06, "loss": 1.21963902, "memory(GiB)": 133.94, "step": 6000, "train_speed(iter/s)": 0.328536 }, { "epoch": 0.0671096837875587, "eval_acc": 0.6634258552673871, "eval_loss": 1.1868011951446533, "eval_runtime": 2226.3478, "eval_samples_per_second": 33.815, "eval_steps_per_second": 16.908, "step": 6000 }, { "acc": 0.71475754, "epoch": 0.06733338273351723, "grad_norm": 8.9375, "learning_rate": 6.733027625545242e-06, "loss": 1.16224632, "memory(GiB)": 133.94, "step": 6020, "train_speed(iter/s)": 0.292479 }, { "acc": 0.70758181, "epoch": 0.06755708167947576, "grad_norm": 6.65625, "learning_rate": 6.7553964880885816e-06, "loss": 1.19684181, "memory(GiB)": 133.94, "step": 6040, "train_speed(iter/s)": 0.292793 }, { "acc": 0.72041988, "epoch": 0.06778078062543429, "grad_norm": 6.75, "learning_rate": 6.777765350631921e-06, "loss": 1.14944115, "memory(GiB)": 133.94, "step": 6060, "train_speed(iter/s)": 0.293131 }, { "acc": 0.71069708, "epoch": 0.06800447957139281, "grad_norm": 8.25, "learning_rate": 6.80013421317526e-06, "loss": 1.18205814, "memory(GiB)": 133.94, "step": 6080, "train_speed(iter/s)": 0.293483 }, { "acc": 0.72092438, "epoch": 0.06822817851735134, "grad_norm": 6.5, "learning_rate": 6.8225030757186005e-06, "loss": 1.14166431, "memory(GiB)": 133.94, "step": 6100, "train_speed(iter/s)": 0.293818 }, { "acc": 0.71446552, "epoch": 0.06845187746330987, "grad_norm": 7.5625, "learning_rate": 6.84487193826194e-06, "loss": 1.17158928, "memory(GiB)": 133.94, "step": 6120, "train_speed(iter/s)": 0.294107 }, { "acc": 0.70979633, "epoch": 0.06867557640926841, "grad_norm": 7.0625, "learning_rate": 6.867240800805279e-06, "loss": 1.17613716, "memory(GiB)": 133.94, "step": 6140, "train_speed(iter/s)": 0.294419 }, { "acc": 0.69972682, "epoch": 0.06889927535522694, "grad_norm": 7.65625, "learning_rate": 6.88960966334862e-06, "loss": 1.22395, "memory(GiB)": 133.94, "step": 6160, "train_speed(iter/s)": 0.294773 }, { "acc": 0.70565343, "epoch": 0.06912297430118547, "grad_norm": 10.125, "learning_rate": 6.9119785258919596e-06, "loss": 1.20875196, "memory(GiB)": 133.94, "step": 6180, "train_speed(iter/s)": 0.295109 }, { "acc": 0.70540972, "epoch": 0.069346673247144, "grad_norm": 5.34375, "learning_rate": 6.934347388435299e-06, "loss": 1.21815147, "memory(GiB)": 133.94, "step": 6200, "train_speed(iter/s)": 0.295426 }, { "acc": 0.70349545, "epoch": 0.06957037219310253, "grad_norm": 6.90625, "learning_rate": 6.956716250978638e-06, "loss": 1.22017365, "memory(GiB)": 133.94, "step": 6220, "train_speed(iter/s)": 0.295719 }, { "acc": 0.71152563, "epoch": 0.06979407113906105, "grad_norm": 9.1875, "learning_rate": 6.9790851135219785e-06, "loss": 1.18589411, "memory(GiB)": 133.94, "step": 6240, "train_speed(iter/s)": 0.296072 }, { "acc": 0.71993084, "epoch": 0.07001777008501958, "grad_norm": 7.5, "learning_rate": 7.001453976065318e-06, "loss": 1.13230572, "memory(GiB)": 133.94, "step": 6260, "train_speed(iter/s)": 0.29635 }, { "acc": 0.69122057, "epoch": 0.07024146903097811, "grad_norm": 6.78125, "learning_rate": 7.023822838608657e-06, "loss": 1.25324364, "memory(GiB)": 133.94, "step": 6280, "train_speed(iter/s)": 0.296673 }, { "acc": 0.7071538, "epoch": 0.07046516797693664, "grad_norm": 7.0625, "learning_rate": 7.0461917011519965e-06, "loss": 1.19000664, "memory(GiB)": 133.94, "step": 6300, "train_speed(iter/s)": 0.297012 }, { "acc": 0.71617451, "epoch": 0.07068886692289517, "grad_norm": 7.0625, "learning_rate": 7.068560563695337e-06, "loss": 1.14369497, "memory(GiB)": 133.94, "step": 6320, "train_speed(iter/s)": 0.297339 }, { "acc": 0.7028152, "epoch": 0.0709125658688537, "grad_norm": 7.9375, "learning_rate": 7.090929426238676e-06, "loss": 1.22286034, "memory(GiB)": 133.94, "step": 6340, "train_speed(iter/s)": 0.297648 }, { "acc": 0.70769863, "epoch": 0.07113626481481222, "grad_norm": 5.1875, "learning_rate": 7.113298288782015e-06, "loss": 1.18925772, "memory(GiB)": 133.94, "step": 6360, "train_speed(iter/s)": 0.297961 }, { "acc": 0.69733582, "epoch": 0.07135996376077075, "grad_norm": 7.15625, "learning_rate": 7.1356671513253565e-06, "loss": 1.25076542, "memory(GiB)": 133.94, "step": 6380, "train_speed(iter/s)": 0.29827 }, { "acc": 0.71634984, "epoch": 0.07158366270672928, "grad_norm": 6.6875, "learning_rate": 7.158036013868696e-06, "loss": 1.14090681, "memory(GiB)": 133.94, "step": 6400, "train_speed(iter/s)": 0.29856 }, { "acc": 0.71134529, "epoch": 0.0718073616526878, "grad_norm": 7.25, "learning_rate": 7.180404876412035e-06, "loss": 1.16146927, "memory(GiB)": 141.16, "step": 6420, "train_speed(iter/s)": 0.298811 }, { "acc": 0.70635548, "epoch": 0.07203106059864635, "grad_norm": 8.375, "learning_rate": 7.2027737389553745e-06, "loss": 1.18586845, "memory(GiB)": 141.16, "step": 6440, "train_speed(iter/s)": 0.299104 }, { "acc": 0.71682491, "epoch": 0.07225475954460488, "grad_norm": 6.8125, "learning_rate": 7.225142601498715e-06, "loss": 1.15319319, "memory(GiB)": 141.16, "step": 6460, "train_speed(iter/s)": 0.299425 }, { "acc": 0.72486391, "epoch": 0.0724784584905634, "grad_norm": 6.84375, "learning_rate": 7.247511464042054e-06, "loss": 1.10170918, "memory(GiB)": 141.16, "step": 6480, "train_speed(iter/s)": 0.299744 }, { "acc": 0.71807365, "epoch": 0.07270215743652193, "grad_norm": 7.53125, "learning_rate": 7.269880326585393e-06, "loss": 1.14699602, "memory(GiB)": 141.16, "step": 6500, "train_speed(iter/s)": 0.300056 }, { "acc": 0.70191994, "epoch": 0.07292585638248046, "grad_norm": 7.4375, "learning_rate": 7.292249189128734e-06, "loss": 1.22725506, "memory(GiB)": 141.16, "step": 6520, "train_speed(iter/s)": 0.300378 }, { "acc": 0.71579719, "epoch": 0.07314955532843899, "grad_norm": 7.90625, "learning_rate": 7.314618051672073e-06, "loss": 1.15196152, "memory(GiB)": 141.16, "step": 6540, "train_speed(iter/s)": 0.300702 }, { "acc": 0.71728005, "epoch": 0.07337325427439752, "grad_norm": 7.53125, "learning_rate": 7.336986914215412e-06, "loss": 1.1454505, "memory(GiB)": 141.16, "step": 6560, "train_speed(iter/s)": 0.301021 }, { "acc": 0.72508287, "epoch": 0.07359695322035605, "grad_norm": 8.125, "learning_rate": 7.359355776758752e-06, "loss": 1.1259778, "memory(GiB)": 141.16, "step": 6580, "train_speed(iter/s)": 0.301344 }, { "acc": 0.71181598, "epoch": 0.07382065216631457, "grad_norm": 7.09375, "learning_rate": 7.381724639302093e-06, "loss": 1.17416019, "memory(GiB)": 141.16, "step": 6600, "train_speed(iter/s)": 0.301639 }, { "acc": 0.70789728, "epoch": 0.0740443511122731, "grad_norm": 5.40625, "learning_rate": 7.404093501845432e-06, "loss": 1.18944798, "memory(GiB)": 141.16, "step": 6620, "train_speed(iter/s)": 0.301941 }, { "acc": 0.72131305, "epoch": 0.07426805005823163, "grad_norm": 9.8125, "learning_rate": 7.426462364388771e-06, "loss": 1.1319376, "memory(GiB)": 141.16, "step": 6640, "train_speed(iter/s)": 0.302243 }, { "acc": 0.7154706, "epoch": 0.07449174900419016, "grad_norm": 6.0, "learning_rate": 7.448831226932111e-06, "loss": 1.16500502, "memory(GiB)": 141.16, "step": 6660, "train_speed(iter/s)": 0.302547 }, { "acc": 0.70834274, "epoch": 0.07471544795014869, "grad_norm": 6.0, "learning_rate": 7.471200089475451e-06, "loss": 1.18073711, "memory(GiB)": 141.16, "step": 6680, "train_speed(iter/s)": 0.302803 }, { "acc": 0.71213508, "epoch": 0.07493914689610721, "grad_norm": 6.5, "learning_rate": 7.49356895201879e-06, "loss": 1.16633759, "memory(GiB)": 141.16, "step": 6700, "train_speed(iter/s)": 0.30308 }, { "acc": 0.72741919, "epoch": 0.07516284584206574, "grad_norm": 7.75, "learning_rate": 7.51593781456213e-06, "loss": 1.09792519, "memory(GiB)": 141.16, "step": 6720, "train_speed(iter/s)": 0.303388 }, { "acc": 0.72319527, "epoch": 0.07538654478802427, "grad_norm": 8.0625, "learning_rate": 7.53830667710547e-06, "loss": 1.10844402, "memory(GiB)": 141.16, "step": 6740, "train_speed(iter/s)": 0.303695 }, { "acc": 0.70836792, "epoch": 0.07561024373398281, "grad_norm": 6.375, "learning_rate": 7.560675539648809e-06, "loss": 1.18512821, "memory(GiB)": 141.16, "step": 6760, "train_speed(iter/s)": 0.303954 }, { "acc": 0.71272287, "epoch": 0.07583394267994134, "grad_norm": 8.0, "learning_rate": 7.5830444021921486e-06, "loss": 1.16944399, "memory(GiB)": 141.16, "step": 6780, "train_speed(iter/s)": 0.304244 }, { "acc": 0.71158438, "epoch": 0.07605764162589987, "grad_norm": 8.5625, "learning_rate": 7.605413264735489e-06, "loss": 1.18131618, "memory(GiB)": 141.16, "step": 6800, "train_speed(iter/s)": 0.304559 }, { "acc": 0.72281461, "epoch": 0.0762813405718584, "grad_norm": 9.5, "learning_rate": 7.627782127278829e-06, "loss": 1.11913595, "memory(GiB)": 141.16, "step": 6820, "train_speed(iter/s)": 0.30485 }, { "acc": 0.724998, "epoch": 0.07650503951781693, "grad_norm": 8.875, "learning_rate": 7.650150989822168e-06, "loss": 1.10483551, "memory(GiB)": 141.16, "step": 6840, "train_speed(iter/s)": 0.305135 }, { "acc": 0.70575495, "epoch": 0.07672873846377545, "grad_norm": 6.71875, "learning_rate": 7.672519852365508e-06, "loss": 1.20812073, "memory(GiB)": 141.16, "step": 6860, "train_speed(iter/s)": 0.305441 }, { "acc": 0.71598692, "epoch": 0.07695243740973398, "grad_norm": 6.46875, "learning_rate": 7.694888714908849e-06, "loss": 1.14304409, "memory(GiB)": 141.16, "step": 6880, "train_speed(iter/s)": 0.305725 }, { "acc": 0.70324221, "epoch": 0.07717613635569251, "grad_norm": 7.25, "learning_rate": 7.717257577452188e-06, "loss": 1.20446196, "memory(GiB)": 141.16, "step": 6900, "train_speed(iter/s)": 0.305963 }, { "acc": 0.71601944, "epoch": 0.07739983530165104, "grad_norm": 7.9375, "learning_rate": 7.739626439995527e-06, "loss": 1.16437931, "memory(GiB)": 141.16, "step": 6920, "train_speed(iter/s)": 0.306254 }, { "acc": 0.70772123, "epoch": 0.07762353424760957, "grad_norm": 7.3125, "learning_rate": 7.761995302538867e-06, "loss": 1.1851553, "memory(GiB)": 141.16, "step": 6940, "train_speed(iter/s)": 0.306548 }, { "acc": 0.71981106, "epoch": 0.0778472331935681, "grad_norm": 9.375, "learning_rate": 7.784364165082206e-06, "loss": 1.13492928, "memory(GiB)": 141.16, "step": 6960, "train_speed(iter/s)": 0.306852 }, { "acc": 0.71937857, "epoch": 0.07807093213952662, "grad_norm": 6.8125, "learning_rate": 7.806733027625545e-06, "loss": 1.1304759, "memory(GiB)": 141.16, "step": 6980, "train_speed(iter/s)": 0.307154 }, { "acc": 0.72077675, "epoch": 0.07829463108548515, "grad_norm": 7.90625, "learning_rate": 7.829101890168885e-06, "loss": 1.11814613, "memory(GiB)": 141.16, "step": 7000, "train_speed(iter/s)": 0.307417 }, { "acc": 0.7165369, "epoch": 0.07851833003144368, "grad_norm": 6.75, "learning_rate": 7.851470752712224e-06, "loss": 1.16566753, "memory(GiB)": 141.16, "step": 7020, "train_speed(iter/s)": 0.307699 }, { "acc": 0.69686995, "epoch": 0.0787420289774022, "grad_norm": 6.625, "learning_rate": 7.873839615255565e-06, "loss": 1.24185352, "memory(GiB)": 141.16, "step": 7040, "train_speed(iter/s)": 0.307948 }, { "acc": 0.71572356, "epoch": 0.07896572792336073, "grad_norm": 7.28125, "learning_rate": 7.896208477798905e-06, "loss": 1.14827499, "memory(GiB)": 141.16, "step": 7060, "train_speed(iter/s)": 0.308242 }, { "acc": 0.71081533, "epoch": 0.07918942686931928, "grad_norm": 7.53125, "learning_rate": 7.918577340342244e-06, "loss": 1.17477732, "memory(GiB)": 141.16, "step": 7080, "train_speed(iter/s)": 0.308518 }, { "acc": 0.71574192, "epoch": 0.0794131258152778, "grad_norm": 8.4375, "learning_rate": 7.940946202885585e-06, "loss": 1.17847881, "memory(GiB)": 141.16, "step": 7100, "train_speed(iter/s)": 0.308752 }, { "acc": 0.71168976, "epoch": 0.07963682476123633, "grad_norm": 8.5, "learning_rate": 7.963315065428924e-06, "loss": 1.16988287, "memory(GiB)": 141.16, "step": 7120, "train_speed(iter/s)": 0.309018 }, { "acc": 0.72389612, "epoch": 0.07986052370719486, "grad_norm": 5.84375, "learning_rate": 7.985683927972264e-06, "loss": 1.11617413, "memory(GiB)": 141.16, "step": 7140, "train_speed(iter/s)": 0.309279 }, { "acc": 0.70494933, "epoch": 0.08008422265315339, "grad_norm": 9.125, "learning_rate": 8.008052790515603e-06, "loss": 1.21170311, "memory(GiB)": 141.16, "step": 7160, "train_speed(iter/s)": 0.309585 }, { "acc": 0.71317148, "epoch": 0.08030792159911192, "grad_norm": 6.96875, "learning_rate": 8.030421653058942e-06, "loss": 1.16966152, "memory(GiB)": 141.16, "step": 7180, "train_speed(iter/s)": 0.309845 }, { "acc": 0.72392874, "epoch": 0.08053162054507045, "grad_norm": 7.40625, "learning_rate": 8.052790515602282e-06, "loss": 1.09903889, "memory(GiB)": 141.16, "step": 7200, "train_speed(iter/s)": 0.310115 }, { "acc": 0.7290535, "epoch": 0.08075531949102897, "grad_norm": 5.84375, "learning_rate": 8.075159378145621e-06, "loss": 1.09157524, "memory(GiB)": 141.16, "step": 7220, "train_speed(iter/s)": 0.310398 }, { "acc": 0.713235, "epoch": 0.0809790184369875, "grad_norm": 7.28125, "learning_rate": 8.097528240688962e-06, "loss": 1.14938354, "memory(GiB)": 141.16, "step": 7240, "train_speed(iter/s)": 0.310665 }, { "acc": 0.71275673, "epoch": 0.08120271738294603, "grad_norm": 7.875, "learning_rate": 8.119897103232301e-06, "loss": 1.16720638, "memory(GiB)": 141.16, "step": 7260, "train_speed(iter/s)": 0.31095 }, { "acc": 0.70986166, "epoch": 0.08142641632890456, "grad_norm": 9.25, "learning_rate": 8.14226596577564e-06, "loss": 1.171208, "memory(GiB)": 141.16, "step": 7280, "train_speed(iter/s)": 0.311186 }, { "acc": 0.71379604, "epoch": 0.08165011527486309, "grad_norm": 6.9375, "learning_rate": 8.16463482831898e-06, "loss": 1.1542778, "memory(GiB)": 141.16, "step": 7300, "train_speed(iter/s)": 0.311465 }, { "acc": 0.72240152, "epoch": 0.08187381422082161, "grad_norm": 8.625, "learning_rate": 8.187003690862321e-06, "loss": 1.12941933, "memory(GiB)": 141.16, "step": 7320, "train_speed(iter/s)": 0.311742 }, { "acc": 0.71054149, "epoch": 0.08209751316678014, "grad_norm": 6.375, "learning_rate": 8.20937255340566e-06, "loss": 1.17212315, "memory(GiB)": 141.16, "step": 7340, "train_speed(iter/s)": 0.312015 }, { "acc": 0.71440239, "epoch": 0.08232121211273867, "grad_norm": 7.40625, "learning_rate": 8.231741415949e-06, "loss": 1.16435986, "memory(GiB)": 141.16, "step": 7360, "train_speed(iter/s)": 0.312294 }, { "acc": 0.70585217, "epoch": 0.0825449110586972, "grad_norm": 7.03125, "learning_rate": 8.25411027849234e-06, "loss": 1.20853939, "memory(GiB)": 141.16, "step": 7380, "train_speed(iter/s)": 0.312529 }, { "acc": 0.70392599, "epoch": 0.08276861000465574, "grad_norm": 6.90625, "learning_rate": 8.276479141035679e-06, "loss": 1.21216679, "memory(GiB)": 141.16, "step": 7400, "train_speed(iter/s)": 0.312785 }, { "acc": 0.70335913, "epoch": 0.08299230895061427, "grad_norm": 7.71875, "learning_rate": 8.298848003579018e-06, "loss": 1.19221869, "memory(GiB)": 141.16, "step": 7420, "train_speed(iter/s)": 0.313036 }, { "acc": 0.70746393, "epoch": 0.0832160078965728, "grad_norm": 6.90625, "learning_rate": 8.321216866122357e-06, "loss": 1.18170586, "memory(GiB)": 141.16, "step": 7440, "train_speed(iter/s)": 0.313305 }, { "acc": 0.71743059, "epoch": 0.08343970684253132, "grad_norm": 6.84375, "learning_rate": 8.343585728665698e-06, "loss": 1.15294905, "memory(GiB)": 141.16, "step": 7460, "train_speed(iter/s)": 0.313579 }, { "acc": 0.70390387, "epoch": 0.08366340578848985, "grad_norm": 8.1875, "learning_rate": 8.365954591209038e-06, "loss": 1.20189981, "memory(GiB)": 141.16, "step": 7480, "train_speed(iter/s)": 0.313838 }, { "acc": 0.72352753, "epoch": 0.08388710473444838, "grad_norm": 7.25, "learning_rate": 8.388323453752377e-06, "loss": 1.10145111, "memory(GiB)": 141.16, "step": 7500, "train_speed(iter/s)": 0.314101 }, { "acc": 0.71385088, "epoch": 0.08411080368040691, "grad_norm": 6.125, "learning_rate": 8.410692316295716e-06, "loss": 1.16132393, "memory(GiB)": 141.16, "step": 7520, "train_speed(iter/s)": 0.314375 }, { "acc": 0.71382227, "epoch": 0.08433450262636544, "grad_norm": 7.0625, "learning_rate": 8.433061178839057e-06, "loss": 1.1496151, "memory(GiB)": 141.16, "step": 7540, "train_speed(iter/s)": 0.31458 }, { "acc": 0.7065176, "epoch": 0.08455820157232397, "grad_norm": 8.5, "learning_rate": 8.455430041382397e-06, "loss": 1.19822636, "memory(GiB)": 141.16, "step": 7560, "train_speed(iter/s)": 0.314822 }, { "acc": 0.70520244, "epoch": 0.0847819005182825, "grad_norm": 6.78125, "learning_rate": 8.477798903925736e-06, "loss": 1.20282516, "memory(GiB)": 141.16, "step": 7580, "train_speed(iter/s)": 0.315078 }, { "acc": 0.7152297, "epoch": 0.08500559946424102, "grad_norm": 5.0625, "learning_rate": 8.500167766469076e-06, "loss": 1.13658752, "memory(GiB)": 141.16, "step": 7600, "train_speed(iter/s)": 0.315291 }, { "acc": 0.70678492, "epoch": 0.08522929841019955, "grad_norm": 7.9375, "learning_rate": 8.522536629012415e-06, "loss": 1.19770832, "memory(GiB)": 141.16, "step": 7620, "train_speed(iter/s)": 0.315553 }, { "acc": 0.72117043, "epoch": 0.08545299735615808, "grad_norm": 6.53125, "learning_rate": 8.544905491555754e-06, "loss": 1.1299305, "memory(GiB)": 141.16, "step": 7640, "train_speed(iter/s)": 0.315771 }, { "acc": 0.71528606, "epoch": 0.0856766963021166, "grad_norm": 7.40625, "learning_rate": 8.567274354099094e-06, "loss": 1.15379362, "memory(GiB)": 141.16, "step": 7660, "train_speed(iter/s)": 0.31601 }, { "acc": 0.71244287, "epoch": 0.08590039524807513, "grad_norm": 7.65625, "learning_rate": 8.589643216642435e-06, "loss": 1.16588526, "memory(GiB)": 141.16, "step": 7680, "train_speed(iter/s)": 0.316233 }, { "acc": 0.70960889, "epoch": 0.08612409419403368, "grad_norm": 6.3125, "learning_rate": 8.612012079185774e-06, "loss": 1.18273439, "memory(GiB)": 141.16, "step": 7700, "train_speed(iter/s)": 0.316435 }, { "acc": 0.7161622, "epoch": 0.0863477931399922, "grad_norm": 7.6875, "learning_rate": 8.634380941729113e-06, "loss": 1.15480938, "memory(GiB)": 141.16, "step": 7720, "train_speed(iter/s)": 0.316708 }, { "acc": 0.71423445, "epoch": 0.08657149208595073, "grad_norm": 8.1875, "learning_rate": 8.656749804272453e-06, "loss": 1.15549469, "memory(GiB)": 141.16, "step": 7740, "train_speed(iter/s)": 0.316965 }, { "acc": 0.70616369, "epoch": 0.08679519103190926, "grad_norm": 8.625, "learning_rate": 8.679118666815794e-06, "loss": 1.1968441, "memory(GiB)": 141.16, "step": 7760, "train_speed(iter/s)": 0.317178 }, { "acc": 0.71168804, "epoch": 0.08701888997786779, "grad_norm": 8.0, "learning_rate": 8.701487529359133e-06, "loss": 1.17479, "memory(GiB)": 141.16, "step": 7780, "train_speed(iter/s)": 0.317395 }, { "acc": 0.72053566, "epoch": 0.08724258892382632, "grad_norm": 9.0625, "learning_rate": 8.723856391902472e-06, "loss": 1.13488503, "memory(GiB)": 141.16, "step": 7800, "train_speed(iter/s)": 0.317653 }, { "acc": 0.70716314, "epoch": 0.08746628786978485, "grad_norm": 5.5, "learning_rate": 8.746225254445812e-06, "loss": 1.20437202, "memory(GiB)": 141.16, "step": 7820, "train_speed(iter/s)": 0.317902 }, { "acc": 0.71018648, "epoch": 0.08768998681574337, "grad_norm": 6.6875, "learning_rate": 8.768594116989151e-06, "loss": 1.18072243, "memory(GiB)": 141.16, "step": 7840, "train_speed(iter/s)": 0.318133 }, { "acc": 0.7207418, "epoch": 0.0879136857617019, "grad_norm": 7.78125, "learning_rate": 8.79096297953249e-06, "loss": 1.12091427, "memory(GiB)": 141.16, "step": 7860, "train_speed(iter/s)": 0.31836 }, { "acc": 0.72215738, "epoch": 0.08813738470766043, "grad_norm": 6.15625, "learning_rate": 8.813331842075832e-06, "loss": 1.13350239, "memory(GiB)": 141.16, "step": 7880, "train_speed(iter/s)": 0.318588 }, { "acc": 0.71722159, "epoch": 0.08836108365361896, "grad_norm": 6.375, "learning_rate": 8.835700704619171e-06, "loss": 1.14713058, "memory(GiB)": 141.16, "step": 7900, "train_speed(iter/s)": 0.318831 }, { "acc": 0.71197739, "epoch": 0.08858478259957749, "grad_norm": 5.9375, "learning_rate": 8.85806956716251e-06, "loss": 1.16932487, "memory(GiB)": 141.16, "step": 7920, "train_speed(iter/s)": 0.319076 }, { "acc": 0.70625763, "epoch": 0.08880848154553601, "grad_norm": 8.4375, "learning_rate": 8.88043842970585e-06, "loss": 1.19067173, "memory(GiB)": 141.16, "step": 7940, "train_speed(iter/s)": 0.319315 }, { "acc": 0.71665268, "epoch": 0.08903218049149454, "grad_norm": 6.84375, "learning_rate": 8.90280729224919e-06, "loss": 1.13891163, "memory(GiB)": 141.16, "step": 7960, "train_speed(iter/s)": 0.319536 }, { "acc": 0.71053896, "epoch": 0.08925587943745307, "grad_norm": 6.84375, "learning_rate": 8.92517615479253e-06, "loss": 1.17213249, "memory(GiB)": 141.16, "step": 7980, "train_speed(iter/s)": 0.319749 }, { "acc": 0.718748, "epoch": 0.0894795783834116, "grad_norm": 6.9375, "learning_rate": 8.94754501733587e-06, "loss": 1.13411446, "memory(GiB)": 141.16, "step": 8000, "train_speed(iter/s)": 0.320002 }, { "epoch": 0.0894795783834116, "eval_acc": 0.6695967612492009, "eval_loss": 1.1587116718292236, "eval_runtime": 2231.1412, "eval_samples_per_second": 33.742, "eval_steps_per_second": 16.871, "step": 8000 }, { "acc": 0.71932325, "epoch": 0.08970327732937014, "grad_norm": 7.65625, "learning_rate": 8.969913879879209e-06, "loss": 1.12569351, "memory(GiB)": 141.16, "step": 8020, "train_speed(iter/s)": 0.293516 }, { "acc": 0.70547142, "epoch": 0.08992697627532867, "grad_norm": 5.28125, "learning_rate": 8.99228274242255e-06, "loss": 1.19735203, "memory(GiB)": 141.16, "step": 8040, "train_speed(iter/s)": 0.293753 }, { "acc": 0.72309136, "epoch": 0.0901506752212872, "grad_norm": 8.5625, "learning_rate": 9.014651604965889e-06, "loss": 1.11686363, "memory(GiB)": 141.16, "step": 8060, "train_speed(iter/s)": 0.294019 }, { "acc": 0.70369825, "epoch": 0.09037437416724572, "grad_norm": 6.5625, "learning_rate": 9.037020467509228e-06, "loss": 1.21279373, "memory(GiB)": 141.16, "step": 8080, "train_speed(iter/s)": 0.294271 }, { "acc": 0.7045434, "epoch": 0.09059807311320425, "grad_norm": 6.9375, "learning_rate": 9.059389330052568e-06, "loss": 1.21017675, "memory(GiB)": 141.16, "step": 8100, "train_speed(iter/s)": 0.294542 }, { "acc": 0.7059433, "epoch": 0.09082177205916278, "grad_norm": 5.90625, "learning_rate": 9.081758192595907e-06, "loss": 1.20811014, "memory(GiB)": 141.16, "step": 8120, "train_speed(iter/s)": 0.294769 }, { "acc": 0.71574473, "epoch": 0.09104547100512131, "grad_norm": 7.5625, "learning_rate": 9.104127055139246e-06, "loss": 1.14583178, "memory(GiB)": 141.16, "step": 8140, "train_speed(iter/s)": 0.295 }, { "acc": 0.70517769, "epoch": 0.09126916995107984, "grad_norm": 5.6875, "learning_rate": 9.126495917682586e-06, "loss": 1.22298212, "memory(GiB)": 141.16, "step": 8160, "train_speed(iter/s)": 0.295253 }, { "acc": 0.71860967, "epoch": 0.09149286889703837, "grad_norm": 6.46875, "learning_rate": 9.148864780225927e-06, "loss": 1.14090099, "memory(GiB)": 141.16, "step": 8180, "train_speed(iter/s)": 0.295506 }, { "acc": 0.71474085, "epoch": 0.0917165678429969, "grad_norm": 8.1875, "learning_rate": 9.171233642769266e-06, "loss": 1.15051308, "memory(GiB)": 141.16, "step": 8200, "train_speed(iter/s)": 0.295738 }, { "acc": 0.70977011, "epoch": 0.09194026678895542, "grad_norm": 7.4375, "learning_rate": 9.193602505312606e-06, "loss": 1.17767773, "memory(GiB)": 141.16, "step": 8220, "train_speed(iter/s)": 0.295988 }, { "acc": 0.71591053, "epoch": 0.09216396573491395, "grad_norm": 8.5625, "learning_rate": 9.215971367855945e-06, "loss": 1.13090401, "memory(GiB)": 141.16, "step": 8240, "train_speed(iter/s)": 0.29624 }, { "acc": 0.72492337, "epoch": 0.09238766468087248, "grad_norm": 5.78125, "learning_rate": 9.238340230399286e-06, "loss": 1.09965029, "memory(GiB)": 141.16, "step": 8260, "train_speed(iter/s)": 0.29645 }, { "acc": 0.71484776, "epoch": 0.092611363626831, "grad_norm": 6.40625, "learning_rate": 9.260709092942625e-06, "loss": 1.15623379, "memory(GiB)": 141.16, "step": 8280, "train_speed(iter/s)": 0.296646 }, { "acc": 0.72368031, "epoch": 0.09283506257278953, "grad_norm": 5.15625, "learning_rate": 9.283077955485965e-06, "loss": 1.11338978, "memory(GiB)": 141.16, "step": 8300, "train_speed(iter/s)": 0.296862 }, { "acc": 0.72214203, "epoch": 0.09305876151874806, "grad_norm": 8.5625, "learning_rate": 9.305446818029304e-06, "loss": 1.12088203, "memory(GiB)": 141.16, "step": 8320, "train_speed(iter/s)": 0.297117 }, { "acc": 0.72608328, "epoch": 0.0932824604647066, "grad_norm": 5.0625, "learning_rate": 9.327815680572643e-06, "loss": 1.10275993, "memory(GiB)": 141.16, "step": 8340, "train_speed(iter/s)": 0.297334 }, { "acc": 0.71622853, "epoch": 0.09350615941066513, "grad_norm": 6.625, "learning_rate": 9.350184543115983e-06, "loss": 1.14040852, "memory(GiB)": 141.16, "step": 8360, "train_speed(iter/s)": 0.297583 }, { "acc": 0.70731578, "epoch": 0.09372985835662366, "grad_norm": 6.96875, "learning_rate": 9.372553405659322e-06, "loss": 1.19207439, "memory(GiB)": 141.16, "step": 8380, "train_speed(iter/s)": 0.297798 }, { "acc": 0.71130791, "epoch": 0.09395355730258219, "grad_norm": 6.65625, "learning_rate": 9.394922268202663e-06, "loss": 1.17943096, "memory(GiB)": 141.16, "step": 8400, "train_speed(iter/s)": 0.298029 }, { "acc": 0.71327915, "epoch": 0.09417725624854072, "grad_norm": 5.6875, "learning_rate": 9.417291130746002e-06, "loss": 1.18578548, "memory(GiB)": 141.16, "step": 8420, "train_speed(iter/s)": 0.298235 }, { "acc": 0.71498408, "epoch": 0.09440095519449924, "grad_norm": 6.28125, "learning_rate": 9.439659993289342e-06, "loss": 1.16287546, "memory(GiB)": 141.16, "step": 8440, "train_speed(iter/s)": 0.298473 }, { "acc": 0.71234188, "epoch": 0.09462465414045777, "grad_norm": 7.46875, "learning_rate": 9.462028855832681e-06, "loss": 1.17571268, "memory(GiB)": 141.16, "step": 8460, "train_speed(iter/s)": 0.298702 }, { "acc": 0.72509427, "epoch": 0.0948483530864163, "grad_norm": 4.9375, "learning_rate": 9.484397718376022e-06, "loss": 1.11577015, "memory(GiB)": 141.16, "step": 8480, "train_speed(iter/s)": 0.298939 }, { "acc": 0.72661161, "epoch": 0.09507205203237483, "grad_norm": 8.6875, "learning_rate": 9.506766580919362e-06, "loss": 1.10669432, "memory(GiB)": 141.16, "step": 8500, "train_speed(iter/s)": 0.299163 }, { "acc": 0.72819452, "epoch": 0.09529575097833336, "grad_norm": 6.46875, "learning_rate": 9.529135443462701e-06, "loss": 1.0868866, "memory(GiB)": 141.16, "step": 8520, "train_speed(iter/s)": 0.299398 }, { "acc": 0.70892701, "epoch": 0.09551944992429189, "grad_norm": 6.40625, "learning_rate": 9.55150430600604e-06, "loss": 1.19066906, "memory(GiB)": 141.16, "step": 8540, "train_speed(iter/s)": 0.299604 }, { "acc": 0.71882486, "epoch": 0.09574314887025041, "grad_norm": 7.09375, "learning_rate": 9.57387316854938e-06, "loss": 1.12249546, "memory(GiB)": 141.16, "step": 8560, "train_speed(iter/s)": 0.299827 }, { "acc": 0.7147963, "epoch": 0.09596684781620894, "grad_norm": 7.9375, "learning_rate": 9.596242031092719e-06, "loss": 1.15369244, "memory(GiB)": 141.16, "step": 8580, "train_speed(iter/s)": 0.300073 }, { "acc": 0.71383448, "epoch": 0.09619054676216747, "grad_norm": 7.375, "learning_rate": 9.618610893636058e-06, "loss": 1.16297455, "memory(GiB)": 141.16, "step": 8600, "train_speed(iter/s)": 0.300301 }, { "acc": 0.71744857, "epoch": 0.096414245708126, "grad_norm": 5.65625, "learning_rate": 9.6409797561794e-06, "loss": 1.1513176, "memory(GiB)": 141.16, "step": 8620, "train_speed(iter/s)": 0.300519 }, { "acc": 0.71971207, "epoch": 0.09663794465408454, "grad_norm": 6.875, "learning_rate": 9.663348618722739e-06, "loss": 1.13067532, "memory(GiB)": 141.16, "step": 8640, "train_speed(iter/s)": 0.300749 }, { "acc": 0.71451855, "epoch": 0.09686164360004307, "grad_norm": 6.6875, "learning_rate": 9.685717481266078e-06, "loss": 1.15176716, "memory(GiB)": 141.16, "step": 8660, "train_speed(iter/s)": 0.300974 }, { "acc": 0.7082942, "epoch": 0.0970853425460016, "grad_norm": 6.78125, "learning_rate": 9.708086343809419e-06, "loss": 1.18397446, "memory(GiB)": 141.16, "step": 8680, "train_speed(iter/s)": 0.301222 }, { "acc": 0.70710211, "epoch": 0.09730904149196012, "grad_norm": 7.3125, "learning_rate": 9.730455206352758e-06, "loss": 1.201682, "memory(GiB)": 141.16, "step": 8700, "train_speed(iter/s)": 0.301464 }, { "acc": 0.72472992, "epoch": 0.09753274043791865, "grad_norm": 6.78125, "learning_rate": 9.752824068896098e-06, "loss": 1.1116436, "memory(GiB)": 141.16, "step": 8720, "train_speed(iter/s)": 0.301702 }, { "acc": 0.721772, "epoch": 0.09775643938387718, "grad_norm": 7.34375, "learning_rate": 9.775192931439437e-06, "loss": 1.11486092, "memory(GiB)": 141.16, "step": 8740, "train_speed(iter/s)": 0.301905 }, { "acc": 0.71613264, "epoch": 0.09798013832983571, "grad_norm": 8.375, "learning_rate": 9.797561793982777e-06, "loss": 1.15829773, "memory(GiB)": 141.16, "step": 8760, "train_speed(iter/s)": 0.302102 }, { "acc": 0.71239414, "epoch": 0.09820383727579424, "grad_norm": 7.90625, "learning_rate": 9.819930656526116e-06, "loss": 1.17284374, "memory(GiB)": 141.16, "step": 8780, "train_speed(iter/s)": 0.30231 }, { "acc": 0.707232, "epoch": 0.09842753622175276, "grad_norm": 5.15625, "learning_rate": 9.842299519069455e-06, "loss": 1.19533186, "memory(GiB)": 141.16, "step": 8800, "train_speed(iter/s)": 0.302537 }, { "acc": 0.71829605, "epoch": 0.0986512351677113, "grad_norm": 7.3125, "learning_rate": 9.864668381612795e-06, "loss": 1.15611486, "memory(GiB)": 141.16, "step": 8820, "train_speed(iter/s)": 0.302767 }, { "acc": 0.72254786, "epoch": 0.09887493411366982, "grad_norm": 6.3125, "learning_rate": 9.887037244156136e-06, "loss": 1.10635147, "memory(GiB)": 141.16, "step": 8840, "train_speed(iter/s)": 0.302984 }, { "acc": 0.71805177, "epoch": 0.09909863305962835, "grad_norm": 7.125, "learning_rate": 9.909406106699475e-06, "loss": 1.14598961, "memory(GiB)": 141.16, "step": 8860, "train_speed(iter/s)": 0.303206 }, { "acc": 0.71767063, "epoch": 0.09932233200558688, "grad_norm": 8.25, "learning_rate": 9.931774969242814e-06, "loss": 1.15316067, "memory(GiB)": 141.16, "step": 8880, "train_speed(iter/s)": 0.303425 }, { "acc": 0.71223812, "epoch": 0.0995460309515454, "grad_norm": 6.9375, "learning_rate": 9.954143831786155e-06, "loss": 1.16494675, "memory(GiB)": 141.16, "step": 8900, "train_speed(iter/s)": 0.303631 }, { "acc": 0.70800028, "epoch": 0.09976972989750393, "grad_norm": 7.75, "learning_rate": 9.976512694329495e-06, "loss": 1.18339319, "memory(GiB)": 141.16, "step": 8920, "train_speed(iter/s)": 0.303869 }, { "acc": 0.70874577, "epoch": 0.09999342884346246, "grad_norm": 5.6875, "learning_rate": 9.998881556872834e-06, "loss": 1.17672577, "memory(GiB)": 141.16, "step": 8940, "train_speed(iter/s)": 0.304054 }, { "acc": 0.70547829, "epoch": 0.100217127789421, "grad_norm": 6.59375, "learning_rate": 9.999999691312751e-06, "loss": 1.20707188, "memory(GiB)": 141.16, "step": 8960, "train_speed(iter/s)": 0.304222 }, { "acc": 0.71141605, "epoch": 0.10044082673537953, "grad_norm": 6.5, "learning_rate": 9.999998699409167e-06, "loss": 1.17943668, "memory(GiB)": 141.16, "step": 8980, "train_speed(iter/s)": 0.304449 }, { "acc": 0.73094792, "epoch": 0.10066452568133806, "grad_norm": 7.15625, "learning_rate": 9.999997023434294e-06, "loss": 1.09484177, "memory(GiB)": 141.16, "step": 9000, "train_speed(iter/s)": 0.304648 }, { "acc": 0.72445908, "epoch": 0.10088822462729659, "grad_norm": 9.375, "learning_rate": 9.999994663388362e-06, "loss": 1.11308594, "memory(GiB)": 141.16, "step": 9020, "train_speed(iter/s)": 0.304853 }, { "acc": 0.70593333, "epoch": 0.10111192357325512, "grad_norm": 6.90625, "learning_rate": 9.999991619271693e-06, "loss": 1.18902664, "memory(GiB)": 141.16, "step": 9040, "train_speed(iter/s)": 0.30505 }, { "acc": 0.72304869, "epoch": 0.10133562251921364, "grad_norm": 7.96875, "learning_rate": 9.999987891084703e-06, "loss": 1.1258091, "memory(GiB)": 141.16, "step": 9060, "train_speed(iter/s)": 0.305277 }, { "acc": 0.72457733, "epoch": 0.10155932146517217, "grad_norm": 6.0625, "learning_rate": 9.999983478827906e-06, "loss": 1.11925278, "memory(GiB)": 141.16, "step": 9080, "train_speed(iter/s)": 0.305446 }, { "acc": 0.7163661, "epoch": 0.1017830204111307, "grad_norm": 8.5625, "learning_rate": 9.999978382501902e-06, "loss": 1.14900045, "memory(GiB)": 141.16, "step": 9100, "train_speed(iter/s)": 0.305649 }, { "acc": 0.70860815, "epoch": 0.10200671935708923, "grad_norm": 7.9375, "learning_rate": 9.999972602107388e-06, "loss": 1.19164524, "memory(GiB)": 141.16, "step": 9120, "train_speed(iter/s)": 0.305872 }, { "acc": 0.7061203, "epoch": 0.10223041830304776, "grad_norm": 5.8125, "learning_rate": 9.999966137645157e-06, "loss": 1.21669922, "memory(GiB)": 141.16, "step": 9140, "train_speed(iter/s)": 0.306093 }, { "acc": 0.69828596, "epoch": 0.10245411724900629, "grad_norm": 7.625, "learning_rate": 9.999958989116093e-06, "loss": 1.23983173, "memory(GiB)": 141.16, "step": 9160, "train_speed(iter/s)": 0.306303 }, { "acc": 0.72006087, "epoch": 0.10267781619496481, "grad_norm": 5.5, "learning_rate": 9.999951156521172e-06, "loss": 1.13935032, "memory(GiB)": 141.16, "step": 9180, "train_speed(iter/s)": 0.306517 }, { "acc": 0.71682901, "epoch": 0.10290151514092334, "grad_norm": 6.96875, "learning_rate": 9.999942639861467e-06, "loss": 1.15634184, "memory(GiB)": 141.16, "step": 9200, "train_speed(iter/s)": 0.306745 }, { "acc": 0.70610576, "epoch": 0.10312521408688187, "grad_norm": 7.125, "learning_rate": 9.999933439138144e-06, "loss": 1.20755911, "memory(GiB)": 141.16, "step": 9220, "train_speed(iter/s)": 0.306962 }, { "acc": 0.71804056, "epoch": 0.1033489130328404, "grad_norm": 6.1875, "learning_rate": 9.999923554352461e-06, "loss": 1.137286, "memory(GiB)": 141.16, "step": 9240, "train_speed(iter/s)": 0.307194 }, { "acc": 0.71043787, "epoch": 0.10357261197879893, "grad_norm": 8.9375, "learning_rate": 9.999912985505772e-06, "loss": 1.16284142, "memory(GiB)": 141.16, "step": 9260, "train_speed(iter/s)": 0.30739 }, { "acc": 0.71032882, "epoch": 0.10379631092475747, "grad_norm": 8.75, "learning_rate": 9.999901732599518e-06, "loss": 1.1818162, "memory(GiB)": 141.16, "step": 9280, "train_speed(iter/s)": 0.307602 }, { "acc": 0.73000984, "epoch": 0.104020009870716, "grad_norm": 6.90625, "learning_rate": 9.999889795635243e-06, "loss": 1.08515053, "memory(GiB)": 141.16, "step": 9300, "train_speed(iter/s)": 0.307814 }, { "acc": 0.72542405, "epoch": 0.10424370881667452, "grad_norm": 6.8125, "learning_rate": 9.99987717461458e-06, "loss": 1.11376743, "memory(GiB)": 141.16, "step": 9320, "train_speed(iter/s)": 0.308026 }, { "acc": 0.7300252, "epoch": 0.10446740776263305, "grad_norm": 8.4375, "learning_rate": 9.999863869539254e-06, "loss": 1.08728085, "memory(GiB)": 141.16, "step": 9340, "train_speed(iter/s)": 0.30825 }, { "acc": 0.71602769, "epoch": 0.10469110670859158, "grad_norm": 8.4375, "learning_rate": 9.999849880411086e-06, "loss": 1.14979744, "memory(GiB)": 141.16, "step": 9360, "train_speed(iter/s)": 0.308446 }, { "acc": 0.7283145, "epoch": 0.10491480565455011, "grad_norm": 6.0, "learning_rate": 9.99983520723199e-06, "loss": 1.07819672, "memory(GiB)": 141.16, "step": 9380, "train_speed(iter/s)": 0.308653 }, { "acc": 0.70960159, "epoch": 0.10513850460050864, "grad_norm": 5.59375, "learning_rate": 9.999819850003975e-06, "loss": 1.17764835, "memory(GiB)": 141.16, "step": 9400, "train_speed(iter/s)": 0.308866 }, { "acc": 0.72610078, "epoch": 0.10536220354646716, "grad_norm": 7.65625, "learning_rate": 9.99980380872914e-06, "loss": 1.09945335, "memory(GiB)": 141.16, "step": 9420, "train_speed(iter/s)": 0.309045 }, { "acc": 0.71468258, "epoch": 0.10558590249242569, "grad_norm": 6.625, "learning_rate": 9.999787083409679e-06, "loss": 1.15428362, "memory(GiB)": 141.16, "step": 9440, "train_speed(iter/s)": 0.309237 }, { "acc": 0.72211809, "epoch": 0.10580960143838422, "grad_norm": 7.125, "learning_rate": 9.999769674047883e-06, "loss": 1.12047319, "memory(GiB)": 141.16, "step": 9460, "train_speed(iter/s)": 0.309433 }, { "acc": 0.70703726, "epoch": 0.10603330038434275, "grad_norm": 6.75, "learning_rate": 9.999751580646132e-06, "loss": 1.18048964, "memory(GiB)": 141.16, "step": 9480, "train_speed(iter/s)": 0.309602 }, { "acc": 0.72303057, "epoch": 0.10625699933030128, "grad_norm": 6.40625, "learning_rate": 9.999732803206901e-06, "loss": 1.1100174, "memory(GiB)": 141.16, "step": 9500, "train_speed(iter/s)": 0.309822 }, { "acc": 0.71840858, "epoch": 0.1064806982762598, "grad_norm": 6.53125, "learning_rate": 9.999713341732762e-06, "loss": 1.13819494, "memory(GiB)": 141.16, "step": 9520, "train_speed(iter/s)": 0.310041 }, { "acc": 0.71922302, "epoch": 0.10670439722221833, "grad_norm": 7.34375, "learning_rate": 9.999693196226373e-06, "loss": 1.12713137, "memory(GiB)": 141.16, "step": 9540, "train_speed(iter/s)": 0.31023 }, { "acc": 0.71830711, "epoch": 0.10692809616817686, "grad_norm": 8.1875, "learning_rate": 9.999672366690494e-06, "loss": 1.14808664, "memory(GiB)": 141.16, "step": 9560, "train_speed(iter/s)": 0.310413 }, { "acc": 0.71473069, "epoch": 0.10715179511413539, "grad_norm": 6.9375, "learning_rate": 9.999650853127973e-06, "loss": 1.15781231, "memory(GiB)": 141.16, "step": 9580, "train_speed(iter/s)": 0.310633 }, { "acc": 0.71121063, "epoch": 0.10737549406009393, "grad_norm": 6.28125, "learning_rate": 9.999628655541754e-06, "loss": 1.15480309, "memory(GiB)": 141.16, "step": 9600, "train_speed(iter/s)": 0.310833 }, { "acc": 0.72246828, "epoch": 0.10759919300605246, "grad_norm": 7.03125, "learning_rate": 9.999605773934873e-06, "loss": 1.11923084, "memory(GiB)": 141.16, "step": 9620, "train_speed(iter/s)": 0.311029 }, { "acc": 0.73168583, "epoch": 0.10782289195201099, "grad_norm": 8.125, "learning_rate": 9.999582208310463e-06, "loss": 1.07572765, "memory(GiB)": 141.16, "step": 9640, "train_speed(iter/s)": 0.311249 }, { "acc": 0.71741576, "epoch": 0.10804659089796952, "grad_norm": 6.8125, "learning_rate": 9.999557958671746e-06, "loss": 1.14278221, "memory(GiB)": 141.16, "step": 9660, "train_speed(iter/s)": 0.311446 }, { "acc": 0.71837111, "epoch": 0.10827028984392804, "grad_norm": 6.65625, "learning_rate": 9.99953302502204e-06, "loss": 1.14450769, "memory(GiB)": 141.16, "step": 9680, "train_speed(iter/s)": 0.311645 }, { "acc": 0.72764072, "epoch": 0.10849398878988657, "grad_norm": 8.1875, "learning_rate": 9.999507407364755e-06, "loss": 1.09211884, "memory(GiB)": 141.16, "step": 9700, "train_speed(iter/s)": 0.31182 }, { "acc": 0.72260532, "epoch": 0.1087176877358451, "grad_norm": 5.53125, "learning_rate": 9.999481105703397e-06, "loss": 1.1027195, "memory(GiB)": 141.16, "step": 9720, "train_speed(iter/s)": 0.312006 }, { "acc": 0.71677752, "epoch": 0.10894138668180363, "grad_norm": 7.5, "learning_rate": 9.999454120041567e-06, "loss": 1.13859329, "memory(GiB)": 141.16, "step": 9740, "train_speed(iter/s)": 0.31217 }, { "acc": 0.7307642, "epoch": 0.10916508562776216, "grad_norm": 9.3125, "learning_rate": 9.999426450382953e-06, "loss": 1.07876091, "memory(GiB)": 141.16, "step": 9760, "train_speed(iter/s)": 0.312379 }, { "acc": 0.70318599, "epoch": 0.10938878457372068, "grad_norm": 5.78125, "learning_rate": 9.999398096731343e-06, "loss": 1.20900669, "memory(GiB)": 141.16, "step": 9780, "train_speed(iter/s)": 0.312582 }, { "acc": 0.72007236, "epoch": 0.10961248351967921, "grad_norm": 6.78125, "learning_rate": 9.999369059090616e-06, "loss": 1.13052082, "memory(GiB)": 141.16, "step": 9800, "train_speed(iter/s)": 0.312782 }, { "acc": 0.71256857, "epoch": 0.10983618246563774, "grad_norm": 7.5, "learning_rate": 9.999339337464744e-06, "loss": 1.16625557, "memory(GiB)": 141.16, "step": 9820, "train_speed(iter/s)": 0.312962 }, { "acc": 0.72264123, "epoch": 0.11005988141159627, "grad_norm": 6.1875, "learning_rate": 9.999308931857794e-06, "loss": 1.10587292, "memory(GiB)": 141.16, "step": 9840, "train_speed(iter/s)": 0.313145 }, { "acc": 0.70891428, "epoch": 0.1102835803575548, "grad_norm": 6.90625, "learning_rate": 9.999277842273925e-06, "loss": 1.17519455, "memory(GiB)": 141.16, "step": 9860, "train_speed(iter/s)": 0.31333 }, { "acc": 0.71576076, "epoch": 0.11050727930351333, "grad_norm": 7.25, "learning_rate": 9.99924606871739e-06, "loss": 1.15565577, "memory(GiB)": 141.16, "step": 9880, "train_speed(iter/s)": 0.313514 }, { "acc": 0.70867152, "epoch": 0.11073097824947187, "grad_norm": 6.65625, "learning_rate": 9.999213611192537e-06, "loss": 1.18384323, "memory(GiB)": 141.16, "step": 9900, "train_speed(iter/s)": 0.313699 }, { "acc": 0.72997732, "epoch": 0.1109546771954304, "grad_norm": 6.21875, "learning_rate": 9.999180469703809e-06, "loss": 1.07277632, "memory(GiB)": 141.16, "step": 9920, "train_speed(iter/s)": 0.31389 }, { "acc": 0.7202486, "epoch": 0.11117837614138892, "grad_norm": 8.125, "learning_rate": 9.999146644255738e-06, "loss": 1.14711704, "memory(GiB)": 141.16, "step": 9940, "train_speed(iter/s)": 0.314082 }, { "acc": 0.71830425, "epoch": 0.11140207508734745, "grad_norm": 6.8125, "learning_rate": 9.99911213485295e-06, "loss": 1.13535919, "memory(GiB)": 141.16, "step": 9960, "train_speed(iter/s)": 0.314255 }, { "acc": 0.713626, "epoch": 0.11162577403330598, "grad_norm": 6.40625, "learning_rate": 9.999076941500167e-06, "loss": 1.15873194, "memory(GiB)": 141.16, "step": 9980, "train_speed(iter/s)": 0.314433 }, { "acc": 0.72129707, "epoch": 0.11184947297926451, "grad_norm": 9.3125, "learning_rate": 9.999041064202208e-06, "loss": 1.12825861, "memory(GiB)": 141.16, "step": 10000, "train_speed(iter/s)": 0.314629 }, { "epoch": 0.11184947297926451, "eval_acc": 0.674384994650327, "eval_loss": 1.1401276588439941, "eval_runtime": 2236.98, "eval_samples_per_second": 33.654, "eval_steps_per_second": 16.827, "step": 10000 }, { "acc": 0.72068505, "epoch": 0.11207317192522304, "grad_norm": 7.5625, "learning_rate": 9.999004502963978e-06, "loss": 1.13043289, "memory(GiB)": 141.16, "step": 10020, "train_speed(iter/s)": 0.293713 }, { "acc": 0.70924597, "epoch": 0.11229687087118156, "grad_norm": 6.0, "learning_rate": 9.99896725779048e-06, "loss": 1.17785358, "memory(GiB)": 141.16, "step": 10040, "train_speed(iter/s)": 0.293915 }, { "acc": 0.71802025, "epoch": 0.11252056981714009, "grad_norm": 7.0, "learning_rate": 9.998929328686808e-06, "loss": 1.14819813, "memory(GiB)": 141.16, "step": 10060, "train_speed(iter/s)": 0.294135 }, { "acc": 0.71291761, "epoch": 0.11274426876309862, "grad_norm": 6.09375, "learning_rate": 9.998890715658153e-06, "loss": 1.14421282, "memory(GiB)": 141.16, "step": 10080, "train_speed(iter/s)": 0.294323 }, { "acc": 0.71999254, "epoch": 0.11296796770905715, "grad_norm": 7.21875, "learning_rate": 9.998851418709798e-06, "loss": 1.12922592, "memory(GiB)": 141.16, "step": 10100, "train_speed(iter/s)": 0.294518 }, { "acc": 0.72185888, "epoch": 0.11319166665501568, "grad_norm": 6.15625, "learning_rate": 9.998811437847117e-06, "loss": 1.11468258, "memory(GiB)": 141.16, "step": 10120, "train_speed(iter/s)": 0.294706 }, { "acc": 0.72071781, "epoch": 0.1134153656009742, "grad_norm": 7.4375, "learning_rate": 9.998770773075586e-06, "loss": 1.12565823, "memory(GiB)": 141.16, "step": 10140, "train_speed(iter/s)": 0.294899 }, { "acc": 0.72406244, "epoch": 0.11363906454693273, "grad_norm": 7.34375, "learning_rate": 9.998729424400761e-06, "loss": 1.12807503, "memory(GiB)": 141.16, "step": 10160, "train_speed(iter/s)": 0.295085 }, { "acc": 0.71305866, "epoch": 0.11386276349289126, "grad_norm": 6.78125, "learning_rate": 9.998687391828303e-06, "loss": 1.15329161, "memory(GiB)": 141.16, "step": 10180, "train_speed(iter/s)": 0.295294 }, { "acc": 0.71626377, "epoch": 0.11408646243884979, "grad_norm": 7.09375, "learning_rate": 9.998644675363961e-06, "loss": 1.14867506, "memory(GiB)": 141.16, "step": 10200, "train_speed(iter/s)": 0.295477 }, { "acc": 0.71969161, "epoch": 0.11431016138480833, "grad_norm": 7.21875, "learning_rate": 9.998601275013584e-06, "loss": 1.12677259, "memory(GiB)": 141.16, "step": 10220, "train_speed(iter/s)": 0.29569 }, { "acc": 0.71895137, "epoch": 0.11453386033076686, "grad_norm": 6.375, "learning_rate": 9.998557190783104e-06, "loss": 1.15654469, "memory(GiB)": 141.16, "step": 10240, "train_speed(iter/s)": 0.295882 }, { "acc": 0.724614, "epoch": 0.11475755927672539, "grad_norm": 8.0625, "learning_rate": 9.998512422678555e-06, "loss": 1.12359009, "memory(GiB)": 141.16, "step": 10260, "train_speed(iter/s)": 0.296075 }, { "acc": 0.72085872, "epoch": 0.11498125822268392, "grad_norm": 9.0625, "learning_rate": 9.99846697070606e-06, "loss": 1.13310089, "memory(GiB)": 141.16, "step": 10280, "train_speed(iter/s)": 0.296281 }, { "acc": 0.72497964, "epoch": 0.11520495716864244, "grad_norm": 6.65625, "learning_rate": 9.99842083487184e-06, "loss": 1.12004557, "memory(GiB)": 141.16, "step": 10300, "train_speed(iter/s)": 0.296475 }, { "acc": 0.71920357, "epoch": 0.11542865611460097, "grad_norm": 8.8125, "learning_rate": 9.998374015182205e-06, "loss": 1.13476601, "memory(GiB)": 141.16, "step": 10320, "train_speed(iter/s)": 0.296659 }, { "acc": 0.70440826, "epoch": 0.1156523550605595, "grad_norm": 8.25, "learning_rate": 9.998326511643562e-06, "loss": 1.20229874, "memory(GiB)": 141.16, "step": 10340, "train_speed(iter/s)": 0.296862 }, { "acc": 0.7170352, "epoch": 0.11587605400651803, "grad_norm": 8.1875, "learning_rate": 9.998278324262408e-06, "loss": 1.14598255, "memory(GiB)": 141.16, "step": 10360, "train_speed(iter/s)": 0.297061 }, { "acc": 0.72517843, "epoch": 0.11609975295247656, "grad_norm": 8.125, "learning_rate": 9.998229453045341e-06, "loss": 1.11194, "memory(GiB)": 141.16, "step": 10380, "train_speed(iter/s)": 0.297271 }, { "acc": 0.71765299, "epoch": 0.11632345189843508, "grad_norm": 7.5625, "learning_rate": 9.998179897999041e-06, "loss": 1.12464085, "memory(GiB)": 141.16, "step": 10400, "train_speed(iter/s)": 0.297458 }, { "acc": 0.71761885, "epoch": 0.11654715084439361, "grad_norm": 7.375, "learning_rate": 9.998129659130292e-06, "loss": 1.13767147, "memory(GiB)": 141.16, "step": 10420, "train_speed(iter/s)": 0.297633 }, { "acc": 0.7339201, "epoch": 0.11677084979035214, "grad_norm": 6.3125, "learning_rate": 9.998078736445964e-06, "loss": 1.05534534, "memory(GiB)": 141.16, "step": 10440, "train_speed(iter/s)": 0.297826 }, { "acc": 0.71597176, "epoch": 0.11699454873631067, "grad_norm": 6.1875, "learning_rate": 9.998027129953027e-06, "loss": 1.16383839, "memory(GiB)": 141.16, "step": 10460, "train_speed(iter/s)": 0.298048 }, { "acc": 0.72279434, "epoch": 0.1172182476822692, "grad_norm": 6.59375, "learning_rate": 9.99797483965854e-06, "loss": 1.10671072, "memory(GiB)": 141.16, "step": 10480, "train_speed(iter/s)": 0.298229 }, { "acc": 0.72658777, "epoch": 0.11744194662822773, "grad_norm": 6.78125, "learning_rate": 9.997921865569657e-06, "loss": 1.08834095, "memory(GiB)": 141.16, "step": 10500, "train_speed(iter/s)": 0.298423 }, { "acc": 0.73186712, "epoch": 0.11766564557418625, "grad_norm": 5.9375, "learning_rate": 9.997868207693628e-06, "loss": 1.08183098, "memory(GiB)": 141.16, "step": 10520, "train_speed(iter/s)": 0.298629 }, { "acc": 0.72167244, "epoch": 0.1178893445201448, "grad_norm": 8.0625, "learning_rate": 9.997813866037792e-06, "loss": 1.11885796, "memory(GiB)": 141.16, "step": 10540, "train_speed(iter/s)": 0.298815 }, { "acc": 0.7101047, "epoch": 0.11811304346610332, "grad_norm": 6.21875, "learning_rate": 9.99775884060958e-06, "loss": 1.17942772, "memory(GiB)": 141.16, "step": 10560, "train_speed(iter/s)": 0.298988 }, { "acc": 0.71388707, "epoch": 0.11833674241206185, "grad_norm": 6.59375, "learning_rate": 9.997703131416527e-06, "loss": 1.14271603, "memory(GiB)": 141.16, "step": 10580, "train_speed(iter/s)": 0.299189 }, { "acc": 0.70784111, "epoch": 0.11856044135802038, "grad_norm": 6.40625, "learning_rate": 9.997646738466254e-06, "loss": 1.18877096, "memory(GiB)": 141.16, "step": 10600, "train_speed(iter/s)": 0.299384 }, { "acc": 0.71550021, "epoch": 0.11878414030397891, "grad_norm": 7.21875, "learning_rate": 9.997589661766471e-06, "loss": 1.16848297, "memory(GiB)": 141.16, "step": 10620, "train_speed(iter/s)": 0.299568 }, { "acc": 0.71500711, "epoch": 0.11900783924993744, "grad_norm": 7.71875, "learning_rate": 9.997531901324991e-06, "loss": 1.13316841, "memory(GiB)": 141.16, "step": 10640, "train_speed(iter/s)": 0.299772 }, { "acc": 0.72571502, "epoch": 0.11923153819589596, "grad_norm": 10.4375, "learning_rate": 9.997473457149717e-06, "loss": 1.09239511, "memory(GiB)": 141.16, "step": 10660, "train_speed(iter/s)": 0.299972 }, { "acc": 0.71923914, "epoch": 0.11945523714185449, "grad_norm": 6.40625, "learning_rate": 9.997414329248642e-06, "loss": 1.16623287, "memory(GiB)": 141.16, "step": 10680, "train_speed(iter/s)": 0.300162 }, { "acc": 0.7287159, "epoch": 0.11967893608781302, "grad_norm": 6.90625, "learning_rate": 9.99735451762986e-06, "loss": 1.08237343, "memory(GiB)": 141.16, "step": 10700, "train_speed(iter/s)": 0.300351 }, { "acc": 0.72079935, "epoch": 0.11990263503377155, "grad_norm": 7.46875, "learning_rate": 9.99729402230155e-06, "loss": 1.13482189, "memory(GiB)": 141.16, "step": 10720, "train_speed(iter/s)": 0.30054 }, { "acc": 0.72338629, "epoch": 0.12012633397973008, "grad_norm": 7.84375, "learning_rate": 9.99723284327199e-06, "loss": 1.11242695, "memory(GiB)": 141.16, "step": 10740, "train_speed(iter/s)": 0.300738 }, { "acc": 0.72439075, "epoch": 0.1203500329256886, "grad_norm": 7.59375, "learning_rate": 9.997170980549547e-06, "loss": 1.09978542, "memory(GiB)": 141.16, "step": 10760, "train_speed(iter/s)": 0.300921 }, { "acc": 0.73005772, "epoch": 0.12057373187164713, "grad_norm": 7.15625, "learning_rate": 9.99710843414269e-06, "loss": 1.09595928, "memory(GiB)": 141.16, "step": 10780, "train_speed(iter/s)": 0.301106 }, { "acc": 0.72470946, "epoch": 0.12079743081760566, "grad_norm": 7.34375, "learning_rate": 9.997045204059977e-06, "loss": 1.11520901, "memory(GiB)": 141.16, "step": 10800, "train_speed(iter/s)": 0.301285 }, { "acc": 0.72279916, "epoch": 0.12102112976356419, "grad_norm": 7.4375, "learning_rate": 9.996981290310052e-06, "loss": 1.12368851, "memory(GiB)": 141.16, "step": 10820, "train_speed(iter/s)": 0.301459 }, { "acc": 0.7202384, "epoch": 0.12124482870952273, "grad_norm": 7.09375, "learning_rate": 9.996916692901665e-06, "loss": 1.13381062, "memory(GiB)": 141.16, "step": 10840, "train_speed(iter/s)": 0.301641 }, { "acc": 0.72855463, "epoch": 0.12146852765548126, "grad_norm": 6.8125, "learning_rate": 9.996851411843652e-06, "loss": 1.10309105, "memory(GiB)": 141.16, "step": 10860, "train_speed(iter/s)": 0.301837 }, { "acc": 0.71766605, "epoch": 0.12169222660143979, "grad_norm": 8.125, "learning_rate": 9.996785447144943e-06, "loss": 1.13328695, "memory(GiB)": 141.16, "step": 10880, "train_speed(iter/s)": 0.302016 }, { "acc": 0.71138263, "epoch": 0.12191592554739832, "grad_norm": 6.15625, "learning_rate": 9.996718798814565e-06, "loss": 1.14381657, "memory(GiB)": 141.16, "step": 10900, "train_speed(iter/s)": 0.302208 }, { "acc": 0.71838317, "epoch": 0.12213962449335684, "grad_norm": 6.28125, "learning_rate": 9.996651466861636e-06, "loss": 1.13969288, "memory(GiB)": 141.16, "step": 10920, "train_speed(iter/s)": 0.302394 }, { "acc": 0.72201343, "epoch": 0.12236332343931537, "grad_norm": 8.25, "learning_rate": 9.996583451295368e-06, "loss": 1.12809734, "memory(GiB)": 141.16, "step": 10940, "train_speed(iter/s)": 0.302606 }, { "acc": 0.72088943, "epoch": 0.1225870223852739, "grad_norm": 8.625, "learning_rate": 9.996514752125065e-06, "loss": 1.11500988, "memory(GiB)": 141.16, "step": 10960, "train_speed(iter/s)": 0.302784 }, { "acc": 0.71869192, "epoch": 0.12281072133123243, "grad_norm": 7.84375, "learning_rate": 9.996445369360129e-06, "loss": 1.1357317, "memory(GiB)": 141.16, "step": 10980, "train_speed(iter/s)": 0.302934 }, { "acc": 0.71287622, "epoch": 0.12303442027719096, "grad_norm": 7.75, "learning_rate": 9.996375303010051e-06, "loss": 1.13787851, "memory(GiB)": 141.16, "step": 11000, "train_speed(iter/s)": 0.303111 }, { "acc": 0.71206837, "epoch": 0.12325811922314948, "grad_norm": 6.21875, "learning_rate": 9.996304553084416e-06, "loss": 1.1671052, "memory(GiB)": 141.16, "step": 11020, "train_speed(iter/s)": 0.303293 }, { "acc": 0.708147, "epoch": 0.12348181816910801, "grad_norm": 6.84375, "learning_rate": 9.996233119592905e-06, "loss": 1.19803085, "memory(GiB)": 141.16, "step": 11040, "train_speed(iter/s)": 0.303466 }, { "acc": 0.70491347, "epoch": 0.12370551711506654, "grad_norm": 6.125, "learning_rate": 9.996161002545288e-06, "loss": 1.20548143, "memory(GiB)": 141.16, "step": 11060, "train_speed(iter/s)": 0.30365 }, { "acc": 0.73745718, "epoch": 0.12392921606102507, "grad_norm": 6.0, "learning_rate": 9.996088201951438e-06, "loss": 1.05149117, "memory(GiB)": 141.16, "step": 11080, "train_speed(iter/s)": 0.303832 }, { "acc": 0.72144356, "epoch": 0.1241529150069836, "grad_norm": 6.4375, "learning_rate": 9.996014717821309e-06, "loss": 1.11468468, "memory(GiB)": 141.16, "step": 11100, "train_speed(iter/s)": 0.304015 }, { "acc": 0.71301227, "epoch": 0.12437661395294212, "grad_norm": 7.5, "learning_rate": 9.995940550164958e-06, "loss": 1.15985403, "memory(GiB)": 141.16, "step": 11120, "train_speed(iter/s)": 0.304195 }, { "acc": 0.7139502, "epoch": 0.12460031289890065, "grad_norm": 6.0625, "learning_rate": 9.995865698992531e-06, "loss": 1.15104771, "memory(GiB)": 141.16, "step": 11140, "train_speed(iter/s)": 0.304337 }, { "acc": 0.7143796, "epoch": 0.1248240118448592, "grad_norm": 5.5625, "learning_rate": 9.995790164314269e-06, "loss": 1.14285736, "memory(GiB)": 141.16, "step": 11160, "train_speed(iter/s)": 0.304513 }, { "acc": 0.71981759, "epoch": 0.12504771079081772, "grad_norm": 8.8125, "learning_rate": 9.995713946140507e-06, "loss": 1.13705692, "memory(GiB)": 141.16, "step": 11180, "train_speed(iter/s)": 0.304698 }, { "acc": 0.71992197, "epoch": 0.12527140973677625, "grad_norm": 6.90625, "learning_rate": 9.99563704448167e-06, "loss": 1.12847939, "memory(GiB)": 141.16, "step": 11200, "train_speed(iter/s)": 0.304864 }, { "acc": 0.72531848, "epoch": 0.12549510868273478, "grad_norm": 6.90625, "learning_rate": 9.995559459348282e-06, "loss": 1.10917587, "memory(GiB)": 141.16, "step": 11220, "train_speed(iter/s)": 0.305045 }, { "acc": 0.7182436, "epoch": 0.1257188076286933, "grad_norm": 7.65625, "learning_rate": 9.995481190750958e-06, "loss": 1.14892206, "memory(GiB)": 141.16, "step": 11240, "train_speed(iter/s)": 0.305192 }, { "acc": 0.70796075, "epoch": 0.12594250657465184, "grad_norm": 6.6875, "learning_rate": 9.995402238700406e-06, "loss": 1.18919764, "memory(GiB)": 141.16, "step": 11260, "train_speed(iter/s)": 0.305369 }, { "acc": 0.73323016, "epoch": 0.12616620552061036, "grad_norm": 7.375, "learning_rate": 9.995322603207425e-06, "loss": 1.06931639, "memory(GiB)": 141.16, "step": 11280, "train_speed(iter/s)": 0.305533 }, { "acc": 0.72061939, "epoch": 0.1263899044665689, "grad_norm": 7.3125, "learning_rate": 9.995242284282912e-06, "loss": 1.1151309, "memory(GiB)": 141.16, "step": 11300, "train_speed(iter/s)": 0.305698 }, { "acc": 0.73728628, "epoch": 0.12661360341252742, "grad_norm": 6.6875, "learning_rate": 9.995161281937858e-06, "loss": 1.06106615, "memory(GiB)": 141.16, "step": 11320, "train_speed(iter/s)": 0.30587 }, { "acc": 0.7249033, "epoch": 0.12683730235848595, "grad_norm": 6.15625, "learning_rate": 9.995079596183343e-06, "loss": 1.10987644, "memory(GiB)": 141.16, "step": 11340, "train_speed(iter/s)": 0.306028 }, { "acc": 0.71769648, "epoch": 0.12706100130444448, "grad_norm": 7.03125, "learning_rate": 9.994997227030543e-06, "loss": 1.14149361, "memory(GiB)": 141.16, "step": 11360, "train_speed(iter/s)": 0.306183 }, { "acc": 0.71928864, "epoch": 0.127284700250403, "grad_norm": 8.625, "learning_rate": 9.994914174490727e-06, "loss": 1.11380854, "memory(GiB)": 141.16, "step": 11380, "train_speed(iter/s)": 0.306354 }, { "acc": 0.73417563, "epoch": 0.12750839919636153, "grad_norm": 8.0625, "learning_rate": 9.994830438575257e-06, "loss": 1.06210117, "memory(GiB)": 141.16, "step": 11400, "train_speed(iter/s)": 0.306511 }, { "acc": 0.71360426, "epoch": 0.12773209814232006, "grad_norm": 8.5, "learning_rate": 9.994746019295592e-06, "loss": 1.14176569, "memory(GiB)": 141.16, "step": 11420, "train_speed(iter/s)": 0.306678 }, { "acc": 0.7104373, "epoch": 0.1279557970882786, "grad_norm": 6.46875, "learning_rate": 9.994660916663279e-06, "loss": 1.16514835, "memory(GiB)": 141.16, "step": 11440, "train_speed(iter/s)": 0.306858 }, { "acc": 0.71732903, "epoch": 0.12817949603423712, "grad_norm": 7.125, "learning_rate": 9.994575130689963e-06, "loss": 1.12920132, "memory(GiB)": 141.16, "step": 11460, "train_speed(iter/s)": 0.307035 }, { "acc": 0.71096821, "epoch": 0.12840319498019565, "grad_norm": 7.0, "learning_rate": 9.99448866138738e-06, "loss": 1.16533298, "memory(GiB)": 141.16, "step": 11480, "train_speed(iter/s)": 0.307186 }, { "acc": 0.71320901, "epoch": 0.12862689392615417, "grad_norm": 6.40625, "learning_rate": 9.994401508767361e-06, "loss": 1.174119, "memory(GiB)": 141.16, "step": 11500, "train_speed(iter/s)": 0.307358 }, { "acc": 0.72359753, "epoch": 0.1288505928721127, "grad_norm": 8.125, "learning_rate": 9.994313672841829e-06, "loss": 1.10466137, "memory(GiB)": 141.16, "step": 11520, "train_speed(iter/s)": 0.307535 }, { "acc": 0.73395538, "epoch": 0.12907429181807123, "grad_norm": 7.15625, "learning_rate": 9.994225153622801e-06, "loss": 1.05791683, "memory(GiB)": 141.16, "step": 11540, "train_speed(iter/s)": 0.307698 }, { "acc": 0.70833044, "epoch": 0.12929799076402976, "grad_norm": 8.5, "learning_rate": 9.994135951122387e-06, "loss": 1.18450031, "memory(GiB)": 141.16, "step": 11560, "train_speed(iter/s)": 0.307891 }, { "acc": 0.72284694, "epoch": 0.12952168970998829, "grad_norm": 7.21875, "learning_rate": 9.994046065352794e-06, "loss": 1.1054266, "memory(GiB)": 141.16, "step": 11580, "train_speed(iter/s)": 0.308059 }, { "acc": 0.71486034, "epoch": 0.12974538865594681, "grad_norm": 6.96875, "learning_rate": 9.993955496326318e-06, "loss": 1.15618706, "memory(GiB)": 141.16, "step": 11600, "train_speed(iter/s)": 0.308232 }, { "acc": 0.72342367, "epoch": 0.12996908760190534, "grad_norm": 6.90625, "learning_rate": 9.99386424405535e-06, "loss": 1.10747833, "memory(GiB)": 141.16, "step": 11620, "train_speed(iter/s)": 0.308393 }, { "acc": 0.71724834, "epoch": 0.1301927865478639, "grad_norm": 6.71875, "learning_rate": 9.993772308552374e-06, "loss": 1.13667469, "memory(GiB)": 141.16, "step": 11640, "train_speed(iter/s)": 0.30856 }, { "acc": 0.71977472, "epoch": 0.13041648549382243, "grad_norm": 6.625, "learning_rate": 9.993679689829968e-06, "loss": 1.12059879, "memory(GiB)": 141.16, "step": 11660, "train_speed(iter/s)": 0.308703 }, { "acc": 0.70723848, "epoch": 0.13064018443978095, "grad_norm": 7.0, "learning_rate": 9.993586387900805e-06, "loss": 1.19163189, "memory(GiB)": 141.16, "step": 11680, "train_speed(iter/s)": 0.30888 }, { "acc": 0.72030325, "epoch": 0.13086388338573948, "grad_norm": 5.96875, "learning_rate": 9.99349240277765e-06, "loss": 1.14548569, "memory(GiB)": 141.16, "step": 11700, "train_speed(iter/s)": 0.309042 }, { "acc": 0.71539774, "epoch": 0.131087582331698, "grad_norm": 7.4375, "learning_rate": 9.99339773447336e-06, "loss": 1.16689682, "memory(GiB)": 141.16, "step": 11720, "train_speed(iter/s)": 0.309205 }, { "acc": 0.7123106, "epoch": 0.13131128127765654, "grad_norm": 6.5, "learning_rate": 9.993302383000887e-06, "loss": 1.1538662, "memory(GiB)": 141.16, "step": 11740, "train_speed(iter/s)": 0.309353 }, { "acc": 0.70649099, "epoch": 0.13153498022361507, "grad_norm": 6.09375, "learning_rate": 9.993206348373278e-06, "loss": 1.1886611, "memory(GiB)": 141.16, "step": 11760, "train_speed(iter/s)": 0.30953 }, { "acc": 0.72167072, "epoch": 0.1317586791695736, "grad_norm": 6.53125, "learning_rate": 9.993109630603672e-06, "loss": 1.12690525, "memory(GiB)": 141.16, "step": 11780, "train_speed(iter/s)": 0.309679 }, { "acc": 0.71053247, "epoch": 0.13198237811553212, "grad_norm": 7.90625, "learning_rate": 9.993012229705302e-06, "loss": 1.16362762, "memory(GiB)": 141.16, "step": 11800, "train_speed(iter/s)": 0.309814 }, { "acc": 0.71496906, "epoch": 0.13220607706149065, "grad_norm": 7.71875, "learning_rate": 9.99291414569149e-06, "loss": 1.16085796, "memory(GiB)": 141.16, "step": 11820, "train_speed(iter/s)": 0.309975 }, { "acc": 0.72172432, "epoch": 0.13242977600744918, "grad_norm": 7.3125, "learning_rate": 9.992815378575658e-06, "loss": 1.12694006, "memory(GiB)": 141.16, "step": 11840, "train_speed(iter/s)": 0.310128 }, { "acc": 0.73010559, "epoch": 0.1326534749534077, "grad_norm": 9.75, "learning_rate": 9.992715928371318e-06, "loss": 1.1042448, "memory(GiB)": 141.16, "step": 11860, "train_speed(iter/s)": 0.310284 }, { "acc": 0.72427034, "epoch": 0.13287717389936624, "grad_norm": 7.6875, "learning_rate": 9.992615795092078e-06, "loss": 1.10511532, "memory(GiB)": 141.16, "step": 11880, "train_speed(iter/s)": 0.310457 }, { "acc": 0.72276354, "epoch": 0.13310087284532476, "grad_norm": 6.25, "learning_rate": 9.992514978751635e-06, "loss": 1.09967136, "memory(GiB)": 141.16, "step": 11900, "train_speed(iter/s)": 0.310624 }, { "acc": 0.7249917, "epoch": 0.1333245717912833, "grad_norm": 6.90625, "learning_rate": 9.992413479363785e-06, "loss": 1.11063919, "memory(GiB)": 141.16, "step": 11920, "train_speed(iter/s)": 0.310764 }, { "acc": 0.72075529, "epoch": 0.13354827073724182, "grad_norm": 7.1875, "learning_rate": 9.992311296942412e-06, "loss": 1.13336086, "memory(GiB)": 141.16, "step": 11940, "train_speed(iter/s)": 0.310932 }, { "acc": 0.72545686, "epoch": 0.13377196968320035, "grad_norm": 8.5, "learning_rate": 9.992208431501495e-06, "loss": 1.12074432, "memory(GiB)": 141.16, "step": 11960, "train_speed(iter/s)": 0.311097 }, { "acc": 0.7263813, "epoch": 0.13399566862915888, "grad_norm": 6.75, "learning_rate": 9.992104883055112e-06, "loss": 1.1069253, "memory(GiB)": 141.16, "step": 11980, "train_speed(iter/s)": 0.311262 }, { "acc": 0.71217098, "epoch": 0.1342193675751174, "grad_norm": 7.15625, "learning_rate": 9.992000651617429e-06, "loss": 1.18231659, "memory(GiB)": 141.16, "step": 12000, "train_speed(iter/s)": 0.311426 }, { "epoch": 0.1342193675751174, "eval_acc": 0.6774369505641312, "eval_loss": 1.1278555393218994, "eval_runtime": 2222.4292, "eval_samples_per_second": 33.874, "eval_steps_per_second": 16.937, "step": 12000 }, { "acc": 0.7306982, "epoch": 0.13444306652107593, "grad_norm": 7.21875, "learning_rate": 9.991895737202701e-06, "loss": 1.08962479, "memory(GiB)": 141.16, "step": 12020, "train_speed(iter/s)": 0.294265 }, { "acc": 0.72208152, "epoch": 0.13466676546703446, "grad_norm": 6.65625, "learning_rate": 9.991790139825288e-06, "loss": 1.12383232, "memory(GiB)": 141.16, "step": 12040, "train_speed(iter/s)": 0.294436 }, { "acc": 0.74008722, "epoch": 0.134890464412993, "grad_norm": 8.9375, "learning_rate": 9.991683859499632e-06, "loss": 1.03680611, "memory(GiB)": 141.16, "step": 12060, "train_speed(iter/s)": 0.294611 }, { "acc": 0.71729589, "epoch": 0.13511416335895152, "grad_norm": 7.71875, "learning_rate": 9.99157689624028e-06, "loss": 1.14917583, "memory(GiB)": 141.16, "step": 12080, "train_speed(iter/s)": 0.294792 }, { "acc": 0.72151217, "epoch": 0.13533786230491004, "grad_norm": 8.125, "learning_rate": 9.99146925006186e-06, "loss": 1.11376286, "memory(GiB)": 141.16, "step": 12100, "train_speed(iter/s)": 0.294957 }, { "acc": 0.70576153, "epoch": 0.13556156125086857, "grad_norm": 6.875, "learning_rate": 9.991360920979103e-06, "loss": 1.18191423, "memory(GiB)": 141.16, "step": 12120, "train_speed(iter/s)": 0.29512 }, { "acc": 0.71624589, "epoch": 0.1357852601968271, "grad_norm": 7.15625, "learning_rate": 9.991251909006829e-06, "loss": 1.14758644, "memory(GiB)": 141.16, "step": 12140, "train_speed(iter/s)": 0.295281 }, { "acc": 0.72445583, "epoch": 0.13600895914278563, "grad_norm": 6.8125, "learning_rate": 9.991142214159953e-06, "loss": 1.09648113, "memory(GiB)": 141.16, "step": 12160, "train_speed(iter/s)": 0.29544 }, { "acc": 0.73188248, "epoch": 0.13623265808874416, "grad_norm": 8.5625, "learning_rate": 9.991031836453482e-06, "loss": 1.07805653, "memory(GiB)": 141.16, "step": 12180, "train_speed(iter/s)": 0.295608 }, { "acc": 0.73198752, "epoch": 0.13645635703470269, "grad_norm": 8.8125, "learning_rate": 9.990920775902514e-06, "loss": 1.09384003, "memory(GiB)": 141.16, "step": 12200, "train_speed(iter/s)": 0.29577 }, { "acc": 0.71278005, "epoch": 0.1366800559806612, "grad_norm": 4.28125, "learning_rate": 9.990809032522252e-06, "loss": 1.1568079, "memory(GiB)": 141.16, "step": 12220, "train_speed(iter/s)": 0.295931 }, { "acc": 0.73277974, "epoch": 0.13690375492661974, "grad_norm": 8.25, "learning_rate": 9.990696606327978e-06, "loss": 1.07171593, "memory(GiB)": 141.16, "step": 12240, "train_speed(iter/s)": 0.296092 }, { "acc": 0.71332712, "epoch": 0.1371274538725783, "grad_norm": 7.0625, "learning_rate": 9.990583497335074e-06, "loss": 1.16728983, "memory(GiB)": 141.16, "step": 12260, "train_speed(iter/s)": 0.296263 }, { "acc": 0.73154674, "epoch": 0.13735115281853683, "grad_norm": 5.1875, "learning_rate": 9.990469705559016e-06, "loss": 1.0981554, "memory(GiB)": 141.16, "step": 12280, "train_speed(iter/s)": 0.296424 }, { "acc": 0.74045172, "epoch": 0.13757485176449535, "grad_norm": 6.78125, "learning_rate": 9.990355231015372e-06, "loss": 1.04195681, "memory(GiB)": 141.16, "step": 12300, "train_speed(iter/s)": 0.296586 }, { "acc": 0.7372777, "epoch": 0.13779855071045388, "grad_norm": 6.84375, "learning_rate": 9.990240073719804e-06, "loss": 1.05561504, "memory(GiB)": 141.16, "step": 12320, "train_speed(iter/s)": 0.296754 }, { "acc": 0.7189661, "epoch": 0.1380222496564124, "grad_norm": 8.0625, "learning_rate": 9.990124233688066e-06, "loss": 1.13360949, "memory(GiB)": 141.16, "step": 12340, "train_speed(iter/s)": 0.296921 }, { "acc": 0.72097058, "epoch": 0.13824594860237094, "grad_norm": 6.4375, "learning_rate": 9.990007710936006e-06, "loss": 1.13094072, "memory(GiB)": 141.16, "step": 12360, "train_speed(iter/s)": 0.297098 }, { "acc": 0.73482256, "epoch": 0.13846964754832947, "grad_norm": 6.84375, "learning_rate": 9.989890505479571e-06, "loss": 1.06293154, "memory(GiB)": 141.16, "step": 12380, "train_speed(iter/s)": 0.297262 }, { "acc": 0.72000513, "epoch": 0.138693346494288, "grad_norm": 7.15625, "learning_rate": 9.989772617334792e-06, "loss": 1.1235363, "memory(GiB)": 141.16, "step": 12400, "train_speed(iter/s)": 0.297424 }, { "acc": 0.71680174, "epoch": 0.13891704544024652, "grad_norm": 7.03125, "learning_rate": 9.989654046517799e-06, "loss": 1.15973644, "memory(GiB)": 141.16, "step": 12420, "train_speed(iter/s)": 0.297585 }, { "acc": 0.71937604, "epoch": 0.13914074438620505, "grad_norm": 7.125, "learning_rate": 9.989534793044813e-06, "loss": 1.13388109, "memory(GiB)": 141.16, "step": 12440, "train_speed(iter/s)": 0.297729 }, { "acc": 0.72059665, "epoch": 0.13936444333216358, "grad_norm": 5.75, "learning_rate": 9.98941485693215e-06, "loss": 1.12519341, "memory(GiB)": 141.16, "step": 12460, "train_speed(iter/s)": 0.297891 }, { "acc": 0.71288948, "epoch": 0.1395881422781221, "grad_norm": 7.25, "learning_rate": 9.98929423819622e-06, "loss": 1.15420761, "memory(GiB)": 141.16, "step": 12480, "train_speed(iter/s)": 0.298039 }, { "acc": 0.72607579, "epoch": 0.13981184122408064, "grad_norm": 7.75, "learning_rate": 9.989172936853525e-06, "loss": 1.10121975, "memory(GiB)": 141.16, "step": 12500, "train_speed(iter/s)": 0.298206 }, { "acc": 0.72805252, "epoch": 0.14003554017003916, "grad_norm": 10.125, "learning_rate": 9.98905095292066e-06, "loss": 1.09418697, "memory(GiB)": 141.16, "step": 12520, "train_speed(iter/s)": 0.298387 }, { "acc": 0.71894188, "epoch": 0.1402592391159977, "grad_norm": 8.5625, "learning_rate": 9.988928286414315e-06, "loss": 1.13834124, "memory(GiB)": 141.16, "step": 12540, "train_speed(iter/s)": 0.29853 }, { "acc": 0.72348828, "epoch": 0.14048293806195622, "grad_norm": 8.1875, "learning_rate": 9.988804937351272e-06, "loss": 1.1071661, "memory(GiB)": 141.16, "step": 12560, "train_speed(iter/s)": 0.298689 }, { "acc": 0.72184291, "epoch": 0.14070663700791475, "grad_norm": 7.5, "learning_rate": 9.988680905748407e-06, "loss": 1.11359806, "memory(GiB)": 141.16, "step": 12580, "train_speed(iter/s)": 0.29885 }, { "acc": 0.71947002, "epoch": 0.14093033595387328, "grad_norm": 8.375, "learning_rate": 9.988556191622689e-06, "loss": 1.13307819, "memory(GiB)": 141.16, "step": 12600, "train_speed(iter/s)": 0.299004 }, { "acc": 0.71700449, "epoch": 0.1411540348998318, "grad_norm": 6.53125, "learning_rate": 9.988430794991181e-06, "loss": 1.11746826, "memory(GiB)": 141.16, "step": 12620, "train_speed(iter/s)": 0.299174 }, { "acc": 0.72079296, "epoch": 0.14137773384579033, "grad_norm": 9.1875, "learning_rate": 9.98830471587104e-06, "loss": 1.12187948, "memory(GiB)": 141.16, "step": 12640, "train_speed(iter/s)": 0.299338 }, { "acc": 0.71671019, "epoch": 0.14160143279174886, "grad_norm": 6.75, "learning_rate": 9.988177954279515e-06, "loss": 1.15028934, "memory(GiB)": 141.16, "step": 12660, "train_speed(iter/s)": 0.299485 }, { "acc": 0.72333341, "epoch": 0.1418251317377074, "grad_norm": 8.375, "learning_rate": 9.988050510233948e-06, "loss": 1.10685167, "memory(GiB)": 141.16, "step": 12680, "train_speed(iter/s)": 0.299665 }, { "acc": 0.73154984, "epoch": 0.14204883068366592, "grad_norm": 6.125, "learning_rate": 9.987922383751777e-06, "loss": 1.08156672, "memory(GiB)": 141.16, "step": 12700, "train_speed(iter/s)": 0.299843 }, { "acc": 0.72147503, "epoch": 0.14227252962962444, "grad_norm": 6.90625, "learning_rate": 9.987793574850526e-06, "loss": 1.09817429, "memory(GiB)": 141.16, "step": 12720, "train_speed(iter/s)": 0.300019 }, { "acc": 0.72647901, "epoch": 0.14249622857558297, "grad_norm": 7.8125, "learning_rate": 9.987664083547826e-06, "loss": 1.09593277, "memory(GiB)": 141.16, "step": 12740, "train_speed(iter/s)": 0.300184 }, { "acc": 0.72718797, "epoch": 0.1427199275215415, "grad_norm": 9.5625, "learning_rate": 9.987533909861387e-06, "loss": 1.08809261, "memory(GiB)": 141.16, "step": 12760, "train_speed(iter/s)": 0.300318 }, { "acc": 0.72024021, "epoch": 0.14294362646750003, "grad_norm": 6.40625, "learning_rate": 9.987403053809022e-06, "loss": 1.12795305, "memory(GiB)": 141.16, "step": 12780, "train_speed(iter/s)": 0.300474 }, { "acc": 0.70947094, "epoch": 0.14316732541345856, "grad_norm": 4.625, "learning_rate": 9.987271515408633e-06, "loss": 1.18909578, "memory(GiB)": 141.16, "step": 12800, "train_speed(iter/s)": 0.300632 }, { "acc": 0.71772547, "epoch": 0.14339102435941709, "grad_norm": 5.84375, "learning_rate": 9.987139294678213e-06, "loss": 1.13778477, "memory(GiB)": 141.16, "step": 12820, "train_speed(iter/s)": 0.300766 }, { "acc": 0.72204957, "epoch": 0.1436147233053756, "grad_norm": 6.875, "learning_rate": 9.987006391635859e-06, "loss": 1.11839905, "memory(GiB)": 141.16, "step": 12840, "train_speed(iter/s)": 0.300922 }, { "acc": 0.7227478, "epoch": 0.14383842225133414, "grad_norm": 6.625, "learning_rate": 9.986872806299747e-06, "loss": 1.1225956, "memory(GiB)": 141.16, "step": 12860, "train_speed(iter/s)": 0.301081 }, { "acc": 0.71378417, "epoch": 0.1440621211972927, "grad_norm": 7.3125, "learning_rate": 9.986738538688156e-06, "loss": 1.16749582, "memory(GiB)": 141.16, "step": 12880, "train_speed(iter/s)": 0.301233 }, { "acc": 0.7083591, "epoch": 0.14428582014325123, "grad_norm": 6.46875, "learning_rate": 9.98660358881946e-06, "loss": 1.18641586, "memory(GiB)": 141.16, "step": 12900, "train_speed(iter/s)": 0.301389 }, { "acc": 0.71649055, "epoch": 0.14450951908920975, "grad_norm": 7.0625, "learning_rate": 9.986467956712114e-06, "loss": 1.16955805, "memory(GiB)": 141.16, "step": 12920, "train_speed(iter/s)": 0.301541 }, { "acc": 0.72688131, "epoch": 0.14473321803516828, "grad_norm": 8.0, "learning_rate": 9.98633164238468e-06, "loss": 1.0814333, "memory(GiB)": 141.16, "step": 12940, "train_speed(iter/s)": 0.301695 }, { "acc": 0.71787882, "epoch": 0.1449569169811268, "grad_norm": 7.21875, "learning_rate": 9.986194645855807e-06, "loss": 1.15404339, "memory(GiB)": 141.16, "step": 12960, "train_speed(iter/s)": 0.30183 }, { "acc": 0.71652942, "epoch": 0.14518061592708534, "grad_norm": 7.28125, "learning_rate": 9.986056967144236e-06, "loss": 1.14375458, "memory(GiB)": 141.16, "step": 12980, "train_speed(iter/s)": 0.301983 }, { "acc": 0.72896667, "epoch": 0.14540431487304387, "grad_norm": 8.1875, "learning_rate": 9.985918606268805e-06, "loss": 1.08948545, "memory(GiB)": 141.16, "step": 13000, "train_speed(iter/s)": 0.302137 }, { "acc": 0.71371841, "epoch": 0.1456280138190024, "grad_norm": 7.40625, "learning_rate": 9.985779563248444e-06, "loss": 1.15465622, "memory(GiB)": 141.16, "step": 13020, "train_speed(iter/s)": 0.302296 }, { "acc": 0.71773896, "epoch": 0.14585171276496092, "grad_norm": 7.40625, "learning_rate": 9.985639838102174e-06, "loss": 1.13333082, "memory(GiB)": 141.16, "step": 13040, "train_speed(iter/s)": 0.302466 }, { "acc": 0.72237701, "epoch": 0.14607541171091945, "grad_norm": 8.3125, "learning_rate": 9.985499430849114e-06, "loss": 1.11929131, "memory(GiB)": 141.16, "step": 13060, "train_speed(iter/s)": 0.302628 }, { "acc": 0.72698622, "epoch": 0.14629911065687798, "grad_norm": 7.71875, "learning_rate": 9.985358341508473e-06, "loss": 1.07898521, "memory(GiB)": 141.16, "step": 13080, "train_speed(iter/s)": 0.302793 }, { "acc": 0.71783094, "epoch": 0.1465228096028365, "grad_norm": 6.875, "learning_rate": 9.985216570099555e-06, "loss": 1.13497906, "memory(GiB)": 141.16, "step": 13100, "train_speed(iter/s)": 0.302924 }, { "acc": 0.72827263, "epoch": 0.14674650854879503, "grad_norm": 8.125, "learning_rate": 9.985074116641752e-06, "loss": 1.08939648, "memory(GiB)": 141.16, "step": 13120, "train_speed(iter/s)": 0.303049 }, { "acc": 0.72308989, "epoch": 0.14697020749475356, "grad_norm": 5.40625, "learning_rate": 9.984930981154558e-06, "loss": 1.10828552, "memory(GiB)": 141.16, "step": 13140, "train_speed(iter/s)": 0.303211 }, { "acc": 0.72038155, "epoch": 0.1471939064407121, "grad_norm": 7.28125, "learning_rate": 9.984787163657554e-06, "loss": 1.12431889, "memory(GiB)": 141.16, "step": 13160, "train_speed(iter/s)": 0.303351 }, { "acc": 0.73182125, "epoch": 0.14741760538667062, "grad_norm": 7.34375, "learning_rate": 9.984642664170419e-06, "loss": 1.07325897, "memory(GiB)": 141.16, "step": 13180, "train_speed(iter/s)": 0.303503 }, { "acc": 0.72136974, "epoch": 0.14764130433262915, "grad_norm": 7.03125, "learning_rate": 9.984497482712919e-06, "loss": 1.12330246, "memory(GiB)": 141.16, "step": 13200, "train_speed(iter/s)": 0.30366 }, { "acc": 0.73169775, "epoch": 0.14786500327858768, "grad_norm": 6.78125, "learning_rate": 9.984351619304919e-06, "loss": 1.06437788, "memory(GiB)": 141.16, "step": 13220, "train_speed(iter/s)": 0.303804 }, { "acc": 0.72345772, "epoch": 0.1480887022245462, "grad_norm": 5.46875, "learning_rate": 9.984205073966375e-06, "loss": 1.12295942, "memory(GiB)": 141.16, "step": 13240, "train_speed(iter/s)": 0.303921 }, { "acc": 0.72915177, "epoch": 0.14831240117050473, "grad_norm": 5.53125, "learning_rate": 9.984057846717335e-06, "loss": 1.10490093, "memory(GiB)": 141.16, "step": 13260, "train_speed(iter/s)": 0.304062 }, { "acc": 0.72030134, "epoch": 0.14853610011646326, "grad_norm": 7.25, "learning_rate": 9.983909937577944e-06, "loss": 1.13369446, "memory(GiB)": 141.16, "step": 13280, "train_speed(iter/s)": 0.304201 }, { "acc": 0.72700214, "epoch": 0.1487597990624218, "grad_norm": 7.8125, "learning_rate": 9.983761346568437e-06, "loss": 1.10250473, "memory(GiB)": 141.16, "step": 13300, "train_speed(iter/s)": 0.304336 }, { "acc": 0.72509074, "epoch": 0.14898349800838032, "grad_norm": 8.0, "learning_rate": 9.983612073709144e-06, "loss": 1.09342003, "memory(GiB)": 141.16, "step": 13320, "train_speed(iter/s)": 0.304482 }, { "acc": 0.71965418, "epoch": 0.14920719695433884, "grad_norm": 7.5625, "learning_rate": 9.983462119020487e-06, "loss": 1.11164951, "memory(GiB)": 141.16, "step": 13340, "train_speed(iter/s)": 0.304638 }, { "acc": 0.72058477, "epoch": 0.14943089590029737, "grad_norm": 7.3125, "learning_rate": 9.983311482522979e-06, "loss": 1.12796688, "memory(GiB)": 141.16, "step": 13360, "train_speed(iter/s)": 0.304764 }, { "acc": 0.73086405, "epoch": 0.1496545948462559, "grad_norm": 6.96875, "learning_rate": 9.983160164237236e-06, "loss": 1.07825098, "memory(GiB)": 141.16, "step": 13380, "train_speed(iter/s)": 0.304925 }, { "acc": 0.72191191, "epoch": 0.14987829379221443, "grad_norm": 5.625, "learning_rate": 9.983008164183955e-06, "loss": 1.11179867, "memory(GiB)": 141.16, "step": 13400, "train_speed(iter/s)": 0.305075 }, { "acc": 0.71641626, "epoch": 0.15010199273817296, "grad_norm": 6.6875, "learning_rate": 9.982855482383934e-06, "loss": 1.14148283, "memory(GiB)": 141.16, "step": 13420, "train_speed(iter/s)": 0.305234 }, { "acc": 0.72575045, "epoch": 0.15032569168413148, "grad_norm": 7.59375, "learning_rate": 9.982702118858061e-06, "loss": 1.10931997, "memory(GiB)": 141.16, "step": 13440, "train_speed(iter/s)": 0.305386 }, { "acc": 0.73448172, "epoch": 0.15054939063009, "grad_norm": 7.1875, "learning_rate": 9.98254807362732e-06, "loss": 1.07718287, "memory(GiB)": 141.16, "step": 13460, "train_speed(iter/s)": 0.305522 }, { "acc": 0.73215656, "epoch": 0.15077308957604854, "grad_norm": 5.34375, "learning_rate": 9.982393346712785e-06, "loss": 1.07244034, "memory(GiB)": 141.16, "step": 13480, "train_speed(iter/s)": 0.305665 }, { "acc": 0.71504903, "epoch": 0.15099678852200707, "grad_norm": 6.71875, "learning_rate": 9.982237938135625e-06, "loss": 1.15769072, "memory(GiB)": 141.16, "step": 13500, "train_speed(iter/s)": 0.305801 }, { "acc": 0.7166265, "epoch": 0.15122048746796563, "grad_norm": 7.78125, "learning_rate": 9.982081847917102e-06, "loss": 1.13872223, "memory(GiB)": 141.16, "step": 13520, "train_speed(iter/s)": 0.305935 }, { "acc": 0.71871753, "epoch": 0.15144418641392415, "grad_norm": 9.4375, "learning_rate": 9.981925076078573e-06, "loss": 1.13938618, "memory(GiB)": 141.16, "step": 13540, "train_speed(iter/s)": 0.30607 }, { "acc": 0.71197462, "epoch": 0.15166788535988268, "grad_norm": 7.25, "learning_rate": 9.981767622641485e-06, "loss": 1.17197065, "memory(GiB)": 141.16, "step": 13560, "train_speed(iter/s)": 0.306219 }, { "acc": 0.71274066, "epoch": 0.1518915843058412, "grad_norm": 8.4375, "learning_rate": 9.98160948762738e-06, "loss": 1.17409992, "memory(GiB)": 141.16, "step": 13580, "train_speed(iter/s)": 0.306365 }, { "acc": 0.7353354, "epoch": 0.15211528325179974, "grad_norm": 6.59375, "learning_rate": 9.981450671057896e-06, "loss": 1.06311111, "memory(GiB)": 141.16, "step": 13600, "train_speed(iter/s)": 0.30652 }, { "acc": 0.72120543, "epoch": 0.15233898219775827, "grad_norm": 6.875, "learning_rate": 9.981291172954755e-06, "loss": 1.12898436, "memory(GiB)": 141.16, "step": 13620, "train_speed(iter/s)": 0.306656 }, { "acc": 0.72232628, "epoch": 0.1525626811437168, "grad_norm": 6.5625, "learning_rate": 9.981130993339785e-06, "loss": 1.12070656, "memory(GiB)": 141.16, "step": 13640, "train_speed(iter/s)": 0.306803 }, { "acc": 0.7339139, "epoch": 0.15278638008967532, "grad_norm": 7.78125, "learning_rate": 9.980970132234897e-06, "loss": 1.07526007, "memory(GiB)": 141.16, "step": 13660, "train_speed(iter/s)": 0.306946 }, { "acc": 0.72141819, "epoch": 0.15301007903563385, "grad_norm": 7.71875, "learning_rate": 9.980808589662101e-06, "loss": 1.11524725, "memory(GiB)": 141.16, "step": 13680, "train_speed(iter/s)": 0.307095 }, { "acc": 0.73007231, "epoch": 0.15323377798159238, "grad_norm": 7.78125, "learning_rate": 9.980646365643498e-06, "loss": 1.08502207, "memory(GiB)": 141.16, "step": 13700, "train_speed(iter/s)": 0.307251 }, { "acc": 0.71326723, "epoch": 0.1534574769275509, "grad_norm": 7.0625, "learning_rate": 9.980483460201283e-06, "loss": 1.1590621, "memory(GiB)": 141.16, "step": 13720, "train_speed(iter/s)": 0.307402 }, { "acc": 0.72266016, "epoch": 0.15368117587350943, "grad_norm": 7.4375, "learning_rate": 9.980319873357742e-06, "loss": 1.11954403, "memory(GiB)": 141.16, "step": 13740, "train_speed(iter/s)": 0.307525 }, { "acc": 0.72052202, "epoch": 0.15390487481946796, "grad_norm": 6.15625, "learning_rate": 9.980155605135257e-06, "loss": 1.12966347, "memory(GiB)": 141.16, "step": 13760, "train_speed(iter/s)": 0.307644 }, { "acc": 0.7249671, "epoch": 0.1541285737654265, "grad_norm": 7.0, "learning_rate": 9.979990655556303e-06, "loss": 1.10820189, "memory(GiB)": 141.16, "step": 13780, "train_speed(iter/s)": 0.307775 }, { "acc": 0.71663556, "epoch": 0.15435227271138502, "grad_norm": 7.1875, "learning_rate": 9.979825024643447e-06, "loss": 1.14037418, "memory(GiB)": 141.16, "step": 13800, "train_speed(iter/s)": 0.307911 }, { "acc": 0.72516003, "epoch": 0.15457597165734355, "grad_norm": 9.8125, "learning_rate": 9.97965871241935e-06, "loss": 1.11175356, "memory(GiB)": 141.16, "step": 13820, "train_speed(iter/s)": 0.308043 }, { "acc": 0.73708477, "epoch": 0.15479967060330208, "grad_norm": 6.59375, "learning_rate": 9.979491718906765e-06, "loss": 1.05935755, "memory(GiB)": 141.16, "step": 13840, "train_speed(iter/s)": 0.308174 }, { "acc": 0.71610408, "epoch": 0.1550233695492606, "grad_norm": 6.875, "learning_rate": 9.979324044128538e-06, "loss": 1.14885139, "memory(GiB)": 141.16, "step": 13860, "train_speed(iter/s)": 0.308324 }, { "acc": 0.72350569, "epoch": 0.15524706849521913, "grad_norm": 9.375, "learning_rate": 9.97915568810761e-06, "loss": 1.11050205, "memory(GiB)": 141.16, "step": 13880, "train_speed(iter/s)": 0.308469 }, { "acc": 0.70953107, "epoch": 0.15547076744117766, "grad_norm": 6.59375, "learning_rate": 9.978986650867019e-06, "loss": 1.17571945, "memory(GiB)": 141.16, "step": 13900, "train_speed(iter/s)": 0.308602 }, { "acc": 0.72328329, "epoch": 0.1556944663871362, "grad_norm": 7.9375, "learning_rate": 9.978816932429886e-06, "loss": 1.1197978, "memory(GiB)": 141.16, "step": 13920, "train_speed(iter/s)": 0.308733 }, { "acc": 0.72464485, "epoch": 0.15591816533309472, "grad_norm": 10.75, "learning_rate": 9.978646532819434e-06, "loss": 1.12658634, "memory(GiB)": 141.16, "step": 13940, "train_speed(iter/s)": 0.308875 }, { "acc": 0.72761612, "epoch": 0.15614186427905324, "grad_norm": 6.625, "learning_rate": 9.978475452058974e-06, "loss": 1.09994278, "memory(GiB)": 141.16, "step": 13960, "train_speed(iter/s)": 0.309009 }, { "acc": 0.71461391, "epoch": 0.15636556322501177, "grad_norm": 5.8125, "learning_rate": 9.978303690171912e-06, "loss": 1.14934464, "memory(GiB)": 141.16, "step": 13980, "train_speed(iter/s)": 0.309162 }, { "acc": 0.72396646, "epoch": 0.1565892621709703, "grad_norm": 7.1875, "learning_rate": 9.978131247181753e-06, "loss": 1.11645107, "memory(GiB)": 141.16, "step": 14000, "train_speed(iter/s)": 0.309294 }, { "epoch": 0.1565892621709703, "eval_acc": 0.6796686101379952, "eval_loss": 1.1198610067367554, "eval_runtime": 2222.051, "eval_samples_per_second": 33.88, "eval_steps_per_second": 16.94, "step": 14000 }, { "acc": 0.72925415, "epoch": 0.15681296111692883, "grad_norm": 6.125, "learning_rate": 9.977958123112082e-06, "loss": 1.09675846, "memory(GiB)": 141.16, "step": 14020, "train_speed(iter/s)": 0.294665 }, { "acc": 0.72465229, "epoch": 0.15703666006288736, "grad_norm": 6.0625, "learning_rate": 9.97778431798659e-06, "loss": 1.11787939, "memory(GiB)": 141.16, "step": 14040, "train_speed(iter/s)": 0.294792 }, { "acc": 0.72254057, "epoch": 0.15726035900884588, "grad_norm": 6.96875, "learning_rate": 9.977609831829054e-06, "loss": 1.12034063, "memory(GiB)": 141.16, "step": 14060, "train_speed(iter/s)": 0.294922 }, { "acc": 0.72131281, "epoch": 0.1574840579548044, "grad_norm": 8.1875, "learning_rate": 9.977434664663345e-06, "loss": 1.12221718, "memory(GiB)": 141.16, "step": 14080, "train_speed(iter/s)": 0.295064 }, { "acc": 0.72226791, "epoch": 0.15770775690076294, "grad_norm": 8.75, "learning_rate": 9.977258816513432e-06, "loss": 1.11769562, "memory(GiB)": 141.16, "step": 14100, "train_speed(iter/s)": 0.295209 }, { "acc": 0.73084869, "epoch": 0.15793145584672147, "grad_norm": 5.90625, "learning_rate": 9.97708228740337e-06, "loss": 1.08664198, "memory(GiB)": 141.16, "step": 14120, "train_speed(iter/s)": 0.295339 }, { "acc": 0.72485962, "epoch": 0.15815515479268002, "grad_norm": 6.625, "learning_rate": 9.976905077357315e-06, "loss": 1.09748859, "memory(GiB)": 141.16, "step": 14140, "train_speed(iter/s)": 0.295482 }, { "acc": 0.70855331, "epoch": 0.15837885373863855, "grad_norm": 8.4375, "learning_rate": 9.976727186399506e-06, "loss": 1.19063082, "memory(GiB)": 141.16, "step": 14160, "train_speed(iter/s)": 0.295624 }, { "acc": 0.7088973, "epoch": 0.15860255268459708, "grad_norm": 8.3125, "learning_rate": 9.976548614554285e-06, "loss": 1.17718906, "memory(GiB)": 141.16, "step": 14180, "train_speed(iter/s)": 0.295763 }, { "acc": 0.7179533, "epoch": 0.1588262516305556, "grad_norm": 7.5, "learning_rate": 9.976369361846082e-06, "loss": 1.15515785, "memory(GiB)": 141.16, "step": 14200, "train_speed(iter/s)": 0.295911 }, { "acc": 0.7130621, "epoch": 0.15904995057651414, "grad_norm": 6.3125, "learning_rate": 9.976189428299422e-06, "loss": 1.16320982, "memory(GiB)": 141.16, "step": 14220, "train_speed(iter/s)": 0.29604 }, { "acc": 0.72229266, "epoch": 0.15927364952247267, "grad_norm": 9.3125, "learning_rate": 9.976008813938922e-06, "loss": 1.11860161, "memory(GiB)": 141.16, "step": 14240, "train_speed(iter/s)": 0.296181 }, { "acc": 0.71181946, "epoch": 0.1594973484684312, "grad_norm": 6.15625, "learning_rate": 9.975827518789294e-06, "loss": 1.16769562, "memory(GiB)": 141.16, "step": 14260, "train_speed(iter/s)": 0.296335 }, { "acc": 0.72460651, "epoch": 0.15972104741438972, "grad_norm": 5.96875, "learning_rate": 9.975645542875338e-06, "loss": 1.11192532, "memory(GiB)": 141.16, "step": 14280, "train_speed(iter/s)": 0.296481 }, { "acc": 0.7318778, "epoch": 0.15994474636034825, "grad_norm": 7.25, "learning_rate": 9.975462886221954e-06, "loss": 1.06973705, "memory(GiB)": 141.16, "step": 14300, "train_speed(iter/s)": 0.296622 }, { "acc": 0.72843494, "epoch": 0.16016844530630678, "grad_norm": 6.15625, "learning_rate": 9.975279548854133e-06, "loss": 1.07902279, "memory(GiB)": 141.16, "step": 14320, "train_speed(iter/s)": 0.296769 }, { "acc": 0.71238832, "epoch": 0.1603921442522653, "grad_norm": 10.0625, "learning_rate": 9.975095530796954e-06, "loss": 1.16041384, "memory(GiB)": 141.16, "step": 14340, "train_speed(iter/s)": 0.296912 }, { "acc": 0.72000008, "epoch": 0.16061584319822383, "grad_norm": 8.0, "learning_rate": 9.974910832075598e-06, "loss": 1.13334312, "memory(GiB)": 141.16, "step": 14360, "train_speed(iter/s)": 0.297068 }, { "acc": 0.72905412, "epoch": 0.16083954214418236, "grad_norm": 7.1875, "learning_rate": 9.974725452715332e-06, "loss": 1.08618431, "memory(GiB)": 141.16, "step": 14380, "train_speed(iter/s)": 0.297215 }, { "acc": 0.72926111, "epoch": 0.1610632410901409, "grad_norm": 6.75, "learning_rate": 9.974539392741518e-06, "loss": 1.08895721, "memory(GiB)": 141.16, "step": 14400, "train_speed(iter/s)": 0.297354 }, { "acc": 0.71349525, "epoch": 0.16128694003609942, "grad_norm": 6.03125, "learning_rate": 9.974352652179614e-06, "loss": 1.15036926, "memory(GiB)": 141.16, "step": 14420, "train_speed(iter/s)": 0.297498 }, { "acc": 0.72073479, "epoch": 0.16151063898205795, "grad_norm": 8.5, "learning_rate": 9.974165231055166e-06, "loss": 1.10623302, "memory(GiB)": 141.16, "step": 14440, "train_speed(iter/s)": 0.297633 }, { "acc": 0.72498741, "epoch": 0.16173433792801648, "grad_norm": 7.03125, "learning_rate": 9.973977129393817e-06, "loss": 1.12340832, "memory(GiB)": 141.16, "step": 14460, "train_speed(iter/s)": 0.297765 }, { "acc": 0.7125618, "epoch": 0.161958036873975, "grad_norm": 7.9375, "learning_rate": 9.973788347221304e-06, "loss": 1.15921326, "memory(GiB)": 141.16, "step": 14480, "train_speed(iter/s)": 0.297914 }, { "acc": 0.72709751, "epoch": 0.16218173581993353, "grad_norm": 7.1875, "learning_rate": 9.97359888456345e-06, "loss": 1.09742393, "memory(GiB)": 141.16, "step": 14500, "train_speed(iter/s)": 0.298042 }, { "acc": 0.72247095, "epoch": 0.16240543476589206, "grad_norm": 6.5625, "learning_rate": 9.973408741446183e-06, "loss": 1.1243516, "memory(GiB)": 141.16, "step": 14520, "train_speed(iter/s)": 0.298185 }, { "acc": 0.7362854, "epoch": 0.1626291337118506, "grad_norm": 7.78125, "learning_rate": 9.973217917895513e-06, "loss": 1.05193882, "memory(GiB)": 141.16, "step": 14540, "train_speed(iter/s)": 0.298328 }, { "acc": 0.72790012, "epoch": 0.16285283265780912, "grad_norm": 5.40625, "learning_rate": 9.973026413937548e-06, "loss": 1.075951, "memory(GiB)": 141.16, "step": 14560, "train_speed(iter/s)": 0.298465 }, { "acc": 0.71847796, "epoch": 0.16307653160376764, "grad_norm": 7.25, "learning_rate": 9.972834229598487e-06, "loss": 1.13152065, "memory(GiB)": 141.16, "step": 14580, "train_speed(iter/s)": 0.298595 }, { "acc": 0.72359781, "epoch": 0.16330023054972617, "grad_norm": 7.5, "learning_rate": 9.972641364904627e-06, "loss": 1.10397434, "memory(GiB)": 141.16, "step": 14600, "train_speed(iter/s)": 0.298731 }, { "acc": 0.71748123, "epoch": 0.1635239294956847, "grad_norm": 7.78125, "learning_rate": 9.972447819882351e-06, "loss": 1.15126772, "memory(GiB)": 141.16, "step": 14620, "train_speed(iter/s)": 0.298857 }, { "acc": 0.72528973, "epoch": 0.16374762844164323, "grad_norm": 7.78125, "learning_rate": 9.972253594558142e-06, "loss": 1.09165611, "memory(GiB)": 141.16, "step": 14640, "train_speed(iter/s)": 0.298995 }, { "acc": 0.72463422, "epoch": 0.16397132738760176, "grad_norm": 6.9375, "learning_rate": 9.972058688958572e-06, "loss": 1.11765041, "memory(GiB)": 141.16, "step": 14660, "train_speed(iter/s)": 0.299145 }, { "acc": 0.71771641, "epoch": 0.16419502633356028, "grad_norm": 6.8125, "learning_rate": 9.971863103110306e-06, "loss": 1.13484364, "memory(GiB)": 141.16, "step": 14680, "train_speed(iter/s)": 0.299272 }, { "acc": 0.72736688, "epoch": 0.1644187252795188, "grad_norm": 5.65625, "learning_rate": 9.971666837040102e-06, "loss": 1.079035, "memory(GiB)": 141.16, "step": 14700, "train_speed(iter/s)": 0.2994 }, { "acc": 0.70917587, "epoch": 0.16464242422547734, "grad_norm": 7.96875, "learning_rate": 9.971469890774814e-06, "loss": 1.19280376, "memory(GiB)": 141.16, "step": 14720, "train_speed(iter/s)": 0.299528 }, { "acc": 0.74725595, "epoch": 0.16486612317143587, "grad_norm": 7.75, "learning_rate": 9.971272264341386e-06, "loss": 0.99958496, "memory(GiB)": 141.16, "step": 14740, "train_speed(iter/s)": 0.299668 }, { "acc": 0.71126432, "epoch": 0.1650898221173944, "grad_norm": 7.4375, "learning_rate": 9.971073957766857e-06, "loss": 1.16699467, "memory(GiB)": 141.16, "step": 14760, "train_speed(iter/s)": 0.299783 }, { "acc": 0.73250017, "epoch": 0.16531352106335295, "grad_norm": 7.25, "learning_rate": 9.970874971078358e-06, "loss": 1.07957935, "memory(GiB)": 141.16, "step": 14780, "train_speed(iter/s)": 0.299923 }, { "acc": 0.72331514, "epoch": 0.16553722000931148, "grad_norm": 6.9375, "learning_rate": 9.97067530430311e-06, "loss": 1.11158867, "memory(GiB)": 141.16, "step": 14800, "train_speed(iter/s)": 0.300054 }, { "acc": 0.72265749, "epoch": 0.16576091895527, "grad_norm": 8.0, "learning_rate": 9.970474957468435e-06, "loss": 1.11309433, "memory(GiB)": 141.16, "step": 14820, "train_speed(iter/s)": 0.300193 }, { "acc": 0.72073545, "epoch": 0.16598461790122854, "grad_norm": 6.71875, "learning_rate": 9.97027393060174e-06, "loss": 1.130019, "memory(GiB)": 141.16, "step": 14840, "train_speed(iter/s)": 0.300318 }, { "acc": 0.72005386, "epoch": 0.16620831684718707, "grad_norm": 5.5, "learning_rate": 9.970072223730532e-06, "loss": 1.11800404, "memory(GiB)": 141.16, "step": 14860, "train_speed(iter/s)": 0.300437 }, { "acc": 0.72223253, "epoch": 0.1664320157931456, "grad_norm": 8.1875, "learning_rate": 9.969869836882404e-06, "loss": 1.10039463, "memory(GiB)": 141.16, "step": 14880, "train_speed(iter/s)": 0.300563 }, { "acc": 0.72774339, "epoch": 0.16665571473910412, "grad_norm": 6.90625, "learning_rate": 9.969666770085046e-06, "loss": 1.09769535, "memory(GiB)": 141.16, "step": 14900, "train_speed(iter/s)": 0.300701 }, { "acc": 0.71504693, "epoch": 0.16687941368506265, "grad_norm": 5.125, "learning_rate": 9.969463023366241e-06, "loss": 1.13798637, "memory(GiB)": 141.16, "step": 14920, "train_speed(iter/s)": 0.300834 }, { "acc": 0.71476555, "epoch": 0.16710311263102118, "grad_norm": 8.625, "learning_rate": 9.969258596753866e-06, "loss": 1.14417915, "memory(GiB)": 141.16, "step": 14940, "train_speed(iter/s)": 0.300965 }, { "acc": 0.72120137, "epoch": 0.1673268115769797, "grad_norm": 6.9375, "learning_rate": 9.969053490275886e-06, "loss": 1.12035208, "memory(GiB)": 141.16, "step": 14960, "train_speed(iter/s)": 0.301093 }, { "acc": 0.72255759, "epoch": 0.16755051052293823, "grad_norm": 5.875, "learning_rate": 9.968847703960365e-06, "loss": 1.11000109, "memory(GiB)": 141.16, "step": 14980, "train_speed(iter/s)": 0.301225 }, { "acc": 0.72347641, "epoch": 0.16777420946889676, "grad_norm": 5.875, "learning_rate": 9.968641237835458e-06, "loss": 1.11075954, "memory(GiB)": 141.16, "step": 15000, "train_speed(iter/s)": 0.301356 }, { "acc": 0.72725325, "epoch": 0.1679979084148553, "grad_norm": 7.71875, "learning_rate": 9.968434091929411e-06, "loss": 1.09887161, "memory(GiB)": 141.16, "step": 15020, "train_speed(iter/s)": 0.301483 }, { "acc": 0.72103434, "epoch": 0.16822160736081382, "grad_norm": 8.6875, "learning_rate": 9.968226266270563e-06, "loss": 1.12554665, "memory(GiB)": 141.16, "step": 15040, "train_speed(iter/s)": 0.301618 }, { "acc": 0.72347832, "epoch": 0.16844530630677235, "grad_norm": 7.625, "learning_rate": 9.968017760887352e-06, "loss": 1.12067051, "memory(GiB)": 141.16, "step": 15060, "train_speed(iter/s)": 0.301757 }, { "acc": 0.71496611, "epoch": 0.16866900525273087, "grad_norm": 7.59375, "learning_rate": 9.967808575808301e-06, "loss": 1.14369869, "memory(GiB)": 141.16, "step": 15080, "train_speed(iter/s)": 0.301891 }, { "acc": 0.72146101, "epoch": 0.1688927041986894, "grad_norm": 7.21875, "learning_rate": 9.96759871106203e-06, "loss": 1.11766987, "memory(GiB)": 141.16, "step": 15100, "train_speed(iter/s)": 0.302031 }, { "acc": 0.71403141, "epoch": 0.16911640314464793, "grad_norm": 7.625, "learning_rate": 9.967388166677252e-06, "loss": 1.14107513, "memory(GiB)": 141.16, "step": 15120, "train_speed(iter/s)": 0.302158 }, { "acc": 0.72434492, "epoch": 0.16934010209060646, "grad_norm": 7.09375, "learning_rate": 9.967176942682773e-06, "loss": 1.11149483, "memory(GiB)": 141.16, "step": 15140, "train_speed(iter/s)": 0.302299 }, { "acc": 0.71734428, "epoch": 0.169563801036565, "grad_norm": 8.0625, "learning_rate": 9.966965039107491e-06, "loss": 1.14866638, "memory(GiB)": 141.16, "step": 15160, "train_speed(iter/s)": 0.302384 }, { "acc": 0.7160243, "epoch": 0.16978749998252352, "grad_norm": 6.65625, "learning_rate": 9.966752455980397e-06, "loss": 1.14140253, "memory(GiB)": 141.16, "step": 15180, "train_speed(iter/s)": 0.302519 }, { "acc": 0.72158933, "epoch": 0.17001119892848204, "grad_norm": 5.375, "learning_rate": 9.966539193330576e-06, "loss": 1.12454195, "memory(GiB)": 141.16, "step": 15200, "train_speed(iter/s)": 0.302659 }, { "acc": 0.71759315, "epoch": 0.17023489787444057, "grad_norm": 7.1875, "learning_rate": 9.966325251187205e-06, "loss": 1.1368206, "memory(GiB)": 141.16, "step": 15220, "train_speed(iter/s)": 0.302779 }, { "acc": 0.71862626, "epoch": 0.1704585968203991, "grad_norm": 6.5625, "learning_rate": 9.966110629579556e-06, "loss": 1.12376137, "memory(GiB)": 141.16, "step": 15240, "train_speed(iter/s)": 0.302906 }, { "acc": 0.72261934, "epoch": 0.17068229576635763, "grad_norm": 7.15625, "learning_rate": 9.965895328536987e-06, "loss": 1.10897102, "memory(GiB)": 141.16, "step": 15260, "train_speed(iter/s)": 0.303035 }, { "acc": 0.72117548, "epoch": 0.17090599471231616, "grad_norm": 6.5, "learning_rate": 9.965679348088962e-06, "loss": 1.12547073, "memory(GiB)": 141.16, "step": 15280, "train_speed(iter/s)": 0.30316 }, { "acc": 0.72404246, "epoch": 0.17112969365827468, "grad_norm": 6.8125, "learning_rate": 9.965462688265025e-06, "loss": 1.12140903, "memory(GiB)": 141.16, "step": 15300, "train_speed(iter/s)": 0.303298 }, { "acc": 0.72536678, "epoch": 0.1713533926042332, "grad_norm": 7.5, "learning_rate": 9.96524534909482e-06, "loss": 1.11517191, "memory(GiB)": 141.16, "step": 15320, "train_speed(iter/s)": 0.303441 }, { "acc": 0.72710905, "epoch": 0.17157709155019174, "grad_norm": 6.125, "learning_rate": 9.965027330608078e-06, "loss": 1.11009121, "memory(GiB)": 141.16, "step": 15340, "train_speed(iter/s)": 0.303574 }, { "acc": 0.70478587, "epoch": 0.17180079049615027, "grad_norm": 8.875, "learning_rate": 9.964808632834634e-06, "loss": 1.20360632, "memory(GiB)": 141.16, "step": 15360, "train_speed(iter/s)": 0.303701 }, { "acc": 0.72234364, "epoch": 0.1720244894421088, "grad_norm": 7.25, "learning_rate": 9.964589255804405e-06, "loss": 1.11182976, "memory(GiB)": 141.16, "step": 15380, "train_speed(iter/s)": 0.303807 }, { "acc": 0.72983952, "epoch": 0.17224818838806735, "grad_norm": 8.125, "learning_rate": 9.964369199547404e-06, "loss": 1.09280043, "memory(GiB)": 141.16, "step": 15400, "train_speed(iter/s)": 0.303918 }, { "acc": 0.7054883, "epoch": 0.17247188733402588, "grad_norm": 7.1875, "learning_rate": 9.96414846409374e-06, "loss": 1.21282368, "memory(GiB)": 141.16, "step": 15420, "train_speed(iter/s)": 0.304059 }, { "acc": 0.72615576, "epoch": 0.1726955862799844, "grad_norm": 8.6875, "learning_rate": 9.963927049473614e-06, "loss": 1.097052, "memory(GiB)": 141.16, "step": 15440, "train_speed(iter/s)": 0.304195 }, { "acc": 0.72445264, "epoch": 0.17291928522594294, "grad_norm": 6.21875, "learning_rate": 9.963704955717315e-06, "loss": 1.10573826, "memory(GiB)": 141.16, "step": 15460, "train_speed(iter/s)": 0.304333 }, { "acc": 0.71926608, "epoch": 0.17314298417190147, "grad_norm": 8.9375, "learning_rate": 9.963482182855231e-06, "loss": 1.13065901, "memory(GiB)": 141.16, "step": 15480, "train_speed(iter/s)": 0.304474 }, { "acc": 0.71022272, "epoch": 0.17336668311786, "grad_norm": 5.1875, "learning_rate": 9.963258730917839e-06, "loss": 1.17620811, "memory(GiB)": 141.16, "step": 15500, "train_speed(iter/s)": 0.304609 }, { "acc": 0.7089982, "epoch": 0.17359038206381852, "grad_norm": 6.6875, "learning_rate": 9.963034599935712e-06, "loss": 1.18241158, "memory(GiB)": 141.16, "step": 15520, "train_speed(iter/s)": 0.30474 }, { "acc": 0.73042727, "epoch": 0.17381408100977705, "grad_norm": 8.25, "learning_rate": 9.962809789939513e-06, "loss": 1.08614025, "memory(GiB)": 141.16, "step": 15540, "train_speed(iter/s)": 0.30488 }, { "acc": 0.72717171, "epoch": 0.17403777995573558, "grad_norm": 8.1875, "learning_rate": 9.962584300960001e-06, "loss": 1.09327612, "memory(GiB)": 141.16, "step": 15560, "train_speed(iter/s)": 0.305006 }, { "acc": 0.71294079, "epoch": 0.1742614789016941, "grad_norm": 9.625, "learning_rate": 9.962358133028025e-06, "loss": 1.157967, "memory(GiB)": 141.16, "step": 15580, "train_speed(iter/s)": 0.305132 }, { "acc": 0.72057037, "epoch": 0.17448517784765263, "grad_norm": 9.3125, "learning_rate": 9.962131286174529e-06, "loss": 1.12606211, "memory(GiB)": 141.16, "step": 15600, "train_speed(iter/s)": 0.305257 }, { "acc": 0.72183437, "epoch": 0.17470887679361116, "grad_norm": 6.6875, "learning_rate": 9.961903760430544e-06, "loss": 1.1094717, "memory(GiB)": 141.16, "step": 15620, "train_speed(iter/s)": 0.30539 }, { "acc": 0.72113304, "epoch": 0.1749325757395697, "grad_norm": 5.9375, "learning_rate": 9.961675555827204e-06, "loss": 1.12506323, "memory(GiB)": 141.16, "step": 15640, "train_speed(iter/s)": 0.305533 }, { "acc": 0.72244253, "epoch": 0.17515627468552822, "grad_norm": 7.25, "learning_rate": 9.961446672395731e-06, "loss": 1.11885815, "memory(GiB)": 141.16, "step": 15660, "train_speed(iter/s)": 0.305664 }, { "acc": 0.71945887, "epoch": 0.17537997363148675, "grad_norm": 6.625, "learning_rate": 9.961217110167436e-06, "loss": 1.12643852, "memory(GiB)": 141.16, "step": 15680, "train_speed(iter/s)": 0.305802 }, { "acc": 0.72880478, "epoch": 0.17560367257744527, "grad_norm": 7.03125, "learning_rate": 9.96098686917373e-06, "loss": 1.07781734, "memory(GiB)": 141.16, "step": 15700, "train_speed(iter/s)": 0.305928 }, { "acc": 0.73072233, "epoch": 0.1758273715234038, "grad_norm": 8.3125, "learning_rate": 9.96075594944611e-06, "loss": 1.08384991, "memory(GiB)": 141.16, "step": 15720, "train_speed(iter/s)": 0.306047 }, { "acc": 0.71328311, "epoch": 0.17605107046936233, "grad_norm": 6.21875, "learning_rate": 9.960524351016172e-06, "loss": 1.1664463, "memory(GiB)": 141.16, "step": 15740, "train_speed(iter/s)": 0.306185 }, { "acc": 0.72162294, "epoch": 0.17627476941532086, "grad_norm": 6.90625, "learning_rate": 9.9602920739156e-06, "loss": 1.11107445, "memory(GiB)": 141.16, "step": 15760, "train_speed(iter/s)": 0.306296 }, { "acc": 0.72314644, "epoch": 0.1764984683612794, "grad_norm": 9.0625, "learning_rate": 9.960059118176173e-06, "loss": 1.1241766, "memory(GiB)": 141.16, "step": 15780, "train_speed(iter/s)": 0.306424 }, { "acc": 0.71569624, "epoch": 0.17672216730723792, "grad_norm": 5.4375, "learning_rate": 9.959825483829762e-06, "loss": 1.14131765, "memory(GiB)": 141.16, "step": 15800, "train_speed(iter/s)": 0.306539 }, { "acc": 0.71228228, "epoch": 0.17694586625319644, "grad_norm": 9.3125, "learning_rate": 9.959591170908334e-06, "loss": 1.15533943, "memory(GiB)": 141.16, "step": 15820, "train_speed(iter/s)": 0.306672 }, { "acc": 0.71104178, "epoch": 0.17716956519915497, "grad_norm": 6.15625, "learning_rate": 9.959356179443945e-06, "loss": 1.18187017, "memory(GiB)": 141.16, "step": 15840, "train_speed(iter/s)": 0.306802 }, { "acc": 0.72367601, "epoch": 0.1773932641451135, "grad_norm": 7.75, "learning_rate": 9.959120509468744e-06, "loss": 1.11825066, "memory(GiB)": 141.16, "step": 15860, "train_speed(iter/s)": 0.306929 }, { "acc": 0.7158865, "epoch": 0.17761696309107203, "grad_norm": 7.25, "learning_rate": 9.958884161014976e-06, "loss": 1.13726349, "memory(GiB)": 141.16, "step": 15880, "train_speed(iter/s)": 0.307056 }, { "acc": 0.7225585, "epoch": 0.17784066203703056, "grad_norm": 7.34375, "learning_rate": 9.958647134114975e-06, "loss": 1.11620941, "memory(GiB)": 141.16, "step": 15900, "train_speed(iter/s)": 0.307192 }, { "acc": 0.72450943, "epoch": 0.17806436098298908, "grad_norm": 7.34375, "learning_rate": 9.958409428801172e-06, "loss": 1.09218693, "memory(GiB)": 141.16, "step": 15920, "train_speed(iter/s)": 0.307305 }, { "acc": 0.72279525, "epoch": 0.1782880599289476, "grad_norm": 8.25, "learning_rate": 9.958171045106086e-06, "loss": 1.12292862, "memory(GiB)": 141.16, "step": 15940, "train_speed(iter/s)": 0.307437 }, { "acc": 0.72847996, "epoch": 0.17851175887490614, "grad_norm": 6.40625, "learning_rate": 9.957931983062334e-06, "loss": 1.09161243, "memory(GiB)": 141.16, "step": 15960, "train_speed(iter/s)": 0.307564 }, { "acc": 0.72862811, "epoch": 0.17873545782086467, "grad_norm": 6.28125, "learning_rate": 9.957692242702621e-06, "loss": 1.07969389, "memory(GiB)": 141.16, "step": 15980, "train_speed(iter/s)": 0.307685 }, { "acc": 0.72042942, "epoch": 0.1789591567668232, "grad_norm": 6.59375, "learning_rate": 9.957451824059747e-06, "loss": 1.1242897, "memory(GiB)": 141.16, "step": 16000, "train_speed(iter/s)": 0.307798 }, { "epoch": 0.1789591567668232, "eval_acc": 0.6812712443934914, "eval_loss": 1.114004135131836, "eval_runtime": 2227.1048, "eval_samples_per_second": 33.803, "eval_steps_per_second": 16.902, "step": 16000 }, { "acc": 0.71816678, "epoch": 0.17918285571278172, "grad_norm": 6.78125, "learning_rate": 9.957210727166604e-06, "loss": 1.1535182, "memory(GiB)": 141.16, "step": 16020, "train_speed(iter/s)": 0.295009 }, { "acc": 0.72389617, "epoch": 0.17940655465874028, "grad_norm": 6.78125, "learning_rate": 9.95696895205618e-06, "loss": 1.09953499, "memory(GiB)": 141.16, "step": 16040, "train_speed(iter/s)": 0.295146 }, { "acc": 0.72293639, "epoch": 0.1796302536046988, "grad_norm": 5.9375, "learning_rate": 9.956726498761553e-06, "loss": 1.11522255, "memory(GiB)": 141.16, "step": 16060, "train_speed(iter/s)": 0.295273 }, { "acc": 0.7299902, "epoch": 0.17985395255065734, "grad_norm": 7.21875, "learning_rate": 9.95648336731589e-06, "loss": 1.09318466, "memory(GiB)": 141.16, "step": 16080, "train_speed(iter/s)": 0.295394 }, { "acc": 0.72919731, "epoch": 0.18007765149661586, "grad_norm": 6.25, "learning_rate": 9.95623955775246e-06, "loss": 1.08826294, "memory(GiB)": 141.16, "step": 16100, "train_speed(iter/s)": 0.295509 }, { "acc": 0.71190777, "epoch": 0.1803013504425744, "grad_norm": 8.0, "learning_rate": 9.955995070104618e-06, "loss": 1.16442738, "memory(GiB)": 141.16, "step": 16120, "train_speed(iter/s)": 0.295647 }, { "acc": 0.72502737, "epoch": 0.18052504938853292, "grad_norm": 7.09375, "learning_rate": 9.955749904405812e-06, "loss": 1.10613356, "memory(GiB)": 141.16, "step": 16140, "train_speed(iter/s)": 0.295782 }, { "acc": 0.72435236, "epoch": 0.18074874833449145, "grad_norm": 8.125, "learning_rate": 9.955504060689584e-06, "loss": 1.10672092, "memory(GiB)": 141.16, "step": 16160, "train_speed(iter/s)": 0.295894 }, { "acc": 0.72117891, "epoch": 0.18097244728044998, "grad_norm": 7.125, "learning_rate": 9.955257538989573e-06, "loss": 1.13306761, "memory(GiB)": 141.16, "step": 16180, "train_speed(iter/s)": 0.296006 }, { "acc": 0.71469359, "epoch": 0.1811961462264085, "grad_norm": 8.625, "learning_rate": 9.955010339339501e-06, "loss": 1.13848667, "memory(GiB)": 141.16, "step": 16200, "train_speed(iter/s)": 0.29612 }, { "acc": 0.72291498, "epoch": 0.18141984517236703, "grad_norm": 9.5, "learning_rate": 9.954762461773194e-06, "loss": 1.11717339, "memory(GiB)": 141.16, "step": 16220, "train_speed(iter/s)": 0.296246 }, { "acc": 0.72303076, "epoch": 0.18164354411832556, "grad_norm": 6.25, "learning_rate": 9.954513906324559e-06, "loss": 1.10389471, "memory(GiB)": 141.16, "step": 16240, "train_speed(iter/s)": 0.296384 }, { "acc": 0.70368519, "epoch": 0.1818672430642841, "grad_norm": 6.65625, "learning_rate": 9.954264673027606e-06, "loss": 1.19622135, "memory(GiB)": 141.16, "step": 16260, "train_speed(iter/s)": 0.296515 }, { "acc": 0.72311592, "epoch": 0.18209094201024262, "grad_norm": 6.96875, "learning_rate": 9.954014761916436e-06, "loss": 1.11283627, "memory(GiB)": 141.16, "step": 16280, "train_speed(iter/s)": 0.296643 }, { "acc": 0.73626742, "epoch": 0.18231464095620115, "grad_norm": 8.125, "learning_rate": 9.953764173025234e-06, "loss": 1.04721336, "memory(GiB)": 141.16, "step": 16300, "train_speed(iter/s)": 0.296779 }, { "acc": 0.72783709, "epoch": 0.18253833990215967, "grad_norm": 6.25, "learning_rate": 9.953512906388288e-06, "loss": 1.11086025, "memory(GiB)": 141.16, "step": 16320, "train_speed(iter/s)": 0.296876 }, { "acc": 0.7242743, "epoch": 0.1827620388481182, "grad_norm": 7.75, "learning_rate": 9.953260962039976e-06, "loss": 1.12792988, "memory(GiB)": 141.16, "step": 16340, "train_speed(iter/s)": 0.296989 }, { "acc": 0.71967869, "epoch": 0.18298573779407673, "grad_norm": 5.28125, "learning_rate": 9.953008340014764e-06, "loss": 1.14314232, "memory(GiB)": 141.16, "step": 16360, "train_speed(iter/s)": 0.297117 }, { "acc": 0.72771282, "epoch": 0.18320943674003526, "grad_norm": 6.65625, "learning_rate": 9.952755040347218e-06, "loss": 1.09070158, "memory(GiB)": 141.16, "step": 16380, "train_speed(iter/s)": 0.297246 }, { "acc": 0.71872544, "epoch": 0.1834331356859938, "grad_norm": 6.46875, "learning_rate": 9.95250106307199e-06, "loss": 1.14392138, "memory(GiB)": 141.16, "step": 16400, "train_speed(iter/s)": 0.297371 }, { "acc": 0.7353478, "epoch": 0.18365683463195231, "grad_norm": 7.78125, "learning_rate": 9.952246408223831e-06, "loss": 1.05195637, "memory(GiB)": 141.16, "step": 16420, "train_speed(iter/s)": 0.297488 }, { "acc": 0.72025442, "epoch": 0.18388053357791084, "grad_norm": 5.9375, "learning_rate": 9.951991075837576e-06, "loss": 1.1160531, "memory(GiB)": 141.16, "step": 16440, "train_speed(iter/s)": 0.297607 }, { "acc": 0.71935081, "epoch": 0.18410423252386937, "grad_norm": 6.78125, "learning_rate": 9.951735065948165e-06, "loss": 1.1400116, "memory(GiB)": 141.16, "step": 16460, "train_speed(iter/s)": 0.297731 }, { "acc": 0.73740816, "epoch": 0.1843279314698279, "grad_norm": 6.3125, "learning_rate": 9.95147837859062e-06, "loss": 1.03709164, "memory(GiB)": 141.16, "step": 16480, "train_speed(iter/s)": 0.297855 }, { "acc": 0.73059492, "epoch": 0.18455163041578643, "grad_norm": 9.0, "learning_rate": 9.951221013800059e-06, "loss": 1.07995567, "memory(GiB)": 141.16, "step": 16500, "train_speed(iter/s)": 0.29798 }, { "acc": 0.72881303, "epoch": 0.18477532936174496, "grad_norm": 6.34375, "learning_rate": 9.950962971611693e-06, "loss": 1.0870491, "memory(GiB)": 141.16, "step": 16520, "train_speed(iter/s)": 0.298099 }, { "acc": 0.73234887, "epoch": 0.18499902830770348, "grad_norm": 6.65625, "learning_rate": 9.950704252060827e-06, "loss": 1.06957817, "memory(GiB)": 141.16, "step": 16540, "train_speed(iter/s)": 0.298216 }, { "acc": 0.7207406, "epoch": 0.185222727253662, "grad_norm": 6.9375, "learning_rate": 9.950444855182859e-06, "loss": 1.12351799, "memory(GiB)": 141.16, "step": 16560, "train_speed(iter/s)": 0.298325 }, { "acc": 0.71189127, "epoch": 0.18544642619962054, "grad_norm": 6.9375, "learning_rate": 9.950184781013276e-06, "loss": 1.16317921, "memory(GiB)": 141.16, "step": 16580, "train_speed(iter/s)": 0.298432 }, { "acc": 0.73841715, "epoch": 0.18567012514557907, "grad_norm": 7.25, "learning_rate": 9.94992402958766e-06, "loss": 1.04635487, "memory(GiB)": 141.16, "step": 16600, "train_speed(iter/s)": 0.298552 }, { "acc": 0.73327937, "epoch": 0.1858938240915376, "grad_norm": 7.71875, "learning_rate": 9.949662600941687e-06, "loss": 1.06211777, "memory(GiB)": 141.16, "step": 16620, "train_speed(iter/s)": 0.298678 }, { "acc": 0.71049862, "epoch": 0.18611752303749612, "grad_norm": 6.40625, "learning_rate": 9.949400495111124e-06, "loss": 1.17261419, "memory(GiB)": 141.16, "step": 16640, "train_speed(iter/s)": 0.298808 }, { "acc": 0.7046051, "epoch": 0.18634122198345468, "grad_norm": 9.3125, "learning_rate": 9.949137712131828e-06, "loss": 1.19179468, "memory(GiB)": 141.16, "step": 16660, "train_speed(iter/s)": 0.29892 }, { "acc": 0.73566146, "epoch": 0.1865649209294132, "grad_norm": 7.3125, "learning_rate": 9.948874252039754e-06, "loss": 1.05681744, "memory(GiB)": 141.16, "step": 16680, "train_speed(iter/s)": 0.299048 }, { "acc": 0.71533389, "epoch": 0.18678861987537174, "grad_norm": 7.625, "learning_rate": 9.948610114870946e-06, "loss": 1.14865036, "memory(GiB)": 141.16, "step": 16700, "train_speed(iter/s)": 0.29917 }, { "acc": 0.72692022, "epoch": 0.18701231882133026, "grad_norm": 7.71875, "learning_rate": 9.948345300661543e-06, "loss": 1.0931736, "memory(GiB)": 141.16, "step": 16720, "train_speed(iter/s)": 0.299294 }, { "acc": 0.72572489, "epoch": 0.1872360177672888, "grad_norm": 7.0, "learning_rate": 9.948079809447776e-06, "loss": 1.10155792, "memory(GiB)": 141.16, "step": 16740, "train_speed(iter/s)": 0.299405 }, { "acc": 0.72982388, "epoch": 0.18745971671324732, "grad_norm": 8.4375, "learning_rate": 9.947813641265965e-06, "loss": 1.08497772, "memory(GiB)": 141.16, "step": 16760, "train_speed(iter/s)": 0.299521 }, { "acc": 0.71516604, "epoch": 0.18768341565920585, "grad_norm": 6.125, "learning_rate": 9.947546796152529e-06, "loss": 1.12758961, "memory(GiB)": 141.16, "step": 16780, "train_speed(iter/s)": 0.29964 }, { "acc": 0.72619567, "epoch": 0.18790711460516438, "grad_norm": 7.53125, "learning_rate": 9.947279274143973e-06, "loss": 1.09589844, "memory(GiB)": 141.16, "step": 16800, "train_speed(iter/s)": 0.299762 }, { "acc": 0.71871223, "epoch": 0.1881308135511229, "grad_norm": 6.34375, "learning_rate": 9.9470110752769e-06, "loss": 1.13804016, "memory(GiB)": 141.16, "step": 16820, "train_speed(iter/s)": 0.299883 }, { "acc": 0.71467519, "epoch": 0.18835451249708143, "grad_norm": 6.875, "learning_rate": 9.946742199588002e-06, "loss": 1.17253571, "memory(GiB)": 141.16, "step": 16840, "train_speed(iter/s)": 0.300009 }, { "acc": 0.72487116, "epoch": 0.18857821144303996, "grad_norm": 5.84375, "learning_rate": 9.946472647114066e-06, "loss": 1.09284821, "memory(GiB)": 141.16, "step": 16860, "train_speed(iter/s)": 0.300126 }, { "acc": 0.72395844, "epoch": 0.1888019103889985, "grad_norm": 7.9375, "learning_rate": 9.946202417891972e-06, "loss": 1.11515093, "memory(GiB)": 141.16, "step": 16880, "train_speed(iter/s)": 0.300248 }, { "acc": 0.72732916, "epoch": 0.18902560933495702, "grad_norm": 7.40625, "learning_rate": 9.94593151195869e-06, "loss": 1.11209583, "memory(GiB)": 141.16, "step": 16900, "train_speed(iter/s)": 0.300365 }, { "acc": 0.71250401, "epoch": 0.18924930828091555, "grad_norm": 6.125, "learning_rate": 9.945659929351282e-06, "loss": 1.16959858, "memory(GiB)": 141.16, "step": 16920, "train_speed(iter/s)": 0.300479 }, { "acc": 0.72248564, "epoch": 0.18947300722687407, "grad_norm": 8.375, "learning_rate": 9.945387670106905e-06, "loss": 1.10008354, "memory(GiB)": 141.16, "step": 16940, "train_speed(iter/s)": 0.300582 }, { "acc": 0.71304903, "epoch": 0.1896967061728326, "grad_norm": 7.96875, "learning_rate": 9.94511473426281e-06, "loss": 1.1737421, "memory(GiB)": 141.16, "step": 16960, "train_speed(iter/s)": 0.300706 }, { "acc": 0.706353, "epoch": 0.18992040511879113, "grad_norm": 7.65625, "learning_rate": 9.944841121856337e-06, "loss": 1.2030653, "memory(GiB)": 141.16, "step": 16980, "train_speed(iter/s)": 0.300818 }, { "acc": 0.7235857, "epoch": 0.19014410406474966, "grad_norm": 6.46875, "learning_rate": 9.944566832924922e-06, "loss": 1.10254364, "memory(GiB)": 141.16, "step": 17000, "train_speed(iter/s)": 0.300938 }, { "acc": 0.71485486, "epoch": 0.1903678030107082, "grad_norm": 8.0625, "learning_rate": 9.944291867506089e-06, "loss": 1.12692461, "memory(GiB)": 141.16, "step": 17020, "train_speed(iter/s)": 0.301056 }, { "acc": 0.71332717, "epoch": 0.19059150195666671, "grad_norm": 7.6875, "learning_rate": 9.944016225637458e-06, "loss": 1.17031765, "memory(GiB)": 141.16, "step": 17040, "train_speed(iter/s)": 0.301168 }, { "acc": 0.72385855, "epoch": 0.19081520090262524, "grad_norm": 6.8125, "learning_rate": 9.943739907356743e-06, "loss": 1.10537357, "memory(GiB)": 141.16, "step": 17060, "train_speed(iter/s)": 0.301277 }, { "acc": 0.71997051, "epoch": 0.19103889984858377, "grad_norm": 6.40625, "learning_rate": 9.943462912701743e-06, "loss": 1.12663612, "memory(GiB)": 141.16, "step": 17080, "train_speed(iter/s)": 0.301378 }, { "acc": 0.72808952, "epoch": 0.1912625987945423, "grad_norm": 7.0625, "learning_rate": 9.943185241710361e-06, "loss": 1.08877087, "memory(GiB)": 141.16, "step": 17100, "train_speed(iter/s)": 0.301495 }, { "acc": 0.72654319, "epoch": 0.19148629774050083, "grad_norm": 7.25, "learning_rate": 9.942906894420582e-06, "loss": 1.10343523, "memory(GiB)": 141.16, "step": 17120, "train_speed(iter/s)": 0.301612 }, { "acc": 0.72934661, "epoch": 0.19170999668645936, "grad_norm": 6.46875, "learning_rate": 9.94262787087049e-06, "loss": 1.07208195, "memory(GiB)": 141.16, "step": 17140, "train_speed(iter/s)": 0.301728 }, { "acc": 0.73087158, "epoch": 0.19193369563241788, "grad_norm": 6.8125, "learning_rate": 9.942348171098258e-06, "loss": 1.082621, "memory(GiB)": 141.16, "step": 17160, "train_speed(iter/s)": 0.301846 }, { "acc": 0.72167192, "epoch": 0.1921573945783764, "grad_norm": 6.875, "learning_rate": 9.942067795142154e-06, "loss": 1.11652145, "memory(GiB)": 141.16, "step": 17180, "train_speed(iter/s)": 0.301957 }, { "acc": 0.72336569, "epoch": 0.19238109352433494, "grad_norm": 8.875, "learning_rate": 9.941786743040537e-06, "loss": 1.10160351, "memory(GiB)": 141.16, "step": 17200, "train_speed(iter/s)": 0.302077 }, { "acc": 0.73141031, "epoch": 0.19260479247029347, "grad_norm": 8.125, "learning_rate": 9.941505014831862e-06, "loss": 1.07871361, "memory(GiB)": 141.16, "step": 17220, "train_speed(iter/s)": 0.302192 }, { "acc": 0.71173906, "epoch": 0.192828491416252, "grad_norm": 7.25, "learning_rate": 9.941222610554668e-06, "loss": 1.15055542, "memory(GiB)": 141.16, "step": 17240, "train_speed(iter/s)": 0.302308 }, { "acc": 0.72077599, "epoch": 0.19305219036221052, "grad_norm": 7.3125, "learning_rate": 9.940939530247595e-06, "loss": 1.12180672, "memory(GiB)": 141.16, "step": 17260, "train_speed(iter/s)": 0.302431 }, { "acc": 0.71676598, "epoch": 0.19327588930816908, "grad_norm": 4.65625, "learning_rate": 9.940655773949372e-06, "loss": 1.14675846, "memory(GiB)": 141.16, "step": 17280, "train_speed(iter/s)": 0.302546 }, { "acc": 0.72046041, "epoch": 0.1934995882541276, "grad_norm": 7.25, "learning_rate": 9.94037134169882e-06, "loss": 1.13683062, "memory(GiB)": 141.16, "step": 17300, "train_speed(iter/s)": 0.302663 }, { "acc": 0.72532163, "epoch": 0.19372328720008614, "grad_norm": 8.125, "learning_rate": 9.940086233534856e-06, "loss": 1.10167542, "memory(GiB)": 141.16, "step": 17320, "train_speed(iter/s)": 0.302778 }, { "acc": 0.72584715, "epoch": 0.19394698614604466, "grad_norm": 5.6875, "learning_rate": 9.939800449496484e-06, "loss": 1.10181713, "memory(GiB)": 141.16, "step": 17340, "train_speed(iter/s)": 0.302887 }, { "acc": 0.72262764, "epoch": 0.1941706850920032, "grad_norm": 7.65625, "learning_rate": 9.939513989622805e-06, "loss": 1.11047487, "memory(GiB)": 141.16, "step": 17360, "train_speed(iter/s)": 0.303005 }, { "acc": 0.73103495, "epoch": 0.19439438403796172, "grad_norm": 8.0, "learning_rate": 9.939226853953009e-06, "loss": 1.08749838, "memory(GiB)": 141.16, "step": 17380, "train_speed(iter/s)": 0.303121 }, { "acc": 0.72277303, "epoch": 0.19461808298392025, "grad_norm": 6.875, "learning_rate": 9.938939042526382e-06, "loss": 1.12375565, "memory(GiB)": 141.16, "step": 17400, "train_speed(iter/s)": 0.303238 }, { "acc": 0.72486887, "epoch": 0.19484178192987878, "grad_norm": 6.53125, "learning_rate": 9.9386505553823e-06, "loss": 1.10445843, "memory(GiB)": 141.16, "step": 17420, "train_speed(iter/s)": 0.303345 }, { "acc": 0.73545418, "epoch": 0.1950654808758373, "grad_norm": 7.03125, "learning_rate": 9.938361392560235e-06, "loss": 1.04284945, "memory(GiB)": 141.16, "step": 17440, "train_speed(iter/s)": 0.303471 }, { "acc": 0.72278471, "epoch": 0.19528917982179583, "grad_norm": 8.0, "learning_rate": 9.938071554099745e-06, "loss": 1.11397858, "memory(GiB)": 141.16, "step": 17460, "train_speed(iter/s)": 0.303581 }, { "acc": 0.71833286, "epoch": 0.19551287876775436, "grad_norm": 6.375, "learning_rate": 9.937781040040484e-06, "loss": 1.13489075, "memory(GiB)": 141.16, "step": 17480, "train_speed(iter/s)": 0.303698 }, { "acc": 0.72084179, "epoch": 0.1957365777137129, "grad_norm": 6.75, "learning_rate": 9.9374898504222e-06, "loss": 1.11294231, "memory(GiB)": 141.16, "step": 17500, "train_speed(iter/s)": 0.303819 }, { "acc": 0.72994156, "epoch": 0.19596027665967142, "grad_norm": 7.90625, "learning_rate": 9.937197985284732e-06, "loss": 1.07663069, "memory(GiB)": 141.16, "step": 17520, "train_speed(iter/s)": 0.30393 }, { "acc": 0.72403436, "epoch": 0.19618397560562995, "grad_norm": 6.84375, "learning_rate": 9.93690544466801e-06, "loss": 1.11101065, "memory(GiB)": 141.16, "step": 17540, "train_speed(iter/s)": 0.304051 }, { "acc": 0.70956545, "epoch": 0.19640767455158847, "grad_norm": 6.78125, "learning_rate": 9.936612228612058e-06, "loss": 1.1716898, "memory(GiB)": 141.16, "step": 17560, "train_speed(iter/s)": 0.30416 }, { "acc": 0.7138855, "epoch": 0.196631373497547, "grad_norm": 6.59375, "learning_rate": 9.936318337156993e-06, "loss": 1.14376965, "memory(GiB)": 141.16, "step": 17580, "train_speed(iter/s)": 0.304276 }, { "acc": 0.72560902, "epoch": 0.19685507244350553, "grad_norm": 6.75, "learning_rate": 9.936023770343024e-06, "loss": 1.08866596, "memory(GiB)": 141.16, "step": 17600, "train_speed(iter/s)": 0.304387 }, { "acc": 0.71971426, "epoch": 0.19707877138946406, "grad_norm": 7.625, "learning_rate": 9.935728528210451e-06, "loss": 1.12233868, "memory(GiB)": 141.16, "step": 17620, "train_speed(iter/s)": 0.304506 }, { "acc": 0.72354913, "epoch": 0.1973024703354226, "grad_norm": 7.09375, "learning_rate": 9.935432610799667e-06, "loss": 1.12187262, "memory(GiB)": 141.16, "step": 17640, "train_speed(iter/s)": 0.304615 }, { "acc": 0.73821239, "epoch": 0.19752616928138111, "grad_norm": 7.75, "learning_rate": 9.93513601815116e-06, "loss": 1.04963379, "memory(GiB)": 141.16, "step": 17660, "train_speed(iter/s)": 0.304734 }, { "acc": 0.73771062, "epoch": 0.19774986822733964, "grad_norm": 7.125, "learning_rate": 9.934838750305504e-06, "loss": 1.04407778, "memory(GiB)": 141.16, "step": 17680, "train_speed(iter/s)": 0.304839 }, { "acc": 0.71869869, "epoch": 0.19797356717329817, "grad_norm": 11.0625, "learning_rate": 9.934540807303372e-06, "loss": 1.13294353, "memory(GiB)": 141.16, "step": 17700, "train_speed(iter/s)": 0.304944 }, { "acc": 0.7263341, "epoch": 0.1981972661192567, "grad_norm": 7.96875, "learning_rate": 9.934242189185527e-06, "loss": 1.09494762, "memory(GiB)": 141.16, "step": 17720, "train_speed(iter/s)": 0.305049 }, { "acc": 0.71868639, "epoch": 0.19842096506521523, "grad_norm": 7.53125, "learning_rate": 9.933942895992825e-06, "loss": 1.14309778, "memory(GiB)": 141.16, "step": 17740, "train_speed(iter/s)": 0.305158 }, { "acc": 0.7163981, "epoch": 0.19864466401117375, "grad_norm": 7.9375, "learning_rate": 9.933642927766215e-06, "loss": 1.14948025, "memory(GiB)": 141.16, "step": 17760, "train_speed(iter/s)": 0.305277 }, { "acc": 0.72353296, "epoch": 0.19886836295713228, "grad_norm": 5.5625, "learning_rate": 9.93334228454673e-06, "loss": 1.09951916, "memory(GiB)": 141.16, "step": 17780, "train_speed(iter/s)": 0.305386 }, { "acc": 0.71577177, "epoch": 0.1990920619030908, "grad_norm": 7.46875, "learning_rate": 9.933040966375508e-06, "loss": 1.15553703, "memory(GiB)": 141.16, "step": 17800, "train_speed(iter/s)": 0.305481 }, { "acc": 0.72436261, "epoch": 0.19931576084904934, "grad_norm": 10.9375, "learning_rate": 9.932738973293773e-06, "loss": 1.11323318, "memory(GiB)": 141.16, "step": 17820, "train_speed(iter/s)": 0.305588 }, { "acc": 0.72432766, "epoch": 0.19953945979500787, "grad_norm": 7.375, "learning_rate": 9.932436305342842e-06, "loss": 1.11268902, "memory(GiB)": 141.16, "step": 17840, "train_speed(iter/s)": 0.30571 }, { "acc": 0.72157063, "epoch": 0.1997631587409664, "grad_norm": 7.5625, "learning_rate": 9.932132962564121e-06, "loss": 1.12621889, "memory(GiB)": 141.16, "step": 17860, "train_speed(iter/s)": 0.30582 }, { "acc": 0.72996421, "epoch": 0.19998685768692492, "grad_norm": 4.9375, "learning_rate": 9.931828944999116e-06, "loss": 1.07579699, "memory(GiB)": 141.16, "step": 17880, "train_speed(iter/s)": 0.305929 }, { "acc": 0.74055223, "epoch": 0.20021055663288345, "grad_norm": 7.9375, "learning_rate": 9.931524252689419e-06, "loss": 1.04095097, "memory(GiB)": 141.16, "step": 17900, "train_speed(iter/s)": 0.306039 }, { "acc": 0.72002401, "epoch": 0.200434255578842, "grad_norm": 4.875, "learning_rate": 9.931218885676718e-06, "loss": 1.1431694, "memory(GiB)": 141.16, "step": 17920, "train_speed(iter/s)": 0.306141 }, { "acc": 0.7183444, "epoch": 0.20065795452480054, "grad_norm": 7.5, "learning_rate": 9.93091284400279e-06, "loss": 1.14678631, "memory(GiB)": 141.16, "step": 17940, "train_speed(iter/s)": 0.306235 }, { "acc": 0.72533131, "epoch": 0.20088165347075906, "grad_norm": 5.59375, "learning_rate": 9.930606127709503e-06, "loss": 1.11164036, "memory(GiB)": 141.16, "step": 17960, "train_speed(iter/s)": 0.306349 }, { "acc": 0.70172768, "epoch": 0.2011053524167176, "grad_norm": 7.03125, "learning_rate": 9.930298736838826e-06, "loss": 1.20744019, "memory(GiB)": 141.16, "step": 17980, "train_speed(iter/s)": 0.306454 }, { "acc": 0.71204877, "epoch": 0.20132905136267612, "grad_norm": 6.4375, "learning_rate": 9.92999067143281e-06, "loss": 1.17145519, "memory(GiB)": 141.16, "step": 18000, "train_speed(iter/s)": 0.306564 }, { "epoch": 0.20132905136267612, "eval_acc": 0.6826130007504922, "eval_loss": 1.1091314554214478, "eval_runtime": 2253.1342, "eval_samples_per_second": 33.413, "eval_steps_per_second": 16.707, "step": 18000 }, { "acc": 0.72833214, "epoch": 0.20155275030863465, "grad_norm": 8.25, "learning_rate": 9.929681931533605e-06, "loss": 1.08331432, "memory(GiB)": 141.16, "step": 18020, "train_speed(iter/s)": 0.295092 }, { "acc": 0.72190104, "epoch": 0.20177644925459318, "grad_norm": 6.71875, "learning_rate": 9.92937251718345e-06, "loss": 1.12279501, "memory(GiB)": 141.16, "step": 18040, "train_speed(iter/s)": 0.29521 }, { "acc": 0.72164507, "epoch": 0.2020001482005517, "grad_norm": 6.375, "learning_rate": 9.929062428424678e-06, "loss": 1.12924709, "memory(GiB)": 141.16, "step": 18060, "train_speed(iter/s)": 0.295334 }, { "acc": 0.72606163, "epoch": 0.20222384714651023, "grad_norm": 8.25, "learning_rate": 9.928751665299714e-06, "loss": 1.09584255, "memory(GiB)": 141.16, "step": 18080, "train_speed(iter/s)": 0.295457 }, { "acc": 0.72282171, "epoch": 0.20244754609246876, "grad_norm": 8.9375, "learning_rate": 9.928440227851072e-06, "loss": 1.11117229, "memory(GiB)": 141.16, "step": 18100, "train_speed(iter/s)": 0.295578 }, { "acc": 0.73618445, "epoch": 0.2026712450384273, "grad_norm": 5.9375, "learning_rate": 9.928128116121365e-06, "loss": 1.05527134, "memory(GiB)": 141.16, "step": 18120, "train_speed(iter/s)": 0.295689 }, { "acc": 0.71661434, "epoch": 0.20289494398438582, "grad_norm": 5.71875, "learning_rate": 9.927815330153291e-06, "loss": 1.14721298, "memory(GiB)": 141.16, "step": 18140, "train_speed(iter/s)": 0.295782 }, { "acc": 0.72408009, "epoch": 0.20311864293034435, "grad_norm": 5.9375, "learning_rate": 9.927501869989648e-06, "loss": 1.11041584, "memory(GiB)": 141.16, "step": 18160, "train_speed(iter/s)": 0.295883 }, { "acc": 0.72851925, "epoch": 0.20334234187630287, "grad_norm": 8.25, "learning_rate": 9.927187735673315e-06, "loss": 1.09026718, "memory(GiB)": 141.16, "step": 18180, "train_speed(iter/s)": 0.296002 }, { "acc": 0.70662222, "epoch": 0.2035660408222614, "grad_norm": 6.5625, "learning_rate": 9.926872927247277e-06, "loss": 1.18034363, "memory(GiB)": 141.16, "step": 18200, "train_speed(iter/s)": 0.296115 }, { "acc": 0.72740016, "epoch": 0.20378973976821993, "grad_norm": 8.5625, "learning_rate": 9.926557444754601e-06, "loss": 1.08046417, "memory(GiB)": 141.16, "step": 18220, "train_speed(iter/s)": 0.296234 }, { "acc": 0.71547155, "epoch": 0.20401343871417846, "grad_norm": 7.21875, "learning_rate": 9.92624128823845e-06, "loss": 1.15291662, "memory(GiB)": 141.16, "step": 18240, "train_speed(iter/s)": 0.296351 }, { "acc": 0.723944, "epoch": 0.20423713766013699, "grad_norm": 7.34375, "learning_rate": 9.925924457742078e-06, "loss": 1.10083084, "memory(GiB)": 141.16, "step": 18260, "train_speed(iter/s)": 0.296454 }, { "acc": 0.71940904, "epoch": 0.20446083660609551, "grad_norm": 6.1875, "learning_rate": 9.925606953308831e-06, "loss": 1.12056313, "memory(GiB)": 141.16, "step": 18280, "train_speed(iter/s)": 0.296557 }, { "acc": 0.71788898, "epoch": 0.20468453555205404, "grad_norm": 7.625, "learning_rate": 9.925288774982151e-06, "loss": 1.14450798, "memory(GiB)": 141.16, "step": 18300, "train_speed(iter/s)": 0.296661 }, { "acc": 0.72655649, "epoch": 0.20490823449801257, "grad_norm": 7.59375, "learning_rate": 9.92496992280557e-06, "loss": 1.09715176, "memory(GiB)": 141.16, "step": 18320, "train_speed(iter/s)": 0.296769 }, { "acc": 0.73521733, "epoch": 0.2051319334439711, "grad_norm": 6.875, "learning_rate": 9.924650396822706e-06, "loss": 1.06428614, "memory(GiB)": 141.16, "step": 18340, "train_speed(iter/s)": 0.296873 }, { "acc": 0.69970665, "epoch": 0.20535563238992963, "grad_norm": 6.84375, "learning_rate": 9.92433019707728e-06, "loss": 1.21981907, "memory(GiB)": 141.16, "step": 18360, "train_speed(iter/s)": 0.296981 }, { "acc": 0.72361145, "epoch": 0.20557933133588815, "grad_norm": 8.625, "learning_rate": 9.924009323613098e-06, "loss": 1.10498295, "memory(GiB)": 141.16, "step": 18380, "train_speed(iter/s)": 0.297092 }, { "acc": 0.73077583, "epoch": 0.20580303028184668, "grad_norm": 7.125, "learning_rate": 9.92368777647406e-06, "loss": 1.06765995, "memory(GiB)": 141.16, "step": 18400, "train_speed(iter/s)": 0.297193 }, { "acc": 0.73308382, "epoch": 0.2060267292278052, "grad_norm": 5.5, "learning_rate": 9.923365555704159e-06, "loss": 1.06468468, "memory(GiB)": 141.16, "step": 18420, "train_speed(iter/s)": 0.297304 }, { "acc": 0.72645488, "epoch": 0.20625042817376374, "grad_norm": 7.0625, "learning_rate": 9.923042661347477e-06, "loss": 1.11405296, "memory(GiB)": 141.16, "step": 18440, "train_speed(iter/s)": 0.297405 }, { "acc": 0.71946621, "epoch": 0.20647412711972227, "grad_norm": 8.25, "learning_rate": 9.922719093448194e-06, "loss": 1.14490223, "memory(GiB)": 141.16, "step": 18460, "train_speed(iter/s)": 0.297516 }, { "acc": 0.72825661, "epoch": 0.2066978260656808, "grad_norm": 7.03125, "learning_rate": 9.92239485205058e-06, "loss": 1.09916973, "memory(GiB)": 141.16, "step": 18480, "train_speed(iter/s)": 0.297612 }, { "acc": 0.71986785, "epoch": 0.20692152501163932, "grad_norm": 7.28125, "learning_rate": 9.922069937198987e-06, "loss": 1.12638416, "memory(GiB)": 141.16, "step": 18500, "train_speed(iter/s)": 0.297701 }, { "acc": 0.7227972, "epoch": 0.20714522395759785, "grad_norm": 7.59375, "learning_rate": 9.921744348937878e-06, "loss": 1.1218092, "memory(GiB)": 141.16, "step": 18520, "train_speed(iter/s)": 0.29781 }, { "acc": 0.730584, "epoch": 0.2073689229035564, "grad_norm": 7.1875, "learning_rate": 9.921418087311794e-06, "loss": 1.07103519, "memory(GiB)": 141.16, "step": 18540, "train_speed(iter/s)": 0.297923 }, { "acc": 0.73656034, "epoch": 0.20759262184951494, "grad_norm": 6.125, "learning_rate": 9.92109115236537e-06, "loss": 1.0454855, "memory(GiB)": 141.16, "step": 18560, "train_speed(iter/s)": 0.29804 }, { "acc": 0.72880473, "epoch": 0.20781632079547346, "grad_norm": 6.625, "learning_rate": 9.920763544143339e-06, "loss": 1.09460602, "memory(GiB)": 141.16, "step": 18580, "train_speed(iter/s)": 0.298161 }, { "acc": 0.71509953, "epoch": 0.208040019741432, "grad_norm": 6.15625, "learning_rate": 9.920435262690523e-06, "loss": 1.12758989, "memory(GiB)": 141.16, "step": 18600, "train_speed(iter/s)": 0.298263 }, { "acc": 0.72325487, "epoch": 0.20826371868739052, "grad_norm": 6.625, "learning_rate": 9.92010630805183e-06, "loss": 1.09900875, "memory(GiB)": 141.16, "step": 18620, "train_speed(iter/s)": 0.298367 }, { "acc": 0.7183857, "epoch": 0.20848741763334905, "grad_norm": 6.59375, "learning_rate": 9.919776680272272e-06, "loss": 1.15351915, "memory(GiB)": 141.16, "step": 18640, "train_speed(iter/s)": 0.298479 }, { "acc": 0.72274246, "epoch": 0.20871111657930758, "grad_norm": 7.5, "learning_rate": 9.919446379396946e-06, "loss": 1.0911622, "memory(GiB)": 141.16, "step": 18660, "train_speed(iter/s)": 0.298577 }, { "acc": 0.72715406, "epoch": 0.2089348155252661, "grad_norm": 7.71875, "learning_rate": 9.919115405471039e-06, "loss": 1.09646263, "memory(GiB)": 141.16, "step": 18680, "train_speed(iter/s)": 0.298691 }, { "acc": 0.71745391, "epoch": 0.20915851447122463, "grad_norm": 6.96875, "learning_rate": 9.918783758539833e-06, "loss": 1.13350315, "memory(GiB)": 141.16, "step": 18700, "train_speed(iter/s)": 0.298797 }, { "acc": 0.72093925, "epoch": 0.20938221341718316, "grad_norm": 7.4375, "learning_rate": 9.918451438648705e-06, "loss": 1.13519497, "memory(GiB)": 141.16, "step": 18720, "train_speed(iter/s)": 0.298908 }, { "acc": 0.74060373, "epoch": 0.2096059123631417, "grad_norm": 7.15625, "learning_rate": 9.918118445843117e-06, "loss": 1.02593107, "memory(GiB)": 141.16, "step": 18740, "train_speed(iter/s)": 0.299018 }, { "acc": 0.72531567, "epoch": 0.20982961130910022, "grad_norm": 7.625, "learning_rate": 9.91778478016863e-06, "loss": 1.11183472, "memory(GiB)": 141.16, "step": 18760, "train_speed(iter/s)": 0.299116 }, { "acc": 0.73813295, "epoch": 0.21005331025505874, "grad_norm": 7.65625, "learning_rate": 9.917450441670895e-06, "loss": 1.05803347, "memory(GiB)": 141.16, "step": 18780, "train_speed(iter/s)": 0.29922 }, { "acc": 0.71246257, "epoch": 0.21027700920101727, "grad_norm": 6.15625, "learning_rate": 9.917115430395651e-06, "loss": 1.16730299, "memory(GiB)": 141.16, "step": 18800, "train_speed(iter/s)": 0.29933 }, { "acc": 0.72177916, "epoch": 0.2105007081469758, "grad_norm": 8.0625, "learning_rate": 9.916779746388737e-06, "loss": 1.11973362, "memory(GiB)": 141.16, "step": 18820, "train_speed(iter/s)": 0.299428 }, { "acc": 0.71726465, "epoch": 0.21072440709293433, "grad_norm": 6.46875, "learning_rate": 9.916443389696076e-06, "loss": 1.15150452, "memory(GiB)": 141.16, "step": 18840, "train_speed(iter/s)": 0.29951 }, { "acc": 0.71975155, "epoch": 0.21094810603889286, "grad_norm": 7.40625, "learning_rate": 9.916106360363687e-06, "loss": 1.11638336, "memory(GiB)": 141.16, "step": 18860, "train_speed(iter/s)": 0.29962 }, { "acc": 0.72952824, "epoch": 0.21117180498485139, "grad_norm": 7.25, "learning_rate": 9.915768658437678e-06, "loss": 1.07998638, "memory(GiB)": 141.16, "step": 18880, "train_speed(iter/s)": 0.29973 }, { "acc": 0.71962886, "epoch": 0.2113955039308099, "grad_norm": 7.21875, "learning_rate": 9.915430283964259e-06, "loss": 1.12973919, "memory(GiB)": 141.16, "step": 18900, "train_speed(iter/s)": 0.299834 }, { "acc": 0.72694683, "epoch": 0.21161920287676844, "grad_norm": 7.625, "learning_rate": 9.915091236989715e-06, "loss": 1.08345118, "memory(GiB)": 141.16, "step": 18920, "train_speed(iter/s)": 0.299936 }, { "acc": 0.72823672, "epoch": 0.21184290182272697, "grad_norm": 6.15625, "learning_rate": 9.914751517560439e-06, "loss": 1.09325085, "memory(GiB)": 141.16, "step": 18940, "train_speed(iter/s)": 0.300048 }, { "acc": 0.72669439, "epoch": 0.2120666007686855, "grad_norm": 9.5625, "learning_rate": 9.914411125722908e-06, "loss": 1.09478226, "memory(GiB)": 141.16, "step": 18960, "train_speed(iter/s)": 0.300162 }, { "acc": 0.72366099, "epoch": 0.21229029971464403, "grad_norm": 8.625, "learning_rate": 9.91407006152369e-06, "loss": 1.10771599, "memory(GiB)": 141.16, "step": 18980, "train_speed(iter/s)": 0.300273 }, { "acc": 0.72282553, "epoch": 0.21251399866060255, "grad_norm": 6.34375, "learning_rate": 9.91372832500945e-06, "loss": 1.09631119, "memory(GiB)": 141.16, "step": 19000, "train_speed(iter/s)": 0.300361 }, { "acc": 0.70988703, "epoch": 0.21273769760656108, "grad_norm": 8.125, "learning_rate": 9.913385916226941e-06, "loss": 1.16839113, "memory(GiB)": 141.16, "step": 19020, "train_speed(iter/s)": 0.300463 }, { "acc": 0.72433052, "epoch": 0.2129613965525196, "grad_norm": 7.03125, "learning_rate": 9.913042835223012e-06, "loss": 1.12136536, "memory(GiB)": 141.16, "step": 19040, "train_speed(iter/s)": 0.300558 }, { "acc": 0.72362771, "epoch": 0.21318509549847814, "grad_norm": 5.9375, "learning_rate": 9.912699082044599e-06, "loss": 1.11931458, "memory(GiB)": 141.16, "step": 19060, "train_speed(iter/s)": 0.300658 }, { "acc": 0.72195177, "epoch": 0.21340879444443667, "grad_norm": 6.21875, "learning_rate": 9.912354656738731e-06, "loss": 1.12407341, "memory(GiB)": 141.16, "step": 19080, "train_speed(iter/s)": 0.300764 }, { "acc": 0.71489763, "epoch": 0.2136324933903952, "grad_norm": 5.78125, "learning_rate": 9.912009559352536e-06, "loss": 1.153409, "memory(GiB)": 141.16, "step": 19100, "train_speed(iter/s)": 0.300869 }, { "acc": 0.71498489, "epoch": 0.21385619233635372, "grad_norm": 6.75, "learning_rate": 9.911663789933222e-06, "loss": 1.14638309, "memory(GiB)": 141.16, "step": 19120, "train_speed(iter/s)": 0.300979 }, { "acc": 0.72545304, "epoch": 0.21407989128231225, "grad_norm": 7.5, "learning_rate": 9.911317348528097e-06, "loss": 1.09348164, "memory(GiB)": 141.16, "step": 19140, "train_speed(iter/s)": 0.301076 }, { "acc": 0.72317486, "epoch": 0.21430359022827078, "grad_norm": 9.4375, "learning_rate": 9.910970235184561e-06, "loss": 1.12024584, "memory(GiB)": 141.16, "step": 19160, "train_speed(iter/s)": 0.301186 }, { "acc": 0.72680483, "epoch": 0.21452728917422934, "grad_norm": 6.5625, "learning_rate": 9.910622449950102e-06, "loss": 1.11005135, "memory(GiB)": 141.16, "step": 19180, "train_speed(iter/s)": 0.301286 }, { "acc": 0.71328125, "epoch": 0.21475098812018786, "grad_norm": 7.4375, "learning_rate": 9.910273992872305e-06, "loss": 1.1674099, "memory(GiB)": 141.16, "step": 19200, "train_speed(iter/s)": 0.301375 }, { "acc": 0.73077259, "epoch": 0.2149746870661464, "grad_norm": 5.84375, "learning_rate": 9.90992486399884e-06, "loss": 1.07286873, "memory(GiB)": 141.16, "step": 19220, "train_speed(iter/s)": 0.301474 }, { "acc": 0.72702026, "epoch": 0.21519838601210492, "grad_norm": 7.375, "learning_rate": 9.909575063377474e-06, "loss": 1.07442875, "memory(GiB)": 141.16, "step": 19240, "train_speed(iter/s)": 0.30158 }, { "acc": 0.72603531, "epoch": 0.21542208495806345, "grad_norm": 8.0625, "learning_rate": 9.909224591056068e-06, "loss": 1.09785461, "memory(GiB)": 141.16, "step": 19260, "train_speed(iter/s)": 0.301669 }, { "acc": 0.71599889, "epoch": 0.21564578390402198, "grad_norm": 7.125, "learning_rate": 9.908873447082567e-06, "loss": 1.135009, "memory(GiB)": 141.16, "step": 19280, "train_speed(iter/s)": 0.301766 }, { "acc": 0.72355175, "epoch": 0.2158694828499805, "grad_norm": 6.96875, "learning_rate": 9.908521631505015e-06, "loss": 1.09966307, "memory(GiB)": 141.16, "step": 19300, "train_speed(iter/s)": 0.301868 }, { "acc": 0.73447399, "epoch": 0.21609318179593903, "grad_norm": 6.28125, "learning_rate": 9.908169144371544e-06, "loss": 1.08259029, "memory(GiB)": 141.16, "step": 19320, "train_speed(iter/s)": 0.301975 }, { "acc": 0.73058934, "epoch": 0.21631688074189756, "grad_norm": 7.3125, "learning_rate": 9.90781598573038e-06, "loss": 1.09433384, "memory(GiB)": 141.16, "step": 19340, "train_speed(iter/s)": 0.302061 }, { "acc": 0.72723169, "epoch": 0.2165405796878561, "grad_norm": 8.8125, "learning_rate": 9.907462155629841e-06, "loss": 1.10168552, "memory(GiB)": 141.16, "step": 19360, "train_speed(iter/s)": 0.302157 }, { "acc": 0.72494998, "epoch": 0.21676427863381462, "grad_norm": 6.84375, "learning_rate": 9.907107654118337e-06, "loss": 1.11175518, "memory(GiB)": 141.16, "step": 19380, "train_speed(iter/s)": 0.302251 }, { "acc": 0.71862388, "epoch": 0.21698797757977314, "grad_norm": 8.3125, "learning_rate": 9.906752481244366e-06, "loss": 1.11575975, "memory(GiB)": 141.16, "step": 19400, "train_speed(iter/s)": 0.302345 }, { "acc": 0.72052603, "epoch": 0.21721167652573167, "grad_norm": 6.90625, "learning_rate": 9.906396637056522e-06, "loss": 1.11081896, "memory(GiB)": 141.16, "step": 19420, "train_speed(iter/s)": 0.30243 }, { "acc": 0.72166429, "epoch": 0.2174353754716902, "grad_norm": 5.78125, "learning_rate": 9.906040121603488e-06, "loss": 1.12087803, "memory(GiB)": 141.16, "step": 19440, "train_speed(iter/s)": 0.302533 }, { "acc": 0.72929163, "epoch": 0.21765907441764873, "grad_norm": 6.65625, "learning_rate": 9.905682934934042e-06, "loss": 1.08907032, "memory(GiB)": 141.16, "step": 19460, "train_speed(iter/s)": 0.302639 }, { "acc": 0.72156754, "epoch": 0.21788277336360726, "grad_norm": 5.4375, "learning_rate": 9.905325077097054e-06, "loss": 1.12756195, "memory(GiB)": 141.16, "step": 19480, "train_speed(iter/s)": 0.30275 }, { "acc": 0.72039175, "epoch": 0.21810647230956579, "grad_norm": 8.25, "learning_rate": 9.904966548141481e-06, "loss": 1.14608374, "memory(GiB)": 141.16, "step": 19500, "train_speed(iter/s)": 0.302849 }, { "acc": 0.74067583, "epoch": 0.2183301712555243, "grad_norm": 6.5, "learning_rate": 9.904607348116378e-06, "loss": 1.04585476, "memory(GiB)": 141.16, "step": 19520, "train_speed(iter/s)": 0.302955 }, { "acc": 0.71377077, "epoch": 0.21855387020148284, "grad_norm": 5.75, "learning_rate": 9.904247477070883e-06, "loss": 1.16937122, "memory(GiB)": 141.16, "step": 19540, "train_speed(iter/s)": 0.303051 }, { "acc": 0.72541351, "epoch": 0.21877756914744137, "grad_norm": 7.28125, "learning_rate": 9.90388693505424e-06, "loss": 1.12037296, "memory(GiB)": 141.16, "step": 19560, "train_speed(iter/s)": 0.303147 }, { "acc": 0.73123102, "epoch": 0.2190012680933999, "grad_norm": 6.15625, "learning_rate": 9.903525722115768e-06, "loss": 1.07483063, "memory(GiB)": 141.16, "step": 19580, "train_speed(iter/s)": 0.303238 }, { "acc": 0.71737261, "epoch": 0.21922496703935843, "grad_norm": 6.03125, "learning_rate": 9.90316383830489e-06, "loss": 1.14031305, "memory(GiB)": 141.16, "step": 19600, "train_speed(iter/s)": 0.303331 }, { "acc": 0.73050709, "epoch": 0.21944866598531695, "grad_norm": 6.40625, "learning_rate": 9.902801283671118e-06, "loss": 1.07626781, "memory(GiB)": 141.16, "step": 19620, "train_speed(iter/s)": 0.303424 }, { "acc": 0.72506738, "epoch": 0.21967236493127548, "grad_norm": 7.5625, "learning_rate": 9.902438058264052e-06, "loss": 1.10522652, "memory(GiB)": 141.16, "step": 19640, "train_speed(iter/s)": 0.303521 }, { "acc": 0.71905332, "epoch": 0.219896063877234, "grad_norm": 6.5, "learning_rate": 9.902074162133389e-06, "loss": 1.13417892, "memory(GiB)": 141.16, "step": 19660, "train_speed(iter/s)": 0.303608 }, { "acc": 0.73150764, "epoch": 0.22011976282319254, "grad_norm": 6.4375, "learning_rate": 9.901709595328913e-06, "loss": 1.06469049, "memory(GiB)": 141.16, "step": 19680, "train_speed(iter/s)": 0.303697 }, { "acc": 0.72617502, "epoch": 0.22034346176915107, "grad_norm": 8.375, "learning_rate": 9.901344357900502e-06, "loss": 1.08615351, "memory(GiB)": 141.16, "step": 19700, "train_speed(iter/s)": 0.303804 }, { "acc": 0.72843409, "epoch": 0.2205671607151096, "grad_norm": 9.375, "learning_rate": 9.900978449898127e-06, "loss": 1.07372208, "memory(GiB)": 141.16, "step": 19720, "train_speed(iter/s)": 0.30391 }, { "acc": 0.72310309, "epoch": 0.22079085966106812, "grad_norm": 7.28125, "learning_rate": 9.900611871371848e-06, "loss": 1.11699982, "memory(GiB)": 141.16, "step": 19740, "train_speed(iter/s)": 0.304004 }, { "acc": 0.72248273, "epoch": 0.22101455860702665, "grad_norm": 5.28125, "learning_rate": 9.900244622371821e-06, "loss": 1.11488247, "memory(GiB)": 141.16, "step": 19760, "train_speed(iter/s)": 0.304114 }, { "acc": 0.72517405, "epoch": 0.22123825755298518, "grad_norm": 5.875, "learning_rate": 9.899876702948288e-06, "loss": 1.10648699, "memory(GiB)": 141.16, "step": 19780, "train_speed(iter/s)": 0.304219 }, { "acc": 0.72240849, "epoch": 0.22146195649894374, "grad_norm": 7.0, "learning_rate": 9.899508113151588e-06, "loss": 1.12209997, "memory(GiB)": 141.16, "step": 19800, "train_speed(iter/s)": 0.304316 }, { "acc": 0.71062717, "epoch": 0.22168565544490226, "grad_norm": 6.65625, "learning_rate": 9.899138853032147e-06, "loss": 1.17526445, "memory(GiB)": 141.16, "step": 19820, "train_speed(iter/s)": 0.304422 }, { "acc": 0.73507586, "epoch": 0.2219093543908608, "grad_norm": 6.34375, "learning_rate": 9.898768922640485e-06, "loss": 1.04620132, "memory(GiB)": 141.16, "step": 19840, "train_speed(iter/s)": 0.304524 }, { "acc": 0.72393918, "epoch": 0.22213305333681932, "grad_norm": 5.84375, "learning_rate": 9.898398322027216e-06, "loss": 1.12596893, "memory(GiB)": 141.16, "step": 19860, "train_speed(iter/s)": 0.304628 }, { "acc": 0.72147207, "epoch": 0.22235675228277785, "grad_norm": 6.46875, "learning_rate": 9.898027051243042e-06, "loss": 1.12212429, "memory(GiB)": 141.16, "step": 19880, "train_speed(iter/s)": 0.304721 }, { "acc": 0.72353086, "epoch": 0.22258045122873638, "grad_norm": 6.6875, "learning_rate": 9.897655110338759e-06, "loss": 1.1019495, "memory(GiB)": 141.16, "step": 19900, "train_speed(iter/s)": 0.304817 }, { "acc": 0.7237628, "epoch": 0.2228041501746949, "grad_norm": 9.0, "learning_rate": 9.897282499365254e-06, "loss": 1.106847, "memory(GiB)": 141.16, "step": 19920, "train_speed(iter/s)": 0.304918 }, { "acc": 0.7313323, "epoch": 0.22302784912065343, "grad_norm": 5.75, "learning_rate": 9.896909218373503e-06, "loss": 1.07598934, "memory(GiB)": 141.16, "step": 19940, "train_speed(iter/s)": 0.305014 }, { "acc": 0.71500282, "epoch": 0.22325154806661196, "grad_norm": 6.28125, "learning_rate": 9.896535267414578e-06, "loss": 1.15273333, "memory(GiB)": 141.16, "step": 19960, "train_speed(iter/s)": 0.305107 }, { "acc": 0.71170034, "epoch": 0.2234752470125705, "grad_norm": 7.875, "learning_rate": 9.896160646539641e-06, "loss": 1.16488056, "memory(GiB)": 141.16, "step": 19980, "train_speed(iter/s)": 0.305205 }, { "acc": 0.72122035, "epoch": 0.22369894595852902, "grad_norm": 8.4375, "learning_rate": 9.895785355799947e-06, "loss": 1.13955936, "memory(GiB)": 141.16, "step": 20000, "train_speed(iter/s)": 0.305302 }, { "epoch": 0.22369894595852902, "eval_acc": 0.6836218567959778, "eval_loss": 1.1054245233535767, "eval_runtime": 2321.7935, "eval_samples_per_second": 32.425, "eval_steps_per_second": 16.212, "step": 20000 }, { "acc": 0.72348957, "epoch": 0.22392264490448754, "grad_norm": 7.71875, "learning_rate": 9.895409395246839e-06, "loss": 1.10244875, "memory(GiB)": 141.16, "step": 20020, "train_speed(iter/s)": 0.294729 }, { "acc": 0.74192324, "epoch": 0.22414634385044607, "grad_norm": 9.6875, "learning_rate": 9.895032764931753e-06, "loss": 1.02561417, "memory(GiB)": 141.16, "step": 20040, "train_speed(iter/s)": 0.294831 }, { "acc": 0.71319857, "epoch": 0.2243700427964046, "grad_norm": 7.4375, "learning_rate": 9.894655464906217e-06, "loss": 1.16505117, "memory(GiB)": 141.16, "step": 20060, "train_speed(iter/s)": 0.294931 }, { "acc": 0.72172256, "epoch": 0.22459374174236313, "grad_norm": 5.4375, "learning_rate": 9.894277495221856e-06, "loss": 1.11509895, "memory(GiB)": 141.16, "step": 20080, "train_speed(iter/s)": 0.295021 }, { "acc": 0.70846381, "epoch": 0.22481744068832166, "grad_norm": 7.28125, "learning_rate": 9.893898855930378e-06, "loss": 1.17386265, "memory(GiB)": 141.16, "step": 20100, "train_speed(iter/s)": 0.29512 }, { "acc": 0.72317896, "epoch": 0.22504113963428019, "grad_norm": 7.6875, "learning_rate": 9.893519547083584e-06, "loss": 1.10344334, "memory(GiB)": 141.16, "step": 20120, "train_speed(iter/s)": 0.295204 }, { "acc": 0.73984742, "epoch": 0.2252648385802387, "grad_norm": 6.3125, "learning_rate": 9.893139568733374e-06, "loss": 1.04163857, "memory(GiB)": 141.16, "step": 20140, "train_speed(iter/s)": 0.295304 }, { "acc": 0.71990376, "epoch": 0.22548853752619724, "grad_norm": 6.59375, "learning_rate": 9.892758920931732e-06, "loss": 1.10833435, "memory(GiB)": 141.16, "step": 20160, "train_speed(iter/s)": 0.295413 }, { "acc": 0.73653817, "epoch": 0.22571223647215577, "grad_norm": 6.75, "learning_rate": 9.892377603730733e-06, "loss": 1.04661303, "memory(GiB)": 141.16, "step": 20180, "train_speed(iter/s)": 0.295496 }, { "acc": 0.72274189, "epoch": 0.2259359354181143, "grad_norm": 7.6875, "learning_rate": 9.891995617182552e-06, "loss": 1.11215982, "memory(GiB)": 141.16, "step": 20200, "train_speed(iter/s)": 0.295605 }, { "acc": 0.72140908, "epoch": 0.22615963436407283, "grad_norm": 6.1875, "learning_rate": 9.891612961339447e-06, "loss": 1.12632561, "memory(GiB)": 141.16, "step": 20220, "train_speed(iter/s)": 0.29571 }, { "acc": 0.73017187, "epoch": 0.22638333331003135, "grad_norm": 6.1875, "learning_rate": 9.891229636253773e-06, "loss": 1.07792339, "memory(GiB)": 141.16, "step": 20240, "train_speed(iter/s)": 0.295802 }, { "acc": 0.72182074, "epoch": 0.22660703225598988, "grad_norm": 6.09375, "learning_rate": 9.890845641977972e-06, "loss": 1.13621321, "memory(GiB)": 141.16, "step": 20260, "train_speed(iter/s)": 0.2959 }, { "acc": 0.73270769, "epoch": 0.2268307312019484, "grad_norm": 6.65625, "learning_rate": 9.89046097856458e-06, "loss": 1.07391376, "memory(GiB)": 141.16, "step": 20280, "train_speed(iter/s)": 0.295995 }, { "acc": 0.7215517, "epoch": 0.22705443014790694, "grad_norm": 7.03125, "learning_rate": 9.890075646066226e-06, "loss": 1.10885, "memory(GiB)": 141.16, "step": 20300, "train_speed(iter/s)": 0.296095 }, { "acc": 0.72020912, "epoch": 0.22727812909386547, "grad_norm": 6.4375, "learning_rate": 9.88968964453563e-06, "loss": 1.13460808, "memory(GiB)": 141.16, "step": 20320, "train_speed(iter/s)": 0.296194 }, { "acc": 0.72872677, "epoch": 0.227501828039824, "grad_norm": 9.5625, "learning_rate": 9.8893029740256e-06, "loss": 1.1006506, "memory(GiB)": 141.16, "step": 20340, "train_speed(iter/s)": 0.29629 }, { "acc": 0.72457571, "epoch": 0.22772552698578252, "grad_norm": 6.65625, "learning_rate": 9.888915634589036e-06, "loss": 1.10527029, "memory(GiB)": 141.16, "step": 20360, "train_speed(iter/s)": 0.296384 }, { "acc": 0.73136096, "epoch": 0.22794922593174105, "grad_norm": 5.875, "learning_rate": 9.888527626278937e-06, "loss": 1.08387871, "memory(GiB)": 141.16, "step": 20380, "train_speed(iter/s)": 0.296473 }, { "acc": 0.72045269, "epoch": 0.22817292487769958, "grad_norm": 6.9375, "learning_rate": 9.888138949148387e-06, "loss": 1.10534477, "memory(GiB)": 141.16, "step": 20400, "train_speed(iter/s)": 0.296573 }, { "acc": 0.72985525, "epoch": 0.22839662382365813, "grad_norm": 6.96875, "learning_rate": 9.887749603250559e-06, "loss": 1.0706274, "memory(GiB)": 141.16, "step": 20420, "train_speed(iter/s)": 0.296617 }, { "acc": 0.7317338, "epoch": 0.22862032276961666, "grad_norm": 6.28125, "learning_rate": 9.887359588638724e-06, "loss": 1.06988239, "memory(GiB)": 141.16, "step": 20440, "train_speed(iter/s)": 0.296709 }, { "acc": 0.72735767, "epoch": 0.2288440217155752, "grad_norm": 7.625, "learning_rate": 9.886968905366239e-06, "loss": 1.08126984, "memory(GiB)": 141.16, "step": 20460, "train_speed(iter/s)": 0.296811 }, { "acc": 0.72278099, "epoch": 0.22906772066153372, "grad_norm": 6.28125, "learning_rate": 9.886577553486557e-06, "loss": 1.10859327, "memory(GiB)": 141.16, "step": 20480, "train_speed(iter/s)": 0.296898 }, { "acc": 0.72114115, "epoch": 0.22929141960749225, "grad_norm": 7.65625, "learning_rate": 9.886185533053224e-06, "loss": 1.11605844, "memory(GiB)": 141.16, "step": 20500, "train_speed(iter/s)": 0.296986 }, { "acc": 0.72127843, "epoch": 0.22951511855345078, "grad_norm": 7.15625, "learning_rate": 9.885792844119868e-06, "loss": 1.13021145, "memory(GiB)": 141.16, "step": 20520, "train_speed(iter/s)": 0.29708 }, { "acc": 0.72580628, "epoch": 0.2297388174994093, "grad_norm": 7.28125, "learning_rate": 9.885399486740216e-06, "loss": 1.10601006, "memory(GiB)": 141.16, "step": 20540, "train_speed(iter/s)": 0.297166 }, { "acc": 0.72456837, "epoch": 0.22996251644536783, "grad_norm": 5.84375, "learning_rate": 9.885005460968088e-06, "loss": 1.10871487, "memory(GiB)": 141.16, "step": 20560, "train_speed(iter/s)": 0.29726 }, { "acc": 0.72118645, "epoch": 0.23018621539132636, "grad_norm": 6.09375, "learning_rate": 9.884610766857388e-06, "loss": 1.11048946, "memory(GiB)": 141.16, "step": 20580, "train_speed(iter/s)": 0.297362 }, { "acc": 0.73754802, "epoch": 0.2304099143372849, "grad_norm": 8.0625, "learning_rate": 9.884215404462119e-06, "loss": 1.0627182, "memory(GiB)": 141.16, "step": 20600, "train_speed(iter/s)": 0.297459 }, { "acc": 0.72126503, "epoch": 0.23063361328324342, "grad_norm": 6.34375, "learning_rate": 9.883819373836372e-06, "loss": 1.10562038, "memory(GiB)": 141.16, "step": 20620, "train_speed(iter/s)": 0.297556 }, { "acc": 0.72353506, "epoch": 0.23085731222920194, "grad_norm": 6.90625, "learning_rate": 9.883422675034328e-06, "loss": 1.11194382, "memory(GiB)": 141.16, "step": 20640, "train_speed(iter/s)": 0.297656 }, { "acc": 0.7153924, "epoch": 0.23108101117516047, "grad_norm": 7.5625, "learning_rate": 9.88302530811026e-06, "loss": 1.14684896, "memory(GiB)": 141.16, "step": 20660, "train_speed(iter/s)": 0.297757 }, { "acc": 0.73524914, "epoch": 0.231304710121119, "grad_norm": 7.4375, "learning_rate": 9.882627273118538e-06, "loss": 1.06214638, "memory(GiB)": 141.16, "step": 20680, "train_speed(iter/s)": 0.297821 }, { "acc": 0.71650577, "epoch": 0.23152840906707753, "grad_norm": 9.5625, "learning_rate": 9.882228570113616e-06, "loss": 1.14772491, "memory(GiB)": 141.16, "step": 20700, "train_speed(iter/s)": 0.297905 }, { "acc": 0.73777485, "epoch": 0.23175210801303606, "grad_norm": 7.875, "learning_rate": 9.881829199150041e-06, "loss": 1.0654357, "memory(GiB)": 141.16, "step": 20720, "train_speed(iter/s)": 0.298001 }, { "acc": 0.71884193, "epoch": 0.23197580695899458, "grad_norm": 6.6875, "learning_rate": 9.881429160282455e-06, "loss": 1.12305756, "memory(GiB)": 141.16, "step": 20740, "train_speed(iter/s)": 0.298101 }, { "acc": 0.72572651, "epoch": 0.2321995059049531, "grad_norm": 6.25, "learning_rate": 9.881028453565588e-06, "loss": 1.11415653, "memory(GiB)": 141.16, "step": 20760, "train_speed(iter/s)": 0.298199 }, { "acc": 0.72203207, "epoch": 0.23242320485091164, "grad_norm": 6.9375, "learning_rate": 9.880627079054263e-06, "loss": 1.10888243, "memory(GiB)": 141.16, "step": 20780, "train_speed(iter/s)": 0.298299 }, { "acc": 0.7233264, "epoch": 0.23264690379687017, "grad_norm": 8.1875, "learning_rate": 9.880225036803393e-06, "loss": 1.12878304, "memory(GiB)": 141.16, "step": 20800, "train_speed(iter/s)": 0.298388 }, { "acc": 0.71897683, "epoch": 0.2328706027428287, "grad_norm": 6.5625, "learning_rate": 9.879822326867983e-06, "loss": 1.13372469, "memory(GiB)": 141.16, "step": 20820, "train_speed(iter/s)": 0.298483 }, { "acc": 0.73372869, "epoch": 0.23309430168878723, "grad_norm": 7.8125, "learning_rate": 9.879418949303131e-06, "loss": 1.0685317, "memory(GiB)": 141.16, "step": 20840, "train_speed(iter/s)": 0.298564 }, { "acc": 0.73368635, "epoch": 0.23331800063474575, "grad_norm": 8.25, "learning_rate": 9.879014904164023e-06, "loss": 1.06149578, "memory(GiB)": 141.16, "step": 20860, "train_speed(iter/s)": 0.298662 }, { "acc": 0.72135954, "epoch": 0.23354169958070428, "grad_norm": 6.75, "learning_rate": 9.878610191505938e-06, "loss": 1.125914, "memory(GiB)": 141.16, "step": 20880, "train_speed(iter/s)": 0.29875 }, { "acc": 0.71620407, "epoch": 0.2337653985266628, "grad_norm": 6.03125, "learning_rate": 9.878204811384248e-06, "loss": 1.1622468, "memory(GiB)": 141.16, "step": 20900, "train_speed(iter/s)": 0.298836 }, { "acc": 0.71155381, "epoch": 0.23398909747262134, "grad_norm": 6.3125, "learning_rate": 9.877798763854415e-06, "loss": 1.16849527, "memory(GiB)": 141.16, "step": 20920, "train_speed(iter/s)": 0.298937 }, { "acc": 0.72563343, "epoch": 0.23421279641857987, "grad_norm": 8.9375, "learning_rate": 9.877392048971992e-06, "loss": 1.11003208, "memory(GiB)": 141.16, "step": 20940, "train_speed(iter/s)": 0.299032 }, { "acc": 0.71714363, "epoch": 0.2344364953645384, "grad_norm": 7.8125, "learning_rate": 9.876984666792622e-06, "loss": 1.13825874, "memory(GiB)": 141.16, "step": 20960, "train_speed(iter/s)": 0.299125 }, { "acc": 0.72581367, "epoch": 0.23466019431049692, "grad_norm": 7.59375, "learning_rate": 9.87657661737204e-06, "loss": 1.09762793, "memory(GiB)": 141.16, "step": 20980, "train_speed(iter/s)": 0.299221 }, { "acc": 0.71515217, "epoch": 0.23488389325645545, "grad_norm": 7.6875, "learning_rate": 9.876167900766077e-06, "loss": 1.14656067, "memory(GiB)": 141.16, "step": 21000, "train_speed(iter/s)": 0.299323 }, { "acc": 0.73317971, "epoch": 0.23510759220241398, "grad_norm": 6.125, "learning_rate": 9.875758517030647e-06, "loss": 1.07711821, "memory(GiB)": 141.16, "step": 21020, "train_speed(iter/s)": 0.299411 }, { "acc": 0.72896719, "epoch": 0.2353312911483725, "grad_norm": 6.03125, "learning_rate": 9.875348466221762e-06, "loss": 1.07754936, "memory(GiB)": 141.16, "step": 21040, "train_speed(iter/s)": 0.299502 }, { "acc": 0.73757095, "epoch": 0.23555499009433106, "grad_norm": 7.84375, "learning_rate": 9.87493774839552e-06, "loss": 1.05024271, "memory(GiB)": 141.16, "step": 21060, "train_speed(iter/s)": 0.299582 }, { "acc": 0.71873226, "epoch": 0.2357786890402896, "grad_norm": 6.6875, "learning_rate": 9.874526363608116e-06, "loss": 1.13200397, "memory(GiB)": 141.16, "step": 21080, "train_speed(iter/s)": 0.299667 }, { "acc": 0.73248315, "epoch": 0.23600238798624812, "grad_norm": 7.21875, "learning_rate": 9.874114311915833e-06, "loss": 1.0788887, "memory(GiB)": 141.16, "step": 21100, "train_speed(iter/s)": 0.299754 }, { "acc": 0.71648455, "epoch": 0.23622608693220665, "grad_norm": 6.25, "learning_rate": 9.873701593375044e-06, "loss": 1.1436883, "memory(GiB)": 141.16, "step": 21120, "train_speed(iter/s)": 0.299861 }, { "acc": 0.72393484, "epoch": 0.23644978587816518, "grad_norm": 7.6875, "learning_rate": 9.873288208042218e-06, "loss": 1.11193085, "memory(GiB)": 141.16, "step": 21140, "train_speed(iter/s)": 0.299954 }, { "acc": 0.72787638, "epoch": 0.2366734848241237, "grad_norm": 7.75, "learning_rate": 9.872874155973908e-06, "loss": 1.1073595, "memory(GiB)": 141.16, "step": 21160, "train_speed(iter/s)": 0.300054 }, { "acc": 0.71221218, "epoch": 0.23689718377008223, "grad_norm": 8.0625, "learning_rate": 9.872459437226764e-06, "loss": 1.17577038, "memory(GiB)": 141.16, "step": 21180, "train_speed(iter/s)": 0.300151 }, { "acc": 0.72195673, "epoch": 0.23712088271604076, "grad_norm": 6.3125, "learning_rate": 9.872044051857527e-06, "loss": 1.13162813, "memory(GiB)": 141.16, "step": 21200, "train_speed(iter/s)": 0.300248 }, { "acc": 0.7265202, "epoch": 0.2373445816619993, "grad_norm": 7.5625, "learning_rate": 9.871627999923025e-06, "loss": 1.11034365, "memory(GiB)": 141.16, "step": 21220, "train_speed(iter/s)": 0.300337 }, { "acc": 0.73103657, "epoch": 0.23756828060795782, "grad_norm": 8.6875, "learning_rate": 9.871211281480181e-06, "loss": 1.07517853, "memory(GiB)": 141.16, "step": 21240, "train_speed(iter/s)": 0.300435 }, { "acc": 0.71773281, "epoch": 0.23779197955391634, "grad_norm": 8.1875, "learning_rate": 9.870793896586009e-06, "loss": 1.13457279, "memory(GiB)": 141.16, "step": 21260, "train_speed(iter/s)": 0.300534 }, { "acc": 0.73187413, "epoch": 0.23801567849987487, "grad_norm": 6.40625, "learning_rate": 9.87037584529761e-06, "loss": 1.06514883, "memory(GiB)": 141.16, "step": 21280, "train_speed(iter/s)": 0.300626 }, { "acc": 0.72251368, "epoch": 0.2382393774458334, "grad_norm": 6.09375, "learning_rate": 9.869957127672185e-06, "loss": 1.1186615, "memory(GiB)": 141.16, "step": 21300, "train_speed(iter/s)": 0.300716 }, { "acc": 0.73039436, "epoch": 0.23846307639179193, "grad_norm": 6.34375, "learning_rate": 9.869537743767014e-06, "loss": 1.10445509, "memory(GiB)": 141.16, "step": 21320, "train_speed(iter/s)": 0.300805 }, { "acc": 0.71121168, "epoch": 0.23868677533775046, "grad_norm": 6.9375, "learning_rate": 9.86911769363948e-06, "loss": 1.16115208, "memory(GiB)": 141.16, "step": 21340, "train_speed(iter/s)": 0.300898 }, { "acc": 0.72753506, "epoch": 0.23891047428370898, "grad_norm": 7.375, "learning_rate": 9.86869697734705e-06, "loss": 1.08215857, "memory(GiB)": 141.16, "step": 21360, "train_speed(iter/s)": 0.300989 }, { "acc": 0.7295249, "epoch": 0.2391341732296675, "grad_norm": 7.03125, "learning_rate": 9.868275594947282e-06, "loss": 1.09261627, "memory(GiB)": 141.16, "step": 21380, "train_speed(iter/s)": 0.301093 }, { "acc": 0.72316566, "epoch": 0.23935787217562604, "grad_norm": 5.84375, "learning_rate": 9.86785354649783e-06, "loss": 1.12244835, "memory(GiB)": 141.16, "step": 21400, "train_speed(iter/s)": 0.301193 }, { "acc": 0.7251596, "epoch": 0.23958157112158457, "grad_norm": 7.375, "learning_rate": 9.867430832056434e-06, "loss": 1.10051165, "memory(GiB)": 141.16, "step": 21420, "train_speed(iter/s)": 0.301299 }, { "acc": 0.72961731, "epoch": 0.2398052700675431, "grad_norm": 6.34375, "learning_rate": 9.86700745168093e-06, "loss": 1.07493467, "memory(GiB)": 141.16, "step": 21440, "train_speed(iter/s)": 0.30139 }, { "acc": 0.72507725, "epoch": 0.24002896901350163, "grad_norm": 6.15625, "learning_rate": 9.86658340542924e-06, "loss": 1.10425758, "memory(GiB)": 141.16, "step": 21460, "train_speed(iter/s)": 0.301475 }, { "acc": 0.72258868, "epoch": 0.24025266795946015, "grad_norm": 6.875, "learning_rate": 9.866158693359382e-06, "loss": 1.11291904, "memory(GiB)": 141.16, "step": 21480, "train_speed(iter/s)": 0.301565 }, { "acc": 0.71675282, "epoch": 0.24047636690541868, "grad_norm": 7.75, "learning_rate": 9.86573331552946e-06, "loss": 1.14862804, "memory(GiB)": 141.16, "step": 21500, "train_speed(iter/s)": 0.301651 }, { "acc": 0.72749071, "epoch": 0.2407000658513772, "grad_norm": 6.9375, "learning_rate": 9.865307271997674e-06, "loss": 1.09429264, "memory(GiB)": 141.16, "step": 21520, "train_speed(iter/s)": 0.301744 }, { "acc": 0.73327398, "epoch": 0.24092376479733574, "grad_norm": 8.625, "learning_rate": 9.864880562822312e-06, "loss": 1.08419552, "memory(GiB)": 141.16, "step": 21540, "train_speed(iter/s)": 0.301826 }, { "acc": 0.73461142, "epoch": 0.24114746374329427, "grad_norm": 7.4375, "learning_rate": 9.864453188061753e-06, "loss": 1.06768351, "memory(GiB)": 141.16, "step": 21560, "train_speed(iter/s)": 0.301907 }, { "acc": 0.72223864, "epoch": 0.2413711626892528, "grad_norm": 8.375, "learning_rate": 9.86402514777447e-06, "loss": 1.11480246, "memory(GiB)": 141.16, "step": 21580, "train_speed(iter/s)": 0.301993 }, { "acc": 0.72222414, "epoch": 0.24159486163521132, "grad_norm": 5.9375, "learning_rate": 9.863596442019023e-06, "loss": 1.13043194, "memory(GiB)": 141.16, "step": 21600, "train_speed(iter/s)": 0.302076 }, { "acc": 0.72399154, "epoch": 0.24181856058116985, "grad_norm": 7.75, "learning_rate": 9.863167070854064e-06, "loss": 1.09702253, "memory(GiB)": 141.16, "step": 21620, "train_speed(iter/s)": 0.302166 }, { "acc": 0.72978826, "epoch": 0.24204225952712838, "grad_norm": 9.25, "learning_rate": 9.862737034338342e-06, "loss": 1.09399967, "memory(GiB)": 141.16, "step": 21640, "train_speed(iter/s)": 0.302251 }, { "acc": 0.72348738, "epoch": 0.2422659584730869, "grad_norm": 7.96875, "learning_rate": 9.862306332530688e-06, "loss": 1.11485853, "memory(GiB)": 141.16, "step": 21660, "train_speed(iter/s)": 0.302333 }, { "acc": 0.7319777, "epoch": 0.24248965741904546, "grad_norm": 8.375, "learning_rate": 9.86187496549003e-06, "loss": 1.07359943, "memory(GiB)": 141.16, "step": 21680, "train_speed(iter/s)": 0.302427 }, { "acc": 0.72299533, "epoch": 0.242713356365004, "grad_norm": 6.375, "learning_rate": 9.861442933275384e-06, "loss": 1.10477371, "memory(GiB)": 141.16, "step": 21700, "train_speed(iter/s)": 0.30253 }, { "acc": 0.73292918, "epoch": 0.24293705531096252, "grad_norm": 5.96875, "learning_rate": 9.861010235945859e-06, "loss": 1.05975323, "memory(GiB)": 141.16, "step": 21720, "train_speed(iter/s)": 0.302622 }, { "acc": 0.73070507, "epoch": 0.24316075425692105, "grad_norm": 7.5, "learning_rate": 9.860576873560651e-06, "loss": 1.07693005, "memory(GiB)": 141.16, "step": 21740, "train_speed(iter/s)": 0.302717 }, { "acc": 0.71152515, "epoch": 0.24338445320287957, "grad_norm": 6.71875, "learning_rate": 9.860142846179057e-06, "loss": 1.17036972, "memory(GiB)": 141.16, "step": 21760, "train_speed(iter/s)": 0.302812 }, { "acc": 0.7258914, "epoch": 0.2436081521488381, "grad_norm": 5.4375, "learning_rate": 9.859708153860453e-06, "loss": 1.10598536, "memory(GiB)": 141.16, "step": 21780, "train_speed(iter/s)": 0.302887 }, { "acc": 0.72735023, "epoch": 0.24383185109479663, "grad_norm": 7.96875, "learning_rate": 9.859272796664312e-06, "loss": 1.09624825, "memory(GiB)": 141.16, "step": 21800, "train_speed(iter/s)": 0.302976 }, { "acc": 0.72514691, "epoch": 0.24405555004075516, "grad_norm": 7.03125, "learning_rate": 9.858836774650197e-06, "loss": 1.10856094, "memory(GiB)": 141.16, "step": 21820, "train_speed(iter/s)": 0.303059 }, { "acc": 0.70992508, "epoch": 0.2442792489867137, "grad_norm": 8.1875, "learning_rate": 9.858400087877764e-06, "loss": 1.17248192, "memory(GiB)": 141.16, "step": 21840, "train_speed(iter/s)": 0.303141 }, { "acc": 0.73740072, "epoch": 0.24450294793267222, "grad_norm": 6.40625, "learning_rate": 9.857962736406755e-06, "loss": 1.04998169, "memory(GiB)": 141.16, "step": 21860, "train_speed(iter/s)": 0.303235 }, { "acc": 0.72847776, "epoch": 0.24472664687863074, "grad_norm": 5.71875, "learning_rate": 9.857524720297009e-06, "loss": 1.0845787, "memory(GiB)": 141.16, "step": 21880, "train_speed(iter/s)": 0.303324 }, { "acc": 0.73134294, "epoch": 0.24495034582458927, "grad_norm": 6.59375, "learning_rate": 9.857086039608449e-06, "loss": 1.07477417, "memory(GiB)": 141.16, "step": 21900, "train_speed(iter/s)": 0.303408 }, { "acc": 0.71586246, "epoch": 0.2451740447705478, "grad_norm": 7.125, "learning_rate": 9.856646694401097e-06, "loss": 1.14196072, "memory(GiB)": 141.16, "step": 21920, "train_speed(iter/s)": 0.303487 }, { "acc": 0.72102566, "epoch": 0.24539774371650633, "grad_norm": 6.40625, "learning_rate": 9.856206684735058e-06, "loss": 1.11836319, "memory(GiB)": 141.16, "step": 21940, "train_speed(iter/s)": 0.303563 }, { "acc": 0.73039856, "epoch": 0.24562144266246486, "grad_norm": 7.96875, "learning_rate": 9.855766010670533e-06, "loss": 1.08674698, "memory(GiB)": 141.16, "step": 21960, "train_speed(iter/s)": 0.303655 }, { "acc": 0.72145371, "epoch": 0.24584514160842338, "grad_norm": 6.4375, "learning_rate": 9.855324672267815e-06, "loss": 1.11979504, "memory(GiB)": 141.16, "step": 21980, "train_speed(iter/s)": 0.303748 }, { "acc": 0.72378616, "epoch": 0.2460688405543819, "grad_norm": 7.875, "learning_rate": 9.854882669587282e-06, "loss": 1.11187372, "memory(GiB)": 141.16, "step": 22000, "train_speed(iter/s)": 0.303837 }, { "epoch": 0.2460688405543819, "eval_acc": 0.6844437306288794, "eval_loss": 1.1023231744766235, "eval_runtime": 2321.4267, "eval_samples_per_second": 32.43, "eval_steps_per_second": 16.215, "step": 22000 }, { "acc": 0.71672521, "epoch": 0.24629253950034044, "grad_norm": 7.96875, "learning_rate": 9.854440002689409e-06, "loss": 1.15044804, "memory(GiB)": 141.16, "step": 22020, "train_speed(iter/s)": 0.294285 }, { "acc": 0.72062302, "epoch": 0.24651623844629897, "grad_norm": 6.71875, "learning_rate": 9.853996671634755e-06, "loss": 1.12158976, "memory(GiB)": 141.16, "step": 22040, "train_speed(iter/s)": 0.294372 }, { "acc": 0.72333779, "epoch": 0.2467399373922575, "grad_norm": 8.0, "learning_rate": 9.85355267648398e-06, "loss": 1.11010418, "memory(GiB)": 141.16, "step": 22060, "train_speed(iter/s)": 0.294467 }, { "acc": 0.71479492, "epoch": 0.24696363633821602, "grad_norm": 8.375, "learning_rate": 9.853108017297823e-06, "loss": 1.15573645, "memory(GiB)": 141.16, "step": 22080, "train_speed(iter/s)": 0.29455 }, { "acc": 0.73399839, "epoch": 0.24718733528417455, "grad_norm": 6.25, "learning_rate": 9.852662694137123e-06, "loss": 1.07201691, "memory(GiB)": 141.16, "step": 22100, "train_speed(iter/s)": 0.294641 }, { "acc": 0.71697783, "epoch": 0.24741103423013308, "grad_norm": 9.6875, "learning_rate": 9.852216707062805e-06, "loss": 1.14669733, "memory(GiB)": 141.16, "step": 22120, "train_speed(iter/s)": 0.294731 }, { "acc": 0.71207333, "epoch": 0.2476347331760916, "grad_norm": 6.6875, "learning_rate": 9.85177005613589e-06, "loss": 1.15777779, "memory(GiB)": 141.16, "step": 22140, "train_speed(iter/s)": 0.294833 }, { "acc": 0.72013564, "epoch": 0.24785843212205014, "grad_norm": 6.4375, "learning_rate": 9.851322741417482e-06, "loss": 1.12616129, "memory(GiB)": 141.16, "step": 22160, "train_speed(iter/s)": 0.294917 }, { "acc": 0.72677922, "epoch": 0.24808213106800867, "grad_norm": 6.5625, "learning_rate": 9.850874762968781e-06, "loss": 1.10157118, "memory(GiB)": 141.16, "step": 22180, "train_speed(iter/s)": 0.295002 }, { "acc": 0.71919079, "epoch": 0.2483058300139672, "grad_norm": 5.75, "learning_rate": 9.850426120851077e-06, "loss": 1.14340277, "memory(GiB)": 141.16, "step": 22200, "train_speed(iter/s)": 0.295082 }, { "acc": 0.7153574, "epoch": 0.24852952895992572, "grad_norm": 5.59375, "learning_rate": 9.849976815125753e-06, "loss": 1.15355263, "memory(GiB)": 141.16, "step": 22220, "train_speed(iter/s)": 0.295173 }, { "acc": 0.72173929, "epoch": 0.24875322790588425, "grad_norm": 6.96875, "learning_rate": 9.849526845854278e-06, "loss": 1.12472324, "memory(GiB)": 141.16, "step": 22240, "train_speed(iter/s)": 0.295253 }, { "acc": 0.72080617, "epoch": 0.24897692685184278, "grad_norm": 5.25, "learning_rate": 9.849076213098214e-06, "loss": 1.10486555, "memory(GiB)": 141.16, "step": 22260, "train_speed(iter/s)": 0.295343 }, { "acc": 0.71219854, "epoch": 0.2492006257978013, "grad_norm": 4.75, "learning_rate": 9.848624916919213e-06, "loss": 1.1622386, "memory(GiB)": 141.16, "step": 22280, "train_speed(iter/s)": 0.295435 }, { "acc": 0.71101069, "epoch": 0.24942432474375983, "grad_norm": 6.59375, "learning_rate": 9.848172957379024e-06, "loss": 1.17005444, "memory(GiB)": 141.16, "step": 22300, "train_speed(iter/s)": 0.29552 }, { "acc": 0.72681627, "epoch": 0.2496480236897184, "grad_norm": 5.28125, "learning_rate": 9.847720334539476e-06, "loss": 1.10209503, "memory(GiB)": 141.16, "step": 22320, "train_speed(iter/s)": 0.29561 }, { "acc": 0.72156606, "epoch": 0.24987172263567692, "grad_norm": 8.1875, "learning_rate": 9.847267048462498e-06, "loss": 1.10782871, "memory(GiB)": 141.16, "step": 22340, "train_speed(iter/s)": 0.2957 }, { "acc": 0.72551165, "epoch": 0.25009542158163545, "grad_norm": 7.75, "learning_rate": 9.846813099210104e-06, "loss": 1.09348602, "memory(GiB)": 141.16, "step": 22360, "train_speed(iter/s)": 0.295788 }, { "acc": 0.74085522, "epoch": 0.250319120527594, "grad_norm": 7.96875, "learning_rate": 9.8463584868444e-06, "loss": 1.03419609, "memory(GiB)": 141.16, "step": 22380, "train_speed(iter/s)": 0.295866 }, { "acc": 0.73202538, "epoch": 0.2505428194735525, "grad_norm": 6.90625, "learning_rate": 9.845903211427586e-06, "loss": 1.07502193, "memory(GiB)": 141.16, "step": 22400, "train_speed(iter/s)": 0.295959 }, { "acc": 0.71382413, "epoch": 0.25076651841951103, "grad_norm": 6.59375, "learning_rate": 9.845447273021947e-06, "loss": 1.14991474, "memory(GiB)": 141.16, "step": 22420, "train_speed(iter/s)": 0.296054 }, { "acc": 0.71190815, "epoch": 0.25099021736546956, "grad_norm": 6.9375, "learning_rate": 9.844990671689865e-06, "loss": 1.165835, "memory(GiB)": 141.16, "step": 22440, "train_speed(iter/s)": 0.296141 }, { "acc": 0.73683023, "epoch": 0.2512139163114281, "grad_norm": 6.28125, "learning_rate": 9.844533407493808e-06, "loss": 1.06157722, "memory(GiB)": 141.16, "step": 22460, "train_speed(iter/s)": 0.296225 }, { "acc": 0.71303921, "epoch": 0.2514376152573866, "grad_norm": 4.625, "learning_rate": 9.844075480496335e-06, "loss": 1.15547657, "memory(GiB)": 141.16, "step": 22480, "train_speed(iter/s)": 0.296314 }, { "acc": 0.71817904, "epoch": 0.25166131420334514, "grad_norm": 5.3125, "learning_rate": 9.843616890760102e-06, "loss": 1.14464073, "memory(GiB)": 141.16, "step": 22500, "train_speed(iter/s)": 0.296407 }, { "acc": 0.72002172, "epoch": 0.25188501314930367, "grad_norm": 5.90625, "learning_rate": 9.843157638347844e-06, "loss": 1.11753216, "memory(GiB)": 141.16, "step": 22520, "train_speed(iter/s)": 0.29648 }, { "acc": 0.73204699, "epoch": 0.2521087120952622, "grad_norm": 6.09375, "learning_rate": 9.842697723322396e-06, "loss": 1.05731659, "memory(GiB)": 141.16, "step": 22540, "train_speed(iter/s)": 0.296567 }, { "acc": 0.72939653, "epoch": 0.2523324110412207, "grad_norm": 7.25, "learning_rate": 9.842237145746684e-06, "loss": 1.07920322, "memory(GiB)": 141.16, "step": 22560, "train_speed(iter/s)": 0.296657 }, { "acc": 0.73470368, "epoch": 0.25255610998717926, "grad_norm": 5.8125, "learning_rate": 9.841775905683717e-06, "loss": 1.05761299, "memory(GiB)": 141.16, "step": 22580, "train_speed(iter/s)": 0.296748 }, { "acc": 0.72759047, "epoch": 0.2527798089331378, "grad_norm": 7.46875, "learning_rate": 9.841314003196602e-06, "loss": 1.08433208, "memory(GiB)": 141.16, "step": 22600, "train_speed(iter/s)": 0.296837 }, { "acc": 0.7132575, "epoch": 0.2530035078790963, "grad_norm": 7.15625, "learning_rate": 9.840851438348532e-06, "loss": 1.16545811, "memory(GiB)": 141.16, "step": 22620, "train_speed(iter/s)": 0.296927 }, { "acc": 0.73086185, "epoch": 0.25322720682505484, "grad_norm": 5.96875, "learning_rate": 9.840388211202795e-06, "loss": 1.07079649, "memory(GiB)": 141.16, "step": 22640, "train_speed(iter/s)": 0.297013 }, { "acc": 0.73747835, "epoch": 0.25345090577101337, "grad_norm": 8.125, "learning_rate": 9.839924321822765e-06, "loss": 1.05505037, "memory(GiB)": 141.16, "step": 22660, "train_speed(iter/s)": 0.297104 }, { "acc": 0.73433352, "epoch": 0.2536746047169719, "grad_norm": 7.25, "learning_rate": 9.83945977027191e-06, "loss": 1.06129475, "memory(GiB)": 141.16, "step": 22680, "train_speed(iter/s)": 0.297198 }, { "acc": 0.71787691, "epoch": 0.2538983036629304, "grad_norm": 9.0625, "learning_rate": 9.838994556613785e-06, "loss": 1.12217617, "memory(GiB)": 141.16, "step": 22700, "train_speed(iter/s)": 0.297295 }, { "acc": 0.71941528, "epoch": 0.25412200260888895, "grad_norm": 5.71875, "learning_rate": 9.83852868091204e-06, "loss": 1.13719978, "memory(GiB)": 141.16, "step": 22720, "train_speed(iter/s)": 0.297378 }, { "acc": 0.72448912, "epoch": 0.2543457015548475, "grad_norm": 6.03125, "learning_rate": 9.838062143230413e-06, "loss": 1.10409222, "memory(GiB)": 141.16, "step": 22740, "train_speed(iter/s)": 0.29746 }, { "acc": 0.73173804, "epoch": 0.254569400500806, "grad_norm": 5.5625, "learning_rate": 9.837594943632734e-06, "loss": 1.08014622, "memory(GiB)": 141.16, "step": 22760, "train_speed(iter/s)": 0.297547 }, { "acc": 0.7306438, "epoch": 0.25479309944676454, "grad_norm": 5.96875, "learning_rate": 9.837127082182921e-06, "loss": 1.08182011, "memory(GiB)": 141.16, "step": 22780, "train_speed(iter/s)": 0.297641 }, { "acc": 0.72283239, "epoch": 0.25501679839272307, "grad_norm": 6.3125, "learning_rate": 9.836658558944986e-06, "loss": 1.10476303, "memory(GiB)": 141.16, "step": 22800, "train_speed(iter/s)": 0.297742 }, { "acc": 0.72203131, "epoch": 0.2552404973386816, "grad_norm": 4.90625, "learning_rate": 9.836189373983026e-06, "loss": 1.12408752, "memory(GiB)": 141.16, "step": 22820, "train_speed(iter/s)": 0.297834 }, { "acc": 0.72824655, "epoch": 0.2554641962846401, "grad_norm": 7.96875, "learning_rate": 9.835719527361236e-06, "loss": 1.08322563, "memory(GiB)": 141.16, "step": 22840, "train_speed(iter/s)": 0.297927 }, { "acc": 0.73114595, "epoch": 0.25568789523059865, "grad_norm": 7.3125, "learning_rate": 9.835249019143896e-06, "loss": 1.06570883, "memory(GiB)": 141.16, "step": 22860, "train_speed(iter/s)": 0.29802 }, { "acc": 0.72105541, "epoch": 0.2559115941765572, "grad_norm": 5.71875, "learning_rate": 9.834777849395378e-06, "loss": 1.11826982, "memory(GiB)": 141.16, "step": 22880, "train_speed(iter/s)": 0.298102 }, { "acc": 0.72074261, "epoch": 0.2561352931225157, "grad_norm": 5.96875, "learning_rate": 9.834306018180144e-06, "loss": 1.09959669, "memory(GiB)": 141.16, "step": 22900, "train_speed(iter/s)": 0.298183 }, { "acc": 0.73076038, "epoch": 0.25635899206847423, "grad_norm": 8.25, "learning_rate": 9.83383352556275e-06, "loss": 1.0764451, "memory(GiB)": 141.16, "step": 22920, "train_speed(iter/s)": 0.298265 }, { "acc": 0.72862029, "epoch": 0.25658269101443276, "grad_norm": 9.0, "learning_rate": 9.83336037160784e-06, "loss": 1.09443779, "memory(GiB)": 141.16, "step": 22940, "train_speed(iter/s)": 0.298361 }, { "acc": 0.72357063, "epoch": 0.2568063899603913, "grad_norm": 6.53125, "learning_rate": 9.832886556380144e-06, "loss": 1.11857681, "memory(GiB)": 141.16, "step": 22960, "train_speed(iter/s)": 0.298447 }, { "acc": 0.72039585, "epoch": 0.2570300889063498, "grad_norm": 6.0, "learning_rate": 9.832412079944491e-06, "loss": 1.14247532, "memory(GiB)": 141.16, "step": 22980, "train_speed(iter/s)": 0.298531 }, { "acc": 0.72771301, "epoch": 0.25725378785230835, "grad_norm": 8.125, "learning_rate": 9.831936942365794e-06, "loss": 1.09747639, "memory(GiB)": 141.16, "step": 23000, "train_speed(iter/s)": 0.298618 }, { "acc": 0.72089438, "epoch": 0.2574774867982669, "grad_norm": 5.65625, "learning_rate": 9.831461143709057e-06, "loss": 1.13263788, "memory(GiB)": 141.16, "step": 23020, "train_speed(iter/s)": 0.298707 }, { "acc": 0.72482443, "epoch": 0.2577011857442254, "grad_norm": 8.125, "learning_rate": 9.83098468403938e-06, "loss": 1.10459003, "memory(GiB)": 141.16, "step": 23040, "train_speed(iter/s)": 0.298801 }, { "acc": 0.73339648, "epoch": 0.25792488469018393, "grad_norm": 6.625, "learning_rate": 9.830507563421947e-06, "loss": 1.07380714, "memory(GiB)": 141.16, "step": 23060, "train_speed(iter/s)": 0.298882 }, { "acc": 0.73063898, "epoch": 0.25814858363614246, "grad_norm": 6.09375, "learning_rate": 9.830029781922036e-06, "loss": 1.077666, "memory(GiB)": 141.16, "step": 23080, "train_speed(iter/s)": 0.298968 }, { "acc": 0.71860209, "epoch": 0.258372282582101, "grad_norm": 6.4375, "learning_rate": 9.829551339605015e-06, "loss": 1.13461895, "memory(GiB)": 141.16, "step": 23100, "train_speed(iter/s)": 0.299049 }, { "acc": 0.73786678, "epoch": 0.2585959815280595, "grad_norm": 4.84375, "learning_rate": 9.829072236536338e-06, "loss": 1.06659489, "memory(GiB)": 141.16, "step": 23120, "train_speed(iter/s)": 0.299137 }, { "acc": 0.72422667, "epoch": 0.25881968047401804, "grad_norm": 8.5625, "learning_rate": 9.828592472781556e-06, "loss": 1.13175659, "memory(GiB)": 141.16, "step": 23140, "train_speed(iter/s)": 0.299215 }, { "acc": 0.7249814, "epoch": 0.25904337941997657, "grad_norm": 8.0625, "learning_rate": 9.828112048406308e-06, "loss": 1.1117384, "memory(GiB)": 141.16, "step": 23160, "train_speed(iter/s)": 0.299303 }, { "acc": 0.72496243, "epoch": 0.2592670783659351, "grad_norm": 6.46875, "learning_rate": 9.827630963476323e-06, "loss": 1.09347095, "memory(GiB)": 141.16, "step": 23180, "train_speed(iter/s)": 0.299394 }, { "acc": 0.73226328, "epoch": 0.25949077731189363, "grad_norm": 5.03125, "learning_rate": 9.827149218057418e-06, "loss": 1.08786869, "memory(GiB)": 141.16, "step": 23200, "train_speed(iter/s)": 0.299478 }, { "acc": 0.72891345, "epoch": 0.25971447625785216, "grad_norm": 7.125, "learning_rate": 9.826666812215504e-06, "loss": 1.08544645, "memory(GiB)": 141.16, "step": 23220, "train_speed(iter/s)": 0.299565 }, { "acc": 0.73401766, "epoch": 0.2599381752038107, "grad_norm": 7.21875, "learning_rate": 9.826183746016582e-06, "loss": 1.07440109, "memory(GiB)": 141.16, "step": 23240, "train_speed(iter/s)": 0.299645 }, { "acc": 0.72588253, "epoch": 0.26016187414976927, "grad_norm": 7.59375, "learning_rate": 9.825700019526742e-06, "loss": 1.09471664, "memory(GiB)": 141.16, "step": 23260, "train_speed(iter/s)": 0.299722 }, { "acc": 0.72359533, "epoch": 0.2603855730957278, "grad_norm": 7.0, "learning_rate": 9.825215632812163e-06, "loss": 1.10992947, "memory(GiB)": 141.16, "step": 23280, "train_speed(iter/s)": 0.299804 }, { "acc": 0.72481747, "epoch": 0.2606092720416863, "grad_norm": 8.9375, "learning_rate": 9.824730585939117e-06, "loss": 1.11853256, "memory(GiB)": 141.16, "step": 23300, "train_speed(iter/s)": 0.299881 }, { "acc": 0.72547493, "epoch": 0.26083297098764485, "grad_norm": 6.5625, "learning_rate": 9.824244878973967e-06, "loss": 1.09198322, "memory(GiB)": 141.16, "step": 23320, "train_speed(iter/s)": 0.299969 }, { "acc": 0.73597431, "epoch": 0.2610566699336034, "grad_norm": 5.9375, "learning_rate": 9.823758511983162e-06, "loss": 1.05394773, "memory(GiB)": 141.16, "step": 23340, "train_speed(iter/s)": 0.300048 }, { "acc": 0.71647291, "epoch": 0.2612803688795619, "grad_norm": 8.0625, "learning_rate": 9.823271485033246e-06, "loss": 1.14706879, "memory(GiB)": 141.16, "step": 23360, "train_speed(iter/s)": 0.300131 }, { "acc": 0.74036031, "epoch": 0.26150406782552044, "grad_norm": 7.40625, "learning_rate": 9.82278379819085e-06, "loss": 1.0341938, "memory(GiB)": 141.16, "step": 23380, "train_speed(iter/s)": 0.300221 }, { "acc": 0.73023691, "epoch": 0.26172776677147896, "grad_norm": 6.46875, "learning_rate": 9.822295451522697e-06, "loss": 1.08584843, "memory(GiB)": 141.16, "step": 23400, "train_speed(iter/s)": 0.300303 }, { "acc": 0.7135478, "epoch": 0.2619514657174375, "grad_norm": 7.25, "learning_rate": 9.821806445095598e-06, "loss": 1.15280981, "memory(GiB)": 141.16, "step": 23420, "train_speed(iter/s)": 0.300385 }, { "acc": 0.72702656, "epoch": 0.262175164663396, "grad_norm": 8.1875, "learning_rate": 9.821316778976461e-06, "loss": 1.09214764, "memory(GiB)": 141.16, "step": 23440, "train_speed(iter/s)": 0.300474 }, { "acc": 0.72662315, "epoch": 0.26239886360935455, "grad_norm": 6.875, "learning_rate": 9.820826453232275e-06, "loss": 1.08696899, "memory(GiB)": 141.16, "step": 23460, "train_speed(iter/s)": 0.300554 }, { "acc": 0.72901545, "epoch": 0.2626225625553131, "grad_norm": 8.5, "learning_rate": 9.820335467930125e-06, "loss": 1.09955807, "memory(GiB)": 141.16, "step": 23480, "train_speed(iter/s)": 0.300634 }, { "acc": 0.73519611, "epoch": 0.2628462615012716, "grad_norm": 6.65625, "learning_rate": 9.819843823137184e-06, "loss": 1.04077139, "memory(GiB)": 141.16, "step": 23500, "train_speed(iter/s)": 0.300703 }, { "acc": 0.72800531, "epoch": 0.26306996044723013, "grad_norm": 7.71875, "learning_rate": 9.819351518920714e-06, "loss": 1.09400425, "memory(GiB)": 141.16, "step": 23520, "train_speed(iter/s)": 0.300782 }, { "acc": 0.73461924, "epoch": 0.26329365939318866, "grad_norm": 9.8125, "learning_rate": 9.818858555348075e-06, "loss": 1.06243305, "memory(GiB)": 141.16, "step": 23540, "train_speed(iter/s)": 0.300852 }, { "acc": 0.73674378, "epoch": 0.2635173583391472, "grad_norm": 5.28125, "learning_rate": 9.818364932486709e-06, "loss": 1.03115349, "memory(GiB)": 141.16, "step": 23560, "train_speed(iter/s)": 0.300935 }, { "acc": 0.72983274, "epoch": 0.2637410572851057, "grad_norm": 8.6875, "learning_rate": 9.817870650404146e-06, "loss": 1.09355545, "memory(GiB)": 141.16, "step": 23580, "train_speed(iter/s)": 0.301 }, { "acc": 0.72763062, "epoch": 0.26396475623106425, "grad_norm": 9.125, "learning_rate": 9.817375709168018e-06, "loss": 1.09094429, "memory(GiB)": 141.16, "step": 23600, "train_speed(iter/s)": 0.301082 }, { "acc": 0.73082008, "epoch": 0.2641884551770228, "grad_norm": 7.0, "learning_rate": 9.816880108846037e-06, "loss": 1.10021982, "memory(GiB)": 141.16, "step": 23620, "train_speed(iter/s)": 0.301163 }, { "acc": 0.72317033, "epoch": 0.2644121541229813, "grad_norm": 5.96875, "learning_rate": 9.816383849506006e-06, "loss": 1.10262737, "memory(GiB)": 141.16, "step": 23640, "train_speed(iter/s)": 0.301242 }, { "acc": 0.72529879, "epoch": 0.26463585306893983, "grad_norm": 15.25, "learning_rate": 9.815886931215824e-06, "loss": 1.10455408, "memory(GiB)": 141.16, "step": 23660, "train_speed(iter/s)": 0.301319 }, { "acc": 0.71660404, "epoch": 0.26485955201489836, "grad_norm": 6.40625, "learning_rate": 9.815389354043474e-06, "loss": 1.13036871, "memory(GiB)": 141.16, "step": 23680, "train_speed(iter/s)": 0.301394 }, { "acc": 0.73707781, "epoch": 0.2650832509608569, "grad_norm": 8.6875, "learning_rate": 9.814891118057033e-06, "loss": 1.04268789, "memory(GiB)": 141.16, "step": 23700, "train_speed(iter/s)": 0.301466 }, { "acc": 0.72753773, "epoch": 0.2653069499068154, "grad_norm": 6.28125, "learning_rate": 9.814392223324667e-06, "loss": 1.08535957, "memory(GiB)": 141.16, "step": 23720, "train_speed(iter/s)": 0.301549 }, { "acc": 0.72346783, "epoch": 0.26553064885277394, "grad_norm": 6.75, "learning_rate": 9.81389266991463e-06, "loss": 1.10649757, "memory(GiB)": 141.16, "step": 23740, "train_speed(iter/s)": 0.30162 }, { "acc": 0.72553825, "epoch": 0.26575434779873247, "grad_norm": 6.5625, "learning_rate": 9.81339245789527e-06, "loss": 1.10256338, "memory(GiB)": 141.16, "step": 23760, "train_speed(iter/s)": 0.301705 }, { "acc": 0.73042984, "epoch": 0.265978046744691, "grad_norm": 7.5, "learning_rate": 9.812891587335023e-06, "loss": 1.08354731, "memory(GiB)": 141.16, "step": 23780, "train_speed(iter/s)": 0.301782 }, { "acc": 0.72706604, "epoch": 0.2662017456906495, "grad_norm": 6.15625, "learning_rate": 9.812390058302415e-06, "loss": 1.08309708, "memory(GiB)": 141.16, "step": 23800, "train_speed(iter/s)": 0.301868 }, { "acc": 0.72992086, "epoch": 0.26642544463660806, "grad_norm": 6.78125, "learning_rate": 9.811887870866062e-06, "loss": 1.07844276, "memory(GiB)": 141.16, "step": 23820, "train_speed(iter/s)": 0.301946 }, { "acc": 0.72855206, "epoch": 0.2666491435825666, "grad_norm": 6.65625, "learning_rate": 9.811385025094669e-06, "loss": 1.07110271, "memory(GiB)": 141.16, "step": 23840, "train_speed(iter/s)": 0.302031 }, { "acc": 0.7313272, "epoch": 0.2668728425285251, "grad_norm": 8.4375, "learning_rate": 9.810881521057035e-06, "loss": 1.0837163, "memory(GiB)": 141.16, "step": 23860, "train_speed(iter/s)": 0.30211 }, { "acc": 0.72748027, "epoch": 0.26709654147448364, "grad_norm": 6.46875, "learning_rate": 9.810377358822046e-06, "loss": 1.08699818, "memory(GiB)": 141.16, "step": 23880, "train_speed(iter/s)": 0.302185 }, { "acc": 0.72655706, "epoch": 0.26732024042044217, "grad_norm": 6.5625, "learning_rate": 9.809872538458678e-06, "loss": 1.09306126, "memory(GiB)": 141.16, "step": 23900, "train_speed(iter/s)": 0.302267 }, { "acc": 0.73930259, "epoch": 0.2675439393664007, "grad_norm": 6.78125, "learning_rate": 9.809367060035997e-06, "loss": 1.03772964, "memory(GiB)": 141.16, "step": 23920, "train_speed(iter/s)": 0.30233 }, { "acc": 0.72430902, "epoch": 0.2677676383123592, "grad_norm": 7.34375, "learning_rate": 9.80886092362316e-06, "loss": 1.10914898, "memory(GiB)": 141.16, "step": 23940, "train_speed(iter/s)": 0.302411 }, { "acc": 0.73322377, "epoch": 0.26799133725831775, "grad_norm": 8.9375, "learning_rate": 9.808354129289417e-06, "loss": 1.06441822, "memory(GiB)": 141.16, "step": 23960, "train_speed(iter/s)": 0.302488 }, { "acc": 0.7233242, "epoch": 0.2682150362042763, "grad_norm": 6.21875, "learning_rate": 9.8078466771041e-06, "loss": 1.11300049, "memory(GiB)": 141.16, "step": 23980, "train_speed(iter/s)": 0.302559 }, { "acc": 0.72565751, "epoch": 0.2684387351502348, "grad_norm": 8.25, "learning_rate": 9.807338567136637e-06, "loss": 1.08735695, "memory(GiB)": 141.16, "step": 24000, "train_speed(iter/s)": 0.302637 }, { "epoch": 0.2684387351502348, "eval_acc": 0.6851253554781795, "eval_loss": 1.0997040271759033, "eval_runtime": 2253.648, "eval_samples_per_second": 33.405, "eval_steps_per_second": 16.703, "step": 24000 }, { "acc": 0.73540993, "epoch": 0.26866243409619334, "grad_norm": 7.9375, "learning_rate": 9.806829799456547e-06, "loss": 1.06688194, "memory(GiB)": 141.16, "step": 24020, "train_speed(iter/s)": 0.294182 }, { "acc": 0.74033461, "epoch": 0.26888613304215186, "grad_norm": 8.25, "learning_rate": 9.806320374133434e-06, "loss": 1.04413967, "memory(GiB)": 141.16, "step": 24040, "train_speed(iter/s)": 0.294266 }, { "acc": 0.71126575, "epoch": 0.2691098319881104, "grad_norm": 7.59375, "learning_rate": 9.805810291236996e-06, "loss": 1.16544571, "memory(GiB)": 141.16, "step": 24060, "train_speed(iter/s)": 0.294348 }, { "acc": 0.73380709, "epoch": 0.2693335309340689, "grad_norm": 6.5625, "learning_rate": 9.805299550837018e-06, "loss": 1.05937586, "memory(GiB)": 141.16, "step": 24080, "train_speed(iter/s)": 0.294428 }, { "acc": 0.72246914, "epoch": 0.26955722988002745, "grad_norm": 7.5, "learning_rate": 9.80478815300338e-06, "loss": 1.12343731, "memory(GiB)": 141.16, "step": 24100, "train_speed(iter/s)": 0.294512 }, { "acc": 0.72660704, "epoch": 0.269780928825986, "grad_norm": 7.65625, "learning_rate": 9.804276097806045e-06, "loss": 1.09987659, "memory(GiB)": 141.16, "step": 24120, "train_speed(iter/s)": 0.294595 }, { "acc": 0.72813253, "epoch": 0.2700046277719445, "grad_norm": 7.59375, "learning_rate": 9.803763385315072e-06, "loss": 1.10203133, "memory(GiB)": 141.16, "step": 24140, "train_speed(iter/s)": 0.294674 }, { "acc": 0.73160219, "epoch": 0.27022832671790303, "grad_norm": 8.125, "learning_rate": 9.803250015600605e-06, "loss": 1.06724873, "memory(GiB)": 141.16, "step": 24160, "train_speed(iter/s)": 0.294757 }, { "acc": 0.73173003, "epoch": 0.27045202566386156, "grad_norm": 7.15625, "learning_rate": 9.802735988732882e-06, "loss": 1.07402239, "memory(GiB)": 141.16, "step": 24180, "train_speed(iter/s)": 0.294841 }, { "acc": 0.73309994, "epoch": 0.2706757246098201, "grad_norm": 10.0625, "learning_rate": 9.802221304782229e-06, "loss": 1.06311092, "memory(GiB)": 141.16, "step": 24200, "train_speed(iter/s)": 0.294912 }, { "acc": 0.72935781, "epoch": 0.2708994235557786, "grad_norm": 7.1875, "learning_rate": 9.801705963819063e-06, "loss": 1.07589884, "memory(GiB)": 141.16, "step": 24220, "train_speed(iter/s)": 0.294986 }, { "acc": 0.73182821, "epoch": 0.27112312250173715, "grad_norm": 8.375, "learning_rate": 9.801189965913886e-06, "loss": 1.08834772, "memory(GiB)": 141.16, "step": 24240, "train_speed(iter/s)": 0.295066 }, { "acc": 0.73195252, "epoch": 0.2713468214476957, "grad_norm": 7.5, "learning_rate": 9.800673311137301e-06, "loss": 1.09059582, "memory(GiB)": 141.16, "step": 24260, "train_speed(iter/s)": 0.295155 }, { "acc": 0.72842259, "epoch": 0.2715705203936542, "grad_norm": 8.1875, "learning_rate": 9.800155999559986e-06, "loss": 1.06582012, "memory(GiB)": 141.16, "step": 24280, "train_speed(iter/s)": 0.295242 }, { "acc": 0.72159305, "epoch": 0.27179421933961273, "grad_norm": 6.1875, "learning_rate": 9.799638031252723e-06, "loss": 1.11660948, "memory(GiB)": 141.16, "step": 24300, "train_speed(iter/s)": 0.295331 }, { "acc": 0.71762285, "epoch": 0.27201791828557126, "grad_norm": 6.125, "learning_rate": 9.799119406286373e-06, "loss": 1.15242519, "memory(GiB)": 141.16, "step": 24320, "train_speed(iter/s)": 0.295402 }, { "acc": 0.73328753, "epoch": 0.2722416172315298, "grad_norm": 8.6875, "learning_rate": 9.798600124731893e-06, "loss": 1.06101837, "memory(GiB)": 141.16, "step": 24340, "train_speed(iter/s)": 0.295487 }, { "acc": 0.7293416, "epoch": 0.2724653161774883, "grad_norm": 7.25, "learning_rate": 9.798080186660328e-06, "loss": 1.081353, "memory(GiB)": 141.16, "step": 24360, "train_speed(iter/s)": 0.295566 }, { "acc": 0.73550043, "epoch": 0.27268901512344684, "grad_norm": 7.6875, "learning_rate": 9.797559592142814e-06, "loss": 1.06617498, "memory(GiB)": 141.16, "step": 24380, "train_speed(iter/s)": 0.295647 }, { "acc": 0.72781668, "epoch": 0.27291271406940537, "grad_norm": 6.8125, "learning_rate": 9.797038341250574e-06, "loss": 1.08928003, "memory(GiB)": 141.16, "step": 24400, "train_speed(iter/s)": 0.295723 }, { "acc": 0.7322, "epoch": 0.2731364130153639, "grad_norm": 7.53125, "learning_rate": 9.796516434054923e-06, "loss": 1.08181553, "memory(GiB)": 141.16, "step": 24420, "train_speed(iter/s)": 0.2958 }, { "acc": 0.72281256, "epoch": 0.2733601119613224, "grad_norm": 7.21875, "learning_rate": 9.795993870627267e-06, "loss": 1.12870445, "memory(GiB)": 141.16, "step": 24440, "train_speed(iter/s)": 0.295871 }, { "acc": 0.73341436, "epoch": 0.27358381090728096, "grad_norm": 8.6875, "learning_rate": 9.795470651039099e-06, "loss": 1.06206789, "memory(GiB)": 141.16, "step": 24460, "train_speed(iter/s)": 0.29595 }, { "acc": 0.73239326, "epoch": 0.2738075098532395, "grad_norm": 6.21875, "learning_rate": 9.794946775362002e-06, "loss": 1.06193695, "memory(GiB)": 141.16, "step": 24480, "train_speed(iter/s)": 0.296023 }, { "acc": 0.72593641, "epoch": 0.274031208799198, "grad_norm": 6.46875, "learning_rate": 9.794422243667651e-06, "loss": 1.10522614, "memory(GiB)": 141.16, "step": 24500, "train_speed(iter/s)": 0.296112 }, { "acc": 0.73095179, "epoch": 0.2742549077451566, "grad_norm": 7.40625, "learning_rate": 9.79389705602781e-06, "loss": 1.0670598, "memory(GiB)": 141.16, "step": 24520, "train_speed(iter/s)": 0.296197 }, { "acc": 0.73124313, "epoch": 0.2744786066911151, "grad_norm": 7.34375, "learning_rate": 9.79337121251433e-06, "loss": 1.06660061, "memory(GiB)": 141.16, "step": 24540, "train_speed(iter/s)": 0.296275 }, { "acc": 0.72467437, "epoch": 0.27470230563707365, "grad_norm": 6.1875, "learning_rate": 9.792844713199156e-06, "loss": 1.11504517, "memory(GiB)": 141.16, "step": 24560, "train_speed(iter/s)": 0.296354 }, { "acc": 0.71873631, "epoch": 0.2749260045830322, "grad_norm": 8.1875, "learning_rate": 9.792317558154318e-06, "loss": 1.15651655, "memory(GiB)": 141.16, "step": 24580, "train_speed(iter/s)": 0.296438 }, { "acc": 0.7296648, "epoch": 0.2751497035289907, "grad_norm": 9.0, "learning_rate": 9.79178974745194e-06, "loss": 1.07785978, "memory(GiB)": 141.16, "step": 24600, "train_speed(iter/s)": 0.29653 }, { "acc": 0.71787815, "epoch": 0.27537340247494924, "grad_norm": 8.5, "learning_rate": 9.791261281164236e-06, "loss": 1.14158001, "memory(GiB)": 141.16, "step": 24620, "train_speed(iter/s)": 0.296621 }, { "acc": 0.74224391, "epoch": 0.27559710142090776, "grad_norm": 5.9375, "learning_rate": 9.790732159363505e-06, "loss": 1.02424335, "memory(GiB)": 141.16, "step": 24640, "train_speed(iter/s)": 0.296701 }, { "acc": 0.7303484, "epoch": 0.2758208003668663, "grad_norm": 7.1875, "learning_rate": 9.79020238212214e-06, "loss": 1.0715827, "memory(GiB)": 141.16, "step": 24660, "train_speed(iter/s)": 0.296775 }, { "acc": 0.70800705, "epoch": 0.2760444993128248, "grad_norm": 8.125, "learning_rate": 9.78967194951262e-06, "loss": 1.19064503, "memory(GiB)": 141.16, "step": 24680, "train_speed(iter/s)": 0.296857 }, { "acc": 0.73731618, "epoch": 0.27626819825878335, "grad_norm": 8.1875, "learning_rate": 9.789140861607518e-06, "loss": 1.04974661, "memory(GiB)": 141.16, "step": 24700, "train_speed(iter/s)": 0.296933 }, { "acc": 0.73502293, "epoch": 0.2764918972047419, "grad_norm": 8.0, "learning_rate": 9.788609118479494e-06, "loss": 1.07400579, "memory(GiB)": 141.16, "step": 24720, "train_speed(iter/s)": 0.297005 }, { "acc": 0.72287216, "epoch": 0.2767155961507004, "grad_norm": 7.09375, "learning_rate": 9.788076720201296e-06, "loss": 1.10711403, "memory(GiB)": 141.16, "step": 24740, "train_speed(iter/s)": 0.297082 }, { "acc": 0.73064895, "epoch": 0.27693929509665893, "grad_norm": 7.15625, "learning_rate": 9.787543666845766e-06, "loss": 1.08934011, "memory(GiB)": 141.16, "step": 24760, "train_speed(iter/s)": 0.297157 }, { "acc": 0.73245387, "epoch": 0.27716299404261746, "grad_norm": 7.09375, "learning_rate": 9.787009958485831e-06, "loss": 1.07844086, "memory(GiB)": 141.16, "step": 24780, "train_speed(iter/s)": 0.297227 }, { "acc": 0.72807684, "epoch": 0.277386692988576, "grad_norm": 5.625, "learning_rate": 9.786475595194514e-06, "loss": 1.08870077, "memory(GiB)": 141.16, "step": 24800, "train_speed(iter/s)": 0.297306 }, { "acc": 0.7226778, "epoch": 0.2776103919345345, "grad_norm": 6.84375, "learning_rate": 9.78594057704492e-06, "loss": 1.11806841, "memory(GiB)": 141.16, "step": 24820, "train_speed(iter/s)": 0.297386 }, { "acc": 0.73198013, "epoch": 0.27783409088049305, "grad_norm": 6.46875, "learning_rate": 9.78540490411025e-06, "loss": 1.07368202, "memory(GiB)": 141.16, "step": 24840, "train_speed(iter/s)": 0.297455 }, { "acc": 0.73571901, "epoch": 0.2780577898264516, "grad_norm": 7.34375, "learning_rate": 9.784868576463787e-06, "loss": 1.06852856, "memory(GiB)": 141.16, "step": 24860, "train_speed(iter/s)": 0.297535 }, { "acc": 0.73167281, "epoch": 0.2782814887724101, "grad_norm": 6.3125, "learning_rate": 9.784331594178913e-06, "loss": 1.0676199, "memory(GiB)": 141.16, "step": 24880, "train_speed(iter/s)": 0.297606 }, { "acc": 0.71733279, "epoch": 0.27850518771836863, "grad_norm": 7.3125, "learning_rate": 9.783793957329094e-06, "loss": 1.13586578, "memory(GiB)": 141.16, "step": 24900, "train_speed(iter/s)": 0.297685 }, { "acc": 0.71795044, "epoch": 0.27872888666432716, "grad_norm": 7.28125, "learning_rate": 9.783255665987883e-06, "loss": 1.14044113, "memory(GiB)": 141.16, "step": 24920, "train_speed(iter/s)": 0.297759 }, { "acc": 0.73445139, "epoch": 0.2789525856102857, "grad_norm": 9.375, "learning_rate": 9.78271672022893e-06, "loss": 1.06783962, "memory(GiB)": 141.16, "step": 24940, "train_speed(iter/s)": 0.297835 }, { "acc": 0.71722908, "epoch": 0.2791762845562442, "grad_norm": 7.5625, "learning_rate": 9.782177120125968e-06, "loss": 1.14041023, "memory(GiB)": 141.16, "step": 24960, "train_speed(iter/s)": 0.297921 }, { "acc": 0.7214231, "epoch": 0.27939998350220274, "grad_norm": 6.90625, "learning_rate": 9.781636865752824e-06, "loss": 1.12500887, "memory(GiB)": 141.16, "step": 24980, "train_speed(iter/s)": 0.298006 }, { "acc": 0.72864885, "epoch": 0.27962368244816127, "grad_norm": 7.5, "learning_rate": 9.78109595718341e-06, "loss": 1.07572765, "memory(GiB)": 141.16, "step": 25000, "train_speed(iter/s)": 0.298086 }, { "acc": 0.72817783, "epoch": 0.2798473813941198, "grad_norm": 7.78125, "learning_rate": 9.780554394491733e-06, "loss": 1.08011494, "memory(GiB)": 141.16, "step": 25020, "train_speed(iter/s)": 0.29816 }, { "acc": 0.71266489, "epoch": 0.2800710803400783, "grad_norm": 6.9375, "learning_rate": 9.780012177751882e-06, "loss": 1.17963495, "memory(GiB)": 141.16, "step": 25040, "train_speed(iter/s)": 0.298237 }, { "acc": 0.7227447, "epoch": 0.28029477928603685, "grad_norm": 8.1875, "learning_rate": 9.779469307038048e-06, "loss": 1.11697617, "memory(GiB)": 141.16, "step": 25060, "train_speed(iter/s)": 0.298304 }, { "acc": 0.73720589, "epoch": 0.2805184782319954, "grad_norm": 8.1875, "learning_rate": 9.778925782424495e-06, "loss": 1.04078474, "memory(GiB)": 141.16, "step": 25080, "train_speed(iter/s)": 0.298365 }, { "acc": 0.72601357, "epoch": 0.2807421771779539, "grad_norm": 6.09375, "learning_rate": 9.778381603985589e-06, "loss": 1.10028667, "memory(GiB)": 141.16, "step": 25100, "train_speed(iter/s)": 0.298444 }, { "acc": 0.71509848, "epoch": 0.28096587612391244, "grad_norm": 7.625, "learning_rate": 9.777836771795781e-06, "loss": 1.15248585, "memory(GiB)": 141.16, "step": 25120, "train_speed(iter/s)": 0.298517 }, { "acc": 0.72174006, "epoch": 0.28118957506987097, "grad_norm": 7.46875, "learning_rate": 9.777291285929611e-06, "loss": 1.10442047, "memory(GiB)": 141.16, "step": 25140, "train_speed(iter/s)": 0.298583 }, { "acc": 0.72915769, "epoch": 0.2814132740158295, "grad_norm": 9.25, "learning_rate": 9.776745146461711e-06, "loss": 1.08232756, "memory(GiB)": 141.16, "step": 25160, "train_speed(iter/s)": 0.298651 }, { "acc": 0.72911487, "epoch": 0.281636972961788, "grad_norm": 8.75, "learning_rate": 9.776198353466799e-06, "loss": 1.08823776, "memory(GiB)": 141.16, "step": 25180, "train_speed(iter/s)": 0.298731 }, { "acc": 0.73161201, "epoch": 0.28186067190774655, "grad_norm": 7.0, "learning_rate": 9.775650907019682e-06, "loss": 1.07572536, "memory(GiB)": 141.16, "step": 25200, "train_speed(iter/s)": 0.298806 }, { "acc": 0.7201859, "epoch": 0.2820843708537051, "grad_norm": 7.1875, "learning_rate": 9.775102807195264e-06, "loss": 1.13183031, "memory(GiB)": 141.16, "step": 25220, "train_speed(iter/s)": 0.298887 }, { "acc": 0.7320075, "epoch": 0.2823080697996636, "grad_norm": 6.65625, "learning_rate": 9.774554054068531e-06, "loss": 1.05882988, "memory(GiB)": 141.16, "step": 25240, "train_speed(iter/s)": 0.298962 }, { "acc": 0.72126613, "epoch": 0.28253176874562214, "grad_norm": 6.21875, "learning_rate": 9.774004647714557e-06, "loss": 1.13517637, "memory(GiB)": 141.16, "step": 25260, "train_speed(iter/s)": 0.299023 }, { "acc": 0.73801079, "epoch": 0.28275546769158066, "grad_norm": 5.75, "learning_rate": 9.773454588208513e-06, "loss": 1.05481224, "memory(GiB)": 141.16, "step": 25280, "train_speed(iter/s)": 0.299093 }, { "acc": 0.72510338, "epoch": 0.2829791666375392, "grad_norm": 8.5625, "learning_rate": 9.772903875625651e-06, "loss": 1.09651909, "memory(GiB)": 141.16, "step": 25300, "train_speed(iter/s)": 0.299166 }, { "acc": 0.73249483, "epoch": 0.2832028655834977, "grad_norm": 8.0625, "learning_rate": 9.772352510041318e-06, "loss": 1.09208107, "memory(GiB)": 141.16, "step": 25320, "train_speed(iter/s)": 0.299235 }, { "acc": 0.72540908, "epoch": 0.28342656452945625, "grad_norm": 6.4375, "learning_rate": 9.771800491530951e-06, "loss": 1.10598278, "memory(GiB)": 141.16, "step": 25340, "train_speed(iter/s)": 0.299315 }, { "acc": 0.72860203, "epoch": 0.2836502634754148, "grad_norm": 5.21875, "learning_rate": 9.77124782017007e-06, "loss": 1.07418032, "memory(GiB)": 141.16, "step": 25360, "train_speed(iter/s)": 0.299392 }, { "acc": 0.71943998, "epoch": 0.2838739624213733, "grad_norm": 6.15625, "learning_rate": 9.770694496034293e-06, "loss": 1.13578911, "memory(GiB)": 141.16, "step": 25380, "train_speed(iter/s)": 0.299463 }, { "acc": 0.71603651, "epoch": 0.28409766136733183, "grad_norm": 8.9375, "learning_rate": 9.770140519199319e-06, "loss": 1.16549435, "memory(GiB)": 141.16, "step": 25400, "train_speed(iter/s)": 0.299525 }, { "acc": 0.71559887, "epoch": 0.28432136031329036, "grad_norm": 7.21875, "learning_rate": 9.76958588974094e-06, "loss": 1.15240421, "memory(GiB)": 141.16, "step": 25420, "train_speed(iter/s)": 0.299594 }, { "acc": 0.71429329, "epoch": 0.2845450592592489, "grad_norm": 5.9375, "learning_rate": 9.769030607735038e-06, "loss": 1.16647625, "memory(GiB)": 141.16, "step": 25440, "train_speed(iter/s)": 0.299664 }, { "acc": 0.71534872, "epoch": 0.2847687582052074, "grad_norm": 7.4375, "learning_rate": 9.768474673257584e-06, "loss": 1.15168591, "memory(GiB)": 141.16, "step": 25460, "train_speed(iter/s)": 0.299739 }, { "acc": 0.7321054, "epoch": 0.28499245715116595, "grad_norm": 8.3125, "learning_rate": 9.767918086384638e-06, "loss": 1.06759329, "memory(GiB)": 141.16, "step": 25480, "train_speed(iter/s)": 0.299816 }, { "acc": 0.72256799, "epoch": 0.2852161560971245, "grad_norm": 4.40625, "learning_rate": 9.767360847192348e-06, "loss": 1.11039848, "memory(GiB)": 141.16, "step": 25500, "train_speed(iter/s)": 0.299885 }, { "acc": 0.73050127, "epoch": 0.285439855043083, "grad_norm": 6.90625, "learning_rate": 9.766802955756953e-06, "loss": 1.07884922, "memory(GiB)": 141.16, "step": 25520, "train_speed(iter/s)": 0.299955 }, { "acc": 0.72117739, "epoch": 0.28566355398904153, "grad_norm": 6.75, "learning_rate": 9.766244412154782e-06, "loss": 1.12791271, "memory(GiB)": 141.16, "step": 25540, "train_speed(iter/s)": 0.300025 }, { "acc": 0.72971001, "epoch": 0.28588725293500006, "grad_norm": 5.4375, "learning_rate": 9.765685216462249e-06, "loss": 1.07476101, "memory(GiB)": 141.16, "step": 25560, "train_speed(iter/s)": 0.300102 }, { "acc": 0.73083353, "epoch": 0.2861109518809586, "grad_norm": 7.6875, "learning_rate": 9.765125368755859e-06, "loss": 1.06986294, "memory(GiB)": 141.16, "step": 25580, "train_speed(iter/s)": 0.300178 }, { "acc": 0.72570748, "epoch": 0.2863346508269171, "grad_norm": 6.375, "learning_rate": 9.764564869112212e-06, "loss": 1.11160927, "memory(GiB)": 141.16, "step": 25600, "train_speed(iter/s)": 0.300252 }, { "acc": 0.71186261, "epoch": 0.28655834977287564, "grad_norm": 9.1875, "learning_rate": 9.764003717607988e-06, "loss": 1.17973289, "memory(GiB)": 141.16, "step": 25620, "train_speed(iter/s)": 0.300314 }, { "acc": 0.73428497, "epoch": 0.28678204871883417, "grad_norm": 6.8125, "learning_rate": 9.763441914319961e-06, "loss": 1.06829128, "memory(GiB)": 141.16, "step": 25640, "train_speed(iter/s)": 0.300388 }, { "acc": 0.72815509, "epoch": 0.2870057476647927, "grad_norm": 6.875, "learning_rate": 9.762879459324998e-06, "loss": 1.08011875, "memory(GiB)": 141.16, "step": 25660, "train_speed(iter/s)": 0.30046 }, { "acc": 0.7209342, "epoch": 0.2872294466107512, "grad_norm": 6.84375, "learning_rate": 9.762316352700045e-06, "loss": 1.12063694, "memory(GiB)": 141.16, "step": 25680, "train_speed(iter/s)": 0.300542 }, { "acc": 0.72101383, "epoch": 0.28745314555670975, "grad_norm": 6.5, "learning_rate": 9.761752594522147e-06, "loss": 1.11210203, "memory(GiB)": 141.16, "step": 25700, "train_speed(iter/s)": 0.300617 }, { "acc": 0.73455162, "epoch": 0.2876768445026683, "grad_norm": 8.1875, "learning_rate": 9.761188184868433e-06, "loss": 1.06091547, "memory(GiB)": 141.16, "step": 25720, "train_speed(iter/s)": 0.300691 }, { "acc": 0.73268604, "epoch": 0.2879005434486268, "grad_norm": 7.5625, "learning_rate": 9.760623123816122e-06, "loss": 1.06075993, "memory(GiB)": 141.16, "step": 25740, "train_speed(iter/s)": 0.300761 }, { "acc": 0.7287684, "epoch": 0.2881242423945854, "grad_norm": 7.125, "learning_rate": 9.760057411442523e-06, "loss": 1.09042311, "memory(GiB)": 141.16, "step": 25760, "train_speed(iter/s)": 0.300835 }, { "acc": 0.72176676, "epoch": 0.2883479413405439, "grad_norm": 8.5, "learning_rate": 9.759491047825034e-06, "loss": 1.11155825, "memory(GiB)": 141.16, "step": 25780, "train_speed(iter/s)": 0.3009 }, { "acc": 0.73443351, "epoch": 0.28857164028650245, "grad_norm": 7.5, "learning_rate": 9.758924033041139e-06, "loss": 1.06521292, "memory(GiB)": 141.16, "step": 25800, "train_speed(iter/s)": 0.300972 }, { "acc": 0.73017716, "epoch": 0.288795339232461, "grad_norm": 8.0625, "learning_rate": 9.758356367168416e-06, "loss": 1.08195581, "memory(GiB)": 141.16, "step": 25820, "train_speed(iter/s)": 0.301038 }, { "acc": 0.72545242, "epoch": 0.2890190381784195, "grad_norm": 7.0625, "learning_rate": 9.75778805028453e-06, "loss": 1.11279984, "memory(GiB)": 141.16, "step": 25840, "train_speed(iter/s)": 0.301105 }, { "acc": 0.73414474, "epoch": 0.28924273712437804, "grad_norm": 5.5625, "learning_rate": 9.757219082467233e-06, "loss": 1.06240864, "memory(GiB)": 141.16, "step": 25860, "train_speed(iter/s)": 0.301172 }, { "acc": 0.74021749, "epoch": 0.28946643607033656, "grad_norm": 7.90625, "learning_rate": 9.756649463794372e-06, "loss": 1.03471146, "memory(GiB)": 141.16, "step": 25880, "train_speed(iter/s)": 0.30124 }, { "acc": 0.72941647, "epoch": 0.2896901350162951, "grad_norm": 7.78125, "learning_rate": 9.756079194343875e-06, "loss": 1.08817635, "memory(GiB)": 141.16, "step": 25900, "train_speed(iter/s)": 0.301313 }, { "acc": 0.72108846, "epoch": 0.2899138339622536, "grad_norm": 7.0625, "learning_rate": 9.755508274193764e-06, "loss": 1.11984453, "memory(GiB)": 141.16, "step": 25920, "train_speed(iter/s)": 0.301383 }, { "acc": 0.72331762, "epoch": 0.29013753290821215, "grad_norm": 8.0625, "learning_rate": 9.754936703422147e-06, "loss": 1.123806, "memory(GiB)": 141.16, "step": 25940, "train_speed(iter/s)": 0.301448 }, { "acc": 0.73148203, "epoch": 0.2903612318541707, "grad_norm": 6.5, "learning_rate": 9.754364482107227e-06, "loss": 1.08092051, "memory(GiB)": 141.16, "step": 25960, "train_speed(iter/s)": 0.301513 }, { "acc": 0.71816492, "epoch": 0.2905849308001292, "grad_norm": 8.0625, "learning_rate": 9.753791610327291e-06, "loss": 1.13262501, "memory(GiB)": 141.16, "step": 25980, "train_speed(iter/s)": 0.301586 }, { "acc": 0.73220844, "epoch": 0.29080862974608773, "grad_norm": 5.71875, "learning_rate": 9.753218088160715e-06, "loss": 1.07173424, "memory(GiB)": 141.16, "step": 26000, "train_speed(iter/s)": 0.301648 }, { "epoch": 0.29080862974608773, "eval_acc": 0.6856647594776764, "eval_loss": 1.0974366664886475, "eval_runtime": 2322.9235, "eval_samples_per_second": 32.409, "eval_steps_per_second": 16.205, "step": 26000 }, { "acc": 0.72599096, "epoch": 0.29103232869204626, "grad_norm": 7.53125, "learning_rate": 9.752643915685963e-06, "loss": 1.10759659, "memory(GiB)": 141.16, "step": 26020, "train_speed(iter/s)": 0.29364 }, { "acc": 0.73435903, "epoch": 0.2912560276380048, "grad_norm": 6.21875, "learning_rate": 9.752069092981596e-06, "loss": 1.05763702, "memory(GiB)": 141.16, "step": 26040, "train_speed(iter/s)": 0.293709 }, { "acc": 0.71103106, "epoch": 0.2914797265839633, "grad_norm": 5.40625, "learning_rate": 9.751493620126254e-06, "loss": 1.17094193, "memory(GiB)": 141.16, "step": 26060, "train_speed(iter/s)": 0.293775 }, { "acc": 0.73442831, "epoch": 0.29170342552992184, "grad_norm": 6.78125, "learning_rate": 9.750917497198669e-06, "loss": 1.07122841, "memory(GiB)": 141.16, "step": 26080, "train_speed(iter/s)": 0.293845 }, { "acc": 0.72261105, "epoch": 0.2919271244758804, "grad_norm": 6.46875, "learning_rate": 9.750340724277665e-06, "loss": 1.10919428, "memory(GiB)": 141.16, "step": 26100, "train_speed(iter/s)": 0.293911 }, { "acc": 0.73660583, "epoch": 0.2921508234218389, "grad_norm": 5.84375, "learning_rate": 9.74976330144215e-06, "loss": 1.05284901, "memory(GiB)": 141.16, "step": 26120, "train_speed(iter/s)": 0.293987 }, { "acc": 0.72794042, "epoch": 0.29237452236779743, "grad_norm": 6.6875, "learning_rate": 9.749185228771128e-06, "loss": 1.09320812, "memory(GiB)": 141.16, "step": 26140, "train_speed(iter/s)": 0.294067 }, { "acc": 0.7325839, "epoch": 0.29259822131375596, "grad_norm": 6.8125, "learning_rate": 9.748606506343683e-06, "loss": 1.06947308, "memory(GiB)": 141.16, "step": 26160, "train_speed(iter/s)": 0.294138 }, { "acc": 0.72523623, "epoch": 0.2928219202597145, "grad_norm": 7.21875, "learning_rate": 9.748027134238995e-06, "loss": 1.10092964, "memory(GiB)": 141.16, "step": 26180, "train_speed(iter/s)": 0.29421 }, { "acc": 0.72581358, "epoch": 0.293045619205673, "grad_norm": 9.5, "learning_rate": 9.747447112536333e-06, "loss": 1.09725857, "memory(GiB)": 141.16, "step": 26200, "train_speed(iter/s)": 0.29428 }, { "acc": 0.7331048, "epoch": 0.29326931815163154, "grad_norm": 7.96875, "learning_rate": 9.746866441315047e-06, "loss": 1.06565685, "memory(GiB)": 141.16, "step": 26220, "train_speed(iter/s)": 0.29435 }, { "acc": 0.71608572, "epoch": 0.29349301709759007, "grad_norm": 8.6875, "learning_rate": 9.746285120654582e-06, "loss": 1.15381756, "memory(GiB)": 141.16, "step": 26240, "train_speed(iter/s)": 0.294424 }, { "acc": 0.72509394, "epoch": 0.2937167160435486, "grad_norm": 7.78125, "learning_rate": 9.745703150634475e-06, "loss": 1.10008774, "memory(GiB)": 141.16, "step": 26260, "train_speed(iter/s)": 0.2945 }, { "acc": 0.73365412, "epoch": 0.2939404149895071, "grad_norm": 8.8125, "learning_rate": 9.745120531334344e-06, "loss": 1.07044849, "memory(GiB)": 141.16, "step": 26280, "train_speed(iter/s)": 0.294565 }, { "acc": 0.73253975, "epoch": 0.29416411393546565, "grad_norm": 6.25, "learning_rate": 9.744537262833903e-06, "loss": 1.08226395, "memory(GiB)": 141.16, "step": 26300, "train_speed(iter/s)": 0.294628 }, { "acc": 0.71411939, "epoch": 0.2943878128814242, "grad_norm": 6.53125, "learning_rate": 9.743953345212946e-06, "loss": 1.15574322, "memory(GiB)": 141.16, "step": 26320, "train_speed(iter/s)": 0.294697 }, { "acc": 0.72043352, "epoch": 0.2946115118273827, "grad_norm": 6.75, "learning_rate": 9.743368778551367e-06, "loss": 1.13837719, "memory(GiB)": 141.16, "step": 26340, "train_speed(iter/s)": 0.294766 }, { "acc": 0.71971855, "epoch": 0.29483521077334124, "grad_norm": 6.75, "learning_rate": 9.74278356292914e-06, "loss": 1.12830601, "memory(GiB)": 141.16, "step": 26360, "train_speed(iter/s)": 0.294832 }, { "acc": 0.73246756, "epoch": 0.29505890971929977, "grad_norm": 6.84375, "learning_rate": 9.74219769842633e-06, "loss": 1.07082357, "memory(GiB)": 141.16, "step": 26380, "train_speed(iter/s)": 0.294899 }, { "acc": 0.7263402, "epoch": 0.2952826086652583, "grad_norm": 6.90625, "learning_rate": 9.741611185123096e-06, "loss": 1.11305714, "memory(GiB)": 141.16, "step": 26400, "train_speed(iter/s)": 0.294975 }, { "acc": 0.72924447, "epoch": 0.2955063076112168, "grad_norm": 7.84375, "learning_rate": 9.741024023099677e-06, "loss": 1.07564983, "memory(GiB)": 141.16, "step": 26420, "train_speed(iter/s)": 0.295039 }, { "acc": 0.7377264, "epoch": 0.29573000655717535, "grad_norm": 8.25, "learning_rate": 9.740436212436408e-06, "loss": 1.05196867, "memory(GiB)": 141.16, "step": 26440, "train_speed(iter/s)": 0.295116 }, { "acc": 0.72101259, "epoch": 0.2959537055031339, "grad_norm": 5.96875, "learning_rate": 9.739847753213707e-06, "loss": 1.11818218, "memory(GiB)": 141.16, "step": 26460, "train_speed(iter/s)": 0.29519 }, { "acc": 0.72687716, "epoch": 0.2961774044490924, "grad_norm": 5.34375, "learning_rate": 9.739258645512088e-06, "loss": 1.08816118, "memory(GiB)": 141.16, "step": 26480, "train_speed(iter/s)": 0.295273 }, { "acc": 0.73214493, "epoch": 0.29640110339505094, "grad_norm": 9.3125, "learning_rate": 9.738668889412145e-06, "loss": 1.08343697, "memory(GiB)": 141.16, "step": 26500, "train_speed(iter/s)": 0.295338 }, { "acc": 0.72861037, "epoch": 0.29662480234100946, "grad_norm": 7.59375, "learning_rate": 9.738078484994566e-06, "loss": 1.09426422, "memory(GiB)": 141.16, "step": 26520, "train_speed(iter/s)": 0.295415 }, { "acc": 0.7250247, "epoch": 0.296848501286968, "grad_norm": 6.6875, "learning_rate": 9.73748743234013e-06, "loss": 1.11220522, "memory(GiB)": 141.16, "step": 26540, "train_speed(iter/s)": 0.295487 }, { "acc": 0.72680483, "epoch": 0.2970722002329265, "grad_norm": 7.59375, "learning_rate": 9.736895731529696e-06, "loss": 1.10885544, "memory(GiB)": 141.16, "step": 26560, "train_speed(iter/s)": 0.29556 }, { "acc": 0.72824707, "epoch": 0.29729589917888505, "grad_norm": 7.75, "learning_rate": 9.73630338264422e-06, "loss": 1.08455238, "memory(GiB)": 141.16, "step": 26580, "train_speed(iter/s)": 0.295634 }, { "acc": 0.71023321, "epoch": 0.2975195981248436, "grad_norm": 8.4375, "learning_rate": 9.735710385764747e-06, "loss": 1.16645985, "memory(GiB)": 141.16, "step": 26600, "train_speed(iter/s)": 0.2957 }, { "acc": 0.71215305, "epoch": 0.2977432970708021, "grad_norm": 9.3125, "learning_rate": 9.735116740972401e-06, "loss": 1.17298298, "memory(GiB)": 141.16, "step": 26620, "train_speed(iter/s)": 0.295766 }, { "acc": 0.73189011, "epoch": 0.29796699601676063, "grad_norm": 5.28125, "learning_rate": 9.734522448348407e-06, "loss": 1.08513708, "memory(GiB)": 141.16, "step": 26640, "train_speed(iter/s)": 0.295831 }, { "acc": 0.72037072, "epoch": 0.29819069496271916, "grad_norm": 5.5625, "learning_rate": 9.733927507974068e-06, "loss": 1.13280087, "memory(GiB)": 141.16, "step": 26660, "train_speed(iter/s)": 0.295892 }, { "acc": 0.72385788, "epoch": 0.2984143939086777, "grad_norm": 7.9375, "learning_rate": 9.733331919930785e-06, "loss": 1.1190485, "memory(GiB)": 141.16, "step": 26680, "train_speed(iter/s)": 0.29596 }, { "acc": 0.72848158, "epoch": 0.2986380928546362, "grad_norm": 10.0625, "learning_rate": 9.732735684300039e-06, "loss": 1.10486546, "memory(GiB)": 141.16, "step": 26700, "train_speed(iter/s)": 0.296036 }, { "acc": 0.73464112, "epoch": 0.29886179180059474, "grad_norm": 6.9375, "learning_rate": 9.732138801163405e-06, "loss": 1.05961227, "memory(GiB)": 141.16, "step": 26720, "train_speed(iter/s)": 0.296121 }, { "acc": 0.73952937, "epoch": 0.2990854907465533, "grad_norm": 8.5625, "learning_rate": 9.731541270602544e-06, "loss": 1.05812521, "memory(GiB)": 141.16, "step": 26740, "train_speed(iter/s)": 0.296201 }, { "acc": 0.72816315, "epoch": 0.2993091896925118, "grad_norm": 7.15625, "learning_rate": 9.730943092699209e-06, "loss": 1.07528553, "memory(GiB)": 141.16, "step": 26760, "train_speed(iter/s)": 0.296268 }, { "acc": 0.71567345, "epoch": 0.29953288863847033, "grad_norm": 7.0, "learning_rate": 9.730344267535239e-06, "loss": 1.14648666, "memory(GiB)": 141.16, "step": 26780, "train_speed(iter/s)": 0.296342 }, { "acc": 0.72991338, "epoch": 0.29975658758442886, "grad_norm": 7.40625, "learning_rate": 9.72974479519256e-06, "loss": 1.0927124, "memory(GiB)": 141.16, "step": 26800, "train_speed(iter/s)": 0.296423 }, { "acc": 0.71325159, "epoch": 0.2999802865303874, "grad_norm": 9.1875, "learning_rate": 9.72914467575319e-06, "loss": 1.15737476, "memory(GiB)": 141.16, "step": 26820, "train_speed(iter/s)": 0.296496 }, { "acc": 0.73709526, "epoch": 0.3002039854763459, "grad_norm": 7.28125, "learning_rate": 9.728543909299233e-06, "loss": 1.06380377, "memory(GiB)": 141.16, "step": 26840, "train_speed(iter/s)": 0.296558 }, { "acc": 0.72655087, "epoch": 0.30042768442230444, "grad_norm": 6.1875, "learning_rate": 9.727942495912883e-06, "loss": 1.1039669, "memory(GiB)": 141.16, "step": 26860, "train_speed(iter/s)": 0.296631 }, { "acc": 0.72990713, "epoch": 0.30065138336826297, "grad_norm": 7.84375, "learning_rate": 9.72734043567642e-06, "loss": 1.06660347, "memory(GiB)": 141.16, "step": 26880, "train_speed(iter/s)": 0.296703 }, { "acc": 0.7325736, "epoch": 0.3008750823142215, "grad_norm": 7.65625, "learning_rate": 9.726737728672218e-06, "loss": 1.07751112, "memory(GiB)": 141.16, "step": 26900, "train_speed(iter/s)": 0.296774 }, { "acc": 0.73171606, "epoch": 0.30109878126018, "grad_norm": 8.3125, "learning_rate": 9.726134374982734e-06, "loss": 1.0657589, "memory(GiB)": 141.16, "step": 26920, "train_speed(iter/s)": 0.296842 }, { "acc": 0.71831346, "epoch": 0.30132248020613855, "grad_norm": 6.21875, "learning_rate": 9.725530374690515e-06, "loss": 1.14638557, "memory(GiB)": 141.16, "step": 26940, "train_speed(iter/s)": 0.296908 }, { "acc": 0.72910929, "epoch": 0.3015461791520971, "grad_norm": 6.03125, "learning_rate": 9.724925727878198e-06, "loss": 1.10011463, "memory(GiB)": 141.16, "step": 26960, "train_speed(iter/s)": 0.296974 }, { "acc": 0.73533516, "epoch": 0.3017698780980556, "grad_norm": 6.90625, "learning_rate": 9.724320434628505e-06, "loss": 1.05908813, "memory(GiB)": 141.16, "step": 26980, "train_speed(iter/s)": 0.297039 }, { "acc": 0.73392062, "epoch": 0.30199357704401414, "grad_norm": 4.96875, "learning_rate": 9.723714495024252e-06, "loss": 1.06314392, "memory(GiB)": 141.16, "step": 27000, "train_speed(iter/s)": 0.297106 }, { "acc": 0.72968216, "epoch": 0.3022172759899727, "grad_norm": 8.25, "learning_rate": 9.723107909148337e-06, "loss": 1.07149496, "memory(GiB)": 141.16, "step": 27020, "train_speed(iter/s)": 0.297173 }, { "acc": 0.72114859, "epoch": 0.30244097493593125, "grad_norm": 6.0, "learning_rate": 9.722500677083754e-06, "loss": 1.12831459, "memory(GiB)": 141.16, "step": 27040, "train_speed(iter/s)": 0.297243 }, { "acc": 0.73659358, "epoch": 0.3026646738818898, "grad_norm": 6.34375, "learning_rate": 9.721892798913577e-06, "loss": 1.05717573, "memory(GiB)": 141.16, "step": 27060, "train_speed(iter/s)": 0.297311 }, { "acc": 0.73182731, "epoch": 0.3028883728278483, "grad_norm": 6.21875, "learning_rate": 9.721284274720973e-06, "loss": 1.07357025, "memory(GiB)": 141.16, "step": 27080, "train_speed(iter/s)": 0.297381 }, { "acc": 0.72927179, "epoch": 0.30311207177380683, "grad_norm": 5.5625, "learning_rate": 9.720675104589197e-06, "loss": 1.07353334, "memory(GiB)": 141.16, "step": 27100, "train_speed(iter/s)": 0.297447 }, { "acc": 0.73313217, "epoch": 0.30333577071976536, "grad_norm": 7.5625, "learning_rate": 9.720065288601594e-06, "loss": 1.08160086, "memory(GiB)": 141.16, "step": 27120, "train_speed(iter/s)": 0.297512 }, { "acc": 0.72000103, "epoch": 0.3035594696657239, "grad_norm": 6.125, "learning_rate": 9.719454826841594e-06, "loss": 1.12895765, "memory(GiB)": 141.16, "step": 27140, "train_speed(iter/s)": 0.297586 }, { "acc": 0.73020697, "epoch": 0.3037831686116824, "grad_norm": 6.59375, "learning_rate": 9.718843719392716e-06, "loss": 1.08412733, "memory(GiB)": 141.16, "step": 27160, "train_speed(iter/s)": 0.297652 }, { "acc": 0.73580103, "epoch": 0.30400686755764095, "grad_norm": 7.9375, "learning_rate": 9.718231966338572e-06, "loss": 1.04652195, "memory(GiB)": 141.16, "step": 27180, "train_speed(iter/s)": 0.297729 }, { "acc": 0.7179781, "epoch": 0.3042305665035995, "grad_norm": 7.59375, "learning_rate": 9.717619567762854e-06, "loss": 1.14364958, "memory(GiB)": 141.16, "step": 27200, "train_speed(iter/s)": 0.297802 }, { "acc": 0.71876173, "epoch": 0.304454265449558, "grad_norm": 5.5, "learning_rate": 9.71700652374935e-06, "loss": 1.14028549, "memory(GiB)": 141.16, "step": 27220, "train_speed(iter/s)": 0.297867 }, { "acc": 0.72842841, "epoch": 0.30467796439551653, "grad_norm": 5.375, "learning_rate": 9.71639283438193e-06, "loss": 1.09150467, "memory(GiB)": 141.16, "step": 27240, "train_speed(iter/s)": 0.297938 }, { "acc": 0.72648211, "epoch": 0.30490166334147506, "grad_norm": 7.28125, "learning_rate": 9.71577849974456e-06, "loss": 1.11101418, "memory(GiB)": 141.16, "step": 27260, "train_speed(iter/s)": 0.298015 }, { "acc": 0.72712684, "epoch": 0.3051253622874336, "grad_norm": 7.15625, "learning_rate": 9.715163519921285e-06, "loss": 1.08964081, "memory(GiB)": 141.16, "step": 27280, "train_speed(iter/s)": 0.298082 }, { "acc": 0.72194843, "epoch": 0.3053490612333921, "grad_norm": 6.6875, "learning_rate": 9.714547894996246e-06, "loss": 1.13046064, "memory(GiB)": 141.16, "step": 27300, "train_speed(iter/s)": 0.298147 }, { "acc": 0.72775044, "epoch": 0.30557276017935064, "grad_norm": 7.03125, "learning_rate": 9.713931625053667e-06, "loss": 1.09247284, "memory(GiB)": 141.16, "step": 27320, "train_speed(iter/s)": 0.298225 }, { "acc": 0.7391139, "epoch": 0.3057964591253092, "grad_norm": 6.375, "learning_rate": 9.713314710177867e-06, "loss": 1.0390028, "memory(GiB)": 141.16, "step": 27340, "train_speed(iter/s)": 0.298285 }, { "acc": 0.71956635, "epoch": 0.3060201580712677, "grad_norm": 6.8125, "learning_rate": 9.712697150453246e-06, "loss": 1.12719526, "memory(GiB)": 141.16, "step": 27360, "train_speed(iter/s)": 0.298359 }, { "acc": 0.71466036, "epoch": 0.30624385701722623, "grad_norm": 6.65625, "learning_rate": 9.712078945964291e-06, "loss": 1.16822577, "memory(GiB)": 141.16, "step": 27380, "train_speed(iter/s)": 0.29842 }, { "acc": 0.7211134, "epoch": 0.30646755596318476, "grad_norm": 7.28125, "learning_rate": 9.711460096795589e-06, "loss": 1.12100906, "memory(GiB)": 141.16, "step": 27400, "train_speed(iter/s)": 0.298489 }, { "acc": 0.72334528, "epoch": 0.3066912549091433, "grad_norm": 8.3125, "learning_rate": 9.710840603031801e-06, "loss": 1.12039766, "memory(GiB)": 141.16, "step": 27420, "train_speed(iter/s)": 0.298553 }, { "acc": 0.73046107, "epoch": 0.3069149538551018, "grad_norm": 5.625, "learning_rate": 9.710220464757687e-06, "loss": 1.0786562, "memory(GiB)": 141.16, "step": 27440, "train_speed(iter/s)": 0.298623 }, { "acc": 0.73677282, "epoch": 0.30713865280106034, "grad_norm": 5.8125, "learning_rate": 9.709599682058087e-06, "loss": 1.05378437, "memory(GiB)": 141.16, "step": 27460, "train_speed(iter/s)": 0.298695 }, { "acc": 0.73912334, "epoch": 0.30736235174701887, "grad_norm": 6.9375, "learning_rate": 9.708978255017935e-06, "loss": 1.02680807, "memory(GiB)": 141.16, "step": 27480, "train_speed(iter/s)": 0.298762 }, { "acc": 0.72577982, "epoch": 0.3075860506929774, "grad_norm": 7.25, "learning_rate": 9.708356183722252e-06, "loss": 1.09226418, "memory(GiB)": 141.16, "step": 27500, "train_speed(iter/s)": 0.298827 }, { "acc": 0.72787218, "epoch": 0.3078097496389359, "grad_norm": 6.40625, "learning_rate": 9.707733468256145e-06, "loss": 1.10045471, "memory(GiB)": 141.16, "step": 27520, "train_speed(iter/s)": 0.298888 }, { "acc": 0.72575979, "epoch": 0.30803344858489445, "grad_norm": 7.125, "learning_rate": 9.707110108704811e-06, "loss": 1.10027857, "memory(GiB)": 141.16, "step": 27540, "train_speed(iter/s)": 0.298948 }, { "acc": 0.72577772, "epoch": 0.308257147530853, "grad_norm": 7.9375, "learning_rate": 9.706486105153532e-06, "loss": 1.11952419, "memory(GiB)": 141.16, "step": 27560, "train_speed(iter/s)": 0.299012 }, { "acc": 0.73515635, "epoch": 0.3084808464768115, "grad_norm": 6.25, "learning_rate": 9.705861457687685e-06, "loss": 1.06675968, "memory(GiB)": 141.16, "step": 27580, "train_speed(iter/s)": 0.299071 }, { "acc": 0.73356481, "epoch": 0.30870454542277004, "grad_norm": 6.1875, "learning_rate": 9.705236166392728e-06, "loss": 1.0780817, "memory(GiB)": 141.16, "step": 27600, "train_speed(iter/s)": 0.299139 }, { "acc": 0.73446217, "epoch": 0.30892824436872857, "grad_norm": 5.90625, "learning_rate": 9.704610231354208e-06, "loss": 1.06801205, "memory(GiB)": 141.16, "step": 27620, "train_speed(iter/s)": 0.299192 }, { "acc": 0.72462292, "epoch": 0.3091519433146871, "grad_norm": 6.53125, "learning_rate": 9.703983652657767e-06, "loss": 1.1019515, "memory(GiB)": 141.16, "step": 27640, "train_speed(iter/s)": 0.299264 }, { "acc": 0.7218647, "epoch": 0.3093756422606456, "grad_norm": 6.8125, "learning_rate": 9.703356430389123e-06, "loss": 1.11996975, "memory(GiB)": 141.16, "step": 27660, "train_speed(iter/s)": 0.29933 }, { "acc": 0.73074818, "epoch": 0.30959934120660415, "grad_norm": 5.78125, "learning_rate": 9.702728564634097e-06, "loss": 1.08772182, "memory(GiB)": 141.16, "step": 27680, "train_speed(iter/s)": 0.299399 }, { "acc": 0.71388621, "epoch": 0.3098230401525627, "grad_norm": 6.90625, "learning_rate": 9.702100055478583e-06, "loss": 1.15816784, "memory(GiB)": 141.16, "step": 27700, "train_speed(iter/s)": 0.29947 }, { "acc": 0.71946201, "epoch": 0.3100467390985212, "grad_norm": 7.53125, "learning_rate": 9.701470903008574e-06, "loss": 1.12221584, "memory(GiB)": 141.16, "step": 27720, "train_speed(iter/s)": 0.299541 }, { "acc": 0.73907747, "epoch": 0.31027043804447973, "grad_norm": 7.09375, "learning_rate": 9.700841107310146e-06, "loss": 1.04343872, "memory(GiB)": 141.16, "step": 27740, "train_speed(iter/s)": 0.299607 }, { "acc": 0.72594624, "epoch": 0.31049413699043826, "grad_norm": 5.8125, "learning_rate": 9.700210668469464e-06, "loss": 1.0831749, "memory(GiB)": 141.16, "step": 27760, "train_speed(iter/s)": 0.299675 }, { "acc": 0.71878986, "epoch": 0.3107178359363968, "grad_norm": 7.0625, "learning_rate": 9.699579586572781e-06, "loss": 1.14616737, "memory(GiB)": 141.16, "step": 27780, "train_speed(iter/s)": 0.299736 }, { "acc": 0.72336168, "epoch": 0.3109415348823553, "grad_norm": 6.375, "learning_rate": 9.698947861706438e-06, "loss": 1.11632156, "memory(GiB)": 141.16, "step": 27800, "train_speed(iter/s)": 0.299801 }, { "acc": 0.74179225, "epoch": 0.31116523382831385, "grad_norm": 6.3125, "learning_rate": 9.698315493956864e-06, "loss": 1.02117081, "memory(GiB)": 141.16, "step": 27820, "train_speed(iter/s)": 0.299868 }, { "acc": 0.73131218, "epoch": 0.3113889327742724, "grad_norm": 8.375, "learning_rate": 9.697682483410576e-06, "loss": 1.07319832, "memory(GiB)": 141.16, "step": 27840, "train_speed(iter/s)": 0.299941 }, { "acc": 0.73757877, "epoch": 0.3116126317202309, "grad_norm": 7.78125, "learning_rate": 9.69704883015418e-06, "loss": 1.04415178, "memory(GiB)": 141.16, "step": 27860, "train_speed(iter/s)": 0.300001 }, { "acc": 0.71980128, "epoch": 0.31183633066618943, "grad_norm": 6.90625, "learning_rate": 9.696414534274367e-06, "loss": 1.13238697, "memory(GiB)": 141.16, "step": 27880, "train_speed(iter/s)": 0.300061 }, { "acc": 0.72834959, "epoch": 0.31206002961214796, "grad_norm": 6.34375, "learning_rate": 9.695779595857918e-06, "loss": 1.09292526, "memory(GiB)": 141.16, "step": 27900, "train_speed(iter/s)": 0.300133 }, { "acc": 0.73217411, "epoch": 0.3122837285581065, "grad_norm": 7.53125, "learning_rate": 9.695144014991702e-06, "loss": 1.08707361, "memory(GiB)": 141.16, "step": 27920, "train_speed(iter/s)": 0.300202 }, { "acc": 0.72972355, "epoch": 0.312507427504065, "grad_norm": 6.9375, "learning_rate": 9.694507791762676e-06, "loss": 1.07431984, "memory(GiB)": 141.16, "step": 27940, "train_speed(iter/s)": 0.30027 }, { "acc": 0.73638997, "epoch": 0.31273112645002354, "grad_norm": 5.875, "learning_rate": 9.693870926257884e-06, "loss": 1.0541151, "memory(GiB)": 141.16, "step": 27960, "train_speed(iter/s)": 0.300325 }, { "acc": 0.72140112, "epoch": 0.3129548253959821, "grad_norm": 8.375, "learning_rate": 9.693233418564459e-06, "loss": 1.12650995, "memory(GiB)": 141.16, "step": 27980, "train_speed(iter/s)": 0.300387 }, { "acc": 0.72902365, "epoch": 0.3131785243419406, "grad_norm": 5.125, "learning_rate": 9.69259526876962e-06, "loss": 1.08629971, "memory(GiB)": 141.16, "step": 28000, "train_speed(iter/s)": 0.300455 }, { "epoch": 0.3131785243419406, "eval_acc": 0.6861826701355737, "eval_loss": 1.0953868627548218, "eval_runtime": 2318.9472, "eval_samples_per_second": 32.464, "eval_steps_per_second": 16.232, "step": 28000 }, { "acc": 0.71947351, "epoch": 0.31340222328789913, "grad_norm": 6.875, "learning_rate": 9.691956476960676e-06, "loss": 1.10867615, "memory(GiB)": 141.16, "step": 28020, "train_speed(iter/s)": 0.293078 }, { "acc": 0.72868776, "epoch": 0.31362592223385766, "grad_norm": 9.375, "learning_rate": 9.691317043225023e-06, "loss": 1.0810751, "memory(GiB)": 141.16, "step": 28040, "train_speed(iter/s)": 0.29315 }, { "acc": 0.73199282, "epoch": 0.3138496211798162, "grad_norm": 7.0625, "learning_rate": 9.690676967650144e-06, "loss": 1.06343517, "memory(GiB)": 141.16, "step": 28060, "train_speed(iter/s)": 0.293217 }, { "acc": 0.72929082, "epoch": 0.3140733201257747, "grad_norm": 7.5, "learning_rate": 9.690036250323608e-06, "loss": 1.08177528, "memory(GiB)": 141.16, "step": 28080, "train_speed(iter/s)": 0.293287 }, { "acc": 0.70666475, "epoch": 0.31429701907173324, "grad_norm": 7.3125, "learning_rate": 9.68939489133308e-06, "loss": 1.18647079, "memory(GiB)": 141.16, "step": 28100, "train_speed(iter/s)": 0.29336 }, { "acc": 0.74299526, "epoch": 0.31452071801769177, "grad_norm": 7.34375, "learning_rate": 9.688752890766302e-06, "loss": 1.02273645, "memory(GiB)": 141.16, "step": 28120, "train_speed(iter/s)": 0.293433 }, { "acc": 0.74002113, "epoch": 0.3147444169636503, "grad_norm": 6.46875, "learning_rate": 9.688110248711112e-06, "loss": 1.02174549, "memory(GiB)": 141.16, "step": 28140, "train_speed(iter/s)": 0.293499 }, { "acc": 0.73301973, "epoch": 0.3149681159096088, "grad_norm": 5.59375, "learning_rate": 9.687466965255432e-06, "loss": 1.07590132, "memory(GiB)": 141.16, "step": 28160, "train_speed(iter/s)": 0.293566 }, { "acc": 0.73428936, "epoch": 0.31519181485556735, "grad_norm": 6.6875, "learning_rate": 9.68682304048727e-06, "loss": 1.08113556, "memory(GiB)": 141.16, "step": 28180, "train_speed(iter/s)": 0.293624 }, { "acc": 0.71704259, "epoch": 0.3154155138015259, "grad_norm": 6.5, "learning_rate": 9.686178474494727e-06, "loss": 1.14192591, "memory(GiB)": 141.16, "step": 28200, "train_speed(iter/s)": 0.293682 }, { "acc": 0.73281889, "epoch": 0.3156392127474844, "grad_norm": 6.3125, "learning_rate": 9.685533267365988e-06, "loss": 1.06955242, "memory(GiB)": 141.16, "step": 28220, "train_speed(iter/s)": 0.29375 }, { "acc": 0.73517027, "epoch": 0.31586291169344294, "grad_norm": 6.25, "learning_rate": 9.684887419189327e-06, "loss": 1.05187025, "memory(GiB)": 141.16, "step": 28240, "train_speed(iter/s)": 0.293816 }, { "acc": 0.73722229, "epoch": 0.31608661063940147, "grad_norm": 6.96875, "learning_rate": 9.684240930053102e-06, "loss": 1.05606441, "memory(GiB)": 141.16, "step": 28260, "train_speed(iter/s)": 0.293885 }, { "acc": 0.72311106, "epoch": 0.31631030958536005, "grad_norm": 7.15625, "learning_rate": 9.683593800045765e-06, "loss": 1.11133785, "memory(GiB)": 141.16, "step": 28280, "train_speed(iter/s)": 0.293954 }, { "acc": 0.72249832, "epoch": 0.3165340085313186, "grad_norm": 7.40625, "learning_rate": 9.682946029255855e-06, "loss": 1.12322826, "memory(GiB)": 141.16, "step": 28300, "train_speed(iter/s)": 0.294026 }, { "acc": 0.73757811, "epoch": 0.3167577074772771, "grad_norm": 6.59375, "learning_rate": 9.682297617771992e-06, "loss": 1.06041489, "memory(GiB)": 141.16, "step": 28320, "train_speed(iter/s)": 0.294094 }, { "acc": 0.7231019, "epoch": 0.31698140642323563, "grad_norm": 6.9375, "learning_rate": 9.681648565682889e-06, "loss": 1.12996359, "memory(GiB)": 141.16, "step": 28340, "train_speed(iter/s)": 0.294165 }, { "acc": 0.71946611, "epoch": 0.31720510536919416, "grad_norm": 7.5625, "learning_rate": 9.680998873077346e-06, "loss": 1.12027102, "memory(GiB)": 141.16, "step": 28360, "train_speed(iter/s)": 0.29424 }, { "acc": 0.72450228, "epoch": 0.3174288043151527, "grad_norm": 8.0625, "learning_rate": 9.680348540044249e-06, "loss": 1.09655266, "memory(GiB)": 141.16, "step": 28380, "train_speed(iter/s)": 0.294318 }, { "acc": 0.73949013, "epoch": 0.3176525032611112, "grad_norm": 7.15625, "learning_rate": 9.679697566672577e-06, "loss": 1.04081306, "memory(GiB)": 141.16, "step": 28400, "train_speed(iter/s)": 0.294394 }, { "acc": 0.72839098, "epoch": 0.31787620220706975, "grad_norm": 8.875, "learning_rate": 9.679045953051387e-06, "loss": 1.07520714, "memory(GiB)": 141.16, "step": 28420, "train_speed(iter/s)": 0.294461 }, { "acc": 0.71799383, "epoch": 0.3180999011530283, "grad_norm": 6.21875, "learning_rate": 9.678393699269833e-06, "loss": 1.13455038, "memory(GiB)": 141.16, "step": 28440, "train_speed(iter/s)": 0.294529 }, { "acc": 0.73088207, "epoch": 0.3183236000989868, "grad_norm": 7.375, "learning_rate": 9.677740805417151e-06, "loss": 1.08313179, "memory(GiB)": 141.16, "step": 28460, "train_speed(iter/s)": 0.294592 }, { "acc": 0.72614222, "epoch": 0.31854729904494533, "grad_norm": 7.375, "learning_rate": 9.677087271582666e-06, "loss": 1.10795422, "memory(GiB)": 141.16, "step": 28480, "train_speed(iter/s)": 0.294651 }, { "acc": 0.72271662, "epoch": 0.31877099799090386, "grad_norm": 6.40625, "learning_rate": 9.676433097855793e-06, "loss": 1.12701206, "memory(GiB)": 141.16, "step": 28500, "train_speed(iter/s)": 0.294706 }, { "acc": 0.72911377, "epoch": 0.3189946969368624, "grad_norm": 8.75, "learning_rate": 9.675778284326029e-06, "loss": 1.08166075, "memory(GiB)": 141.16, "step": 28520, "train_speed(iter/s)": 0.294775 }, { "acc": 0.73550987, "epoch": 0.3192183958828209, "grad_norm": 7.625, "learning_rate": 9.675122831082963e-06, "loss": 1.05796432, "memory(GiB)": 141.16, "step": 28540, "train_speed(iter/s)": 0.29484 }, { "acc": 0.73652391, "epoch": 0.31944209482877944, "grad_norm": 8.75, "learning_rate": 9.674466738216273e-06, "loss": 1.05947008, "memory(GiB)": 141.16, "step": 28560, "train_speed(iter/s)": 0.294913 }, { "acc": 0.72247758, "epoch": 0.31966579377473797, "grad_norm": 6.9375, "learning_rate": 9.673810005815718e-06, "loss": 1.13740711, "memory(GiB)": 141.16, "step": 28580, "train_speed(iter/s)": 0.294967 }, { "acc": 0.73274064, "epoch": 0.3198894927206965, "grad_norm": 8.3125, "learning_rate": 9.67315263397115e-06, "loss": 1.0658947, "memory(GiB)": 141.16, "step": 28600, "train_speed(iter/s)": 0.29503 }, { "acc": 0.73768978, "epoch": 0.32011319166665503, "grad_norm": 7.125, "learning_rate": 9.672494622772509e-06, "loss": 1.04998083, "memory(GiB)": 141.16, "step": 28620, "train_speed(iter/s)": 0.295094 }, { "acc": 0.7238029, "epoch": 0.32033689061261356, "grad_norm": 6.9375, "learning_rate": 9.671835972309815e-06, "loss": 1.12340374, "memory(GiB)": 141.16, "step": 28640, "train_speed(iter/s)": 0.295156 }, { "acc": 0.71511078, "epoch": 0.3205605895585721, "grad_norm": 8.75, "learning_rate": 9.671176682673186e-06, "loss": 1.14217358, "memory(GiB)": 141.16, "step": 28660, "train_speed(iter/s)": 0.295223 }, { "acc": 0.73408937, "epoch": 0.3207842885045306, "grad_norm": 6.96875, "learning_rate": 9.67051675395282e-06, "loss": 1.0640007, "memory(GiB)": 141.16, "step": 28680, "train_speed(iter/s)": 0.29529 }, { "acc": 0.72243671, "epoch": 0.32100798745048914, "grad_norm": 8.9375, "learning_rate": 9.669856186239004e-06, "loss": 1.10928164, "memory(GiB)": 141.16, "step": 28700, "train_speed(iter/s)": 0.295356 }, { "acc": 0.73480349, "epoch": 0.32123168639644767, "grad_norm": 5.375, "learning_rate": 9.669194979622117e-06, "loss": 1.07506037, "memory(GiB)": 141.16, "step": 28720, "train_speed(iter/s)": 0.295416 }, { "acc": 0.7311285, "epoch": 0.3214553853424062, "grad_norm": 5.21875, "learning_rate": 9.668533134192615e-06, "loss": 1.07557001, "memory(GiB)": 141.16, "step": 28740, "train_speed(iter/s)": 0.295489 }, { "acc": 0.71713009, "epoch": 0.3216790842883647, "grad_norm": 5.71875, "learning_rate": 9.667870650041053e-06, "loss": 1.13246899, "memory(GiB)": 141.16, "step": 28760, "train_speed(iter/s)": 0.295565 }, { "acc": 0.71489992, "epoch": 0.32190278323432325, "grad_norm": 8.125, "learning_rate": 9.667207527258067e-06, "loss": 1.15788546, "memory(GiB)": 141.16, "step": 28780, "train_speed(iter/s)": 0.29562 }, { "acc": 0.7297122, "epoch": 0.3221264821802818, "grad_norm": 7.0, "learning_rate": 9.666543765934381e-06, "loss": 1.09577274, "memory(GiB)": 141.16, "step": 28800, "train_speed(iter/s)": 0.295685 }, { "acc": 0.71449656, "epoch": 0.3223501811262403, "grad_norm": 6.25, "learning_rate": 9.66587936616081e-06, "loss": 1.15723038, "memory(GiB)": 141.16, "step": 28820, "train_speed(iter/s)": 0.295753 }, { "acc": 0.72081227, "epoch": 0.32257388007219884, "grad_norm": 4.65625, "learning_rate": 9.665214328028249e-06, "loss": 1.12801609, "memory(GiB)": 141.16, "step": 28840, "train_speed(iter/s)": 0.295818 }, { "acc": 0.71192465, "epoch": 0.32279757901815737, "grad_norm": 6.5, "learning_rate": 9.664548651627686e-06, "loss": 1.1540576, "memory(GiB)": 141.16, "step": 28860, "train_speed(iter/s)": 0.295878 }, { "acc": 0.72396679, "epoch": 0.3230212779641159, "grad_norm": 7.625, "learning_rate": 9.663882337050197e-06, "loss": 1.10534039, "memory(GiB)": 141.16, "step": 28880, "train_speed(iter/s)": 0.295948 }, { "acc": 0.7259963, "epoch": 0.3232449769100744, "grad_norm": 8.0, "learning_rate": 9.663215384386942e-06, "loss": 1.09861927, "memory(GiB)": 141.16, "step": 28900, "train_speed(iter/s)": 0.296012 }, { "acc": 0.73095894, "epoch": 0.32346867585603295, "grad_norm": 7.21875, "learning_rate": 9.662547793729169e-06, "loss": 1.08483849, "memory(GiB)": 141.16, "step": 28920, "train_speed(iter/s)": 0.296076 }, { "acc": 0.73152609, "epoch": 0.3236923748019915, "grad_norm": 8.5, "learning_rate": 9.661879565168213e-06, "loss": 1.07423954, "memory(GiB)": 141.16, "step": 28940, "train_speed(iter/s)": 0.296144 }, { "acc": 0.72060452, "epoch": 0.32391607374795, "grad_norm": 7.96875, "learning_rate": 9.661210698795502e-06, "loss": 1.12766457, "memory(GiB)": 141.16, "step": 28960, "train_speed(iter/s)": 0.296214 }, { "acc": 0.73405762, "epoch": 0.32413977269390853, "grad_norm": 6.625, "learning_rate": 9.660541194702541e-06, "loss": 1.07121553, "memory(GiB)": 141.16, "step": 28980, "train_speed(iter/s)": 0.296286 }, { "acc": 0.73265028, "epoch": 0.32436347163986706, "grad_norm": 6.96875, "learning_rate": 9.659871052980931e-06, "loss": 1.07605782, "memory(GiB)": 141.16, "step": 29000, "train_speed(iter/s)": 0.296355 }, { "acc": 0.72828808, "epoch": 0.3245871705858256, "grad_norm": 6.25, "learning_rate": 9.659200273722358e-06, "loss": 1.08726254, "memory(GiB)": 141.16, "step": 29020, "train_speed(iter/s)": 0.29642 }, { "acc": 0.73092513, "epoch": 0.3248108695317841, "grad_norm": 7.21875, "learning_rate": 9.65852885701859e-06, "loss": 1.08286133, "memory(GiB)": 141.16, "step": 29040, "train_speed(iter/s)": 0.296484 }, { "acc": 0.72821717, "epoch": 0.32503456847774265, "grad_norm": 9.8125, "learning_rate": 9.657856802961488e-06, "loss": 1.08268681, "memory(GiB)": 141.16, "step": 29060, "train_speed(iter/s)": 0.29655 }, { "acc": 0.73310528, "epoch": 0.3252582674237012, "grad_norm": 8.0, "learning_rate": 9.657184111643e-06, "loss": 1.08139229, "memory(GiB)": 141.16, "step": 29080, "train_speed(iter/s)": 0.296623 }, { "acc": 0.73436832, "epoch": 0.3254819663696597, "grad_norm": 7.03125, "learning_rate": 9.656510783155159e-06, "loss": 1.06082392, "memory(GiB)": 141.16, "step": 29100, "train_speed(iter/s)": 0.296677 }, { "acc": 0.72960005, "epoch": 0.32570566531561823, "grad_norm": 6.8125, "learning_rate": 9.655836817590087e-06, "loss": 1.07722683, "memory(GiB)": 141.16, "step": 29120, "train_speed(iter/s)": 0.296744 }, { "acc": 0.7155117, "epoch": 0.32592936426157676, "grad_norm": 7.21875, "learning_rate": 9.655162215039991e-06, "loss": 1.16973038, "memory(GiB)": 141.16, "step": 29140, "train_speed(iter/s)": 0.296809 }, { "acc": 0.7159729, "epoch": 0.3261530632075353, "grad_norm": 6.75, "learning_rate": 9.654486975597165e-06, "loss": 1.13857355, "memory(GiB)": 141.16, "step": 29160, "train_speed(iter/s)": 0.296877 }, { "acc": 0.72132215, "epoch": 0.3263767621534938, "grad_norm": 8.125, "learning_rate": 9.653811099353994e-06, "loss": 1.11507702, "memory(GiB)": 141.16, "step": 29180, "train_speed(iter/s)": 0.296946 }, { "acc": 0.7123147, "epoch": 0.32660046109945234, "grad_norm": 6.78125, "learning_rate": 9.653134586402946e-06, "loss": 1.17384701, "memory(GiB)": 141.16, "step": 29200, "train_speed(iter/s)": 0.297015 }, { "acc": 0.73246632, "epoch": 0.32682416004541087, "grad_norm": 6.84375, "learning_rate": 9.652457436836577e-06, "loss": 1.06310349, "memory(GiB)": 141.16, "step": 29220, "train_speed(iter/s)": 0.297086 }, { "acc": 0.72293243, "epoch": 0.3270478589913694, "grad_norm": 6.96875, "learning_rate": 9.651779650747533e-06, "loss": 1.12059574, "memory(GiB)": 141.16, "step": 29240, "train_speed(iter/s)": 0.297158 }, { "acc": 0.73638496, "epoch": 0.32727155793732793, "grad_norm": 6.21875, "learning_rate": 9.651101228228543e-06, "loss": 1.06431942, "memory(GiB)": 141.16, "step": 29260, "train_speed(iter/s)": 0.297229 }, { "acc": 0.72553477, "epoch": 0.32749525688328646, "grad_norm": 6.3125, "learning_rate": 9.650422169372427e-06, "loss": 1.08509426, "memory(GiB)": 141.16, "step": 29280, "train_speed(iter/s)": 0.297296 }, { "acc": 0.72913208, "epoch": 0.327718955829245, "grad_norm": 7.84375, "learning_rate": 9.649742474272085e-06, "loss": 1.08764801, "memory(GiB)": 141.16, "step": 29300, "train_speed(iter/s)": 0.297368 }, { "acc": 0.74310246, "epoch": 0.3279426547752035, "grad_norm": 7.3125, "learning_rate": 9.649062143020515e-06, "loss": 1.0290494, "memory(GiB)": 141.16, "step": 29320, "train_speed(iter/s)": 0.297438 }, { "acc": 0.7304255, "epoch": 0.32816635372116204, "grad_norm": 7.625, "learning_rate": 9.648381175710792e-06, "loss": 1.07859859, "memory(GiB)": 141.16, "step": 29340, "train_speed(iter/s)": 0.297504 }, { "acc": 0.72952304, "epoch": 0.32839005266712057, "grad_norm": 7.125, "learning_rate": 9.647699572436085e-06, "loss": 1.09465981, "memory(GiB)": 141.16, "step": 29360, "train_speed(iter/s)": 0.297571 }, { "acc": 0.72754712, "epoch": 0.3286137516130791, "grad_norm": 6.71875, "learning_rate": 9.647017333289646e-06, "loss": 1.08712749, "memory(GiB)": 141.16, "step": 29380, "train_speed(iter/s)": 0.297639 }, { "acc": 0.73106241, "epoch": 0.3288374505590376, "grad_norm": 7.09375, "learning_rate": 9.646334458364813e-06, "loss": 1.08102417, "memory(GiB)": 141.16, "step": 29400, "train_speed(iter/s)": 0.297712 }, { "acc": 0.72631335, "epoch": 0.32906114950499615, "grad_norm": 6.375, "learning_rate": 9.645650947755014e-06, "loss": 1.08721085, "memory(GiB)": 141.16, "step": 29420, "train_speed(iter/s)": 0.297778 }, { "acc": 0.73967819, "epoch": 0.3292848484509547, "grad_norm": 7.28125, "learning_rate": 9.644966801553765e-06, "loss": 1.04736023, "memory(GiB)": 141.16, "step": 29440, "train_speed(iter/s)": 0.297845 }, { "acc": 0.71231632, "epoch": 0.3295085473969132, "grad_norm": 6.59375, "learning_rate": 9.644282019854665e-06, "loss": 1.1667099, "memory(GiB)": 141.16, "step": 29460, "train_speed(iter/s)": 0.297917 }, { "acc": 0.72458162, "epoch": 0.32973224634287174, "grad_norm": 10.0, "learning_rate": 9.643596602751404e-06, "loss": 1.11211319, "memory(GiB)": 141.16, "step": 29480, "train_speed(iter/s)": 0.297987 }, { "acc": 0.71608591, "epoch": 0.32995594528883027, "grad_norm": 8.25, "learning_rate": 9.642910550337754e-06, "loss": 1.11943502, "memory(GiB)": 141.16, "step": 29500, "train_speed(iter/s)": 0.298058 }, { "acc": 0.71706285, "epoch": 0.3301796442347888, "grad_norm": 8.125, "learning_rate": 9.642223862707578e-06, "loss": 1.14708996, "memory(GiB)": 141.16, "step": 29520, "train_speed(iter/s)": 0.298117 }, { "acc": 0.73790927, "epoch": 0.3304033431807474, "grad_norm": 5.8125, "learning_rate": 9.641536539954826e-06, "loss": 1.05765591, "memory(GiB)": 141.16, "step": 29540, "train_speed(iter/s)": 0.298181 }, { "acc": 0.73814869, "epoch": 0.3306270421267059, "grad_norm": 6.6875, "learning_rate": 9.640848582173533e-06, "loss": 1.04640017, "memory(GiB)": 141.16, "step": 29560, "train_speed(iter/s)": 0.298246 }, { "acc": 0.71627598, "epoch": 0.33085074107266443, "grad_norm": 6.3125, "learning_rate": 9.64015998945782e-06, "loss": 1.14773655, "memory(GiB)": 141.16, "step": 29580, "train_speed(iter/s)": 0.298306 }, { "acc": 0.72882957, "epoch": 0.33107444001862296, "grad_norm": 7.375, "learning_rate": 9.639470761901897e-06, "loss": 1.08544331, "memory(GiB)": 141.16, "step": 29600, "train_speed(iter/s)": 0.298371 }, { "acc": 0.73317118, "epoch": 0.3312981389645815, "grad_norm": 6.84375, "learning_rate": 9.63878089960006e-06, "loss": 1.07906532, "memory(GiB)": 141.16, "step": 29620, "train_speed(iter/s)": 0.298434 }, { "acc": 0.72472906, "epoch": 0.33152183791054, "grad_norm": 6.78125, "learning_rate": 9.638090402646694e-06, "loss": 1.10668831, "memory(GiB)": 141.16, "step": 29640, "train_speed(iter/s)": 0.298497 }, { "acc": 0.73699508, "epoch": 0.33174553685649855, "grad_norm": 7.4375, "learning_rate": 9.637399271136267e-06, "loss": 1.04495192, "memory(GiB)": 141.16, "step": 29660, "train_speed(iter/s)": 0.29856 }, { "acc": 0.73179178, "epoch": 0.3319692358024571, "grad_norm": 7.4375, "learning_rate": 9.636707505163334e-06, "loss": 1.08185558, "memory(GiB)": 141.16, "step": 29680, "train_speed(iter/s)": 0.298625 }, { "acc": 0.73301239, "epoch": 0.3321929347484156, "grad_norm": 6.96875, "learning_rate": 9.636015104822543e-06, "loss": 1.09034061, "memory(GiB)": 141.16, "step": 29700, "train_speed(iter/s)": 0.298681 }, { "acc": 0.72918792, "epoch": 0.33241663369437413, "grad_norm": 6.03125, "learning_rate": 9.63532207020862e-06, "loss": 1.07587261, "memory(GiB)": 141.16, "step": 29720, "train_speed(iter/s)": 0.298742 }, { "acc": 0.7210969, "epoch": 0.33264033264033266, "grad_norm": 7.59375, "learning_rate": 9.634628401416385e-06, "loss": 1.13627377, "memory(GiB)": 141.16, "step": 29740, "train_speed(iter/s)": 0.298799 }, { "acc": 0.72695789, "epoch": 0.3328640315862912, "grad_norm": 7.9375, "learning_rate": 9.63393409854074e-06, "loss": 1.11120396, "memory(GiB)": 141.16, "step": 29760, "train_speed(iter/s)": 0.298869 }, { "acc": 0.72205877, "epoch": 0.3330877305322497, "grad_norm": 8.75, "learning_rate": 9.633239161676678e-06, "loss": 1.10980034, "memory(GiB)": 141.16, "step": 29780, "train_speed(iter/s)": 0.298934 }, { "acc": 0.71643667, "epoch": 0.33331142947820824, "grad_norm": 5.75, "learning_rate": 9.632543590919272e-06, "loss": 1.14636478, "memory(GiB)": 141.16, "step": 29800, "train_speed(iter/s)": 0.298995 }, { "acc": 0.72766905, "epoch": 0.33353512842416677, "grad_norm": 7.1875, "learning_rate": 9.63184738636369e-06, "loss": 1.10175133, "memory(GiB)": 141.16, "step": 29820, "train_speed(iter/s)": 0.29906 }, { "acc": 0.73438158, "epoch": 0.3337588273701253, "grad_norm": 9.9375, "learning_rate": 9.63115054810518e-06, "loss": 1.07016582, "memory(GiB)": 141.16, "step": 29840, "train_speed(iter/s)": 0.299119 }, { "acc": 0.73522701, "epoch": 0.3339825263160838, "grad_norm": 6.3125, "learning_rate": 9.63045307623908e-06, "loss": 1.05966663, "memory(GiB)": 141.16, "step": 29860, "train_speed(iter/s)": 0.299181 }, { "acc": 0.72763805, "epoch": 0.33420622526204236, "grad_norm": 5.625, "learning_rate": 9.629754970860815e-06, "loss": 1.07602863, "memory(GiB)": 141.16, "step": 29880, "train_speed(iter/s)": 0.299243 }, { "acc": 0.71319618, "epoch": 0.3344299242080009, "grad_norm": 8.5, "learning_rate": 9.629056232065896e-06, "loss": 1.15483265, "memory(GiB)": 141.16, "step": 29900, "train_speed(iter/s)": 0.299301 }, { "acc": 0.73507419, "epoch": 0.3346536231539594, "grad_norm": 7.75, "learning_rate": 9.62835685994992e-06, "loss": 1.05198421, "memory(GiB)": 141.16, "step": 29920, "train_speed(iter/s)": 0.299364 }, { "acc": 0.71924343, "epoch": 0.33487732209991794, "grad_norm": 5.5625, "learning_rate": 9.627656854608572e-06, "loss": 1.12761898, "memory(GiB)": 141.16, "step": 29940, "train_speed(iter/s)": 0.299432 }, { "acc": 0.71498709, "epoch": 0.33510102104587647, "grad_norm": 7.96875, "learning_rate": 9.626956216137622e-06, "loss": 1.1374424, "memory(GiB)": 141.16, "step": 29960, "train_speed(iter/s)": 0.299492 }, { "acc": 0.72988081, "epoch": 0.335324719991835, "grad_norm": 7.09375, "learning_rate": 9.626254944632927e-06, "loss": 1.07317257, "memory(GiB)": 141.16, "step": 29980, "train_speed(iter/s)": 0.299549 }, { "acc": 0.73658128, "epoch": 0.3355484189377935, "grad_norm": 5.59375, "learning_rate": 9.625553040190429e-06, "loss": 1.05723743, "memory(GiB)": 141.16, "step": 30000, "train_speed(iter/s)": 0.299611 }, { "epoch": 0.3355484189377935, "eval_acc": 0.6865739869833168, "eval_loss": 1.0937881469726562, "eval_runtime": 2320.623, "eval_samples_per_second": 32.441, "eval_steps_per_second": 16.221, "step": 30000 }, { "acc": 0.73738213, "epoch": 0.33577211788375205, "grad_norm": 9.9375, "learning_rate": 9.624850502906163e-06, "loss": 1.05219154, "memory(GiB)": 141.16, "step": 30020, "train_speed(iter/s)": 0.292753 }, { "acc": 0.74377127, "epoch": 0.3359958168297106, "grad_norm": 8.4375, "learning_rate": 9.624147332876244e-06, "loss": 1.03255348, "memory(GiB)": 141.16, "step": 30040, "train_speed(iter/s)": 0.292815 }, { "acc": 0.73496618, "epoch": 0.3362195157756691, "grad_norm": 6.71875, "learning_rate": 9.623443530196874e-06, "loss": 1.06172962, "memory(GiB)": 141.16, "step": 30060, "train_speed(iter/s)": 0.292882 }, { "acc": 0.71496692, "epoch": 0.33644321472162764, "grad_norm": 8.6875, "learning_rate": 9.622739094964347e-06, "loss": 1.14690218, "memory(GiB)": 141.16, "step": 30080, "train_speed(iter/s)": 0.292953 }, { "acc": 0.71876955, "epoch": 0.33666691366758617, "grad_norm": 5.6875, "learning_rate": 9.622034027275035e-06, "loss": 1.14214573, "memory(GiB)": 141.16, "step": 30100, "train_speed(iter/s)": 0.293022 }, { "acc": 0.72637043, "epoch": 0.3368906126135447, "grad_norm": 6.84375, "learning_rate": 9.621328327225406e-06, "loss": 1.11122904, "memory(GiB)": 141.16, "step": 30120, "train_speed(iter/s)": 0.293082 }, { "acc": 0.72921095, "epoch": 0.3371143115595032, "grad_norm": 8.6875, "learning_rate": 9.620621994912004e-06, "loss": 1.09046326, "memory(GiB)": 141.16, "step": 30140, "train_speed(iter/s)": 0.293148 }, { "acc": 0.72962246, "epoch": 0.33733801050546175, "grad_norm": 9.4375, "learning_rate": 9.619915030431475e-06, "loss": 1.08166714, "memory(GiB)": 141.16, "step": 30160, "train_speed(iter/s)": 0.293206 }, { "acc": 0.7349123, "epoch": 0.3375617094514203, "grad_norm": 6.0625, "learning_rate": 9.619207433880532e-06, "loss": 1.04926586, "memory(GiB)": 141.16, "step": 30180, "train_speed(iter/s)": 0.293265 }, { "acc": 0.73009491, "epoch": 0.3377854083973788, "grad_norm": 7.28125, "learning_rate": 9.61849920535599e-06, "loss": 1.08394623, "memory(GiB)": 141.16, "step": 30200, "train_speed(iter/s)": 0.293333 }, { "acc": 0.73376417, "epoch": 0.33800910734333733, "grad_norm": 7.0, "learning_rate": 9.617790344954743e-06, "loss": 1.06365852, "memory(GiB)": 141.16, "step": 30220, "train_speed(iter/s)": 0.293399 }, { "acc": 0.73010125, "epoch": 0.33823280628929586, "grad_norm": 6.5625, "learning_rate": 9.617080852773772e-06, "loss": 1.0985981, "memory(GiB)": 141.16, "step": 30240, "train_speed(iter/s)": 0.293466 }, { "acc": 0.73730507, "epoch": 0.3384565052352544, "grad_norm": 8.875, "learning_rate": 9.61637072891015e-06, "loss": 1.0437645, "memory(GiB)": 141.16, "step": 30260, "train_speed(iter/s)": 0.293525 }, { "acc": 0.74031343, "epoch": 0.3386802041812129, "grad_norm": 7.3125, "learning_rate": 9.615659973461027e-06, "loss": 1.03765907, "memory(GiB)": 141.16, "step": 30280, "train_speed(iter/s)": 0.293586 }, { "acc": 0.72598372, "epoch": 0.33890390312717145, "grad_norm": 7.0625, "learning_rate": 9.614948586523646e-06, "loss": 1.10707722, "memory(GiB)": 141.16, "step": 30300, "train_speed(iter/s)": 0.293655 }, { "acc": 0.72216072, "epoch": 0.33912760207313, "grad_norm": 7.875, "learning_rate": 9.614236568195336e-06, "loss": 1.11897011, "memory(GiB)": 141.16, "step": 30320, "train_speed(iter/s)": 0.293719 }, { "acc": 0.73011513, "epoch": 0.3393513010190885, "grad_norm": 6.78125, "learning_rate": 9.613523918573513e-06, "loss": 1.08435612, "memory(GiB)": 141.16, "step": 30340, "train_speed(iter/s)": 0.293787 }, { "acc": 0.72863617, "epoch": 0.33957499996504703, "grad_norm": 5.90625, "learning_rate": 9.612810637755671e-06, "loss": 1.10077019, "memory(GiB)": 141.16, "step": 30360, "train_speed(iter/s)": 0.293849 }, { "acc": 0.71975899, "epoch": 0.33979869891100556, "grad_norm": 7.59375, "learning_rate": 9.612096725839407e-06, "loss": 1.13104115, "memory(GiB)": 141.16, "step": 30380, "train_speed(iter/s)": 0.293914 }, { "acc": 0.73299656, "epoch": 0.3400223978569641, "grad_norm": 5.71875, "learning_rate": 9.611382182922386e-06, "loss": 1.06585875, "memory(GiB)": 141.16, "step": 30400, "train_speed(iter/s)": 0.293985 }, { "acc": 0.73647275, "epoch": 0.3402460968029226, "grad_norm": 6.96875, "learning_rate": 9.610667009102371e-06, "loss": 1.06249409, "memory(GiB)": 141.16, "step": 30420, "train_speed(iter/s)": 0.294044 }, { "acc": 0.73984127, "epoch": 0.34046979574888114, "grad_norm": 10.1875, "learning_rate": 9.609951204477206e-06, "loss": 1.04006634, "memory(GiB)": 141.16, "step": 30440, "train_speed(iter/s)": 0.294107 }, { "acc": 0.72576885, "epoch": 0.34069349469483967, "grad_norm": 7.78125, "learning_rate": 9.609234769144826e-06, "loss": 1.11623287, "memory(GiB)": 141.16, "step": 30460, "train_speed(iter/s)": 0.294162 }, { "acc": 0.73048143, "epoch": 0.3409171936407982, "grad_norm": 7.0625, "learning_rate": 9.608517703203249e-06, "loss": 1.05153856, "memory(GiB)": 141.16, "step": 30480, "train_speed(iter/s)": 0.294221 }, { "acc": 0.73586264, "epoch": 0.3411408925867567, "grad_norm": 9.0625, "learning_rate": 9.607800006750578e-06, "loss": 1.0746418, "memory(GiB)": 141.16, "step": 30500, "train_speed(iter/s)": 0.294289 }, { "acc": 0.71840382, "epoch": 0.34136459153271526, "grad_norm": 7.46875, "learning_rate": 9.607081679885006e-06, "loss": 1.1428134, "memory(GiB)": 141.16, "step": 30520, "train_speed(iter/s)": 0.294351 }, { "acc": 0.72971582, "epoch": 0.3415882904786738, "grad_norm": 7.0, "learning_rate": 9.60636272270481e-06, "loss": 1.08723507, "memory(GiB)": 141.16, "step": 30540, "train_speed(iter/s)": 0.294419 }, { "acc": 0.7534934, "epoch": 0.3418119894246323, "grad_norm": 5.625, "learning_rate": 9.605643135308354e-06, "loss": 0.96438217, "memory(GiB)": 141.16, "step": 30560, "train_speed(iter/s)": 0.294475 }, { "acc": 0.73698864, "epoch": 0.34203568837059084, "grad_norm": 8.6875, "learning_rate": 9.604922917794087e-06, "loss": 1.05172539, "memory(GiB)": 141.16, "step": 30580, "train_speed(iter/s)": 0.294536 }, { "acc": 0.72648687, "epoch": 0.34225938731654937, "grad_norm": 6.46875, "learning_rate": 9.604202070260545e-06, "loss": 1.1117794, "memory(GiB)": 141.16, "step": 30600, "train_speed(iter/s)": 0.294591 }, { "acc": 0.72156296, "epoch": 0.3424830862625079, "grad_norm": 7.28125, "learning_rate": 9.603480592806351e-06, "loss": 1.12336845, "memory(GiB)": 141.16, "step": 30620, "train_speed(iter/s)": 0.294659 }, { "acc": 0.71741905, "epoch": 0.3427067852084664, "grad_norm": 5.90625, "learning_rate": 9.602758485530213e-06, "loss": 1.14872103, "memory(GiB)": 141.16, "step": 30640, "train_speed(iter/s)": 0.294716 }, { "acc": 0.73463793, "epoch": 0.34293048415442495, "grad_norm": 7.53125, "learning_rate": 9.602035748530925e-06, "loss": 1.05734215, "memory(GiB)": 141.16, "step": 30660, "train_speed(iter/s)": 0.294774 }, { "acc": 0.73274827, "epoch": 0.3431541831003835, "grad_norm": 6.5, "learning_rate": 9.601312381907368e-06, "loss": 1.07569027, "memory(GiB)": 141.16, "step": 30680, "train_speed(iter/s)": 0.294829 }, { "acc": 0.72996411, "epoch": 0.343377882046342, "grad_norm": 6.96875, "learning_rate": 9.600588385758511e-06, "loss": 1.072859, "memory(GiB)": 141.16, "step": 30700, "train_speed(iter/s)": 0.294895 }, { "acc": 0.7378109, "epoch": 0.34360158099230054, "grad_norm": 7.375, "learning_rate": 9.599863760183403e-06, "loss": 1.05569134, "memory(GiB)": 141.16, "step": 30720, "train_speed(iter/s)": 0.294959 }, { "acc": 0.73000026, "epoch": 0.34382527993825907, "grad_norm": 5.46875, "learning_rate": 9.599138505281187e-06, "loss": 1.0991641, "memory(GiB)": 141.16, "step": 30740, "train_speed(iter/s)": 0.295021 }, { "acc": 0.71198664, "epoch": 0.3440489788842176, "grad_norm": 6.4375, "learning_rate": 9.598412621151087e-06, "loss": 1.17362652, "memory(GiB)": 141.16, "step": 30760, "train_speed(iter/s)": 0.295083 }, { "acc": 0.72443991, "epoch": 0.3442726778301761, "grad_norm": 6.96875, "learning_rate": 9.597686107892412e-06, "loss": 1.11801033, "memory(GiB)": 141.16, "step": 30780, "train_speed(iter/s)": 0.295153 }, { "acc": 0.73642631, "epoch": 0.3444963767761347, "grad_norm": 7.0, "learning_rate": 9.596958965604563e-06, "loss": 1.04270153, "memory(GiB)": 141.16, "step": 30800, "train_speed(iter/s)": 0.295217 }, { "acc": 0.71574378, "epoch": 0.34472007572209323, "grad_norm": 6.875, "learning_rate": 9.596231194387022e-06, "loss": 1.16943474, "memory(GiB)": 141.16, "step": 30820, "train_speed(iter/s)": 0.295273 }, { "acc": 0.7176609, "epoch": 0.34494377466805176, "grad_norm": 6.25, "learning_rate": 9.595502794339358e-06, "loss": 1.13499298, "memory(GiB)": 141.16, "step": 30840, "train_speed(iter/s)": 0.295335 }, { "acc": 0.73022795, "epoch": 0.3451674736140103, "grad_norm": 7.0625, "learning_rate": 9.594773765561227e-06, "loss": 1.08573914, "memory(GiB)": 141.16, "step": 30860, "train_speed(iter/s)": 0.295397 }, { "acc": 0.72952099, "epoch": 0.3453911725599688, "grad_norm": 7.71875, "learning_rate": 9.594044108152369e-06, "loss": 1.0660759, "memory(GiB)": 141.16, "step": 30880, "train_speed(iter/s)": 0.295456 }, { "acc": 0.71984301, "epoch": 0.34561487150592735, "grad_norm": 7.0, "learning_rate": 9.593313822212614e-06, "loss": 1.14117422, "memory(GiB)": 141.16, "step": 30900, "train_speed(iter/s)": 0.295519 }, { "acc": 0.71911144, "epoch": 0.3458385704518859, "grad_norm": 8.0, "learning_rate": 9.592582907841874e-06, "loss": 1.13449287, "memory(GiB)": 141.16, "step": 30920, "train_speed(iter/s)": 0.295588 }, { "acc": 0.72739735, "epoch": 0.3460622693978444, "grad_norm": 7.78125, "learning_rate": 9.59185136514015e-06, "loss": 1.08962593, "memory(GiB)": 141.16, "step": 30940, "train_speed(iter/s)": 0.295651 }, { "acc": 0.7383091, "epoch": 0.34628596834380293, "grad_norm": 8.4375, "learning_rate": 9.591119194207527e-06, "loss": 1.04327068, "memory(GiB)": 141.16, "step": 30960, "train_speed(iter/s)": 0.295707 }, { "acc": 0.72381182, "epoch": 0.34650966728976146, "grad_norm": 6.6875, "learning_rate": 9.590386395144174e-06, "loss": 1.10362463, "memory(GiB)": 141.16, "step": 30980, "train_speed(iter/s)": 0.295763 }, { "acc": 0.71508417, "epoch": 0.34673336623572, "grad_norm": 8.125, "learning_rate": 9.589652968050353e-06, "loss": 1.16369867, "memory(GiB)": 141.16, "step": 31000, "train_speed(iter/s)": 0.295827 }, { "acc": 0.72440557, "epoch": 0.3469570651816785, "grad_norm": 7.90625, "learning_rate": 9.588918913026402e-06, "loss": 1.10074129, "memory(GiB)": 141.16, "step": 31020, "train_speed(iter/s)": 0.295886 }, { "acc": 0.72645941, "epoch": 0.34718076412763704, "grad_norm": 7.4375, "learning_rate": 9.588184230172754e-06, "loss": 1.10937109, "memory(GiB)": 141.16, "step": 31040, "train_speed(iter/s)": 0.295947 }, { "acc": 0.72993507, "epoch": 0.34740446307359557, "grad_norm": 6.6875, "learning_rate": 9.587448919589924e-06, "loss": 1.08901062, "memory(GiB)": 141.16, "step": 31060, "train_speed(iter/s)": 0.296003 }, { "acc": 0.72856288, "epoch": 0.3476281620195541, "grad_norm": 6.65625, "learning_rate": 9.586712981378512e-06, "loss": 1.08929605, "memory(GiB)": 141.16, "step": 31080, "train_speed(iter/s)": 0.29606 }, { "acc": 0.72957778, "epoch": 0.3478518609655126, "grad_norm": 6.6875, "learning_rate": 9.585976415639205e-06, "loss": 1.08696699, "memory(GiB)": 141.16, "step": 31100, "train_speed(iter/s)": 0.296128 }, { "acc": 0.73249068, "epoch": 0.34807555991147116, "grad_norm": 5.6875, "learning_rate": 9.585239222472773e-06, "loss": 1.05093584, "memory(GiB)": 141.16, "step": 31120, "train_speed(iter/s)": 0.296192 }, { "acc": 0.73062315, "epoch": 0.3482992588574297, "grad_norm": 6.4375, "learning_rate": 9.58450140198008e-06, "loss": 1.0661973, "memory(GiB)": 141.16, "step": 31140, "train_speed(iter/s)": 0.296258 }, { "acc": 0.72489662, "epoch": 0.3485229578033882, "grad_norm": 6.25, "learning_rate": 9.583762954262066e-06, "loss": 1.10534544, "memory(GiB)": 141.16, "step": 31160, "train_speed(iter/s)": 0.296312 }, { "acc": 0.72998219, "epoch": 0.34874665674934674, "grad_norm": 8.3125, "learning_rate": 9.583023879419764e-06, "loss": 1.0794529, "memory(GiB)": 141.16, "step": 31180, "train_speed(iter/s)": 0.296373 }, { "acc": 0.72777653, "epoch": 0.34897035569530527, "grad_norm": 8.1875, "learning_rate": 9.582284177554288e-06, "loss": 1.08523464, "memory(GiB)": 141.16, "step": 31200, "train_speed(iter/s)": 0.296436 }, { "acc": 0.73518219, "epoch": 0.3491940546412638, "grad_norm": 8.75, "learning_rate": 9.581543848766841e-06, "loss": 1.06583996, "memory(GiB)": 141.16, "step": 31220, "train_speed(iter/s)": 0.296499 }, { "acc": 0.72166109, "epoch": 0.3494177535872223, "grad_norm": 6.46875, "learning_rate": 9.58080289315871e-06, "loss": 1.12338905, "memory(GiB)": 141.16, "step": 31240, "train_speed(iter/s)": 0.296562 }, { "acc": 0.72999482, "epoch": 0.34964145253318085, "grad_norm": 7.4375, "learning_rate": 9.580061310831268e-06, "loss": 1.06654243, "memory(GiB)": 141.16, "step": 31260, "train_speed(iter/s)": 0.296622 }, { "acc": 0.73660712, "epoch": 0.3498651514791394, "grad_norm": 7.5, "learning_rate": 9.579319101885975e-06, "loss": 1.0459137, "memory(GiB)": 141.16, "step": 31280, "train_speed(iter/s)": 0.296683 }, { "acc": 0.73974495, "epoch": 0.3500888504250979, "grad_norm": 8.3125, "learning_rate": 9.578576266424376e-06, "loss": 1.0452137, "memory(GiB)": 141.16, "step": 31300, "train_speed(iter/s)": 0.29674 }, { "acc": 0.71987314, "epoch": 0.35031254937105644, "grad_norm": 7.3125, "learning_rate": 9.5778328045481e-06, "loss": 1.13556728, "memory(GiB)": 141.16, "step": 31320, "train_speed(iter/s)": 0.296792 }, { "acc": 0.73485928, "epoch": 0.35053624831701496, "grad_norm": 9.0625, "learning_rate": 9.577088716358864e-06, "loss": 1.05163498, "memory(GiB)": 141.16, "step": 31340, "train_speed(iter/s)": 0.296862 }, { "acc": 0.72985258, "epoch": 0.3507599472629735, "grad_norm": 6.40625, "learning_rate": 9.57634400195847e-06, "loss": 1.06118813, "memory(GiB)": 141.16, "step": 31360, "train_speed(iter/s)": 0.296923 }, { "acc": 0.73460779, "epoch": 0.350983646208932, "grad_norm": 8.125, "learning_rate": 9.575598661448804e-06, "loss": 1.06400871, "memory(GiB)": 141.16, "step": 31380, "train_speed(iter/s)": 0.29699 }, { "acc": 0.7308713, "epoch": 0.35120734515489055, "grad_norm": 6.53125, "learning_rate": 9.574852694931843e-06, "loss": 1.09387054, "memory(GiB)": 141.16, "step": 31400, "train_speed(iter/s)": 0.297052 }, { "acc": 0.7243259, "epoch": 0.3514310441008491, "grad_norm": 7.96875, "learning_rate": 9.574106102509643e-06, "loss": 1.09875736, "memory(GiB)": 141.16, "step": 31420, "train_speed(iter/s)": 0.297111 }, { "acc": 0.71787658, "epoch": 0.3516547430468076, "grad_norm": 6.65625, "learning_rate": 9.573358884284349e-06, "loss": 1.12966976, "memory(GiB)": 141.16, "step": 31440, "train_speed(iter/s)": 0.297169 }, { "acc": 0.73081589, "epoch": 0.35187844199276613, "grad_norm": 7.84375, "learning_rate": 9.572611040358191e-06, "loss": 1.05459938, "memory(GiB)": 141.16, "step": 31460, "train_speed(iter/s)": 0.297234 }, { "acc": 0.73520956, "epoch": 0.35210214093872466, "grad_norm": 8.25, "learning_rate": 9.571862570833486e-06, "loss": 1.06876965, "memory(GiB)": 141.16, "step": 31480, "train_speed(iter/s)": 0.297296 }, { "acc": 0.72939949, "epoch": 0.3523258398846832, "grad_norm": 5.1875, "learning_rate": 9.571113475812635e-06, "loss": 1.10312901, "memory(GiB)": 141.16, "step": 31500, "train_speed(iter/s)": 0.297356 }, { "acc": 0.71985278, "epoch": 0.3525495388306417, "grad_norm": 6.53125, "learning_rate": 9.570363755398122e-06, "loss": 1.12881908, "memory(GiB)": 141.16, "step": 31520, "train_speed(iter/s)": 0.297416 }, { "acc": 0.73201609, "epoch": 0.35277323777660025, "grad_norm": 6.75, "learning_rate": 9.569613409692523e-06, "loss": 1.08282509, "memory(GiB)": 141.16, "step": 31540, "train_speed(iter/s)": 0.297477 }, { "acc": 0.72511053, "epoch": 0.3529969367225588, "grad_norm": 8.5625, "learning_rate": 9.568862438798495e-06, "loss": 1.09895248, "memory(GiB)": 141.16, "step": 31560, "train_speed(iter/s)": 0.297535 }, { "acc": 0.72903404, "epoch": 0.3532206356685173, "grad_norm": 7.96875, "learning_rate": 9.568110842818779e-06, "loss": 1.08591175, "memory(GiB)": 141.16, "step": 31580, "train_speed(iter/s)": 0.297586 }, { "acc": 0.72464929, "epoch": 0.35344433461447583, "grad_norm": 7.6875, "learning_rate": 9.567358621856209e-06, "loss": 1.10393162, "memory(GiB)": 141.16, "step": 31600, "train_speed(iter/s)": 0.297641 }, { "acc": 0.72877913, "epoch": 0.35366803356043436, "grad_norm": 8.25, "learning_rate": 9.566605776013695e-06, "loss": 1.09494572, "memory(GiB)": 141.16, "step": 31620, "train_speed(iter/s)": 0.297701 }, { "acc": 0.72667785, "epoch": 0.3538917325063929, "grad_norm": 6.90625, "learning_rate": 9.565852305394239e-06, "loss": 1.08972635, "memory(GiB)": 141.16, "step": 31640, "train_speed(iter/s)": 0.297743 }, { "acc": 0.72056131, "epoch": 0.3541154314523514, "grad_norm": 7.90625, "learning_rate": 9.565098210100928e-06, "loss": 1.12133236, "memory(GiB)": 141.16, "step": 31660, "train_speed(iter/s)": 0.297805 }, { "acc": 0.73074894, "epoch": 0.35433913039830994, "grad_norm": 7.625, "learning_rate": 9.564343490236932e-06, "loss": 1.06958447, "memory(GiB)": 141.16, "step": 31680, "train_speed(iter/s)": 0.29786 }, { "acc": 0.73299379, "epoch": 0.35456282934426847, "grad_norm": 7.0625, "learning_rate": 9.563588145905504e-06, "loss": 1.06525822, "memory(GiB)": 141.16, "step": 31700, "train_speed(iter/s)": 0.297923 }, { "acc": 0.73152943, "epoch": 0.354786528290227, "grad_norm": 7.4375, "learning_rate": 9.562832177209992e-06, "loss": 1.07681913, "memory(GiB)": 141.16, "step": 31720, "train_speed(iter/s)": 0.297984 }, { "acc": 0.72657232, "epoch": 0.3550102272361855, "grad_norm": 8.8125, "learning_rate": 9.562075584253821e-06, "loss": 1.09708977, "memory(GiB)": 141.16, "step": 31740, "train_speed(iter/s)": 0.298041 }, { "acc": 0.71725292, "epoch": 0.35523392618214406, "grad_norm": 6.625, "learning_rate": 9.5613183671405e-06, "loss": 1.14863691, "memory(GiB)": 141.16, "step": 31760, "train_speed(iter/s)": 0.298103 }, { "acc": 0.74352093, "epoch": 0.3554576251281026, "grad_norm": 6.5625, "learning_rate": 9.560560525973632e-06, "loss": 1.0123292, "memory(GiB)": 141.16, "step": 31780, "train_speed(iter/s)": 0.298162 }, { "acc": 0.73422256, "epoch": 0.3556813240740611, "grad_norm": 5.96875, "learning_rate": 9.559802060856898e-06, "loss": 1.07165546, "memory(GiB)": 141.16, "step": 31800, "train_speed(iter/s)": 0.298226 }, { "acc": 0.72828217, "epoch": 0.35590502302001964, "grad_norm": 8.1875, "learning_rate": 9.559042971894067e-06, "loss": 1.09216623, "memory(GiB)": 141.16, "step": 31820, "train_speed(iter/s)": 0.298281 }, { "acc": 0.72135839, "epoch": 0.35612872196597817, "grad_norm": 6.0, "learning_rate": 9.558283259188993e-06, "loss": 1.12231064, "memory(GiB)": 141.16, "step": 31840, "train_speed(iter/s)": 0.298341 }, { "acc": 0.73784957, "epoch": 0.3563524209119367, "grad_norm": 9.3125, "learning_rate": 9.55752292284562e-06, "loss": 1.03257599, "memory(GiB)": 141.16, "step": 31860, "train_speed(iter/s)": 0.298402 }, { "acc": 0.72943697, "epoch": 0.3565761198578952, "grad_norm": 8.1875, "learning_rate": 9.556761962967964e-06, "loss": 1.09385338, "memory(GiB)": 141.16, "step": 31880, "train_speed(iter/s)": 0.298461 }, { "acc": 0.72335014, "epoch": 0.35679981880385375, "grad_norm": 5.90625, "learning_rate": 9.556000379660145e-06, "loss": 1.10487347, "memory(GiB)": 141.16, "step": 31900, "train_speed(iter/s)": 0.29851 }, { "acc": 0.72923498, "epoch": 0.3570235177498123, "grad_norm": 7.96875, "learning_rate": 9.555238173026351e-06, "loss": 1.10598707, "memory(GiB)": 141.16, "step": 31920, "train_speed(iter/s)": 0.29857 }, { "acc": 0.72933393, "epoch": 0.3572472166957708, "grad_norm": 7.96875, "learning_rate": 9.554475343170867e-06, "loss": 1.07048922, "memory(GiB)": 141.16, "step": 31940, "train_speed(iter/s)": 0.298639 }, { "acc": 0.72664781, "epoch": 0.35747091564172934, "grad_norm": 6.25, "learning_rate": 9.553711890198056e-06, "loss": 1.10828552, "memory(GiB)": 141.16, "step": 31960, "train_speed(iter/s)": 0.298693 }, { "acc": 0.73289604, "epoch": 0.35769461458768786, "grad_norm": 9.25, "learning_rate": 9.55294781421237e-06, "loss": 1.06367273, "memory(GiB)": 141.16, "step": 31980, "train_speed(iter/s)": 0.298747 }, { "acc": 0.73483181, "epoch": 0.3579183135336464, "grad_norm": 7.3125, "learning_rate": 9.55218311531835e-06, "loss": 1.05742092, "memory(GiB)": 141.16, "step": 32000, "train_speed(iter/s)": 0.298807 }, { "epoch": 0.3579183135336464, "eval_acc": 0.6868951546909317, "eval_loss": 1.0922348499298096, "eval_runtime": 2318.3873, "eval_samples_per_second": 32.472, "eval_steps_per_second": 16.236, "step": 32000 }, { "acc": 0.72787008, "epoch": 0.3581420124796049, "grad_norm": 6.25, "learning_rate": 9.551417793620613e-06, "loss": 1.09640274, "memory(GiB)": 141.16, "step": 32020, "train_speed(iter/s)": 0.292409 }, { "acc": 0.72275496, "epoch": 0.35836571142556345, "grad_norm": 8.8125, "learning_rate": 9.550651849223865e-06, "loss": 1.11179695, "memory(GiB)": 141.16, "step": 32040, "train_speed(iter/s)": 0.292472 }, { "acc": 0.73745599, "epoch": 0.35858941037152203, "grad_norm": 7.25, "learning_rate": 9.549885282232903e-06, "loss": 1.03940754, "memory(GiB)": 141.16, "step": 32060, "train_speed(iter/s)": 0.29253 }, { "acc": 0.7421546, "epoch": 0.35881310931748056, "grad_norm": 8.1875, "learning_rate": 9.549118092752599e-06, "loss": 1.02978086, "memory(GiB)": 141.16, "step": 32080, "train_speed(iter/s)": 0.29259 }, { "acc": 0.72200985, "epoch": 0.3590368082634391, "grad_norm": 5.78125, "learning_rate": 9.54835028088792e-06, "loss": 1.11553764, "memory(GiB)": 141.16, "step": 32100, "train_speed(iter/s)": 0.292653 }, { "acc": 0.72171297, "epoch": 0.3592605072093976, "grad_norm": 7.59375, "learning_rate": 9.54758184674391e-06, "loss": 1.12265167, "memory(GiB)": 141.16, "step": 32120, "train_speed(iter/s)": 0.292716 }, { "acc": 0.73304863, "epoch": 0.35948420615535615, "grad_norm": 6.5, "learning_rate": 9.546812790425704e-06, "loss": 1.0780611, "memory(GiB)": 141.16, "step": 32140, "train_speed(iter/s)": 0.292779 }, { "acc": 0.7308526, "epoch": 0.3597079051013147, "grad_norm": 7.3125, "learning_rate": 9.54604311203852e-06, "loss": 1.07535534, "memory(GiB)": 141.16, "step": 32160, "train_speed(iter/s)": 0.292839 }, { "acc": 0.71931477, "epoch": 0.3599316040472732, "grad_norm": 7.15625, "learning_rate": 9.54527281168766e-06, "loss": 1.13315754, "memory(GiB)": 141.16, "step": 32180, "train_speed(iter/s)": 0.292898 }, { "acc": 0.72567587, "epoch": 0.36015530299323173, "grad_norm": 6.9375, "learning_rate": 9.544501889478513e-06, "loss": 1.09115419, "memory(GiB)": 141.16, "step": 32200, "train_speed(iter/s)": 0.292957 }, { "acc": 0.72037392, "epoch": 0.36037900193919026, "grad_norm": 5.25, "learning_rate": 9.54373034551655e-06, "loss": 1.11615314, "memory(GiB)": 141.16, "step": 32220, "train_speed(iter/s)": 0.293021 }, { "acc": 0.73756905, "epoch": 0.3606027008851488, "grad_norm": 8.25, "learning_rate": 9.542958179907331e-06, "loss": 1.04748869, "memory(GiB)": 141.16, "step": 32240, "train_speed(iter/s)": 0.293081 }, { "acc": 0.72965269, "epoch": 0.3608263998311073, "grad_norm": 7.75, "learning_rate": 9.542185392756501e-06, "loss": 1.0980875, "memory(GiB)": 141.16, "step": 32260, "train_speed(iter/s)": 0.293146 }, { "acc": 0.71891241, "epoch": 0.36105009877706584, "grad_norm": 5.78125, "learning_rate": 9.541411984169785e-06, "loss": 1.1461647, "memory(GiB)": 141.16, "step": 32280, "train_speed(iter/s)": 0.2932 }, { "acc": 0.72874322, "epoch": 0.36127379772302437, "grad_norm": 5.25, "learning_rate": 9.540637954253e-06, "loss": 1.08954945, "memory(GiB)": 141.16, "step": 32300, "train_speed(iter/s)": 0.293259 }, { "acc": 0.73475704, "epoch": 0.3614974966689829, "grad_norm": 7.25, "learning_rate": 9.53986330311204e-06, "loss": 1.05697489, "memory(GiB)": 141.16, "step": 32320, "train_speed(iter/s)": 0.293321 }, { "acc": 0.73477135, "epoch": 0.3617211956149414, "grad_norm": 10.4375, "learning_rate": 9.539088030852891e-06, "loss": 1.06109371, "memory(GiB)": 141.16, "step": 32340, "train_speed(iter/s)": 0.293375 }, { "acc": 0.73551474, "epoch": 0.36194489456089995, "grad_norm": 8.625, "learning_rate": 9.538312137581621e-06, "loss": 1.06035099, "memory(GiB)": 141.16, "step": 32360, "train_speed(iter/s)": 0.293438 }, { "acc": 0.73727522, "epoch": 0.3621685935068585, "grad_norm": 6.75, "learning_rate": 9.537535623404384e-06, "loss": 1.07306309, "memory(GiB)": 141.16, "step": 32380, "train_speed(iter/s)": 0.293501 }, { "acc": 0.7231842, "epoch": 0.362392292452817, "grad_norm": 7.59375, "learning_rate": 9.536758488427415e-06, "loss": 1.11877747, "memory(GiB)": 141.16, "step": 32400, "train_speed(iter/s)": 0.29356 }, { "acc": 0.71813354, "epoch": 0.36261599139877554, "grad_norm": 8.1875, "learning_rate": 9.535980732757042e-06, "loss": 1.15147419, "memory(GiB)": 141.16, "step": 32420, "train_speed(iter/s)": 0.293613 }, { "acc": 0.72894192, "epoch": 0.36283969034473407, "grad_norm": 6.78125, "learning_rate": 9.53520235649967e-06, "loss": 1.08814278, "memory(GiB)": 141.16, "step": 32440, "train_speed(iter/s)": 0.293673 }, { "acc": 0.72178221, "epoch": 0.3630633892906926, "grad_norm": 7.28125, "learning_rate": 9.534423359761792e-06, "loss": 1.13102627, "memory(GiB)": 141.16, "step": 32460, "train_speed(iter/s)": 0.29373 }, { "acc": 0.71897869, "epoch": 0.3632870882366511, "grad_norm": 7.1875, "learning_rate": 9.533643742649988e-06, "loss": 1.13208017, "memory(GiB)": 141.16, "step": 32480, "train_speed(iter/s)": 0.293778 }, { "acc": 0.71785626, "epoch": 0.36351078718260965, "grad_norm": 7.15625, "learning_rate": 9.532863505270917e-06, "loss": 1.14858932, "memory(GiB)": 141.16, "step": 32500, "train_speed(iter/s)": 0.293838 }, { "acc": 0.73579721, "epoch": 0.3637344861285682, "grad_norm": 6.28125, "learning_rate": 9.532082647731332e-06, "loss": 1.06702061, "memory(GiB)": 141.16, "step": 32520, "train_speed(iter/s)": 0.293906 }, { "acc": 0.73640976, "epoch": 0.3639581850745267, "grad_norm": 7.625, "learning_rate": 9.531301170138059e-06, "loss": 1.06876259, "memory(GiB)": 141.16, "step": 32540, "train_speed(iter/s)": 0.293967 }, { "acc": 0.71943526, "epoch": 0.36418188402048524, "grad_norm": 6.75, "learning_rate": 9.53051907259802e-06, "loss": 1.11978817, "memory(GiB)": 141.16, "step": 32560, "train_speed(iter/s)": 0.294028 }, { "acc": 0.73648233, "epoch": 0.36440558296644376, "grad_norm": 6.96875, "learning_rate": 9.529736355218215e-06, "loss": 1.04526854, "memory(GiB)": 141.16, "step": 32580, "train_speed(iter/s)": 0.294093 }, { "acc": 0.71778522, "epoch": 0.3646292819124023, "grad_norm": 9.25, "learning_rate": 9.528953018105734e-06, "loss": 1.13411083, "memory(GiB)": 141.16, "step": 32600, "train_speed(iter/s)": 0.294157 }, { "acc": 0.73150024, "epoch": 0.3648529808583608, "grad_norm": 7.96875, "learning_rate": 9.528169061367745e-06, "loss": 1.07755051, "memory(GiB)": 141.16, "step": 32620, "train_speed(iter/s)": 0.294224 }, { "acc": 0.72970467, "epoch": 0.36507667980431935, "grad_norm": 7.6875, "learning_rate": 9.527384485111506e-06, "loss": 1.09502468, "memory(GiB)": 141.16, "step": 32640, "train_speed(iter/s)": 0.294286 }, { "acc": 0.72202125, "epoch": 0.3653003787502779, "grad_norm": 7.375, "learning_rate": 9.52659928944436e-06, "loss": 1.10825558, "memory(GiB)": 141.16, "step": 32660, "train_speed(iter/s)": 0.294357 }, { "acc": 0.73787942, "epoch": 0.3655240776962364, "grad_norm": 5.5625, "learning_rate": 9.525813474473728e-06, "loss": 1.04990215, "memory(GiB)": 141.16, "step": 32680, "train_speed(iter/s)": 0.294417 }, { "acc": 0.74683995, "epoch": 0.36574777664219493, "grad_norm": 6.71875, "learning_rate": 9.525027040307127e-06, "loss": 1.00989742, "memory(GiB)": 141.16, "step": 32700, "train_speed(iter/s)": 0.294474 }, { "acc": 0.71981115, "epoch": 0.36597147558815346, "grad_norm": 6.8125, "learning_rate": 9.524239987052148e-06, "loss": 1.12087803, "memory(GiB)": 141.16, "step": 32720, "train_speed(iter/s)": 0.294529 }, { "acc": 0.72489882, "epoch": 0.366195174534112, "grad_norm": 6.0625, "learning_rate": 9.523452314816473e-06, "loss": 1.11598549, "memory(GiB)": 141.16, "step": 32740, "train_speed(iter/s)": 0.294588 }, { "acc": 0.71590328, "epoch": 0.3664188734800705, "grad_norm": 6.09375, "learning_rate": 9.522664023707864e-06, "loss": 1.1283102, "memory(GiB)": 141.16, "step": 32760, "train_speed(iter/s)": 0.294645 }, { "acc": 0.72779222, "epoch": 0.36664257242602905, "grad_norm": 9.375, "learning_rate": 9.521875113834175e-06, "loss": 1.10623846, "memory(GiB)": 141.16, "step": 32780, "train_speed(iter/s)": 0.294704 }, { "acc": 0.72927999, "epoch": 0.3668662713719876, "grad_norm": 6.0625, "learning_rate": 9.521085585303338e-06, "loss": 1.09177361, "memory(GiB)": 141.16, "step": 32800, "train_speed(iter/s)": 0.294761 }, { "acc": 0.73552275, "epoch": 0.3670899703179461, "grad_norm": 7.8125, "learning_rate": 9.52029543822337e-06, "loss": 1.0543335, "memory(GiB)": 141.16, "step": 32820, "train_speed(iter/s)": 0.294816 }, { "acc": 0.72318563, "epoch": 0.36731366926390463, "grad_norm": 7.25, "learning_rate": 9.519504672702378e-06, "loss": 1.13271523, "memory(GiB)": 141.16, "step": 32840, "train_speed(iter/s)": 0.294868 }, { "acc": 0.72336159, "epoch": 0.36753736820986316, "grad_norm": 8.75, "learning_rate": 9.518713288848547e-06, "loss": 1.12014904, "memory(GiB)": 141.16, "step": 32860, "train_speed(iter/s)": 0.294933 }, { "acc": 0.72936039, "epoch": 0.3677610671558217, "grad_norm": 7.15625, "learning_rate": 9.517921286770151e-06, "loss": 1.0804904, "memory(GiB)": 141.16, "step": 32880, "train_speed(iter/s)": 0.294994 }, { "acc": 0.73694358, "epoch": 0.3679847661017802, "grad_norm": 6.96875, "learning_rate": 9.517128666575548e-06, "loss": 1.05276136, "memory(GiB)": 141.16, "step": 32900, "train_speed(iter/s)": 0.295062 }, { "acc": 0.72851939, "epoch": 0.36820846504773874, "grad_norm": 6.0625, "learning_rate": 9.516335428373177e-06, "loss": 1.08134127, "memory(GiB)": 141.16, "step": 32920, "train_speed(iter/s)": 0.295129 }, { "acc": 0.73440475, "epoch": 0.36843216399369727, "grad_norm": 7.4375, "learning_rate": 9.515541572271567e-06, "loss": 1.06070747, "memory(GiB)": 141.16, "step": 32940, "train_speed(iter/s)": 0.295193 }, { "acc": 0.73752103, "epoch": 0.3686558629396558, "grad_norm": 7.4375, "learning_rate": 9.514747098379329e-06, "loss": 1.03112125, "memory(GiB)": 141.16, "step": 32960, "train_speed(iter/s)": 0.295255 }, { "acc": 0.73046131, "epoch": 0.3688795618856143, "grad_norm": 8.125, "learning_rate": 9.513952006805157e-06, "loss": 1.08606062, "memory(GiB)": 141.16, "step": 32980, "train_speed(iter/s)": 0.295314 }, { "acc": 0.73844442, "epoch": 0.36910326083157285, "grad_norm": 8.4375, "learning_rate": 9.51315629765783e-06, "loss": 1.0418251, "memory(GiB)": 141.16, "step": 33000, "train_speed(iter/s)": 0.295367 }, { "acc": 0.73410473, "epoch": 0.3693269597775314, "grad_norm": 7.375, "learning_rate": 9.512359971046214e-06, "loss": 1.07399654, "memory(GiB)": 141.16, "step": 33020, "train_speed(iter/s)": 0.29542 }, { "acc": 0.72618442, "epoch": 0.3695506587234899, "grad_norm": 8.5, "learning_rate": 9.511563027079258e-06, "loss": 1.08105087, "memory(GiB)": 141.16, "step": 33040, "train_speed(iter/s)": 0.295475 }, { "acc": 0.71924334, "epoch": 0.36977435766944844, "grad_norm": 6.5625, "learning_rate": 9.510765465865995e-06, "loss": 1.11341877, "memory(GiB)": 141.16, "step": 33060, "train_speed(iter/s)": 0.295529 }, { "acc": 0.72174239, "epoch": 0.36999805661540697, "grad_norm": 7.75, "learning_rate": 9.509967287515542e-06, "loss": 1.10078468, "memory(GiB)": 141.16, "step": 33080, "train_speed(iter/s)": 0.295586 }, { "acc": 0.71324692, "epoch": 0.3702217555613655, "grad_norm": 6.46875, "learning_rate": 9.509168492137102e-06, "loss": 1.15640421, "memory(GiB)": 141.16, "step": 33100, "train_speed(iter/s)": 0.295651 }, { "acc": 0.73003035, "epoch": 0.370445454507324, "grad_norm": 7.6875, "learning_rate": 9.50836907983996e-06, "loss": 1.07222614, "memory(GiB)": 141.16, "step": 33120, "train_speed(iter/s)": 0.295702 }, { "acc": 0.72203631, "epoch": 0.37066915345328255, "grad_norm": 6.5625, "learning_rate": 9.507569050733491e-06, "loss": 1.11684532, "memory(GiB)": 141.16, "step": 33140, "train_speed(iter/s)": 0.29576 }, { "acc": 0.71721725, "epoch": 0.3708928523992411, "grad_norm": 5.8125, "learning_rate": 9.506768404927147e-06, "loss": 1.13414583, "memory(GiB)": 141.16, "step": 33160, "train_speed(iter/s)": 0.295817 }, { "acc": 0.72813616, "epoch": 0.3711165513451996, "grad_norm": 8.0625, "learning_rate": 9.505967142530468e-06, "loss": 1.0836134, "memory(GiB)": 141.16, "step": 33180, "train_speed(iter/s)": 0.295874 }, { "acc": 0.71731825, "epoch": 0.37134025029115814, "grad_norm": 6.28125, "learning_rate": 9.505165263653078e-06, "loss": 1.13756075, "memory(GiB)": 141.16, "step": 33200, "train_speed(iter/s)": 0.295931 }, { "acc": 0.71914716, "epoch": 0.37156394923711666, "grad_norm": 6.34375, "learning_rate": 9.504362768404689e-06, "loss": 1.11012287, "memory(GiB)": 141.16, "step": 33220, "train_speed(iter/s)": 0.29599 }, { "acc": 0.72462564, "epoch": 0.3717876481830752, "grad_norm": 8.25, "learning_rate": 9.503559656895089e-06, "loss": 1.09621506, "memory(GiB)": 141.16, "step": 33240, "train_speed(iter/s)": 0.296054 }, { "acc": 0.73054972, "epoch": 0.3720113471290337, "grad_norm": 7.8125, "learning_rate": 9.502755929234158e-06, "loss": 1.08769798, "memory(GiB)": 141.16, "step": 33260, "train_speed(iter/s)": 0.296115 }, { "acc": 0.72974148, "epoch": 0.37223504607499225, "grad_norm": 5.84375, "learning_rate": 9.501951585531856e-06, "loss": 1.08259602, "memory(GiB)": 141.16, "step": 33280, "train_speed(iter/s)": 0.296167 }, { "acc": 0.73146038, "epoch": 0.37245874502095083, "grad_norm": 8.625, "learning_rate": 9.50114662589823e-06, "loss": 1.07485094, "memory(GiB)": 141.16, "step": 33300, "train_speed(iter/s)": 0.296218 }, { "acc": 0.72893715, "epoch": 0.37268244396690936, "grad_norm": 7.375, "learning_rate": 9.500341050443409e-06, "loss": 1.09198341, "memory(GiB)": 141.16, "step": 33320, "train_speed(iter/s)": 0.296273 }, { "acc": 0.73320775, "epoch": 0.3729061429128679, "grad_norm": 5.6875, "learning_rate": 9.499534859277607e-06, "loss": 1.06208305, "memory(GiB)": 141.16, "step": 33340, "train_speed(iter/s)": 0.296328 }, { "acc": 0.73221931, "epoch": 0.3731298418588264, "grad_norm": 7.0, "learning_rate": 9.498728052511124e-06, "loss": 1.06263332, "memory(GiB)": 141.16, "step": 33360, "train_speed(iter/s)": 0.296378 }, { "acc": 0.72726717, "epoch": 0.37335354080478494, "grad_norm": 8.25, "learning_rate": 9.497920630254342e-06, "loss": 1.09853859, "memory(GiB)": 141.16, "step": 33380, "train_speed(iter/s)": 0.296436 }, { "acc": 0.72894859, "epoch": 0.3735772397507435, "grad_norm": 8.4375, "learning_rate": 9.497112592617727e-06, "loss": 1.08231878, "memory(GiB)": 141.16, "step": 33400, "train_speed(iter/s)": 0.296494 }, { "acc": 0.73177781, "epoch": 0.373800938696702, "grad_norm": 7.3125, "learning_rate": 9.49630393971183e-06, "loss": 1.06628933, "memory(GiB)": 141.16, "step": 33420, "train_speed(iter/s)": 0.296559 }, { "acc": 0.73900347, "epoch": 0.37402463764266053, "grad_norm": 6.5, "learning_rate": 9.495494671647289e-06, "loss": 1.02944908, "memory(GiB)": 141.16, "step": 33440, "train_speed(iter/s)": 0.296611 }, { "acc": 0.74152894, "epoch": 0.37424833658861906, "grad_norm": 7.9375, "learning_rate": 9.494684788534821e-06, "loss": 1.02197037, "memory(GiB)": 141.16, "step": 33460, "train_speed(iter/s)": 0.296669 }, { "acc": 0.72661681, "epoch": 0.3744720355345776, "grad_norm": 7.40625, "learning_rate": 9.493874290485229e-06, "loss": 1.08732214, "memory(GiB)": 141.16, "step": 33480, "train_speed(iter/s)": 0.296727 }, { "acc": 0.7245862, "epoch": 0.3746957344805361, "grad_norm": 7.0625, "learning_rate": 9.493063177609403e-06, "loss": 1.08469772, "memory(GiB)": 141.16, "step": 33500, "train_speed(iter/s)": 0.296786 }, { "acc": 0.71269608, "epoch": 0.37491943342649464, "grad_norm": 6.3125, "learning_rate": 9.492251450018313e-06, "loss": 1.16679192, "memory(GiB)": 141.16, "step": 33520, "train_speed(iter/s)": 0.296838 }, { "acc": 0.72980289, "epoch": 0.37514313237245317, "grad_norm": 8.3125, "learning_rate": 9.491439107823015e-06, "loss": 1.09403381, "memory(GiB)": 141.16, "step": 33540, "train_speed(iter/s)": 0.2969 }, { "acc": 0.72515726, "epoch": 0.3753668313184117, "grad_norm": 7.5625, "learning_rate": 9.49062615113465e-06, "loss": 1.11629181, "memory(GiB)": 141.16, "step": 33560, "train_speed(iter/s)": 0.296957 }, { "acc": 0.72519007, "epoch": 0.3755905302643702, "grad_norm": 5.875, "learning_rate": 9.489812580064442e-06, "loss": 1.08772221, "memory(GiB)": 141.16, "step": 33580, "train_speed(iter/s)": 0.29702 }, { "acc": 0.71923151, "epoch": 0.37581422921032875, "grad_norm": 6.96875, "learning_rate": 9.488998394723699e-06, "loss": 1.14167976, "memory(GiB)": 141.16, "step": 33600, "train_speed(iter/s)": 0.297076 }, { "acc": 0.72726955, "epoch": 0.3760379281562873, "grad_norm": 8.75, "learning_rate": 9.488183595223811e-06, "loss": 1.09426346, "memory(GiB)": 141.16, "step": 33620, "train_speed(iter/s)": 0.297132 }, { "acc": 0.73002949, "epoch": 0.3762616271022458, "grad_norm": 6.84375, "learning_rate": 9.487368181676259e-06, "loss": 1.0990099, "memory(GiB)": 141.16, "step": 33640, "train_speed(iter/s)": 0.297188 }, { "acc": 0.73666849, "epoch": 0.37648532604820434, "grad_norm": 6.84375, "learning_rate": 9.4865521541926e-06, "loss": 1.05509977, "memory(GiB)": 141.16, "step": 33660, "train_speed(iter/s)": 0.297242 }, { "acc": 0.7312417, "epoch": 0.37670902499416287, "grad_norm": 5.9375, "learning_rate": 9.48573551288448e-06, "loss": 1.06835499, "memory(GiB)": 141.16, "step": 33680, "train_speed(iter/s)": 0.297298 }, { "acc": 0.7346427, "epoch": 0.3769327239401214, "grad_norm": 7.0625, "learning_rate": 9.484918257863623e-06, "loss": 1.05382719, "memory(GiB)": 141.16, "step": 33700, "train_speed(iter/s)": 0.297356 }, { "acc": 0.74189911, "epoch": 0.3771564228860799, "grad_norm": 7.53125, "learning_rate": 9.484100389241844e-06, "loss": 1.02937737, "memory(GiB)": 141.16, "step": 33720, "train_speed(iter/s)": 0.297413 }, { "acc": 0.72398472, "epoch": 0.37738012183203845, "grad_norm": 8.5625, "learning_rate": 9.483281907131042e-06, "loss": 1.10404873, "memory(GiB)": 141.16, "step": 33740, "train_speed(iter/s)": 0.297474 }, { "acc": 0.73424158, "epoch": 0.377603820777997, "grad_norm": 10.25, "learning_rate": 9.482462811643191e-06, "loss": 1.06809292, "memory(GiB)": 141.16, "step": 33760, "train_speed(iter/s)": 0.297529 }, { "acc": 0.72598019, "epoch": 0.3778275197239555, "grad_norm": 6.84375, "learning_rate": 9.481643102890361e-06, "loss": 1.10772991, "memory(GiB)": 141.16, "step": 33780, "train_speed(iter/s)": 0.297583 }, { "acc": 0.72901812, "epoch": 0.37805121866991404, "grad_norm": 6.4375, "learning_rate": 9.480822780984695e-06, "loss": 1.09637642, "memory(GiB)": 141.16, "step": 33800, "train_speed(iter/s)": 0.297635 }, { "acc": 0.71879435, "epoch": 0.37827491761587256, "grad_norm": 8.5625, "learning_rate": 9.480001846038429e-06, "loss": 1.12681999, "memory(GiB)": 141.16, "step": 33820, "train_speed(iter/s)": 0.297702 }, { "acc": 0.72074981, "epoch": 0.3784986165618311, "grad_norm": 6.6875, "learning_rate": 9.479180298163876e-06, "loss": 1.11550179, "memory(GiB)": 141.16, "step": 33840, "train_speed(iter/s)": 0.297756 }, { "acc": 0.72563896, "epoch": 0.3787223155077896, "grad_norm": 5.09375, "learning_rate": 9.478358137473433e-06, "loss": 1.09518757, "memory(GiB)": 141.16, "step": 33860, "train_speed(iter/s)": 0.297811 }, { "acc": 0.72634296, "epoch": 0.37894601445374815, "grad_norm": 6.53125, "learning_rate": 9.477535364079588e-06, "loss": 1.10469894, "memory(GiB)": 141.16, "step": 33880, "train_speed(iter/s)": 0.297866 }, { "acc": 0.7367836, "epoch": 0.3791697133997067, "grad_norm": 7.5625, "learning_rate": 9.476711978094908e-06, "loss": 1.04484434, "memory(GiB)": 141.16, "step": 33900, "train_speed(iter/s)": 0.297914 }, { "acc": 0.72284393, "epoch": 0.3793934123456652, "grad_norm": 6.03125, "learning_rate": 9.475887979632041e-06, "loss": 1.11562099, "memory(GiB)": 141.16, "step": 33920, "train_speed(iter/s)": 0.297965 }, { "acc": 0.72960358, "epoch": 0.37961711129162373, "grad_norm": 6.3125, "learning_rate": 9.475063368803724e-06, "loss": 1.0821207, "memory(GiB)": 141.16, "step": 33940, "train_speed(iter/s)": 0.29802 }, { "acc": 0.73682837, "epoch": 0.37984081023758226, "grad_norm": 5.34375, "learning_rate": 9.474238145722775e-06, "loss": 1.05498066, "memory(GiB)": 141.16, "step": 33960, "train_speed(iter/s)": 0.298079 }, { "acc": 0.73839245, "epoch": 0.3800645091835408, "grad_norm": 6.53125, "learning_rate": 9.473412310502095e-06, "loss": 1.04816074, "memory(GiB)": 141.16, "step": 33980, "train_speed(iter/s)": 0.298137 }, { "acc": 0.72384553, "epoch": 0.3802882081294993, "grad_norm": 5.0, "learning_rate": 9.472585863254672e-06, "loss": 1.11227789, "memory(GiB)": 141.16, "step": 34000, "train_speed(iter/s)": 0.29819 }, { "epoch": 0.3802882081294993, "eval_acc": 0.6872426468123403, "eval_loss": 1.090966820716858, "eval_runtime": 2318.6555, "eval_samples_per_second": 32.468, "eval_steps_per_second": 16.234, "step": 34000 }, { "acc": 0.7415102, "epoch": 0.38051190707545784, "grad_norm": 7.59375, "learning_rate": 9.471758804093574e-06, "loss": 1.03424587, "memory(GiB)": 141.16, "step": 34020, "train_speed(iter/s)": 0.292178 }, { "acc": 0.72293415, "epoch": 0.3807356060214164, "grad_norm": 4.8125, "learning_rate": 9.470931133131957e-06, "loss": 1.11382599, "memory(GiB)": 141.16, "step": 34040, "train_speed(iter/s)": 0.292237 }, { "acc": 0.73179092, "epoch": 0.3809593049673749, "grad_norm": 5.8125, "learning_rate": 9.470102850483055e-06, "loss": 1.08935909, "memory(GiB)": 141.16, "step": 34060, "train_speed(iter/s)": 0.292295 }, { "acc": 0.72877378, "epoch": 0.38118300391333343, "grad_norm": 6.875, "learning_rate": 9.46927395626019e-06, "loss": 1.09507999, "memory(GiB)": 141.16, "step": 34080, "train_speed(iter/s)": 0.292348 }, { "acc": 0.73999023, "epoch": 0.38140670285929196, "grad_norm": 6.9375, "learning_rate": 9.468444450576768e-06, "loss": 1.03851814, "memory(GiB)": 141.16, "step": 34100, "train_speed(iter/s)": 0.292405 }, { "acc": 0.74307728, "epoch": 0.3816304018052505, "grad_norm": 7.09375, "learning_rate": 9.467614333546278e-06, "loss": 1.01876278, "memory(GiB)": 141.16, "step": 34120, "train_speed(iter/s)": 0.292466 }, { "acc": 0.73676443, "epoch": 0.381854100751209, "grad_norm": 6.53125, "learning_rate": 9.46678360528229e-06, "loss": 1.06319523, "memory(GiB)": 141.16, "step": 34140, "train_speed(iter/s)": 0.292525 }, { "acc": 0.72595844, "epoch": 0.38207779969716754, "grad_norm": 7.34375, "learning_rate": 9.465952265898458e-06, "loss": 1.09755344, "memory(GiB)": 141.16, "step": 34160, "train_speed(iter/s)": 0.292579 }, { "acc": 0.73279371, "epoch": 0.38230149864312607, "grad_norm": 6.125, "learning_rate": 9.465120315508522e-06, "loss": 1.08720446, "memory(GiB)": 141.16, "step": 34180, "train_speed(iter/s)": 0.292623 }, { "acc": 0.73303943, "epoch": 0.3825251975890846, "grad_norm": 6.25, "learning_rate": 9.464287754226308e-06, "loss": 1.06536045, "memory(GiB)": 141.16, "step": 34200, "train_speed(iter/s)": 0.292679 }, { "acc": 0.72568369, "epoch": 0.3827488965350431, "grad_norm": 7.875, "learning_rate": 9.463454582165719e-06, "loss": 1.10910931, "memory(GiB)": 141.16, "step": 34220, "train_speed(iter/s)": 0.292736 }, { "acc": 0.73339944, "epoch": 0.38297259548100165, "grad_norm": 8.75, "learning_rate": 9.462620799440746e-06, "loss": 1.06640778, "memory(GiB)": 141.16, "step": 34240, "train_speed(iter/s)": 0.292791 }, { "acc": 0.72528257, "epoch": 0.3831962944269602, "grad_norm": 9.0625, "learning_rate": 9.461786406165463e-06, "loss": 1.11594305, "memory(GiB)": 141.16, "step": 34260, "train_speed(iter/s)": 0.292842 }, { "acc": 0.72584715, "epoch": 0.3834199933729187, "grad_norm": 8.0625, "learning_rate": 9.460951402454024e-06, "loss": 1.11560097, "memory(GiB)": 141.16, "step": 34280, "train_speed(iter/s)": 0.292904 }, { "acc": 0.71656737, "epoch": 0.38364369231887724, "grad_norm": 7.53125, "learning_rate": 9.460115788420672e-06, "loss": 1.14953356, "memory(GiB)": 141.16, "step": 34300, "train_speed(iter/s)": 0.292955 }, { "acc": 0.72978134, "epoch": 0.38386739126483577, "grad_norm": 7.03125, "learning_rate": 9.45927956417973e-06, "loss": 1.0714715, "memory(GiB)": 141.16, "step": 34320, "train_speed(iter/s)": 0.293009 }, { "acc": 0.7180027, "epoch": 0.3840910902107943, "grad_norm": 6.875, "learning_rate": 9.458442729845608e-06, "loss": 1.14632034, "memory(GiB)": 141.16, "step": 34340, "train_speed(iter/s)": 0.293063 }, { "acc": 0.72415657, "epoch": 0.3843147891567528, "grad_norm": 5.90625, "learning_rate": 9.457605285532792e-06, "loss": 1.11506596, "memory(GiB)": 141.16, "step": 34360, "train_speed(iter/s)": 0.293111 }, { "acc": 0.74149828, "epoch": 0.38453848810271135, "grad_norm": 6.4375, "learning_rate": 9.45676723135586e-06, "loss": 1.03046341, "memory(GiB)": 141.16, "step": 34380, "train_speed(iter/s)": 0.293165 }, { "acc": 0.73377705, "epoch": 0.3847621870486699, "grad_norm": 6.0625, "learning_rate": 9.455928567429469e-06, "loss": 1.05343723, "memory(GiB)": 141.16, "step": 34400, "train_speed(iter/s)": 0.293219 }, { "acc": 0.71778421, "epoch": 0.3849858859946284, "grad_norm": 6.65625, "learning_rate": 9.45508929386836e-06, "loss": 1.14047203, "memory(GiB)": 141.16, "step": 34420, "train_speed(iter/s)": 0.293272 }, { "acc": 0.72466154, "epoch": 0.38520958494058694, "grad_norm": 5.84375, "learning_rate": 9.454249410787358e-06, "loss": 1.10724154, "memory(GiB)": 141.16, "step": 34440, "train_speed(iter/s)": 0.293334 }, { "acc": 0.71043329, "epoch": 0.38543328388654546, "grad_norm": 9.4375, "learning_rate": 9.45340891830137e-06, "loss": 1.18088083, "memory(GiB)": 141.16, "step": 34460, "train_speed(iter/s)": 0.293388 }, { "acc": 0.72897353, "epoch": 0.385656982832504, "grad_norm": 7.625, "learning_rate": 9.452567816525388e-06, "loss": 1.08291035, "memory(GiB)": 141.16, "step": 34480, "train_speed(iter/s)": 0.293443 }, { "acc": 0.73521328, "epoch": 0.3858806817784625, "grad_norm": 6.8125, "learning_rate": 9.451726105574489e-06, "loss": 1.05052109, "memory(GiB)": 141.16, "step": 34500, "train_speed(iter/s)": 0.2935 }, { "acc": 0.73835363, "epoch": 0.38610438072442105, "grad_norm": 8.3125, "learning_rate": 9.450883785563827e-06, "loss": 1.04503365, "memory(GiB)": 141.16, "step": 34520, "train_speed(iter/s)": 0.293562 }, { "acc": 0.72427502, "epoch": 0.3863280796703796, "grad_norm": 6.6875, "learning_rate": 9.450040856608647e-06, "loss": 1.1097681, "memory(GiB)": 141.16, "step": 34540, "train_speed(iter/s)": 0.293608 }, { "acc": 0.73458285, "epoch": 0.38655177861633816, "grad_norm": 7.6875, "learning_rate": 9.44919731882427e-06, "loss": 1.06918926, "memory(GiB)": 141.16, "step": 34560, "train_speed(iter/s)": 0.293658 }, { "acc": 0.73868437, "epoch": 0.3867754775622967, "grad_norm": 7.125, "learning_rate": 9.448353172326106e-06, "loss": 1.04756269, "memory(GiB)": 141.16, "step": 34580, "train_speed(iter/s)": 0.293711 }, { "acc": 0.72719479, "epoch": 0.3869991765082552, "grad_norm": 7.53125, "learning_rate": 9.447508417229649e-06, "loss": 1.10343895, "memory(GiB)": 141.16, "step": 34600, "train_speed(iter/s)": 0.293763 }, { "acc": 0.72280083, "epoch": 0.38722287545421374, "grad_norm": 6.0, "learning_rate": 9.446663053650468e-06, "loss": 1.11111937, "memory(GiB)": 141.16, "step": 34620, "train_speed(iter/s)": 0.293815 }, { "acc": 0.73577852, "epoch": 0.38744657440017227, "grad_norm": 6.78125, "learning_rate": 9.445817081704226e-06, "loss": 1.05065145, "memory(GiB)": 141.16, "step": 34640, "train_speed(iter/s)": 0.293873 }, { "acc": 0.72523656, "epoch": 0.3876702733461308, "grad_norm": 6.8125, "learning_rate": 9.444970501506661e-06, "loss": 1.10347652, "memory(GiB)": 141.16, "step": 34660, "train_speed(iter/s)": 0.293923 }, { "acc": 0.73579431, "epoch": 0.38789397229208933, "grad_norm": 7.84375, "learning_rate": 9.4441233131736e-06, "loss": 1.05626507, "memory(GiB)": 141.16, "step": 34680, "train_speed(iter/s)": 0.293977 }, { "acc": 0.73244877, "epoch": 0.38811767123804786, "grad_norm": 6.875, "learning_rate": 9.443275516820944e-06, "loss": 1.08498402, "memory(GiB)": 141.16, "step": 34700, "train_speed(iter/s)": 0.294026 }, { "acc": 0.73605194, "epoch": 0.3883413701840064, "grad_norm": 7.125, "learning_rate": 9.442427112564692e-06, "loss": 1.04874125, "memory(GiB)": 141.16, "step": 34720, "train_speed(iter/s)": 0.294081 }, { "acc": 0.73094625, "epoch": 0.3885650691299649, "grad_norm": 8.75, "learning_rate": 9.441578100520914e-06, "loss": 1.09213448, "memory(GiB)": 141.16, "step": 34740, "train_speed(iter/s)": 0.294136 }, { "acc": 0.7251574, "epoch": 0.38878876807592344, "grad_norm": 6.96875, "learning_rate": 9.440728480805765e-06, "loss": 1.11617546, "memory(GiB)": 141.16, "step": 34760, "train_speed(iter/s)": 0.294193 }, { "acc": 0.73425856, "epoch": 0.38901246702188197, "grad_norm": 7.0625, "learning_rate": 9.439878253535488e-06, "loss": 1.07271233, "memory(GiB)": 141.16, "step": 34780, "train_speed(iter/s)": 0.294246 }, { "acc": 0.7369194, "epoch": 0.3892361659678405, "grad_norm": 10.25, "learning_rate": 9.439027418826406e-06, "loss": 1.06444321, "memory(GiB)": 141.16, "step": 34800, "train_speed(iter/s)": 0.294304 }, { "acc": 0.73000827, "epoch": 0.389459864913799, "grad_norm": 7.21875, "learning_rate": 9.438175976794926e-06, "loss": 1.07658863, "memory(GiB)": 141.16, "step": 34820, "train_speed(iter/s)": 0.29436 }, { "acc": 0.73845034, "epoch": 0.38968356385975755, "grad_norm": 7.40625, "learning_rate": 9.437323927557534e-06, "loss": 1.05360985, "memory(GiB)": 141.16, "step": 34840, "train_speed(iter/s)": 0.294415 }, { "acc": 0.72667255, "epoch": 0.3899072628057161, "grad_norm": 6.9375, "learning_rate": 9.436471271230804e-06, "loss": 1.09145279, "memory(GiB)": 141.16, "step": 34860, "train_speed(iter/s)": 0.294474 }, { "acc": 0.71671209, "epoch": 0.3901309617516746, "grad_norm": 6.46875, "learning_rate": 9.435618007931395e-06, "loss": 1.13057556, "memory(GiB)": 141.16, "step": 34880, "train_speed(iter/s)": 0.294529 }, { "acc": 0.71591907, "epoch": 0.39035466069763314, "grad_norm": 7.71875, "learning_rate": 9.434764137776043e-06, "loss": 1.13363981, "memory(GiB)": 141.16, "step": 34900, "train_speed(iter/s)": 0.294588 }, { "acc": 0.71764045, "epoch": 0.39057835964359167, "grad_norm": 6.28125, "learning_rate": 9.433909660881568e-06, "loss": 1.14090691, "memory(GiB)": 141.16, "step": 34920, "train_speed(iter/s)": 0.294643 }, { "acc": 0.72047858, "epoch": 0.3908020585895502, "grad_norm": 8.0625, "learning_rate": 9.433054577364876e-06, "loss": 1.13173313, "memory(GiB)": 141.16, "step": 34940, "train_speed(iter/s)": 0.294703 }, { "acc": 0.73257542, "epoch": 0.3910257575355087, "grad_norm": 10.625, "learning_rate": 9.432198887342956e-06, "loss": 1.08295622, "memory(GiB)": 141.16, "step": 34960, "train_speed(iter/s)": 0.294753 }, { "acc": 0.72709246, "epoch": 0.39124945648146725, "grad_norm": 11.75, "learning_rate": 9.431342590932877e-06, "loss": 1.09880695, "memory(GiB)": 141.16, "step": 34980, "train_speed(iter/s)": 0.294811 }, { "acc": 0.73504238, "epoch": 0.3914731554274258, "grad_norm": 6.8125, "learning_rate": 9.430485688251793e-06, "loss": 1.07187185, "memory(GiB)": 141.16, "step": 35000, "train_speed(iter/s)": 0.294867 }, { "acc": 0.72170992, "epoch": 0.3916968543733843, "grad_norm": 7.375, "learning_rate": 9.42962817941694e-06, "loss": 1.10969582, "memory(GiB)": 141.16, "step": 35020, "train_speed(iter/s)": 0.294917 }, { "acc": 0.73130226, "epoch": 0.39192055331934283, "grad_norm": 8.4375, "learning_rate": 9.428770064545638e-06, "loss": 1.05487595, "memory(GiB)": 141.16, "step": 35040, "train_speed(iter/s)": 0.294975 }, { "acc": 0.72894154, "epoch": 0.39214425226530136, "grad_norm": 5.3125, "learning_rate": 9.427911343755291e-06, "loss": 1.09938297, "memory(GiB)": 141.16, "step": 35060, "train_speed(iter/s)": 0.295026 }, { "acc": 0.71748781, "epoch": 0.3923679512112599, "grad_norm": 6.9375, "learning_rate": 9.427052017163381e-06, "loss": 1.1401619, "memory(GiB)": 141.16, "step": 35080, "train_speed(iter/s)": 0.295076 }, { "acc": 0.7198576, "epoch": 0.3925916501572184, "grad_norm": 6.03125, "learning_rate": 9.42619208488748e-06, "loss": 1.14387407, "memory(GiB)": 141.16, "step": 35100, "train_speed(iter/s)": 0.295127 }, { "acc": 0.73163567, "epoch": 0.39281534910317695, "grad_norm": 8.125, "learning_rate": 9.425331547045235e-06, "loss": 1.08072071, "memory(GiB)": 141.16, "step": 35120, "train_speed(iter/s)": 0.295185 }, { "acc": 0.73382568, "epoch": 0.3930390480491355, "grad_norm": 8.75, "learning_rate": 9.424470403754382e-06, "loss": 1.08560352, "memory(GiB)": 141.16, "step": 35140, "train_speed(iter/s)": 0.295242 }, { "acc": 0.73767476, "epoch": 0.393262746995094, "grad_norm": 8.125, "learning_rate": 9.423608655132738e-06, "loss": 1.05347357, "memory(GiB)": 141.16, "step": 35160, "train_speed(iter/s)": 0.295298 }, { "acc": 0.72867589, "epoch": 0.39348644594105253, "grad_norm": 5.1875, "learning_rate": 9.422746301298203e-06, "loss": 1.09050989, "memory(GiB)": 141.16, "step": 35180, "train_speed(iter/s)": 0.295358 }, { "acc": 0.72866325, "epoch": 0.39371014488701106, "grad_norm": 5.875, "learning_rate": 9.421883342368758e-06, "loss": 1.10997105, "memory(GiB)": 141.16, "step": 35200, "train_speed(iter/s)": 0.295411 }, { "acc": 0.73406296, "epoch": 0.3939338438329696, "grad_norm": 7.375, "learning_rate": 9.421019778462468e-06, "loss": 1.04878588, "memory(GiB)": 141.16, "step": 35220, "train_speed(iter/s)": 0.295466 }, { "acc": 0.71722307, "epoch": 0.3941575427789281, "grad_norm": 9.5, "learning_rate": 9.420155609697482e-06, "loss": 1.15139055, "memory(GiB)": 141.16, "step": 35240, "train_speed(iter/s)": 0.295516 }, { "acc": 0.73054333, "epoch": 0.39438124172488664, "grad_norm": 7.84375, "learning_rate": 9.419290836192027e-06, "loss": 1.08481312, "memory(GiB)": 141.16, "step": 35260, "train_speed(iter/s)": 0.295572 }, { "acc": 0.72559986, "epoch": 0.3946049406708452, "grad_norm": 6.78125, "learning_rate": 9.418425458064423e-06, "loss": 1.10179825, "memory(GiB)": 141.16, "step": 35280, "train_speed(iter/s)": 0.295617 }, { "acc": 0.71300411, "epoch": 0.3948286396168037, "grad_norm": 7.0625, "learning_rate": 9.41755947543306e-06, "loss": 1.17067213, "memory(GiB)": 141.16, "step": 35300, "train_speed(iter/s)": 0.295668 }, { "acc": 0.72941623, "epoch": 0.39505233856276223, "grad_norm": 5.03125, "learning_rate": 9.416692888416421e-06, "loss": 1.08912849, "memory(GiB)": 141.16, "step": 35320, "train_speed(iter/s)": 0.295722 }, { "acc": 0.72631865, "epoch": 0.39527603750872076, "grad_norm": 6.96875, "learning_rate": 9.415825697133065e-06, "loss": 1.09824257, "memory(GiB)": 141.16, "step": 35340, "train_speed(iter/s)": 0.295772 }, { "acc": 0.73429542, "epoch": 0.3954997364546793, "grad_norm": 8.6875, "learning_rate": 9.414957901701637e-06, "loss": 1.06895142, "memory(GiB)": 141.16, "step": 35360, "train_speed(iter/s)": 0.295821 }, { "acc": 0.72261276, "epoch": 0.3957234354006378, "grad_norm": 7.1875, "learning_rate": 9.414089502240864e-06, "loss": 1.1204977, "memory(GiB)": 141.16, "step": 35380, "train_speed(iter/s)": 0.295871 }, { "acc": 0.72316546, "epoch": 0.39594713434659634, "grad_norm": 5.8125, "learning_rate": 9.413220498869556e-06, "loss": 1.13040314, "memory(GiB)": 141.16, "step": 35400, "train_speed(iter/s)": 0.295923 }, { "acc": 0.71482801, "epoch": 0.39617083329255487, "grad_norm": 6.84375, "learning_rate": 9.412350891706603e-06, "loss": 1.15695763, "memory(GiB)": 141.16, "step": 35420, "train_speed(iter/s)": 0.295977 }, { "acc": 0.73037968, "epoch": 0.3963945322385134, "grad_norm": 7.96875, "learning_rate": 9.411480680870982e-06, "loss": 1.06191711, "memory(GiB)": 141.16, "step": 35440, "train_speed(iter/s)": 0.296028 }, { "acc": 0.72004213, "epoch": 0.3966182311844719, "grad_norm": 6.9375, "learning_rate": 9.410609866481748e-06, "loss": 1.13229837, "memory(GiB)": 141.16, "step": 35460, "train_speed(iter/s)": 0.296083 }, { "acc": 0.72449365, "epoch": 0.39684193013043045, "grad_norm": 7.375, "learning_rate": 9.409738448658044e-06, "loss": 1.11188669, "memory(GiB)": 141.16, "step": 35480, "train_speed(iter/s)": 0.29613 }, { "acc": 0.72453308, "epoch": 0.397065629076389, "grad_norm": 7.625, "learning_rate": 9.408866427519088e-06, "loss": 1.12706337, "memory(GiB)": 141.16, "step": 35500, "train_speed(iter/s)": 0.296182 }, { "acc": 0.71843691, "epoch": 0.3972893280223475, "grad_norm": 7.125, "learning_rate": 9.40799380318419e-06, "loss": 1.14066639, "memory(GiB)": 141.16, "step": 35520, "train_speed(iter/s)": 0.296236 }, { "acc": 0.73895359, "epoch": 0.39751302696830604, "grad_norm": 5.625, "learning_rate": 9.407120575772733e-06, "loss": 1.03588562, "memory(GiB)": 141.16, "step": 35540, "train_speed(iter/s)": 0.296284 }, { "acc": 0.73940363, "epoch": 0.39773672591426457, "grad_norm": 6.75, "learning_rate": 9.40624674540419e-06, "loss": 1.0480772, "memory(GiB)": 141.16, "step": 35560, "train_speed(iter/s)": 0.296336 }, { "acc": 0.73637066, "epoch": 0.3979604248602231, "grad_norm": 6.15625, "learning_rate": 9.405372312198113e-06, "loss": 1.05746746, "memory(GiB)": 141.16, "step": 35580, "train_speed(iter/s)": 0.29639 }, { "acc": 0.71593513, "epoch": 0.3981841238061816, "grad_norm": 5.625, "learning_rate": 9.404497276274136e-06, "loss": 1.14910479, "memory(GiB)": 141.16, "step": 35600, "train_speed(iter/s)": 0.296446 }, { "acc": 0.73662429, "epoch": 0.39840782275214015, "grad_norm": 8.25, "learning_rate": 9.403621637751977e-06, "loss": 1.0478138, "memory(GiB)": 141.16, "step": 35620, "train_speed(iter/s)": 0.296491 }, { "acc": 0.71815729, "epoch": 0.3986315216980987, "grad_norm": 8.125, "learning_rate": 9.402745396751434e-06, "loss": 1.14212418, "memory(GiB)": 141.16, "step": 35640, "train_speed(iter/s)": 0.296548 }, { "acc": 0.73801513, "epoch": 0.3988552206440572, "grad_norm": 6.625, "learning_rate": 9.401868553392393e-06, "loss": 1.06474333, "memory(GiB)": 141.16, "step": 35660, "train_speed(iter/s)": 0.296606 }, { "acc": 0.72348728, "epoch": 0.39907891959001573, "grad_norm": 8.1875, "learning_rate": 9.400991107794816e-06, "loss": 1.11679554, "memory(GiB)": 141.16, "step": 35680, "train_speed(iter/s)": 0.296663 }, { "acc": 0.73183436, "epoch": 0.39930261853597426, "grad_norm": 7.46875, "learning_rate": 9.40011306007875e-06, "loss": 1.07044849, "memory(GiB)": 141.16, "step": 35700, "train_speed(iter/s)": 0.296719 }, { "acc": 0.71128106, "epoch": 0.3995263174819328, "grad_norm": 7.8125, "learning_rate": 9.399234410364326e-06, "loss": 1.17472401, "memory(GiB)": 141.16, "step": 35720, "train_speed(iter/s)": 0.29677 }, { "acc": 0.73100076, "epoch": 0.3997500164278913, "grad_norm": 8.5, "learning_rate": 9.398355158771755e-06, "loss": 1.07325802, "memory(GiB)": 141.16, "step": 35740, "train_speed(iter/s)": 0.296828 }, { "acc": 0.72387896, "epoch": 0.39997371537384985, "grad_norm": 7.75, "learning_rate": 9.397475305421332e-06, "loss": 1.11115818, "memory(GiB)": 141.16, "step": 35760, "train_speed(iter/s)": 0.296877 }, { "acc": 0.72593532, "epoch": 0.4001974143198084, "grad_norm": 6.78125, "learning_rate": 9.396594850433432e-06, "loss": 1.08992138, "memory(GiB)": 141.16, "step": 35780, "train_speed(iter/s)": 0.296935 }, { "acc": 0.72980132, "epoch": 0.4004211132657669, "grad_norm": 5.84375, "learning_rate": 9.395713793928514e-06, "loss": 1.08266888, "memory(GiB)": 141.16, "step": 35800, "train_speed(iter/s)": 0.296987 }, { "acc": 0.73924537, "epoch": 0.4006448122117255, "grad_norm": 7.84375, "learning_rate": 9.394832136027121e-06, "loss": 1.03882656, "memory(GiB)": 141.16, "step": 35820, "train_speed(iter/s)": 0.297037 }, { "acc": 0.72865872, "epoch": 0.400868511157684, "grad_norm": 7.0625, "learning_rate": 9.393949876849875e-06, "loss": 1.08358192, "memory(GiB)": 141.16, "step": 35840, "train_speed(iter/s)": 0.2971 }, { "acc": 0.72425842, "epoch": 0.40109221010364254, "grad_norm": 8.1875, "learning_rate": 9.393067016517483e-06, "loss": 1.10927763, "memory(GiB)": 141.16, "step": 35860, "train_speed(iter/s)": 0.297157 }, { "acc": 0.72040319, "epoch": 0.40131590904960107, "grad_norm": 9.1875, "learning_rate": 9.39218355515073e-06, "loss": 1.13143167, "memory(GiB)": 141.16, "step": 35880, "train_speed(iter/s)": 0.297207 }, { "acc": 0.73473864, "epoch": 0.4015396079955596, "grad_norm": 6.40625, "learning_rate": 9.391299492870488e-06, "loss": 1.06010113, "memory(GiB)": 141.16, "step": 35900, "train_speed(iter/s)": 0.297253 }, { "acc": 0.7254735, "epoch": 0.40176330694151813, "grad_norm": 12.0, "learning_rate": 9.39041482979771e-06, "loss": 1.10965958, "memory(GiB)": 141.16, "step": 35920, "train_speed(iter/s)": 0.297302 }, { "acc": 0.72378764, "epoch": 0.40198700588747666, "grad_norm": 6.21875, "learning_rate": 9.389529566053428e-06, "loss": 1.12100668, "memory(GiB)": 141.16, "step": 35940, "train_speed(iter/s)": 0.29735 }, { "acc": 0.72621608, "epoch": 0.4022107048334352, "grad_norm": 5.8125, "learning_rate": 9.388643701758761e-06, "loss": 1.10455017, "memory(GiB)": 141.16, "step": 35960, "train_speed(iter/s)": 0.297405 }, { "acc": 0.72822514, "epoch": 0.4024344037793937, "grad_norm": 6.0625, "learning_rate": 9.387757237034909e-06, "loss": 1.09376345, "memory(GiB)": 141.16, "step": 35980, "train_speed(iter/s)": 0.297456 }, { "acc": 0.73118515, "epoch": 0.40265810272535224, "grad_norm": 7.21875, "learning_rate": 9.386870172003151e-06, "loss": 1.08443623, "memory(GiB)": 141.16, "step": 36000, "train_speed(iter/s)": 0.297507 }, { "epoch": 0.40265810272535224, "eval_acc": 0.6875528213658801, "eval_loss": 1.0896645784378052, "eval_runtime": 2324.121, "eval_samples_per_second": 32.392, "eval_steps_per_second": 16.196, "step": 36000 }, { "acc": 0.72341619, "epoch": 0.40288180167131077, "grad_norm": 6.34375, "learning_rate": 9.385982506784851e-06, "loss": 1.10024738, "memory(GiB)": 141.16, "step": 36020, "train_speed(iter/s)": 0.291842 }, { "acc": 0.73396316, "epoch": 0.4031055006172693, "grad_norm": 5.625, "learning_rate": 9.385094241501453e-06, "loss": 1.06521854, "memory(GiB)": 141.16, "step": 36040, "train_speed(iter/s)": 0.291895 }, { "acc": 0.71048021, "epoch": 0.4033291995632278, "grad_norm": 6.8125, "learning_rate": 9.384205376274486e-06, "loss": 1.16380939, "memory(GiB)": 141.16, "step": 36060, "train_speed(iter/s)": 0.291949 }, { "acc": 0.73337946, "epoch": 0.40355289850918635, "grad_norm": 6.65625, "learning_rate": 9.383315911225557e-06, "loss": 1.06413975, "memory(GiB)": 141.16, "step": 36080, "train_speed(iter/s)": 0.292005 }, { "acc": 0.73515787, "epoch": 0.4037765974551449, "grad_norm": 7.90625, "learning_rate": 9.382425846476362e-06, "loss": 1.07720566, "memory(GiB)": 141.16, "step": 36100, "train_speed(iter/s)": 0.292061 }, { "acc": 0.73184619, "epoch": 0.4040002964011034, "grad_norm": 5.53125, "learning_rate": 9.381535182148671e-06, "loss": 1.06967487, "memory(GiB)": 141.16, "step": 36120, "train_speed(iter/s)": 0.292116 }, { "acc": 0.72490063, "epoch": 0.40422399534706194, "grad_norm": 7.25, "learning_rate": 9.38064391836434e-06, "loss": 1.10762224, "memory(GiB)": 141.16, "step": 36140, "train_speed(iter/s)": 0.29217 }, { "acc": 0.7280695, "epoch": 0.40444769429302047, "grad_norm": 7.0625, "learning_rate": 9.379752055245306e-06, "loss": 1.08948402, "memory(GiB)": 141.16, "step": 36160, "train_speed(iter/s)": 0.292216 }, { "acc": 0.7166995, "epoch": 0.404671393238979, "grad_norm": 6.28125, "learning_rate": 9.378859592913592e-06, "loss": 1.14104881, "memory(GiB)": 141.16, "step": 36180, "train_speed(iter/s)": 0.292274 }, { "acc": 0.73492599, "epoch": 0.4048950921849375, "grad_norm": 7.125, "learning_rate": 9.377966531491297e-06, "loss": 1.06712837, "memory(GiB)": 141.16, "step": 36200, "train_speed(iter/s)": 0.292327 }, { "acc": 0.73259916, "epoch": 0.40511879113089605, "grad_norm": 6.84375, "learning_rate": 9.377072871100603e-06, "loss": 1.08124981, "memory(GiB)": 141.16, "step": 36220, "train_speed(iter/s)": 0.292379 }, { "acc": 0.73655748, "epoch": 0.4053424900768546, "grad_norm": 8.6875, "learning_rate": 9.37617861186378e-06, "loss": 1.05581789, "memory(GiB)": 141.16, "step": 36240, "train_speed(iter/s)": 0.292418 }, { "acc": 0.72447557, "epoch": 0.4055661890228131, "grad_norm": 4.78125, "learning_rate": 9.37528375390317e-06, "loss": 1.09703465, "memory(GiB)": 141.16, "step": 36260, "train_speed(iter/s)": 0.292469 }, { "acc": 0.72800541, "epoch": 0.40578988796877163, "grad_norm": 7.625, "learning_rate": 9.374388297341208e-06, "loss": 1.09943781, "memory(GiB)": 141.16, "step": 36280, "train_speed(iter/s)": 0.292523 }, { "acc": 0.73167763, "epoch": 0.40601358691473016, "grad_norm": 8.8125, "learning_rate": 9.3734922423004e-06, "loss": 1.07640858, "memory(GiB)": 141.16, "step": 36300, "train_speed(iter/s)": 0.292576 }, { "acc": 0.7373188, "epoch": 0.4062372858606887, "grad_norm": 9.3125, "learning_rate": 9.372595588903345e-06, "loss": 1.05337286, "memory(GiB)": 141.16, "step": 36320, "train_speed(iter/s)": 0.292632 }, { "acc": 0.73385458, "epoch": 0.4064609848066472, "grad_norm": 7.75, "learning_rate": 9.371698337272712e-06, "loss": 1.06926079, "memory(GiB)": 141.16, "step": 36340, "train_speed(iter/s)": 0.292684 }, { "acc": 0.71846142, "epoch": 0.40668468375260575, "grad_norm": 5.40625, "learning_rate": 9.370800487531261e-06, "loss": 1.14463921, "memory(GiB)": 141.16, "step": 36360, "train_speed(iter/s)": 0.292726 }, { "acc": 0.72562847, "epoch": 0.4069083826985643, "grad_norm": 6.90625, "learning_rate": 9.369902039801831e-06, "loss": 1.09935875, "memory(GiB)": 141.16, "step": 36380, "train_speed(iter/s)": 0.292781 }, { "acc": 0.72126055, "epoch": 0.4071320816445228, "grad_norm": 6.53125, "learning_rate": 9.369002994207341e-06, "loss": 1.11584473, "memory(GiB)": 141.16, "step": 36400, "train_speed(iter/s)": 0.292836 }, { "acc": 0.71669245, "epoch": 0.40735578059048133, "grad_norm": 7.84375, "learning_rate": 9.368103350870794e-06, "loss": 1.13691044, "memory(GiB)": 141.16, "step": 36420, "train_speed(iter/s)": 0.292883 }, { "acc": 0.72906961, "epoch": 0.40757947953643986, "grad_norm": 7.5625, "learning_rate": 9.367203109915275e-06, "loss": 1.0808424, "memory(GiB)": 141.16, "step": 36440, "train_speed(iter/s)": 0.292928 }, { "acc": 0.75131669, "epoch": 0.4078031784823984, "grad_norm": 6.09375, "learning_rate": 9.366302271463947e-06, "loss": 0.97489586, "memory(GiB)": 141.16, "step": 36460, "train_speed(iter/s)": 0.292985 }, { "acc": 0.73151188, "epoch": 0.4080268774283569, "grad_norm": 7.40625, "learning_rate": 9.365400835640061e-06, "loss": 1.06286716, "memory(GiB)": 141.16, "step": 36480, "train_speed(iter/s)": 0.293041 }, { "acc": 0.72911043, "epoch": 0.40825057637431544, "grad_norm": 5.78125, "learning_rate": 9.364498802566944e-06, "loss": 1.07450447, "memory(GiB)": 141.16, "step": 36500, "train_speed(iter/s)": 0.293096 }, { "acc": 0.73570089, "epoch": 0.40847427532027397, "grad_norm": 6.71875, "learning_rate": 9.363596172368008e-06, "loss": 1.055896, "memory(GiB)": 141.16, "step": 36520, "train_speed(iter/s)": 0.293153 }, { "acc": 0.74419823, "epoch": 0.4086979742662325, "grad_norm": 5.90625, "learning_rate": 9.362692945166745e-06, "loss": 1.01232414, "memory(GiB)": 141.16, "step": 36540, "train_speed(iter/s)": 0.293202 }, { "acc": 0.71570678, "epoch": 0.40892167321219103, "grad_norm": 7.28125, "learning_rate": 9.36178912108673e-06, "loss": 1.16379118, "memory(GiB)": 141.16, "step": 36560, "train_speed(iter/s)": 0.293259 }, { "acc": 0.72960701, "epoch": 0.40914537215814956, "grad_norm": 6.125, "learning_rate": 9.36088470025162e-06, "loss": 1.08067141, "memory(GiB)": 141.16, "step": 36580, "train_speed(iter/s)": 0.29332 }, { "acc": 0.72343736, "epoch": 0.4093690711041081, "grad_norm": 8.0, "learning_rate": 9.35997968278515e-06, "loss": 1.11809597, "memory(GiB)": 141.16, "step": 36600, "train_speed(iter/s)": 0.293366 }, { "acc": 0.72713223, "epoch": 0.4095927700500666, "grad_norm": 7.6875, "learning_rate": 9.359074068811141e-06, "loss": 1.09772568, "memory(GiB)": 141.16, "step": 36620, "train_speed(iter/s)": 0.29342 }, { "acc": 0.73255925, "epoch": 0.40981646899602514, "grad_norm": 7.9375, "learning_rate": 9.358167858453495e-06, "loss": 1.0641571, "memory(GiB)": 141.16, "step": 36640, "train_speed(iter/s)": 0.293469 }, { "acc": 0.7354269, "epoch": 0.41004016794198367, "grad_norm": 6.15625, "learning_rate": 9.357261051836193e-06, "loss": 1.06846542, "memory(GiB)": 141.16, "step": 36660, "train_speed(iter/s)": 0.293519 }, { "acc": 0.72814145, "epoch": 0.4102638668879422, "grad_norm": 6.375, "learning_rate": 9.356353649083298e-06, "loss": 1.09531288, "memory(GiB)": 141.16, "step": 36680, "train_speed(iter/s)": 0.293569 }, { "acc": 0.73618593, "epoch": 0.4104875658339007, "grad_norm": 7.3125, "learning_rate": 9.35544565031896e-06, "loss": 1.06923075, "memory(GiB)": 141.16, "step": 36700, "train_speed(iter/s)": 0.293621 }, { "acc": 0.7141778, "epoch": 0.41071126477985925, "grad_norm": 7.375, "learning_rate": 9.354537055667401e-06, "loss": 1.15631418, "memory(GiB)": 141.16, "step": 36720, "train_speed(iter/s)": 0.293673 }, { "acc": 0.73333473, "epoch": 0.4109349637258178, "grad_norm": 5.375, "learning_rate": 9.353627865252933e-06, "loss": 1.06623096, "memory(GiB)": 141.16, "step": 36740, "train_speed(iter/s)": 0.293724 }, { "acc": 0.73150587, "epoch": 0.4111586626717763, "grad_norm": 7.9375, "learning_rate": 9.352718079199946e-06, "loss": 1.07694435, "memory(GiB)": 141.16, "step": 36760, "train_speed(iter/s)": 0.293778 }, { "acc": 0.72013721, "epoch": 0.41138236161773484, "grad_norm": 6.625, "learning_rate": 9.35180769763291e-06, "loss": 1.12510834, "memory(GiB)": 141.16, "step": 36780, "train_speed(iter/s)": 0.293834 }, { "acc": 0.71611857, "epoch": 0.41160606056369337, "grad_norm": 6.625, "learning_rate": 9.350896720676378e-06, "loss": 1.13842087, "memory(GiB)": 141.16, "step": 36800, "train_speed(iter/s)": 0.293885 }, { "acc": 0.72448111, "epoch": 0.4118297595096519, "grad_norm": 7.40625, "learning_rate": 9.34998514845499e-06, "loss": 1.09826078, "memory(GiB)": 141.16, "step": 36820, "train_speed(iter/s)": 0.293936 }, { "acc": 0.72912598, "epoch": 0.4120534584556104, "grad_norm": 5.96875, "learning_rate": 9.349072981093455e-06, "loss": 1.08457184, "memory(GiB)": 141.16, "step": 36840, "train_speed(iter/s)": 0.293988 }, { "acc": 0.72810335, "epoch": 0.41227715740156895, "grad_norm": 6.375, "learning_rate": 9.348160218716574e-06, "loss": 1.08836336, "memory(GiB)": 141.16, "step": 36860, "train_speed(iter/s)": 0.294038 }, { "acc": 0.73557348, "epoch": 0.4125008563475275, "grad_norm": 6.46875, "learning_rate": 9.347246861449226e-06, "loss": 1.06164875, "memory(GiB)": 141.16, "step": 36880, "train_speed(iter/s)": 0.294089 }, { "acc": 0.74201546, "epoch": 0.412724555293486, "grad_norm": 6.71875, "learning_rate": 9.346332909416371e-06, "loss": 1.03847847, "memory(GiB)": 141.16, "step": 36900, "train_speed(iter/s)": 0.294144 }, { "acc": 0.73773975, "epoch": 0.41294825423944453, "grad_norm": 7.125, "learning_rate": 9.34541836274305e-06, "loss": 1.04631691, "memory(GiB)": 141.16, "step": 36920, "train_speed(iter/s)": 0.294198 }, { "acc": 0.71944027, "epoch": 0.41317195318540306, "grad_norm": 5.65625, "learning_rate": 9.344503221554386e-06, "loss": 1.13702126, "memory(GiB)": 141.16, "step": 36940, "train_speed(iter/s)": 0.294248 }, { "acc": 0.73032923, "epoch": 0.4133956521313616, "grad_norm": 7.8125, "learning_rate": 9.343587485975586e-06, "loss": 1.08302345, "memory(GiB)": 141.16, "step": 36960, "train_speed(iter/s)": 0.294293 }, { "acc": 0.7191268, "epoch": 0.4136193510773201, "grad_norm": 5.8125, "learning_rate": 9.342671156131933e-06, "loss": 1.11292286, "memory(GiB)": 141.16, "step": 36980, "train_speed(iter/s)": 0.294345 }, { "acc": 0.72987976, "epoch": 0.41384305002327865, "grad_norm": 7.1875, "learning_rate": 9.341754232148795e-06, "loss": 1.085182, "memory(GiB)": 141.16, "step": 37000, "train_speed(iter/s)": 0.294396 }, { "acc": 0.71493912, "epoch": 0.4140667489692372, "grad_norm": 6.53125, "learning_rate": 9.340836714151618e-06, "loss": 1.17745905, "memory(GiB)": 141.16, "step": 37020, "train_speed(iter/s)": 0.294446 }, { "acc": 0.73940125, "epoch": 0.4142904479151957, "grad_norm": 6.4375, "learning_rate": 9.339918602265936e-06, "loss": 1.03275452, "memory(GiB)": 141.16, "step": 37040, "train_speed(iter/s)": 0.294502 }, { "acc": 0.7366663, "epoch": 0.41451414686115423, "grad_norm": 8.125, "learning_rate": 9.338999896617357e-06, "loss": 1.05730295, "memory(GiB)": 141.16, "step": 37060, "train_speed(iter/s)": 0.294542 }, { "acc": 0.73440351, "epoch": 0.4147378458071128, "grad_norm": 6.53125, "learning_rate": 9.338080597331573e-06, "loss": 1.04503574, "memory(GiB)": 141.16, "step": 37080, "train_speed(iter/s)": 0.294594 }, { "acc": 0.72424011, "epoch": 0.41496154475307134, "grad_norm": 6.90625, "learning_rate": 9.337160704534358e-06, "loss": 1.10447578, "memory(GiB)": 141.16, "step": 37100, "train_speed(iter/s)": 0.294641 }, { "acc": 0.72961311, "epoch": 0.41518524369902987, "grad_norm": 7.15625, "learning_rate": 9.336240218351567e-06, "loss": 1.07662239, "memory(GiB)": 141.16, "step": 37120, "train_speed(iter/s)": 0.294684 }, { "acc": 0.72526317, "epoch": 0.4154089426449884, "grad_norm": 7.84375, "learning_rate": 9.335319138909133e-06, "loss": 1.10489082, "memory(GiB)": 141.16, "step": 37140, "train_speed(iter/s)": 0.294732 }, { "acc": 0.72651949, "epoch": 0.4156326415909469, "grad_norm": 4.53125, "learning_rate": 9.334397466333078e-06, "loss": 1.10048323, "memory(GiB)": 141.16, "step": 37160, "train_speed(iter/s)": 0.29478 }, { "acc": 0.72909694, "epoch": 0.41585634053690546, "grad_norm": 6.9375, "learning_rate": 9.333475200749495e-06, "loss": 1.08907928, "memory(GiB)": 141.16, "step": 37180, "train_speed(iter/s)": 0.29483 }, { "acc": 0.7217773, "epoch": 0.416080039482864, "grad_norm": 6.4375, "learning_rate": 9.332552342284564e-06, "loss": 1.12167034, "memory(GiB)": 141.16, "step": 37200, "train_speed(iter/s)": 0.294881 }, { "acc": 0.71981592, "epoch": 0.4163037384288225, "grad_norm": 7.59375, "learning_rate": 9.331628891064548e-06, "loss": 1.1358099, "memory(GiB)": 141.16, "step": 37220, "train_speed(iter/s)": 0.294934 }, { "acc": 0.74210615, "epoch": 0.41652743737478104, "grad_norm": 5.71875, "learning_rate": 9.330704847215784e-06, "loss": 1.01756477, "memory(GiB)": 141.16, "step": 37240, "train_speed(iter/s)": 0.294984 }, { "acc": 0.72610054, "epoch": 0.41675113632073957, "grad_norm": 6.09375, "learning_rate": 9.329780210864699e-06, "loss": 1.11485538, "memory(GiB)": 141.16, "step": 37260, "train_speed(iter/s)": 0.295034 }, { "acc": 0.72907786, "epoch": 0.4169748352666981, "grad_norm": 7.625, "learning_rate": 9.328854982137795e-06, "loss": 1.08748674, "memory(GiB)": 141.16, "step": 37280, "train_speed(iter/s)": 0.295084 }, { "acc": 0.73430996, "epoch": 0.4171985342126566, "grad_norm": 7.125, "learning_rate": 9.327929161161652e-06, "loss": 1.07284851, "memory(GiB)": 141.16, "step": 37300, "train_speed(iter/s)": 0.295131 }, { "acc": 0.72484207, "epoch": 0.41742223315861515, "grad_norm": 8.25, "learning_rate": 9.327002748062944e-06, "loss": 1.09185801, "memory(GiB)": 141.16, "step": 37320, "train_speed(iter/s)": 0.295176 }, { "acc": 0.73210907, "epoch": 0.4176459321045737, "grad_norm": 6.15625, "learning_rate": 9.326075742968411e-06, "loss": 1.06679335, "memory(GiB)": 141.16, "step": 37340, "train_speed(iter/s)": 0.295225 }, { "acc": 0.7320446, "epoch": 0.4178696310505322, "grad_norm": 8.5, "learning_rate": 9.325148146004881e-06, "loss": 1.06950855, "memory(GiB)": 141.16, "step": 37360, "train_speed(iter/s)": 0.295281 }, { "acc": 0.71850224, "epoch": 0.41809332999649074, "grad_norm": 6.03125, "learning_rate": 9.324219957299266e-06, "loss": 1.14993782, "memory(GiB)": 141.16, "step": 37380, "train_speed(iter/s)": 0.295329 }, { "acc": 0.72905874, "epoch": 0.41831702894244926, "grad_norm": 7.28125, "learning_rate": 9.323291176978552e-06, "loss": 1.08663387, "memory(GiB)": 141.16, "step": 37400, "train_speed(iter/s)": 0.295381 }, { "acc": 0.73406534, "epoch": 0.4185407278884078, "grad_norm": 9.9375, "learning_rate": 9.322361805169813e-06, "loss": 1.05998507, "memory(GiB)": 141.16, "step": 37420, "train_speed(iter/s)": 0.295436 }, { "acc": 0.72016497, "epoch": 0.4187644268343663, "grad_norm": 5.53125, "learning_rate": 9.321431842000198e-06, "loss": 1.12542553, "memory(GiB)": 141.16, "step": 37440, "train_speed(iter/s)": 0.295488 }, { "acc": 0.7283885, "epoch": 0.41898812578032485, "grad_norm": 7.03125, "learning_rate": 9.320501287596936e-06, "loss": 1.09128361, "memory(GiB)": 141.16, "step": 37460, "train_speed(iter/s)": 0.29554 }, { "acc": 0.73576212, "epoch": 0.4192118247262834, "grad_norm": 7.9375, "learning_rate": 9.319570142087349e-06, "loss": 1.05997219, "memory(GiB)": 141.16, "step": 37480, "train_speed(iter/s)": 0.295586 }, { "acc": 0.73597002, "epoch": 0.4194355236722419, "grad_norm": 6.8125, "learning_rate": 9.318638405598821e-06, "loss": 1.05634995, "memory(GiB)": 141.16, "step": 37500, "train_speed(iter/s)": 0.295637 }, { "acc": 0.73132381, "epoch": 0.41965922261820043, "grad_norm": 6.375, "learning_rate": 9.317706078258835e-06, "loss": 1.07137527, "memory(GiB)": 141.16, "step": 37520, "train_speed(iter/s)": 0.295687 }, { "acc": 0.72728872, "epoch": 0.41988292156415896, "grad_norm": 4.9375, "learning_rate": 9.31677316019494e-06, "loss": 1.08829508, "memory(GiB)": 141.16, "step": 37540, "train_speed(iter/s)": 0.295728 }, { "acc": 0.73269253, "epoch": 0.4201066205101175, "grad_norm": 8.25, "learning_rate": 9.315839651534778e-06, "loss": 1.07438622, "memory(GiB)": 141.16, "step": 37560, "train_speed(iter/s)": 0.295782 }, { "acc": 0.73209729, "epoch": 0.420330319456076, "grad_norm": 6.96875, "learning_rate": 9.314905552406064e-06, "loss": 1.07374792, "memory(GiB)": 141.16, "step": 37580, "train_speed(iter/s)": 0.295827 }, { "acc": 0.73158078, "epoch": 0.42055401840203455, "grad_norm": 7.84375, "learning_rate": 9.313970862936596e-06, "loss": 1.06990547, "memory(GiB)": 141.16, "step": 37600, "train_speed(iter/s)": 0.295882 }, { "acc": 0.73179455, "epoch": 0.4207777173479931, "grad_norm": 6.90625, "learning_rate": 9.313035583254253e-06, "loss": 1.07031364, "memory(GiB)": 141.16, "step": 37620, "train_speed(iter/s)": 0.295947 }, { "acc": 0.74511366, "epoch": 0.4210014162939516, "grad_norm": 7.375, "learning_rate": 9.312099713486994e-06, "loss": 1.01544399, "memory(GiB)": 141.16, "step": 37640, "train_speed(iter/s)": 0.296 }, { "acc": 0.72913294, "epoch": 0.42122511523991013, "grad_norm": 7.78125, "learning_rate": 9.311163253762862e-06, "loss": 1.08038464, "memory(GiB)": 141.16, "step": 37660, "train_speed(iter/s)": 0.296054 }, { "acc": 0.71568909, "epoch": 0.42144881418586866, "grad_norm": 6.3125, "learning_rate": 9.310226204209974e-06, "loss": 1.15164146, "memory(GiB)": 141.16, "step": 37680, "train_speed(iter/s)": 0.296098 }, { "acc": 0.72114124, "epoch": 0.4216725131318272, "grad_norm": 6.8125, "learning_rate": 9.309288564956535e-06, "loss": 1.10500145, "memory(GiB)": 141.16, "step": 37700, "train_speed(iter/s)": 0.296141 }, { "acc": 0.73089952, "epoch": 0.4218962120777857, "grad_norm": 6.6875, "learning_rate": 9.308350336130828e-06, "loss": 1.0864006, "memory(GiB)": 141.16, "step": 37720, "train_speed(iter/s)": 0.296192 }, { "acc": 0.71868267, "epoch": 0.42211991102374424, "grad_norm": 6.78125, "learning_rate": 9.30741151786121e-06, "loss": 1.13733597, "memory(GiB)": 141.16, "step": 37740, "train_speed(iter/s)": 0.29624 }, { "acc": 0.73408546, "epoch": 0.42234360996970277, "grad_norm": 7.59375, "learning_rate": 9.306472110276132e-06, "loss": 1.05953398, "memory(GiB)": 141.16, "step": 37760, "train_speed(iter/s)": 0.296286 }, { "acc": 0.73760071, "epoch": 0.4225673089156613, "grad_norm": 7.125, "learning_rate": 9.305532113504116e-06, "loss": 1.04399872, "memory(GiB)": 141.16, "step": 37780, "train_speed(iter/s)": 0.296331 }, { "acc": 0.72986412, "epoch": 0.4227910078616198, "grad_norm": 6.71875, "learning_rate": 9.304591527673768e-06, "loss": 1.08306618, "memory(GiB)": 141.16, "step": 37800, "train_speed(iter/s)": 0.296385 }, { "acc": 0.7421752, "epoch": 0.42301470680757836, "grad_norm": 5.3125, "learning_rate": 9.303650352913769e-06, "loss": 1.02858334, "memory(GiB)": 141.16, "step": 37820, "train_speed(iter/s)": 0.296431 }, { "acc": 0.73141665, "epoch": 0.4232384057535369, "grad_norm": 9.375, "learning_rate": 9.30270858935289e-06, "loss": 1.07609711, "memory(GiB)": 141.16, "step": 37840, "train_speed(iter/s)": 0.296478 }, { "acc": 0.73117361, "epoch": 0.4234621046994954, "grad_norm": 7.75, "learning_rate": 9.301766237119975e-06, "loss": 1.06360111, "memory(GiB)": 141.16, "step": 37860, "train_speed(iter/s)": 0.296534 }, { "acc": 0.72558813, "epoch": 0.42368580364545394, "grad_norm": 7.4375, "learning_rate": 9.300823296343955e-06, "loss": 1.10759401, "memory(GiB)": 141.16, "step": 37880, "train_speed(iter/s)": 0.296581 }, { "acc": 0.72083445, "epoch": 0.42390950259141247, "grad_norm": 4.9375, "learning_rate": 9.299879767153834e-06, "loss": 1.12532692, "memory(GiB)": 141.16, "step": 37900, "train_speed(iter/s)": 0.296629 }, { "acc": 0.72818708, "epoch": 0.424133201537371, "grad_norm": 5.1875, "learning_rate": 9.2989356496787e-06, "loss": 1.10657234, "memory(GiB)": 141.16, "step": 37920, "train_speed(iter/s)": 0.296679 }, { "acc": 0.72544556, "epoch": 0.4243569004833295, "grad_norm": 6.34375, "learning_rate": 9.297990944047724e-06, "loss": 1.10789471, "memory(GiB)": 141.16, "step": 37940, "train_speed(iter/s)": 0.296728 }, { "acc": 0.72651377, "epoch": 0.42458059942928805, "grad_norm": 7.09375, "learning_rate": 9.297045650390155e-06, "loss": 1.10219994, "memory(GiB)": 141.16, "step": 37960, "train_speed(iter/s)": 0.296783 }, { "acc": 0.7306386, "epoch": 0.4248042983752466, "grad_norm": 6.5, "learning_rate": 9.29609976883532e-06, "loss": 1.06528368, "memory(GiB)": 141.16, "step": 37980, "train_speed(iter/s)": 0.296839 }, { "acc": 0.72990274, "epoch": 0.4250279973212051, "grad_norm": 8.375, "learning_rate": 9.295153299512634e-06, "loss": 1.09134607, "memory(GiB)": 141.16, "step": 38000, "train_speed(iter/s)": 0.296888 }, { "epoch": 0.4250279973212051, "eval_acc": 0.6878322348066723, "eval_loss": 1.0885381698608398, "eval_runtime": 2317.908, "eval_samples_per_second": 32.479, "eval_steps_per_second": 16.24, "step": 38000 }, { "acc": 0.74226704, "epoch": 0.42525169626716364, "grad_norm": 7.9375, "learning_rate": 9.294206242551584e-06, "loss": 1.0125351, "memory(GiB)": 141.16, "step": 38020, "train_speed(iter/s)": 0.291545 }, { "acc": 0.72715993, "epoch": 0.42547539521312217, "grad_norm": 8.4375, "learning_rate": 9.29325859808174e-06, "loss": 1.09303055, "memory(GiB)": 141.16, "step": 38040, "train_speed(iter/s)": 0.291597 }, { "acc": 0.72850847, "epoch": 0.4256990941590807, "grad_norm": 6.59375, "learning_rate": 9.292310366232757e-06, "loss": 1.10038071, "memory(GiB)": 141.16, "step": 38060, "train_speed(iter/s)": 0.291649 }, { "acc": 0.727878, "epoch": 0.4259227931050392, "grad_norm": 6.03125, "learning_rate": 9.291361547134365e-06, "loss": 1.0942565, "memory(GiB)": 141.16, "step": 38080, "train_speed(iter/s)": 0.291701 }, { "acc": 0.73191519, "epoch": 0.42614649205099775, "grad_norm": 7.28125, "learning_rate": 9.290412140916373e-06, "loss": 1.08472538, "memory(GiB)": 141.16, "step": 38100, "train_speed(iter/s)": 0.291748 }, { "acc": 0.72856588, "epoch": 0.4263701909969563, "grad_norm": 8.4375, "learning_rate": 9.28946214770868e-06, "loss": 1.08381643, "memory(GiB)": 141.16, "step": 38120, "train_speed(iter/s)": 0.291793 }, { "acc": 0.73358159, "epoch": 0.4265938899429148, "grad_norm": 7.0, "learning_rate": 9.28851156764125e-06, "loss": 1.0672719, "memory(GiB)": 141.16, "step": 38140, "train_speed(iter/s)": 0.291844 }, { "acc": 0.73215981, "epoch": 0.42681758888887333, "grad_norm": 6.96875, "learning_rate": 9.287560400844142e-06, "loss": 1.06615791, "memory(GiB)": 141.16, "step": 38160, "train_speed(iter/s)": 0.291886 }, { "acc": 0.72991114, "epoch": 0.42704128783483186, "grad_norm": 9.875, "learning_rate": 9.286608647447489e-06, "loss": 1.07936611, "memory(GiB)": 141.16, "step": 38180, "train_speed(iter/s)": 0.291936 }, { "acc": 0.73592634, "epoch": 0.4272649867807904, "grad_norm": 5.8125, "learning_rate": 9.285656307581503e-06, "loss": 1.07078857, "memory(GiB)": 141.16, "step": 38200, "train_speed(iter/s)": 0.291991 }, { "acc": 0.72920418, "epoch": 0.4274886857267489, "grad_norm": 7.53125, "learning_rate": 9.284703381376479e-06, "loss": 1.10257607, "memory(GiB)": 141.16, "step": 38220, "train_speed(iter/s)": 0.292042 }, { "acc": 0.73113651, "epoch": 0.42771238467270745, "grad_norm": 8.875, "learning_rate": 9.283749868962787e-06, "loss": 1.0677866, "memory(GiB)": 141.16, "step": 38240, "train_speed(iter/s)": 0.292095 }, { "acc": 0.73374472, "epoch": 0.427936083618666, "grad_norm": 9.125, "learning_rate": 9.282795770470888e-06, "loss": 1.04833412, "memory(GiB)": 141.16, "step": 38260, "train_speed(iter/s)": 0.292152 }, { "acc": 0.73218641, "epoch": 0.4281597825646245, "grad_norm": 6.46875, "learning_rate": 9.281841086031309e-06, "loss": 1.07918453, "memory(GiB)": 141.16, "step": 38280, "train_speed(iter/s)": 0.292201 }, { "acc": 0.72468972, "epoch": 0.42838348151058303, "grad_norm": 6.125, "learning_rate": 9.280885815774669e-06, "loss": 1.10947952, "memory(GiB)": 141.16, "step": 38300, "train_speed(iter/s)": 0.292249 }, { "acc": 0.73529596, "epoch": 0.42860718045654156, "grad_norm": 5.59375, "learning_rate": 9.279929959831662e-06, "loss": 1.07033005, "memory(GiB)": 141.16, "step": 38320, "train_speed(iter/s)": 0.292301 }, { "acc": 0.73735127, "epoch": 0.42883087940250014, "grad_norm": 10.0, "learning_rate": 9.27897351833306e-06, "loss": 1.04860897, "memory(GiB)": 141.16, "step": 38340, "train_speed(iter/s)": 0.292353 }, { "acc": 0.72133703, "epoch": 0.42905457834845867, "grad_norm": 5.6875, "learning_rate": 9.278016491409722e-06, "loss": 1.1093132, "memory(GiB)": 141.16, "step": 38360, "train_speed(iter/s)": 0.292402 }, { "acc": 0.73511491, "epoch": 0.4292782772944172, "grad_norm": 5.9375, "learning_rate": 9.27705887919258e-06, "loss": 1.07026072, "memory(GiB)": 141.16, "step": 38380, "train_speed(iter/s)": 0.292449 }, { "acc": 0.72409201, "epoch": 0.4295019762403757, "grad_norm": 6.03125, "learning_rate": 9.276100681812651e-06, "loss": 1.11028032, "memory(GiB)": 141.16, "step": 38400, "train_speed(iter/s)": 0.292497 }, { "acc": 0.7249898, "epoch": 0.42972567518633425, "grad_norm": 8.1875, "learning_rate": 9.27514189940103e-06, "loss": 1.11389465, "memory(GiB)": 141.16, "step": 38420, "train_speed(iter/s)": 0.292549 }, { "acc": 0.74128485, "epoch": 0.4299493741322928, "grad_norm": 5.6875, "learning_rate": 9.274182532088888e-06, "loss": 1.04408998, "memory(GiB)": 141.16, "step": 38440, "train_speed(iter/s)": 0.2926 }, { "acc": 0.73869653, "epoch": 0.4301730730782513, "grad_norm": 6.21875, "learning_rate": 9.273222580007488e-06, "loss": 1.03642807, "memory(GiB)": 141.16, "step": 38460, "train_speed(iter/s)": 0.29265 }, { "acc": 0.73346252, "epoch": 0.43039677202420984, "grad_norm": 6.6875, "learning_rate": 9.272262043288158e-06, "loss": 1.07686987, "memory(GiB)": 141.16, "step": 38480, "train_speed(iter/s)": 0.292701 }, { "acc": 0.72357159, "epoch": 0.43062047097016837, "grad_norm": 5.9375, "learning_rate": 9.271300922062315e-06, "loss": 1.11413078, "memory(GiB)": 141.16, "step": 38500, "train_speed(iter/s)": 0.292753 }, { "acc": 0.72727571, "epoch": 0.4308441699161269, "grad_norm": 8.3125, "learning_rate": 9.270339216461457e-06, "loss": 1.11309757, "memory(GiB)": 141.16, "step": 38520, "train_speed(iter/s)": 0.292803 }, { "acc": 0.72063789, "epoch": 0.4310678688620854, "grad_norm": 7.5, "learning_rate": 9.269376926617155e-06, "loss": 1.12588043, "memory(GiB)": 141.16, "step": 38540, "train_speed(iter/s)": 0.292852 }, { "acc": 0.73638983, "epoch": 0.43129156780804395, "grad_norm": 9.375, "learning_rate": 9.268414052661068e-06, "loss": 1.05636292, "memory(GiB)": 141.16, "step": 38560, "train_speed(iter/s)": 0.292905 }, { "acc": 0.72757759, "epoch": 0.4315152667540025, "grad_norm": 7.9375, "learning_rate": 9.267450594724926e-06, "loss": 1.09211769, "memory(GiB)": 141.16, "step": 38580, "train_speed(iter/s)": 0.292959 }, { "acc": 0.73111525, "epoch": 0.431738965699961, "grad_norm": 6.375, "learning_rate": 9.26648655294055e-06, "loss": 1.07610703, "memory(GiB)": 141.16, "step": 38600, "train_speed(iter/s)": 0.293014 }, { "acc": 0.72225399, "epoch": 0.43196266464591954, "grad_norm": 10.1875, "learning_rate": 9.265521927439829e-06, "loss": 1.12764854, "memory(GiB)": 141.16, "step": 38620, "train_speed(iter/s)": 0.29306 }, { "acc": 0.73651123, "epoch": 0.43218636359187806, "grad_norm": 6.40625, "learning_rate": 9.264556718354742e-06, "loss": 1.04345732, "memory(GiB)": 141.16, "step": 38640, "train_speed(iter/s)": 0.293103 }, { "acc": 0.73414469, "epoch": 0.4324100625378366, "grad_norm": 6.65625, "learning_rate": 9.26359092581734e-06, "loss": 1.04305382, "memory(GiB)": 141.16, "step": 38660, "train_speed(iter/s)": 0.293151 }, { "acc": 0.72468367, "epoch": 0.4326337614837951, "grad_norm": 8.3125, "learning_rate": 9.262624549959759e-06, "loss": 1.0924305, "memory(GiB)": 141.16, "step": 38680, "train_speed(iter/s)": 0.293204 }, { "acc": 0.74320593, "epoch": 0.43285746042975365, "grad_norm": 7.875, "learning_rate": 9.261657590914213e-06, "loss": 1.01284504, "memory(GiB)": 141.16, "step": 38700, "train_speed(iter/s)": 0.293256 }, { "acc": 0.73514576, "epoch": 0.4330811593757122, "grad_norm": 7.875, "learning_rate": 9.260690048812995e-06, "loss": 1.06311398, "memory(GiB)": 141.16, "step": 38720, "train_speed(iter/s)": 0.2933 }, { "acc": 0.74228516, "epoch": 0.4333048583216707, "grad_norm": 6.65625, "learning_rate": 9.259721923788479e-06, "loss": 1.02105541, "memory(GiB)": 141.16, "step": 38740, "train_speed(iter/s)": 0.293348 }, { "acc": 0.73250213, "epoch": 0.43352855726762923, "grad_norm": 6.8125, "learning_rate": 9.258753215973117e-06, "loss": 1.0700325, "memory(GiB)": 141.16, "step": 38760, "train_speed(iter/s)": 0.293402 }, { "acc": 0.73152785, "epoch": 0.43375225621358776, "grad_norm": 6.4375, "learning_rate": 9.257783925499447e-06, "loss": 1.06561108, "memory(GiB)": 141.16, "step": 38780, "train_speed(iter/s)": 0.293457 }, { "acc": 0.70845261, "epoch": 0.4339759551595463, "grad_norm": 7.21875, "learning_rate": 9.256814052500074e-06, "loss": 1.19226265, "memory(GiB)": 141.16, "step": 38800, "train_speed(iter/s)": 0.293502 }, { "acc": 0.73042612, "epoch": 0.4341996541055048, "grad_norm": 6.03125, "learning_rate": 9.255843597107697e-06, "loss": 1.0762989, "memory(GiB)": 141.16, "step": 38820, "train_speed(iter/s)": 0.293554 }, { "acc": 0.73352532, "epoch": 0.43442335305146335, "grad_norm": 6.46875, "learning_rate": 9.254872559455086e-06, "loss": 1.06192093, "memory(GiB)": 141.16, "step": 38840, "train_speed(iter/s)": 0.293601 }, { "acc": 0.72896242, "epoch": 0.4346470519974219, "grad_norm": 7.6875, "learning_rate": 9.253900939675092e-06, "loss": 1.08412418, "memory(GiB)": 141.16, "step": 38860, "train_speed(iter/s)": 0.293652 }, { "acc": 0.72757473, "epoch": 0.4348707509433804, "grad_norm": 9.3125, "learning_rate": 9.252928737900649e-06, "loss": 1.09306736, "memory(GiB)": 141.16, "step": 38880, "train_speed(iter/s)": 0.293703 }, { "acc": 0.73781919, "epoch": 0.43509444988933893, "grad_norm": 7.46875, "learning_rate": 9.251955954264764e-06, "loss": 1.05161524, "memory(GiB)": 141.16, "step": 38900, "train_speed(iter/s)": 0.293753 }, { "acc": 0.73251715, "epoch": 0.43531814883529746, "grad_norm": 7.40625, "learning_rate": 9.25098258890053e-06, "loss": 1.09301186, "memory(GiB)": 141.16, "step": 38920, "train_speed(iter/s)": 0.293804 }, { "acc": 0.72683592, "epoch": 0.435541847781256, "grad_norm": 8.4375, "learning_rate": 9.250008641941119e-06, "loss": 1.09178696, "memory(GiB)": 141.16, "step": 38940, "train_speed(iter/s)": 0.293855 }, { "acc": 0.74776258, "epoch": 0.4357655467272145, "grad_norm": 7.875, "learning_rate": 9.249034113519778e-06, "loss": 1.00648956, "memory(GiB)": 141.16, "step": 38960, "train_speed(iter/s)": 0.293907 }, { "acc": 0.7287879, "epoch": 0.43598924567317304, "grad_norm": 7.46875, "learning_rate": 9.248059003769839e-06, "loss": 1.08006954, "memory(GiB)": 141.16, "step": 38980, "train_speed(iter/s)": 0.293958 }, { "acc": 0.73980188, "epoch": 0.43621294461913157, "grad_norm": 7.8125, "learning_rate": 9.247083312824707e-06, "loss": 1.04018192, "memory(GiB)": 141.16, "step": 39000, "train_speed(iter/s)": 0.294017 }, { "acc": 0.72980909, "epoch": 0.4364366435650901, "grad_norm": 7.5, "learning_rate": 9.246107040817876e-06, "loss": 1.09413424, "memory(GiB)": 141.16, "step": 39020, "train_speed(iter/s)": 0.29407 }, { "acc": 0.73415041, "epoch": 0.4366603425110486, "grad_norm": 6.875, "learning_rate": 9.24513018788291e-06, "loss": 1.0825634, "memory(GiB)": 141.16, "step": 39040, "train_speed(iter/s)": 0.29412 }, { "acc": 0.71714344, "epoch": 0.43688404145700716, "grad_norm": 7.4375, "learning_rate": 9.244152754153454e-06, "loss": 1.13142271, "memory(GiB)": 141.16, "step": 39060, "train_speed(iter/s)": 0.29417 }, { "acc": 0.72053132, "epoch": 0.4371077404029657, "grad_norm": 8.125, "learning_rate": 9.243174739763242e-06, "loss": 1.13247204, "memory(GiB)": 141.16, "step": 39080, "train_speed(iter/s)": 0.294213 }, { "acc": 0.72512836, "epoch": 0.4373314393489242, "grad_norm": 8.0625, "learning_rate": 9.242196144846076e-06, "loss": 1.10934057, "memory(GiB)": 141.16, "step": 39100, "train_speed(iter/s)": 0.294265 }, { "acc": 0.73001003, "epoch": 0.43755513829488274, "grad_norm": 6.59375, "learning_rate": 9.241216969535842e-06, "loss": 1.08927193, "memory(GiB)": 141.16, "step": 39120, "train_speed(iter/s)": 0.294315 }, { "acc": 0.7175384, "epoch": 0.43777883724084127, "grad_norm": 7.3125, "learning_rate": 9.240237213966507e-06, "loss": 1.14092302, "memory(GiB)": 141.16, "step": 39140, "train_speed(iter/s)": 0.294362 }, { "acc": 0.74723215, "epoch": 0.4380025361867998, "grad_norm": 8.5625, "learning_rate": 9.239256878272113e-06, "loss": 0.9879921, "memory(GiB)": 141.16, "step": 39160, "train_speed(iter/s)": 0.294415 }, { "acc": 0.71380463, "epoch": 0.4382262351327583, "grad_norm": 6.0625, "learning_rate": 9.238275962586785e-06, "loss": 1.15617018, "memory(GiB)": 141.16, "step": 39180, "train_speed(iter/s)": 0.294467 }, { "acc": 0.73426528, "epoch": 0.43844993407871685, "grad_norm": 7.65625, "learning_rate": 9.237294467044727e-06, "loss": 1.05774708, "memory(GiB)": 141.16, "step": 39200, "train_speed(iter/s)": 0.294516 }, { "acc": 0.72413092, "epoch": 0.4386736330246754, "grad_norm": 8.0, "learning_rate": 9.23631239178022e-06, "loss": 1.10779552, "memory(GiB)": 141.16, "step": 39220, "train_speed(iter/s)": 0.294564 }, { "acc": 0.71650114, "epoch": 0.4388973319706339, "grad_norm": 7.15625, "learning_rate": 9.235329736927628e-06, "loss": 1.14099541, "memory(GiB)": 141.16, "step": 39240, "train_speed(iter/s)": 0.294616 }, { "acc": 0.71665525, "epoch": 0.43912103091659244, "grad_norm": 5.78125, "learning_rate": 9.23434650262139e-06, "loss": 1.15371084, "memory(GiB)": 141.16, "step": 39260, "train_speed(iter/s)": 0.294669 }, { "acc": 0.72528896, "epoch": 0.43934472986255096, "grad_norm": 5.75, "learning_rate": 9.233362688996028e-06, "loss": 1.08266201, "memory(GiB)": 141.16, "step": 39280, "train_speed(iter/s)": 0.294719 }, { "acc": 0.7415822, "epoch": 0.4395684288085095, "grad_norm": 8.0, "learning_rate": 9.232378296186142e-06, "loss": 1.02355728, "memory(GiB)": 141.16, "step": 39300, "train_speed(iter/s)": 0.294765 }, { "acc": 0.7139348, "epoch": 0.439792127754468, "grad_norm": 5.65625, "learning_rate": 9.23139332432641e-06, "loss": 1.15462017, "memory(GiB)": 141.16, "step": 39320, "train_speed(iter/s)": 0.294812 }, { "acc": 0.73313875, "epoch": 0.44001582670042655, "grad_norm": 6.5625, "learning_rate": 9.23040777355159e-06, "loss": 1.07599983, "memory(GiB)": 141.16, "step": 39340, "train_speed(iter/s)": 0.294858 }, { "acc": 0.73091369, "epoch": 0.4402395256463851, "grad_norm": 8.875, "learning_rate": 9.229421643996521e-06, "loss": 1.08028679, "memory(GiB)": 141.16, "step": 39360, "train_speed(iter/s)": 0.294907 }, { "acc": 0.7300982, "epoch": 0.4404632245923436, "grad_norm": 7.96875, "learning_rate": 9.22843493579612e-06, "loss": 1.07968035, "memory(GiB)": 141.16, "step": 39380, "train_speed(iter/s)": 0.294957 }, { "acc": 0.72686014, "epoch": 0.44068692353830213, "grad_norm": 6.625, "learning_rate": 9.227447649085379e-06, "loss": 1.09481239, "memory(GiB)": 141.16, "step": 39400, "train_speed(iter/s)": 0.295004 }, { "acc": 0.72577066, "epoch": 0.44091062248426066, "grad_norm": 6.375, "learning_rate": 9.226459783999378e-06, "loss": 1.11727238, "memory(GiB)": 141.16, "step": 39420, "train_speed(iter/s)": 0.295056 }, { "acc": 0.72827377, "epoch": 0.4411343214302192, "grad_norm": 8.6875, "learning_rate": 9.225471340673267e-06, "loss": 1.07815046, "memory(GiB)": 141.16, "step": 39440, "train_speed(iter/s)": 0.29511 }, { "acc": 0.72438536, "epoch": 0.4413580203761777, "grad_norm": 6.6875, "learning_rate": 9.224482319242281e-06, "loss": 1.11551743, "memory(GiB)": 141.16, "step": 39460, "train_speed(iter/s)": 0.295161 }, { "acc": 0.72449856, "epoch": 0.44158171932213625, "grad_norm": 9.1875, "learning_rate": 9.223492719841732e-06, "loss": 1.10101128, "memory(GiB)": 141.16, "step": 39480, "train_speed(iter/s)": 0.295208 }, { "acc": 0.7407999, "epoch": 0.4418054182680948, "grad_norm": 5.125, "learning_rate": 9.22250254260701e-06, "loss": 1.04416809, "memory(GiB)": 141.16, "step": 39500, "train_speed(iter/s)": 0.295252 }, { "acc": 0.73330526, "epoch": 0.4420291172140533, "grad_norm": 5.03125, "learning_rate": 9.22151178767359e-06, "loss": 1.07749214, "memory(GiB)": 141.16, "step": 39520, "train_speed(iter/s)": 0.295307 }, { "acc": 0.7281743, "epoch": 0.44225281616001183, "grad_norm": 6.125, "learning_rate": 9.220520455177016e-06, "loss": 1.08968163, "memory(GiB)": 141.16, "step": 39540, "train_speed(iter/s)": 0.295357 }, { "acc": 0.74078302, "epoch": 0.44247651510597036, "grad_norm": 7.25, "learning_rate": 9.219528545252918e-06, "loss": 1.01826344, "memory(GiB)": 141.16, "step": 39560, "train_speed(iter/s)": 0.295402 }, { "acc": 0.73645835, "epoch": 0.4427002140519289, "grad_norm": 7.84375, "learning_rate": 9.218536058037004e-06, "loss": 1.05667696, "memory(GiB)": 141.16, "step": 39580, "train_speed(iter/s)": 0.295449 }, { "acc": 0.72823229, "epoch": 0.44292391299788747, "grad_norm": 6.0625, "learning_rate": 9.217542993665061e-06, "loss": 1.0873188, "memory(GiB)": 141.16, "step": 39600, "train_speed(iter/s)": 0.295495 }, { "acc": 0.73524561, "epoch": 0.443147611943846, "grad_norm": 8.9375, "learning_rate": 9.216549352272954e-06, "loss": 1.07092361, "memory(GiB)": 141.16, "step": 39620, "train_speed(iter/s)": 0.295544 }, { "acc": 0.73453102, "epoch": 0.4433713108898045, "grad_norm": 7.78125, "learning_rate": 9.215555133996628e-06, "loss": 1.05012035, "memory(GiB)": 141.16, "step": 39640, "train_speed(iter/s)": 0.295597 }, { "acc": 0.72877407, "epoch": 0.44359500983576305, "grad_norm": 8.1875, "learning_rate": 9.214560338972105e-06, "loss": 1.07750597, "memory(GiB)": 141.16, "step": 39660, "train_speed(iter/s)": 0.295645 }, { "acc": 0.73834057, "epoch": 0.4438187087817216, "grad_norm": 8.375, "learning_rate": 9.213564967335488e-06, "loss": 1.04593945, "memory(GiB)": 141.16, "step": 39680, "train_speed(iter/s)": 0.295691 }, { "acc": 0.72860832, "epoch": 0.4440424077276801, "grad_norm": 8.3125, "learning_rate": 9.212569019222956e-06, "loss": 1.08029947, "memory(GiB)": 141.16, "step": 39700, "train_speed(iter/s)": 0.295744 }, { "acc": 0.72227736, "epoch": 0.44426610667363864, "grad_norm": 5.53125, "learning_rate": 9.211572494770772e-06, "loss": 1.11445971, "memory(GiB)": 141.16, "step": 39720, "train_speed(iter/s)": 0.295798 }, { "acc": 0.73453426, "epoch": 0.44448980561959717, "grad_norm": 7.0, "learning_rate": 9.210575394115273e-06, "loss": 1.08299332, "memory(GiB)": 141.16, "step": 39740, "train_speed(iter/s)": 0.295849 }, { "acc": 0.72561007, "epoch": 0.4447135045655557, "grad_norm": 7.125, "learning_rate": 9.209577717392879e-06, "loss": 1.09632378, "memory(GiB)": 141.16, "step": 39760, "train_speed(iter/s)": 0.295902 }, { "acc": 0.72010069, "epoch": 0.4449372035115142, "grad_norm": 8.4375, "learning_rate": 9.208579464740083e-06, "loss": 1.12538595, "memory(GiB)": 141.16, "step": 39780, "train_speed(iter/s)": 0.295956 }, { "acc": 0.72714615, "epoch": 0.44516090245747275, "grad_norm": 6.4375, "learning_rate": 9.207580636293462e-06, "loss": 1.08427753, "memory(GiB)": 141.16, "step": 39800, "train_speed(iter/s)": 0.296004 }, { "acc": 0.73100452, "epoch": 0.4453846014034313, "grad_norm": 6.9375, "learning_rate": 9.206581232189668e-06, "loss": 1.06700535, "memory(GiB)": 141.16, "step": 39820, "train_speed(iter/s)": 0.296053 }, { "acc": 0.72139292, "epoch": 0.4456083003493898, "grad_norm": 6.6875, "learning_rate": 9.205581252565438e-06, "loss": 1.1116087, "memory(GiB)": 141.16, "step": 39840, "train_speed(iter/s)": 0.296103 }, { "acc": 0.72316418, "epoch": 0.44583199929534834, "grad_norm": 7.71875, "learning_rate": 9.20458069755758e-06, "loss": 1.1045433, "memory(GiB)": 141.16, "step": 39860, "train_speed(iter/s)": 0.296155 }, { "acc": 0.72800288, "epoch": 0.44605569824130686, "grad_norm": 5.96875, "learning_rate": 9.203579567302987e-06, "loss": 1.09350166, "memory(GiB)": 141.16, "step": 39880, "train_speed(iter/s)": 0.296203 }, { "acc": 0.73822136, "epoch": 0.4462793971872654, "grad_norm": 6.84375, "learning_rate": 9.202577861938624e-06, "loss": 1.05183439, "memory(GiB)": 141.16, "step": 39900, "train_speed(iter/s)": 0.296249 }, { "acc": 0.73586745, "epoch": 0.4465030961332239, "grad_norm": 7.40625, "learning_rate": 9.201575581601541e-06, "loss": 1.05481071, "memory(GiB)": 141.16, "step": 39920, "train_speed(iter/s)": 0.296299 }, { "acc": 0.72859731, "epoch": 0.44672679507918245, "grad_norm": 6.625, "learning_rate": 9.200572726428865e-06, "loss": 1.0862174, "memory(GiB)": 141.16, "step": 39940, "train_speed(iter/s)": 0.296349 }, { "acc": 0.72353783, "epoch": 0.446950494025141, "grad_norm": 6.5, "learning_rate": 9.1995692965578e-06, "loss": 1.11455898, "memory(GiB)": 141.16, "step": 39960, "train_speed(iter/s)": 0.296394 }, { "acc": 0.71702003, "epoch": 0.4471741929710995, "grad_norm": 7.875, "learning_rate": 9.198565292125627e-06, "loss": 1.1360815, "memory(GiB)": 141.16, "step": 39980, "train_speed(iter/s)": 0.296445 }, { "acc": 0.73253365, "epoch": 0.44739789191705803, "grad_norm": 6.28125, "learning_rate": 9.197560713269713e-06, "loss": 1.07112169, "memory(GiB)": 141.16, "step": 40000, "train_speed(iter/s)": 0.296484 }, { "epoch": 0.44739789191705803, "eval_acc": 0.6880096041715223, "eval_loss": 1.0877375602722168, "eval_runtime": 2318.0514, "eval_samples_per_second": 32.477, "eval_steps_per_second": 16.239, "step": 40000 }, { "acc": 0.71923656, "epoch": 0.44762159086301656, "grad_norm": 6.3125, "learning_rate": 9.196555560127499e-06, "loss": 1.13852959, "memory(GiB)": 141.16, "step": 40020, "train_speed(iter/s)": 0.291405 }, { "acc": 0.72608595, "epoch": 0.4478452898089751, "grad_norm": 8.0, "learning_rate": 9.195549832836497e-06, "loss": 1.09704227, "memory(GiB)": 141.16, "step": 40040, "train_speed(iter/s)": 0.291456 }, { "acc": 0.73904343, "epoch": 0.4480689887549336, "grad_norm": 6.90625, "learning_rate": 9.194543531534312e-06, "loss": 1.05719738, "memory(GiB)": 141.16, "step": 40060, "train_speed(iter/s)": 0.2915 }, { "acc": 0.72009954, "epoch": 0.44829268770089215, "grad_norm": 5.21875, "learning_rate": 9.193536656358617e-06, "loss": 1.14116297, "memory(GiB)": 141.16, "step": 40080, "train_speed(iter/s)": 0.291549 }, { "acc": 0.71864891, "epoch": 0.4485163866468507, "grad_norm": 7.28125, "learning_rate": 9.19252920744717e-06, "loss": 1.14500942, "memory(GiB)": 141.16, "step": 40100, "train_speed(iter/s)": 0.291593 }, { "acc": 0.73000898, "epoch": 0.4487400855928092, "grad_norm": 6.09375, "learning_rate": 9.1915211849378e-06, "loss": 1.07062893, "memory(GiB)": 141.16, "step": 40120, "train_speed(iter/s)": 0.291643 }, { "acc": 0.73664417, "epoch": 0.44896378453876773, "grad_norm": 7.75, "learning_rate": 9.190512588968423e-06, "loss": 1.05776939, "memory(GiB)": 141.16, "step": 40140, "train_speed(iter/s)": 0.291688 }, { "acc": 0.72141151, "epoch": 0.44918748348472626, "grad_norm": 7.0, "learning_rate": 9.189503419677026e-06, "loss": 1.10338726, "memory(GiB)": 141.16, "step": 40160, "train_speed(iter/s)": 0.291739 }, { "acc": 0.73655663, "epoch": 0.4494111824306848, "grad_norm": 5.9375, "learning_rate": 9.18849367720168e-06, "loss": 1.06579285, "memory(GiB)": 141.16, "step": 40180, "train_speed(iter/s)": 0.291784 }, { "acc": 0.72667632, "epoch": 0.4496348813766433, "grad_norm": 7.65625, "learning_rate": 9.187483361680534e-06, "loss": 1.11150646, "memory(GiB)": 141.16, "step": 40200, "train_speed(iter/s)": 0.291827 }, { "acc": 0.73284674, "epoch": 0.44985858032260184, "grad_norm": 6.53125, "learning_rate": 9.186472473251808e-06, "loss": 1.08488455, "memory(GiB)": 141.16, "step": 40220, "train_speed(iter/s)": 0.291873 }, { "acc": 0.7200479, "epoch": 0.45008227926856037, "grad_norm": 6.625, "learning_rate": 9.18546101205381e-06, "loss": 1.13264885, "memory(GiB)": 141.16, "step": 40240, "train_speed(iter/s)": 0.291919 }, { "acc": 0.72755966, "epoch": 0.4503059782145189, "grad_norm": 5.3125, "learning_rate": 9.184448978224923e-06, "loss": 1.09839821, "memory(GiB)": 141.16, "step": 40260, "train_speed(iter/s)": 0.291968 }, { "acc": 0.71585951, "epoch": 0.4505296771604774, "grad_norm": 8.9375, "learning_rate": 9.183436371903605e-06, "loss": 1.13700848, "memory(GiB)": 141.16, "step": 40280, "train_speed(iter/s)": 0.292016 }, { "acc": 0.73552256, "epoch": 0.45075337610643595, "grad_norm": 5.875, "learning_rate": 9.182423193228397e-06, "loss": 1.07145329, "memory(GiB)": 141.16, "step": 40300, "train_speed(iter/s)": 0.292066 }, { "acc": 0.73024178, "epoch": 0.4509770750523945, "grad_norm": 6.75, "learning_rate": 9.181409442337913e-06, "loss": 1.08584557, "memory(GiB)": 141.16, "step": 40320, "train_speed(iter/s)": 0.292115 }, { "acc": 0.7364501, "epoch": 0.451200773998353, "grad_norm": 6.71875, "learning_rate": 9.180395119370853e-06, "loss": 1.05017757, "memory(GiB)": 141.16, "step": 40340, "train_speed(iter/s)": 0.292167 }, { "acc": 0.72069497, "epoch": 0.45142447294431154, "grad_norm": 6.78125, "learning_rate": 9.179380224465988e-06, "loss": 1.12836123, "memory(GiB)": 141.16, "step": 40360, "train_speed(iter/s)": 0.29221 }, { "acc": 0.71507902, "epoch": 0.45164817189027007, "grad_norm": 7.09375, "learning_rate": 9.178364757762173e-06, "loss": 1.14615145, "memory(GiB)": 141.16, "step": 40380, "train_speed(iter/s)": 0.292258 }, { "acc": 0.73081532, "epoch": 0.4518718708362286, "grad_norm": 8.5625, "learning_rate": 9.177348719398335e-06, "loss": 1.05081244, "memory(GiB)": 141.16, "step": 40400, "train_speed(iter/s)": 0.292308 }, { "acc": 0.73671618, "epoch": 0.4520955697821871, "grad_norm": 7.625, "learning_rate": 9.176332109513486e-06, "loss": 1.05025387, "memory(GiB)": 141.16, "step": 40420, "train_speed(iter/s)": 0.292355 }, { "acc": 0.73492007, "epoch": 0.45231926872814565, "grad_norm": 6.53125, "learning_rate": 9.17531492824671e-06, "loss": 1.06651573, "memory(GiB)": 141.16, "step": 40440, "train_speed(iter/s)": 0.2924 }, { "acc": 0.72296028, "epoch": 0.4525429676741042, "grad_norm": 6.34375, "learning_rate": 9.174297175737173e-06, "loss": 1.12183113, "memory(GiB)": 141.16, "step": 40460, "train_speed(iter/s)": 0.292451 }, { "acc": 0.71619468, "epoch": 0.4527666666200627, "grad_norm": 8.6875, "learning_rate": 9.173278852124117e-06, "loss": 1.15239315, "memory(GiB)": 141.16, "step": 40480, "train_speed(iter/s)": 0.292497 }, { "acc": 0.72070503, "epoch": 0.45299036556602124, "grad_norm": 7.4375, "learning_rate": 9.172259957546865e-06, "loss": 1.12843609, "memory(GiB)": 141.16, "step": 40500, "train_speed(iter/s)": 0.292542 }, { "acc": 0.72846456, "epoch": 0.45321406451197976, "grad_norm": 5.875, "learning_rate": 9.171240492144815e-06, "loss": 1.07666445, "memory(GiB)": 141.16, "step": 40520, "train_speed(iter/s)": 0.29259 }, { "acc": 0.73587713, "epoch": 0.4534377634579383, "grad_norm": 5.65625, "learning_rate": 9.170220456057444e-06, "loss": 1.04839249, "memory(GiB)": 141.16, "step": 40540, "train_speed(iter/s)": 0.292636 }, { "acc": 0.73555894, "epoch": 0.4536614624038968, "grad_norm": 7.1875, "learning_rate": 9.16919984942431e-06, "loss": 1.04579725, "memory(GiB)": 141.16, "step": 40560, "train_speed(iter/s)": 0.292684 }, { "acc": 0.72540817, "epoch": 0.45388516134985535, "grad_norm": 7.59375, "learning_rate": 9.168178672385045e-06, "loss": 1.09767456, "memory(GiB)": 141.16, "step": 40580, "train_speed(iter/s)": 0.292738 }, { "acc": 0.73158283, "epoch": 0.4541088602958139, "grad_norm": 6.15625, "learning_rate": 9.16715692507936e-06, "loss": 1.08337231, "memory(GiB)": 141.16, "step": 40600, "train_speed(iter/s)": 0.292786 }, { "acc": 0.73991146, "epoch": 0.4543325592417724, "grad_norm": 6.03125, "learning_rate": 9.166134607647045e-06, "loss": 1.03329544, "memory(GiB)": 141.16, "step": 40620, "train_speed(iter/s)": 0.292832 }, { "acc": 0.73401365, "epoch": 0.45455625818773093, "grad_norm": 8.375, "learning_rate": 9.165111720227968e-06, "loss": 1.0886363, "memory(GiB)": 141.16, "step": 40640, "train_speed(iter/s)": 0.292875 }, { "acc": 0.71697693, "epoch": 0.45477995713368946, "grad_norm": 6.9375, "learning_rate": 9.164088262962074e-06, "loss": 1.12353506, "memory(GiB)": 141.16, "step": 40660, "train_speed(iter/s)": 0.292928 }, { "acc": 0.73228121, "epoch": 0.455003656079648, "grad_norm": 8.125, "learning_rate": 9.163064235989388e-06, "loss": 1.07260685, "memory(GiB)": 141.16, "step": 40680, "train_speed(iter/s)": 0.29298 }, { "acc": 0.72316122, "epoch": 0.4552273550256065, "grad_norm": 6.625, "learning_rate": 9.162039639450012e-06, "loss": 1.10825996, "memory(GiB)": 141.16, "step": 40700, "train_speed(iter/s)": 0.293031 }, { "acc": 0.73202314, "epoch": 0.45545105397156505, "grad_norm": 9.0625, "learning_rate": 9.161014473484122e-06, "loss": 1.08365717, "memory(GiB)": 141.16, "step": 40720, "train_speed(iter/s)": 0.29307 }, { "acc": 0.72330418, "epoch": 0.4556747529175236, "grad_norm": 6.84375, "learning_rate": 9.159988738231978e-06, "loss": 1.11188469, "memory(GiB)": 141.16, "step": 40740, "train_speed(iter/s)": 0.293097 }, { "acc": 0.73288922, "epoch": 0.4558984518634821, "grad_norm": 5.875, "learning_rate": 9.158962433833914e-06, "loss": 1.06119852, "memory(GiB)": 141.16, "step": 40760, "train_speed(iter/s)": 0.293142 }, { "acc": 0.71754513, "epoch": 0.45612215080944063, "grad_norm": 7.375, "learning_rate": 9.157935560430344e-06, "loss": 1.1211791, "memory(GiB)": 141.16, "step": 40780, "train_speed(iter/s)": 0.293192 }, { "acc": 0.73451872, "epoch": 0.45634584975539916, "grad_norm": 6.3125, "learning_rate": 9.156908118161759e-06, "loss": 1.06881847, "memory(GiB)": 141.16, "step": 40800, "train_speed(iter/s)": 0.293242 }, { "acc": 0.7355113, "epoch": 0.4565695487013577, "grad_norm": 7.03125, "learning_rate": 9.155880107168728e-06, "loss": 1.04429245, "memory(GiB)": 141.16, "step": 40820, "train_speed(iter/s)": 0.293291 }, { "acc": 0.7273715, "epoch": 0.45679324764731627, "grad_norm": 9.8125, "learning_rate": 9.154851527591897e-06, "loss": 1.09287205, "memory(GiB)": 141.16, "step": 40840, "train_speed(iter/s)": 0.293339 }, { "acc": 0.7184433, "epoch": 0.4570169465932748, "grad_norm": 7.0625, "learning_rate": 9.15382237957199e-06, "loss": 1.12300653, "memory(GiB)": 141.16, "step": 40860, "train_speed(iter/s)": 0.293392 }, { "acc": 0.71532984, "epoch": 0.4572406455392333, "grad_norm": 7.8125, "learning_rate": 9.15279266324981e-06, "loss": 1.14656181, "memory(GiB)": 141.16, "step": 40880, "train_speed(iter/s)": 0.29343 }, { "acc": 0.73173914, "epoch": 0.45746434448519185, "grad_norm": 6.5, "learning_rate": 9.151762378766236e-06, "loss": 1.06807795, "memory(GiB)": 141.16, "step": 40900, "train_speed(iter/s)": 0.293479 }, { "acc": 0.7318099, "epoch": 0.4576880434311504, "grad_norm": 8.1875, "learning_rate": 9.150731526262226e-06, "loss": 1.08561993, "memory(GiB)": 141.16, "step": 40920, "train_speed(iter/s)": 0.293521 }, { "acc": 0.74025664, "epoch": 0.4579117423771089, "grad_norm": 6.3125, "learning_rate": 9.149700105878818e-06, "loss": 1.02762289, "memory(GiB)": 141.16, "step": 40940, "train_speed(iter/s)": 0.293564 }, { "acc": 0.72076349, "epoch": 0.45813544132306744, "grad_norm": 6.78125, "learning_rate": 9.148668117757121e-06, "loss": 1.1207037, "memory(GiB)": 141.16, "step": 40960, "train_speed(iter/s)": 0.293614 }, { "acc": 0.72050405, "epoch": 0.45835914026902597, "grad_norm": 7.53125, "learning_rate": 9.147635562038327e-06, "loss": 1.13600206, "memory(GiB)": 141.16, "step": 40980, "train_speed(iter/s)": 0.293657 }, { "acc": 0.72840796, "epoch": 0.4585828392149845, "grad_norm": 6.59375, "learning_rate": 9.146602438863705e-06, "loss": 1.0850852, "memory(GiB)": 141.16, "step": 41000, "train_speed(iter/s)": 0.293704 }, { "acc": 0.73385334, "epoch": 0.458806538160943, "grad_norm": 7.375, "learning_rate": 9.1455687483746e-06, "loss": 1.06882706, "memory(GiB)": 141.16, "step": 41020, "train_speed(iter/s)": 0.293752 }, { "acc": 0.72499785, "epoch": 0.45903023710690155, "grad_norm": 7.65625, "learning_rate": 9.144534490712438e-06, "loss": 1.10280933, "memory(GiB)": 141.16, "step": 41040, "train_speed(iter/s)": 0.293799 }, { "acc": 0.73604364, "epoch": 0.4592539360528601, "grad_norm": 6.0, "learning_rate": 9.143499666018719e-06, "loss": 1.05961704, "memory(GiB)": 141.16, "step": 41060, "train_speed(iter/s)": 0.293848 }, { "acc": 0.72360196, "epoch": 0.4594776349988186, "grad_norm": 7.5625, "learning_rate": 9.142464274435018e-06, "loss": 1.12010345, "memory(GiB)": 141.16, "step": 41080, "train_speed(iter/s)": 0.293892 }, { "acc": 0.73527393, "epoch": 0.45970133394477714, "grad_norm": 6.0, "learning_rate": 9.141428316102998e-06, "loss": 1.05748081, "memory(GiB)": 141.16, "step": 41100, "train_speed(iter/s)": 0.29394 }, { "acc": 0.7307126, "epoch": 0.45992503289073566, "grad_norm": 8.375, "learning_rate": 9.140391791164389e-06, "loss": 1.08681297, "memory(GiB)": 141.16, "step": 41120, "train_speed(iter/s)": 0.293991 }, { "acc": 0.73651104, "epoch": 0.4601487318366942, "grad_norm": 7.90625, "learning_rate": 9.139354699761003e-06, "loss": 1.06413021, "memory(GiB)": 141.16, "step": 41140, "train_speed(iter/s)": 0.294035 }, { "acc": 0.7212182, "epoch": 0.4603724307826527, "grad_norm": 8.0625, "learning_rate": 9.138317042034728e-06, "loss": 1.11677866, "memory(GiB)": 141.16, "step": 41160, "train_speed(iter/s)": 0.294078 }, { "acc": 0.72388906, "epoch": 0.46059612972861125, "grad_norm": 7.8125, "learning_rate": 9.137278818127532e-06, "loss": 1.12574272, "memory(GiB)": 141.16, "step": 41180, "train_speed(iter/s)": 0.294123 }, { "acc": 0.72476387, "epoch": 0.4608198286745698, "grad_norm": 6.53125, "learning_rate": 9.13624002818146e-06, "loss": 1.09921227, "memory(GiB)": 141.16, "step": 41200, "train_speed(iter/s)": 0.294173 }, { "acc": 0.71587949, "epoch": 0.4610435276205283, "grad_norm": 8.8125, "learning_rate": 9.135200672338631e-06, "loss": 1.14335871, "memory(GiB)": 141.16, "step": 41220, "train_speed(iter/s)": 0.29422 }, { "acc": 0.72335224, "epoch": 0.46126722656648683, "grad_norm": 6.65625, "learning_rate": 9.134160750741243e-06, "loss": 1.13134537, "memory(GiB)": 141.16, "step": 41240, "train_speed(iter/s)": 0.294268 }, { "acc": 0.74015574, "epoch": 0.46149092551244536, "grad_norm": 6.875, "learning_rate": 9.133120263531576e-06, "loss": 1.03272581, "memory(GiB)": 141.16, "step": 41260, "train_speed(iter/s)": 0.294309 }, { "acc": 0.73414116, "epoch": 0.4617146244584039, "grad_norm": 7.9375, "learning_rate": 9.132079210851979e-06, "loss": 1.0640996, "memory(GiB)": 141.16, "step": 41280, "train_speed(iter/s)": 0.294354 }, { "acc": 0.71350632, "epoch": 0.4619383234043624, "grad_norm": 6.78125, "learning_rate": 9.131037592844884e-06, "loss": 1.15697327, "memory(GiB)": 141.16, "step": 41300, "train_speed(iter/s)": 0.294401 }, { "acc": 0.73884921, "epoch": 0.46216202235032094, "grad_norm": 8.5625, "learning_rate": 9.129995409652803e-06, "loss": 1.02956161, "memory(GiB)": 141.16, "step": 41320, "train_speed(iter/s)": 0.294449 }, { "acc": 0.73146853, "epoch": 0.4623857212962795, "grad_norm": 7.0, "learning_rate": 9.128952661418317e-06, "loss": 1.06239719, "memory(GiB)": 141.16, "step": 41340, "train_speed(iter/s)": 0.294499 }, { "acc": 0.73825388, "epoch": 0.462609420242238, "grad_norm": 6.4375, "learning_rate": 9.12790934828409e-06, "loss": 1.03649979, "memory(GiB)": 141.16, "step": 41360, "train_speed(iter/s)": 0.294548 }, { "acc": 0.73831477, "epoch": 0.46283311918819653, "grad_norm": 7.65625, "learning_rate": 9.126865470392864e-06, "loss": 1.05770569, "memory(GiB)": 141.16, "step": 41380, "train_speed(iter/s)": 0.294582 }, { "acc": 0.72790332, "epoch": 0.46305681813415506, "grad_norm": 7.875, "learning_rate": 9.125821027887454e-06, "loss": 1.08347082, "memory(GiB)": 141.16, "step": 41400, "train_speed(iter/s)": 0.294636 }, { "acc": 0.7271718, "epoch": 0.4632805170801136, "grad_norm": 8.0, "learning_rate": 9.124776020910757e-06, "loss": 1.09797897, "memory(GiB)": 141.16, "step": 41420, "train_speed(iter/s)": 0.294687 }, { "acc": 0.72985029, "epoch": 0.4635042160260721, "grad_norm": 6.8125, "learning_rate": 9.123730449605743e-06, "loss": 1.07701588, "memory(GiB)": 141.16, "step": 41440, "train_speed(iter/s)": 0.294727 }, { "acc": 0.7158648, "epoch": 0.46372791497203064, "grad_norm": 8.25, "learning_rate": 9.122684314115461e-06, "loss": 1.17107925, "memory(GiB)": 141.16, "step": 41460, "train_speed(iter/s)": 0.294775 }, { "acc": 0.73937063, "epoch": 0.46395161391798917, "grad_norm": 7.8125, "learning_rate": 9.121637614583041e-06, "loss": 1.03979473, "memory(GiB)": 141.16, "step": 41480, "train_speed(iter/s)": 0.294825 }, { "acc": 0.73394761, "epoch": 0.4641753128639477, "grad_norm": 7.1875, "learning_rate": 9.12059035115168e-06, "loss": 1.06422129, "memory(GiB)": 141.16, "step": 41500, "train_speed(iter/s)": 0.294869 }, { "acc": 0.72969861, "epoch": 0.4643990118099062, "grad_norm": 6.28125, "learning_rate": 9.119542523964665e-06, "loss": 1.07468824, "memory(GiB)": 141.16, "step": 41520, "train_speed(iter/s)": 0.29491 }, { "acc": 0.73463326, "epoch": 0.46462271075586475, "grad_norm": 6.71875, "learning_rate": 9.118494133165349e-06, "loss": 1.0650629, "memory(GiB)": 141.16, "step": 41540, "train_speed(iter/s)": 0.294954 }, { "acc": 0.73516836, "epoch": 0.4648464097018233, "grad_norm": 6.59375, "learning_rate": 9.11744517889717e-06, "loss": 1.05903511, "memory(GiB)": 141.16, "step": 41560, "train_speed(iter/s)": 0.295004 }, { "acc": 0.7240284, "epoch": 0.4650701086477818, "grad_norm": 8.5, "learning_rate": 9.11639566130364e-06, "loss": 1.10679817, "memory(GiB)": 141.16, "step": 41580, "train_speed(iter/s)": 0.29505 }, { "acc": 0.73141785, "epoch": 0.46529380759374034, "grad_norm": 7.6875, "learning_rate": 9.115345580528342e-06, "loss": 1.066506, "memory(GiB)": 141.16, "step": 41600, "train_speed(iter/s)": 0.295094 }, { "acc": 0.72070379, "epoch": 0.46551750653969887, "grad_norm": 6.59375, "learning_rate": 9.114294936714951e-06, "loss": 1.13285532, "memory(GiB)": 141.16, "step": 41620, "train_speed(iter/s)": 0.295139 }, { "acc": 0.71979113, "epoch": 0.4657412054856574, "grad_norm": 7.90625, "learning_rate": 9.113243730007204e-06, "loss": 1.13262072, "memory(GiB)": 141.16, "step": 41640, "train_speed(iter/s)": 0.295185 }, { "acc": 0.71964464, "epoch": 0.4659649044316159, "grad_norm": 7.375, "learning_rate": 9.112191960548924e-06, "loss": 1.1477354, "memory(GiB)": 141.16, "step": 41660, "train_speed(iter/s)": 0.295227 }, { "acc": 0.72336044, "epoch": 0.46618860337757445, "grad_norm": 7.625, "learning_rate": 9.111139628484005e-06, "loss": 1.11645813, "memory(GiB)": 141.16, "step": 41680, "train_speed(iter/s)": 0.295273 }, { "acc": 0.72906542, "epoch": 0.466412302323533, "grad_norm": 8.9375, "learning_rate": 9.110086733956425e-06, "loss": 1.07498531, "memory(GiB)": 141.16, "step": 41700, "train_speed(iter/s)": 0.295327 }, { "acc": 0.72967849, "epoch": 0.4666360012694915, "grad_norm": 8.5, "learning_rate": 9.109033277110233e-06, "loss": 1.08689013, "memory(GiB)": 141.16, "step": 41720, "train_speed(iter/s)": 0.295375 }, { "acc": 0.73028698, "epoch": 0.46685970021545004, "grad_norm": 6.15625, "learning_rate": 9.107979258089556e-06, "loss": 1.07974834, "memory(GiB)": 141.16, "step": 41740, "train_speed(iter/s)": 0.295424 }, { "acc": 0.7288558, "epoch": 0.46708339916140856, "grad_norm": 6.4375, "learning_rate": 9.106924677038601e-06, "loss": 1.08034182, "memory(GiB)": 141.16, "step": 41760, "train_speed(iter/s)": 0.295471 }, { "acc": 0.7303277, "epoch": 0.4673070981073671, "grad_norm": 7.96875, "learning_rate": 9.105869534101648e-06, "loss": 1.08395424, "memory(GiB)": 141.16, "step": 41780, "train_speed(iter/s)": 0.295519 }, { "acc": 0.73222561, "epoch": 0.4675307970533256, "grad_norm": 8.0, "learning_rate": 9.104813829423056e-06, "loss": 1.07869987, "memory(GiB)": 141.16, "step": 41800, "train_speed(iter/s)": 0.295564 }, { "acc": 0.74225082, "epoch": 0.46775449599928415, "grad_norm": 6.8125, "learning_rate": 9.103757563147261e-06, "loss": 1.02722816, "memory(GiB)": 141.16, "step": 41820, "train_speed(iter/s)": 0.295604 }, { "acc": 0.73882322, "epoch": 0.4679781949452427, "grad_norm": 6.6875, "learning_rate": 9.102700735418777e-06, "loss": 1.03994856, "memory(GiB)": 141.16, "step": 41840, "train_speed(iter/s)": 0.295649 }, { "acc": 0.73041244, "epoch": 0.4682018938912012, "grad_norm": 5.875, "learning_rate": 9.10164334638219e-06, "loss": 1.07882748, "memory(GiB)": 141.16, "step": 41860, "train_speed(iter/s)": 0.295698 }, { "acc": 0.71726437, "epoch": 0.46842559283715973, "grad_norm": 7.34375, "learning_rate": 9.100585396182166e-06, "loss": 1.135112, "memory(GiB)": 141.16, "step": 41880, "train_speed(iter/s)": 0.295737 }, { "acc": 0.7443306, "epoch": 0.46864929178311826, "grad_norm": 6.375, "learning_rate": 9.099526884963451e-06, "loss": 1.01585426, "memory(GiB)": 141.16, "step": 41900, "train_speed(iter/s)": 0.295783 }, { "acc": 0.72749138, "epoch": 0.4688729907290768, "grad_norm": 6.09375, "learning_rate": 9.09846781287086e-06, "loss": 1.09126854, "memory(GiB)": 141.16, "step": 41920, "train_speed(iter/s)": 0.29583 }, { "acc": 0.72947569, "epoch": 0.4690966896750353, "grad_norm": 8.125, "learning_rate": 9.097408180049295e-06, "loss": 1.08336849, "memory(GiB)": 141.16, "step": 41940, "train_speed(iter/s)": 0.295875 }, { "acc": 0.7376215, "epoch": 0.46932038862099384, "grad_norm": 7.15625, "learning_rate": 9.096347986643723e-06, "loss": 1.04688663, "memory(GiB)": 141.16, "step": 41960, "train_speed(iter/s)": 0.295921 }, { "acc": 0.72159491, "epoch": 0.4695440875669524, "grad_norm": 7.09375, "learning_rate": 9.095287232799196e-06, "loss": 1.13473396, "memory(GiB)": 141.16, "step": 41980, "train_speed(iter/s)": 0.295963 }, { "acc": 0.73290157, "epoch": 0.4697677865129109, "grad_norm": 6.09375, "learning_rate": 9.094225918660842e-06, "loss": 1.07460299, "memory(GiB)": 141.16, "step": 42000, "train_speed(iter/s)": 0.296007 }, { "epoch": 0.4697677865129109, "eval_acc": 0.6882532282407522, "eval_loss": 1.0867210626602173, "eval_runtime": 2318.7308, "eval_samples_per_second": 32.467, "eval_steps_per_second": 16.234, "step": 42000 }, { "acc": 0.74088688, "epoch": 0.46999148545886943, "grad_norm": 7.21875, "learning_rate": 9.093164044373862e-06, "loss": 1.03209143, "memory(GiB)": 141.16, "step": 42020, "train_speed(iter/s)": 0.291193 }, { "acc": 0.72290993, "epoch": 0.47021518440482796, "grad_norm": 6.6875, "learning_rate": 9.092101610083534e-06, "loss": 1.12040062, "memory(GiB)": 141.16, "step": 42040, "train_speed(iter/s)": 0.291239 }, { "acc": 0.72198391, "epoch": 0.4704388833507865, "grad_norm": 6.53125, "learning_rate": 9.091038615935217e-06, "loss": 1.10833015, "memory(GiB)": 141.16, "step": 42060, "train_speed(iter/s)": 0.291291 }, { "acc": 0.72451239, "epoch": 0.470662582296745, "grad_norm": 6.5625, "learning_rate": 9.089975062074345e-06, "loss": 1.11497326, "memory(GiB)": 141.16, "step": 42080, "train_speed(iter/s)": 0.291339 }, { "acc": 0.73162947, "epoch": 0.4708862812427036, "grad_norm": 7.5625, "learning_rate": 9.088910948646424e-06, "loss": 1.06812649, "memory(GiB)": 141.16, "step": 42100, "train_speed(iter/s)": 0.291387 }, { "acc": 0.73172932, "epoch": 0.4711099801886621, "grad_norm": 7.96875, "learning_rate": 9.08784627579704e-06, "loss": 1.07498055, "memory(GiB)": 141.16, "step": 42120, "train_speed(iter/s)": 0.291429 }, { "acc": 0.72722306, "epoch": 0.47133367913462065, "grad_norm": 5.625, "learning_rate": 9.086781043671857e-06, "loss": 1.08809204, "memory(GiB)": 141.16, "step": 42140, "train_speed(iter/s)": 0.291477 }, { "acc": 0.72973905, "epoch": 0.4715573780805792, "grad_norm": 7.875, "learning_rate": 9.085715252416616e-06, "loss": 1.0780736, "memory(GiB)": 141.16, "step": 42160, "train_speed(iter/s)": 0.291527 }, { "acc": 0.72475567, "epoch": 0.4717810770265377, "grad_norm": 6.34375, "learning_rate": 9.084648902177127e-06, "loss": 1.10909176, "memory(GiB)": 141.16, "step": 42180, "train_speed(iter/s)": 0.291568 }, { "acc": 0.73693705, "epoch": 0.47200477597249624, "grad_norm": 9.375, "learning_rate": 9.083581993099287e-06, "loss": 1.05558147, "memory(GiB)": 141.16, "step": 42200, "train_speed(iter/s)": 0.291612 }, { "acc": 0.73371906, "epoch": 0.47222847491845477, "grad_norm": 5.75, "learning_rate": 9.082514525329063e-06, "loss": 1.08984594, "memory(GiB)": 141.16, "step": 42220, "train_speed(iter/s)": 0.291649 }, { "acc": 0.71801357, "epoch": 0.4724521738644133, "grad_norm": 7.28125, "learning_rate": 9.081446499012498e-06, "loss": 1.12964325, "memory(GiB)": 141.16, "step": 42240, "train_speed(iter/s)": 0.291695 }, { "acc": 0.71824102, "epoch": 0.4726758728103718, "grad_norm": 4.78125, "learning_rate": 9.080377914295714e-06, "loss": 1.14218273, "memory(GiB)": 141.16, "step": 42260, "train_speed(iter/s)": 0.291743 }, { "acc": 0.74428844, "epoch": 0.47289957175633035, "grad_norm": 9.125, "learning_rate": 9.07930877132491e-06, "loss": 0.9983614, "memory(GiB)": 141.16, "step": 42280, "train_speed(iter/s)": 0.291793 }, { "acc": 0.71848354, "epoch": 0.4731232707022889, "grad_norm": 8.4375, "learning_rate": 9.07823907024636e-06, "loss": 1.13016529, "memory(GiB)": 141.16, "step": 42300, "train_speed(iter/s)": 0.291839 }, { "acc": 0.72422571, "epoch": 0.4733469696482474, "grad_norm": 8.5, "learning_rate": 9.077168811206414e-06, "loss": 1.10963068, "memory(GiB)": 141.16, "step": 42320, "train_speed(iter/s)": 0.291879 }, { "acc": 0.7201601, "epoch": 0.47357066859420593, "grad_norm": 7.1875, "learning_rate": 9.076097994351499e-06, "loss": 1.13612566, "memory(GiB)": 141.16, "step": 42340, "train_speed(iter/s)": 0.291924 }, { "acc": 0.73964262, "epoch": 0.47379436754016446, "grad_norm": 7.28125, "learning_rate": 9.075026619828116e-06, "loss": 1.02760382, "memory(GiB)": 141.16, "step": 42360, "train_speed(iter/s)": 0.291976 }, { "acc": 0.72759771, "epoch": 0.474018066486123, "grad_norm": 6.78125, "learning_rate": 9.073954687782846e-06, "loss": 1.09918156, "memory(GiB)": 141.16, "step": 42380, "train_speed(iter/s)": 0.292021 }, { "acc": 0.73304029, "epoch": 0.4742417654320815, "grad_norm": 8.875, "learning_rate": 9.072882198362345e-06, "loss": 1.06386681, "memory(GiB)": 141.16, "step": 42400, "train_speed(iter/s)": 0.292069 }, { "acc": 0.73418226, "epoch": 0.47446546437804005, "grad_norm": 8.5, "learning_rate": 9.071809151713341e-06, "loss": 1.0674551, "memory(GiB)": 141.16, "step": 42420, "train_speed(iter/s)": 0.292118 }, { "acc": 0.72948027, "epoch": 0.4746891633239986, "grad_norm": 7.84375, "learning_rate": 9.070735547982651e-06, "loss": 1.08645611, "memory(GiB)": 141.16, "step": 42440, "train_speed(iter/s)": 0.292164 }, { "acc": 0.72509451, "epoch": 0.4749128622699571, "grad_norm": 5.625, "learning_rate": 9.06966138731715e-06, "loss": 1.10627937, "memory(GiB)": 141.16, "step": 42460, "train_speed(iter/s)": 0.292211 }, { "acc": 0.73762579, "epoch": 0.47513656121591563, "grad_norm": 5.65625, "learning_rate": 9.068586669863804e-06, "loss": 1.04451685, "memory(GiB)": 141.16, "step": 42480, "train_speed(iter/s)": 0.292256 }, { "acc": 0.7262146, "epoch": 0.47536026016187416, "grad_norm": 11.8125, "learning_rate": 9.067511395769649e-06, "loss": 1.10254059, "memory(GiB)": 141.16, "step": 42500, "train_speed(iter/s)": 0.292312 }, { "acc": 0.72174282, "epoch": 0.4755839591078327, "grad_norm": 6.875, "learning_rate": 9.066435565181795e-06, "loss": 1.10953445, "memory(GiB)": 141.16, "step": 42520, "train_speed(iter/s)": 0.292361 }, { "acc": 0.7276412, "epoch": 0.4758076580537912, "grad_norm": 6.9375, "learning_rate": 9.065359178247434e-06, "loss": 1.09845219, "memory(GiB)": 141.16, "step": 42540, "train_speed(iter/s)": 0.292408 }, { "acc": 0.72292447, "epoch": 0.47603135699974974, "grad_norm": 9.4375, "learning_rate": 9.06428223511383e-06, "loss": 1.12225342, "memory(GiB)": 141.16, "step": 42560, "train_speed(iter/s)": 0.292453 }, { "acc": 0.74051304, "epoch": 0.47625505594570827, "grad_norm": 6.65625, "learning_rate": 9.063204735928323e-06, "loss": 1.02366667, "memory(GiB)": 141.16, "step": 42580, "train_speed(iter/s)": 0.292499 }, { "acc": 0.73968582, "epoch": 0.4764787548916668, "grad_norm": 5.03125, "learning_rate": 9.062126680838332e-06, "loss": 1.03540487, "memory(GiB)": 141.16, "step": 42600, "train_speed(iter/s)": 0.292539 }, { "acc": 0.7303566, "epoch": 0.47670245383762533, "grad_norm": 6.25, "learning_rate": 9.06104806999135e-06, "loss": 1.09025421, "memory(GiB)": 141.16, "step": 42620, "train_speed(iter/s)": 0.292579 }, { "acc": 0.7262598, "epoch": 0.47692615278358386, "grad_norm": 8.3125, "learning_rate": 9.059968903534948e-06, "loss": 1.10296774, "memory(GiB)": 141.16, "step": 42640, "train_speed(iter/s)": 0.292625 }, { "acc": 0.71468029, "epoch": 0.4771498517295424, "grad_norm": 7.5625, "learning_rate": 9.058889181616768e-06, "loss": 1.14958048, "memory(GiB)": 141.16, "step": 42660, "train_speed(iter/s)": 0.29267 }, { "acc": 0.73521719, "epoch": 0.4773735506755009, "grad_norm": 7.625, "learning_rate": 9.057808904384534e-06, "loss": 1.06460323, "memory(GiB)": 141.16, "step": 42680, "train_speed(iter/s)": 0.292719 }, { "acc": 0.72855873, "epoch": 0.47759724962145944, "grad_norm": 7.65625, "learning_rate": 9.056728071986041e-06, "loss": 1.07361164, "memory(GiB)": 141.16, "step": 42700, "train_speed(iter/s)": 0.292763 }, { "acc": 0.73217554, "epoch": 0.47782094856741797, "grad_norm": 5.96875, "learning_rate": 9.055646684569164e-06, "loss": 1.0752491, "memory(GiB)": 141.16, "step": 42720, "train_speed(iter/s)": 0.292814 }, { "acc": 0.71752081, "epoch": 0.4780446475133765, "grad_norm": 9.8125, "learning_rate": 9.054564742281853e-06, "loss": 1.14193668, "memory(GiB)": 141.16, "step": 42740, "train_speed(iter/s)": 0.292858 }, { "acc": 0.72720995, "epoch": 0.478268346459335, "grad_norm": 6.8125, "learning_rate": 9.053482245272132e-06, "loss": 1.10323505, "memory(GiB)": 141.16, "step": 42760, "train_speed(iter/s)": 0.292903 }, { "acc": 0.7271369, "epoch": 0.47849204540529355, "grad_norm": 5.4375, "learning_rate": 9.052399193688102e-06, "loss": 1.10250111, "memory(GiB)": 141.16, "step": 42780, "train_speed(iter/s)": 0.29295 }, { "acc": 0.72279215, "epoch": 0.4787157443512521, "grad_norm": 6.3125, "learning_rate": 9.05131558767794e-06, "loss": 1.11118374, "memory(GiB)": 141.16, "step": 42800, "train_speed(iter/s)": 0.293 }, { "acc": 0.73638086, "epoch": 0.4789394432972106, "grad_norm": 7.09375, "learning_rate": 9.0502314273899e-06, "loss": 1.04777012, "memory(GiB)": 141.16, "step": 42820, "train_speed(iter/s)": 0.293045 }, { "acc": 0.73727336, "epoch": 0.47916314224316914, "grad_norm": 7.75, "learning_rate": 9.049146712972308e-06, "loss": 1.04645786, "memory(GiB)": 141.16, "step": 42840, "train_speed(iter/s)": 0.29309 }, { "acc": 0.72129135, "epoch": 0.47938684118912767, "grad_norm": 9.0625, "learning_rate": 9.048061444573571e-06, "loss": 1.12281809, "memory(GiB)": 141.16, "step": 42860, "train_speed(iter/s)": 0.29313 }, { "acc": 0.73107224, "epoch": 0.4796105401350862, "grad_norm": 5.21875, "learning_rate": 9.046975622342167e-06, "loss": 1.06816168, "memory(GiB)": 141.16, "step": 42880, "train_speed(iter/s)": 0.293181 }, { "acc": 0.72103901, "epoch": 0.4798342390810447, "grad_norm": 7.71875, "learning_rate": 9.045889246426654e-06, "loss": 1.12918549, "memory(GiB)": 141.16, "step": 42900, "train_speed(iter/s)": 0.293229 }, { "acc": 0.72346315, "epoch": 0.48005793802700325, "grad_norm": 6.25, "learning_rate": 9.044802316975662e-06, "loss": 1.11508446, "memory(GiB)": 141.16, "step": 42920, "train_speed(iter/s)": 0.293277 }, { "acc": 0.73056917, "epoch": 0.4802816369729618, "grad_norm": 7.28125, "learning_rate": 9.043714834137902e-06, "loss": 1.07768402, "memory(GiB)": 141.16, "step": 42940, "train_speed(iter/s)": 0.293323 }, { "acc": 0.75163546, "epoch": 0.4805053359189203, "grad_norm": 7.4375, "learning_rate": 9.042626798062152e-06, "loss": 0.98492355, "memory(GiB)": 141.16, "step": 42960, "train_speed(iter/s)": 0.293368 }, { "acc": 0.72392321, "epoch": 0.48072903486487883, "grad_norm": 6.03125, "learning_rate": 9.041538208897277e-06, "loss": 1.10283298, "memory(GiB)": 141.16, "step": 42980, "train_speed(iter/s)": 0.293414 }, { "acc": 0.72288675, "epoch": 0.48095273381083736, "grad_norm": 5.09375, "learning_rate": 9.040449066792205e-06, "loss": 1.112644, "memory(GiB)": 141.16, "step": 43000, "train_speed(iter/s)": 0.293459 }, { "acc": 0.72781038, "epoch": 0.4811764327567959, "grad_norm": 6.75, "learning_rate": 9.039359371895951e-06, "loss": 1.10267525, "memory(GiB)": 141.16, "step": 43020, "train_speed(iter/s)": 0.293502 }, { "acc": 0.72971621, "epoch": 0.4814001317027544, "grad_norm": 4.875, "learning_rate": 9.038269124357598e-06, "loss": 1.08734779, "memory(GiB)": 141.16, "step": 43040, "train_speed(iter/s)": 0.29355 }, { "acc": 0.72352238, "epoch": 0.48162383064871295, "grad_norm": 6.46875, "learning_rate": 9.03717832432631e-06, "loss": 1.12227211, "memory(GiB)": 141.16, "step": 43060, "train_speed(iter/s)": 0.293592 }, { "acc": 0.72305903, "epoch": 0.4818475295946715, "grad_norm": 7.625, "learning_rate": 9.036086971951321e-06, "loss": 1.1267704, "memory(GiB)": 141.16, "step": 43080, "train_speed(iter/s)": 0.293634 }, { "acc": 0.73347392, "epoch": 0.48207122854063, "grad_norm": 6.71875, "learning_rate": 9.034995067381946e-06, "loss": 1.05212326, "memory(GiB)": 141.16, "step": 43100, "train_speed(iter/s)": 0.293683 }, { "acc": 0.72459927, "epoch": 0.48229492748658853, "grad_norm": 6.1875, "learning_rate": 9.033902610767573e-06, "loss": 1.10280371, "memory(GiB)": 141.16, "step": 43120, "train_speed(iter/s)": 0.29373 }, { "acc": 0.72987652, "epoch": 0.48251862643254706, "grad_norm": 7.375, "learning_rate": 9.032809602257663e-06, "loss": 1.08226566, "memory(GiB)": 141.16, "step": 43140, "train_speed(iter/s)": 0.293775 }, { "acc": 0.7381144, "epoch": 0.4827423253785056, "grad_norm": 8.0, "learning_rate": 9.03171604200176e-06, "loss": 1.04227524, "memory(GiB)": 141.16, "step": 43160, "train_speed(iter/s)": 0.293823 }, { "acc": 0.73285999, "epoch": 0.4829660243244641, "grad_norm": 7.875, "learning_rate": 9.030621930149475e-06, "loss": 1.0711132, "memory(GiB)": 141.16, "step": 43180, "train_speed(iter/s)": 0.293857 }, { "acc": 0.72941427, "epoch": 0.48318972327042264, "grad_norm": 7.9375, "learning_rate": 9.029527266850499e-06, "loss": 1.08575706, "memory(GiB)": 141.16, "step": 43200, "train_speed(iter/s)": 0.293902 }, { "acc": 0.71751003, "epoch": 0.4834134222163812, "grad_norm": 8.25, "learning_rate": 9.028432052254598e-06, "loss": 1.13451738, "memory(GiB)": 141.16, "step": 43220, "train_speed(iter/s)": 0.293947 }, { "acc": 0.72662687, "epoch": 0.4836371211623397, "grad_norm": 7.75, "learning_rate": 9.027336286511613e-06, "loss": 1.09909658, "memory(GiB)": 141.16, "step": 43240, "train_speed(iter/s)": 0.293992 }, { "acc": 0.72776561, "epoch": 0.48386082010829823, "grad_norm": 7.0, "learning_rate": 9.026239969771459e-06, "loss": 1.08055382, "memory(GiB)": 141.16, "step": 43260, "train_speed(iter/s)": 0.294038 }, { "acc": 0.72961397, "epoch": 0.48408451905425676, "grad_norm": 8.125, "learning_rate": 9.025143102184129e-06, "loss": 1.07695122, "memory(GiB)": 141.16, "step": 43280, "train_speed(iter/s)": 0.294084 }, { "acc": 0.72973633, "epoch": 0.4843082180002153, "grad_norm": 6.65625, "learning_rate": 9.024045683899692e-06, "loss": 1.07756538, "memory(GiB)": 141.16, "step": 43300, "train_speed(iter/s)": 0.294126 }, { "acc": 0.73334575, "epoch": 0.4845319169461738, "grad_norm": 8.375, "learning_rate": 9.022947715068287e-06, "loss": 1.07340174, "memory(GiB)": 141.16, "step": 43320, "train_speed(iter/s)": 0.294171 }, { "acc": 0.72243052, "epoch": 0.48475561589213234, "grad_norm": 6.09375, "learning_rate": 9.021849195840133e-06, "loss": 1.12047462, "memory(GiB)": 141.16, "step": 43340, "train_speed(iter/s)": 0.294221 }, { "acc": 0.72492437, "epoch": 0.4849793148380909, "grad_norm": 7.09375, "learning_rate": 9.020750126365523e-06, "loss": 1.11279907, "memory(GiB)": 141.16, "step": 43360, "train_speed(iter/s)": 0.294259 }, { "acc": 0.72539692, "epoch": 0.48520301378404945, "grad_norm": 7.25, "learning_rate": 9.019650506794828e-06, "loss": 1.10621166, "memory(GiB)": 141.16, "step": 43380, "train_speed(iter/s)": 0.294304 }, { "acc": 0.72421842, "epoch": 0.485426712730008, "grad_norm": 6.4375, "learning_rate": 9.018550337278486e-06, "loss": 1.1100174, "memory(GiB)": 141.16, "step": 43400, "train_speed(iter/s)": 0.294349 }, { "acc": 0.72404842, "epoch": 0.4856504116759665, "grad_norm": 9.0625, "learning_rate": 9.017449617967024e-06, "loss": 1.10676441, "memory(GiB)": 141.16, "step": 43420, "train_speed(iter/s)": 0.294395 }, { "acc": 0.71112466, "epoch": 0.48587411062192504, "grad_norm": 8.6875, "learning_rate": 9.016348349011029e-06, "loss": 1.15998678, "memory(GiB)": 141.16, "step": 43440, "train_speed(iter/s)": 0.294437 }, { "acc": 0.7215414, "epoch": 0.48609780956788357, "grad_norm": 5.03125, "learning_rate": 9.015246530561174e-06, "loss": 1.11362457, "memory(GiB)": 141.16, "step": 43460, "train_speed(iter/s)": 0.294476 }, { "acc": 0.71905241, "epoch": 0.4863215085138421, "grad_norm": 6.96875, "learning_rate": 9.014144162768202e-06, "loss": 1.11789808, "memory(GiB)": 141.16, "step": 43480, "train_speed(iter/s)": 0.294519 }, { "acc": 0.7277916, "epoch": 0.4865452074598006, "grad_norm": 6.875, "learning_rate": 9.013041245782934e-06, "loss": 1.08840036, "memory(GiB)": 141.16, "step": 43500, "train_speed(iter/s)": 0.294563 }, { "acc": 0.72832985, "epoch": 0.48676890640575915, "grad_norm": 6.5625, "learning_rate": 9.011937779756263e-06, "loss": 1.08640976, "memory(GiB)": 141.16, "step": 43520, "train_speed(iter/s)": 0.294606 }, { "acc": 0.72628465, "epoch": 0.4869926053517177, "grad_norm": 7.125, "learning_rate": 9.01083376483916e-06, "loss": 1.11806173, "memory(GiB)": 141.16, "step": 43540, "train_speed(iter/s)": 0.294646 }, { "acc": 0.72055597, "epoch": 0.4872163042976762, "grad_norm": 6.53125, "learning_rate": 9.00972920118267e-06, "loss": 1.14421263, "memory(GiB)": 141.16, "step": 43560, "train_speed(iter/s)": 0.294683 }, { "acc": 0.7256566, "epoch": 0.48744000324363473, "grad_norm": 7.65625, "learning_rate": 9.008624088937913e-06, "loss": 1.11319561, "memory(GiB)": 141.16, "step": 43580, "train_speed(iter/s)": 0.294724 }, { "acc": 0.72406306, "epoch": 0.48766370218959326, "grad_norm": 6.25, "learning_rate": 9.007518428256086e-06, "loss": 1.11033936, "memory(GiB)": 141.16, "step": 43600, "train_speed(iter/s)": 0.294768 }, { "acc": 0.73610153, "epoch": 0.4878874011355518, "grad_norm": 6.4375, "learning_rate": 9.006412219288456e-06, "loss": 1.05611725, "memory(GiB)": 141.16, "step": 43620, "train_speed(iter/s)": 0.294814 }, { "acc": 0.72482743, "epoch": 0.4881111000815103, "grad_norm": 6.8125, "learning_rate": 9.005305462186369e-06, "loss": 1.0922842, "memory(GiB)": 141.16, "step": 43640, "train_speed(iter/s)": 0.294867 }, { "acc": 0.72531524, "epoch": 0.48833479902746885, "grad_norm": 9.4375, "learning_rate": 9.004198157101248e-06, "loss": 1.09289513, "memory(GiB)": 141.16, "step": 43660, "train_speed(iter/s)": 0.294917 }, { "acc": 0.73766327, "epoch": 0.4885584979734274, "grad_norm": 7.28125, "learning_rate": 9.003090304184583e-06, "loss": 1.05140944, "memory(GiB)": 141.16, "step": 43680, "train_speed(iter/s)": 0.294963 }, { "acc": 0.71919899, "epoch": 0.4887821969193859, "grad_norm": 7.15625, "learning_rate": 9.001981903587949e-06, "loss": 1.14393978, "memory(GiB)": 141.16, "step": 43700, "train_speed(iter/s)": 0.295006 }, { "acc": 0.73235655, "epoch": 0.48900589586534443, "grad_norm": 6.625, "learning_rate": 9.000872955462987e-06, "loss": 1.06555748, "memory(GiB)": 141.16, "step": 43720, "train_speed(iter/s)": 0.295051 }, { "acc": 0.7206912, "epoch": 0.48922959481130296, "grad_norm": 5.625, "learning_rate": 8.999763459961422e-06, "loss": 1.11652412, "memory(GiB)": 141.16, "step": 43740, "train_speed(iter/s)": 0.295095 }, { "acc": 0.73438549, "epoch": 0.4894532937572615, "grad_norm": 6.0625, "learning_rate": 8.998653417235044e-06, "loss": 1.06037998, "memory(GiB)": 141.16, "step": 43760, "train_speed(iter/s)": 0.295139 }, { "acc": 0.73283358, "epoch": 0.48967699270322, "grad_norm": 9.25, "learning_rate": 8.997542827435723e-06, "loss": 1.08671112, "memory(GiB)": 141.16, "step": 43780, "train_speed(iter/s)": 0.295183 }, { "acc": 0.72989378, "epoch": 0.48990069164917854, "grad_norm": 7.125, "learning_rate": 8.996431690715408e-06, "loss": 1.07042742, "memory(GiB)": 141.16, "step": 43800, "train_speed(iter/s)": 0.295224 }, { "acc": 0.7276916, "epoch": 0.49012439059513707, "grad_norm": 6.75, "learning_rate": 8.995320007226114e-06, "loss": 1.09704847, "memory(GiB)": 141.16, "step": 43820, "train_speed(iter/s)": 0.295269 }, { "acc": 0.71575065, "epoch": 0.4903480895410956, "grad_norm": 7.9375, "learning_rate": 8.994207777119937e-06, "loss": 1.13837261, "memory(GiB)": 141.16, "step": 43840, "train_speed(iter/s)": 0.295317 }, { "acc": 0.72762232, "epoch": 0.49057178848705413, "grad_norm": 7.625, "learning_rate": 8.993095000549047e-06, "loss": 1.09141254, "memory(GiB)": 141.16, "step": 43860, "train_speed(iter/s)": 0.295361 }, { "acc": 0.72681003, "epoch": 0.49079548743301266, "grad_norm": 7.59375, "learning_rate": 8.991981677665685e-06, "loss": 1.10618343, "memory(GiB)": 141.16, "step": 43880, "train_speed(iter/s)": 0.295401 }, { "acc": 0.73693256, "epoch": 0.4910191863789712, "grad_norm": 7.1875, "learning_rate": 8.990867808622172e-06, "loss": 1.04267788, "memory(GiB)": 141.16, "step": 43900, "train_speed(iter/s)": 0.295442 }, { "acc": 0.72332244, "epoch": 0.4912428853249297, "grad_norm": 7.09375, "learning_rate": 8.989753393570899e-06, "loss": 1.08366051, "memory(GiB)": 141.16, "step": 43920, "train_speed(iter/s)": 0.295487 }, { "acc": 0.73807187, "epoch": 0.49146658427088824, "grad_norm": 7.21875, "learning_rate": 8.988638432664336e-06, "loss": 1.05223026, "memory(GiB)": 141.16, "step": 43940, "train_speed(iter/s)": 0.295527 }, { "acc": 0.73288078, "epoch": 0.49169028321684677, "grad_norm": 8.5, "learning_rate": 8.987522926055023e-06, "loss": 1.06993341, "memory(GiB)": 141.16, "step": 43960, "train_speed(iter/s)": 0.295572 }, { "acc": 0.73165522, "epoch": 0.4919139821628053, "grad_norm": 7.0, "learning_rate": 8.986406873895581e-06, "loss": 1.06100826, "memory(GiB)": 141.16, "step": 43980, "train_speed(iter/s)": 0.295615 }, { "acc": 0.72642069, "epoch": 0.4921376811087638, "grad_norm": 6.96875, "learning_rate": 8.985290276338698e-06, "loss": 1.10428505, "memory(GiB)": 141.16, "step": 44000, "train_speed(iter/s)": 0.295655 }, { "epoch": 0.4921376811087638, "eval_acc": 0.6884350343045563, "eval_loss": 1.0859544277191162, "eval_runtime": 2316.9949, "eval_samples_per_second": 32.492, "eval_steps_per_second": 16.246, "step": 44000 }, { "acc": 0.72770605, "epoch": 0.49236138005472235, "grad_norm": 11.0625, "learning_rate": 8.984173133537144e-06, "loss": 1.08684845, "memory(GiB)": 141.16, "step": 44020, "train_speed(iter/s)": 0.29107 }, { "acc": 0.72552061, "epoch": 0.4925850790006809, "grad_norm": 7.03125, "learning_rate": 8.983055445643758e-06, "loss": 1.11582928, "memory(GiB)": 141.16, "step": 44040, "train_speed(iter/s)": 0.291115 }, { "acc": 0.7132957, "epoch": 0.4928087779466394, "grad_norm": 6.125, "learning_rate": 8.981937212811455e-06, "loss": 1.15674038, "memory(GiB)": 141.16, "step": 44060, "train_speed(iter/s)": 0.291157 }, { "acc": 0.73422365, "epoch": 0.49303247689259794, "grad_norm": 7.28125, "learning_rate": 8.980818435193226e-06, "loss": 1.06721563, "memory(GiB)": 141.16, "step": 44080, "train_speed(iter/s)": 0.2912 }, { "acc": 0.74194708, "epoch": 0.49325617583855647, "grad_norm": 6.5625, "learning_rate": 8.979699112942137e-06, "loss": 1.03898983, "memory(GiB)": 141.16, "step": 44100, "train_speed(iter/s)": 0.291243 }, { "acc": 0.73346624, "epoch": 0.493479874784515, "grad_norm": 7.3125, "learning_rate": 8.978579246211327e-06, "loss": 1.07563725, "memory(GiB)": 141.16, "step": 44120, "train_speed(iter/s)": 0.291282 }, { "acc": 0.71188383, "epoch": 0.4937035737304735, "grad_norm": 5.96875, "learning_rate": 8.977458835154008e-06, "loss": 1.18381462, "memory(GiB)": 141.16, "step": 44140, "train_speed(iter/s)": 0.291324 }, { "acc": 0.7398859, "epoch": 0.49392727267643205, "grad_norm": 5.125, "learning_rate": 8.97633787992347e-06, "loss": 1.04385662, "memory(GiB)": 141.16, "step": 44160, "train_speed(iter/s)": 0.291369 }, { "acc": 0.73014393, "epoch": 0.4941509716223906, "grad_norm": 8.0, "learning_rate": 8.975216380673075e-06, "loss": 1.07125015, "memory(GiB)": 141.16, "step": 44180, "train_speed(iter/s)": 0.291413 }, { "acc": 0.72418823, "epoch": 0.4943746705683491, "grad_norm": 6.0, "learning_rate": 8.974094337556261e-06, "loss": 1.1360218, "memory(GiB)": 141.16, "step": 44200, "train_speed(iter/s)": 0.291456 }, { "acc": 0.73338799, "epoch": 0.49459836951430763, "grad_norm": 5.0, "learning_rate": 8.972971750726537e-06, "loss": 1.08409929, "memory(GiB)": 141.16, "step": 44220, "train_speed(iter/s)": 0.2915 }, { "acc": 0.7328248, "epoch": 0.49482206846026616, "grad_norm": 6.3125, "learning_rate": 8.971848620337492e-06, "loss": 1.07574902, "memory(GiB)": 141.16, "step": 44240, "train_speed(iter/s)": 0.291544 }, { "acc": 0.72622299, "epoch": 0.4950457674062247, "grad_norm": 7.28125, "learning_rate": 8.970724946542784e-06, "loss": 1.11629295, "memory(GiB)": 141.16, "step": 44260, "train_speed(iter/s)": 0.29159 }, { "acc": 0.73334756, "epoch": 0.4952694663521832, "grad_norm": 6.0625, "learning_rate": 8.969600729496148e-06, "loss": 1.06068096, "memory(GiB)": 141.16, "step": 44280, "train_speed(iter/s)": 0.291637 }, { "acc": 0.71917992, "epoch": 0.49549316529814175, "grad_norm": 7.84375, "learning_rate": 8.968475969351395e-06, "loss": 1.1466341, "memory(GiB)": 141.16, "step": 44300, "train_speed(iter/s)": 0.291682 }, { "acc": 0.72672377, "epoch": 0.4957168642441003, "grad_norm": 8.375, "learning_rate": 8.967350666262406e-06, "loss": 1.10045137, "memory(GiB)": 141.16, "step": 44320, "train_speed(iter/s)": 0.291728 }, { "acc": 0.71887121, "epoch": 0.4959405631900588, "grad_norm": 8.0625, "learning_rate": 8.966224820383139e-06, "loss": 1.13313942, "memory(GiB)": 141.16, "step": 44340, "train_speed(iter/s)": 0.291778 }, { "acc": 0.72559052, "epoch": 0.49616426213601733, "grad_norm": 7.5625, "learning_rate": 8.965098431867627e-06, "loss": 1.11378174, "memory(GiB)": 141.16, "step": 44360, "train_speed(iter/s)": 0.291822 }, { "acc": 0.72820034, "epoch": 0.49638796108197586, "grad_norm": 6.03125, "learning_rate": 8.963971500869975e-06, "loss": 1.09418602, "memory(GiB)": 141.16, "step": 44380, "train_speed(iter/s)": 0.291861 }, { "acc": 0.73763323, "epoch": 0.4966116600279344, "grad_norm": 6.15625, "learning_rate": 8.962844027544363e-06, "loss": 1.05417252, "memory(GiB)": 141.16, "step": 44400, "train_speed(iter/s)": 0.291903 }, { "acc": 0.72249956, "epoch": 0.4968353589738929, "grad_norm": 5.8125, "learning_rate": 8.961716012045047e-06, "loss": 1.10626564, "memory(GiB)": 141.16, "step": 44420, "train_speed(iter/s)": 0.291945 }, { "acc": 0.71979179, "epoch": 0.49705905791985144, "grad_norm": 8.5625, "learning_rate": 8.960587454526353e-06, "loss": 1.13410549, "memory(GiB)": 141.16, "step": 44440, "train_speed(iter/s)": 0.291985 }, { "acc": 0.73659735, "epoch": 0.49728275686580997, "grad_norm": 8.4375, "learning_rate": 8.959458355142688e-06, "loss": 1.04171343, "memory(GiB)": 141.16, "step": 44460, "train_speed(iter/s)": 0.29203 }, { "acc": 0.72281122, "epoch": 0.4975064558117685, "grad_norm": 8.125, "learning_rate": 8.958328714048522e-06, "loss": 1.1329298, "memory(GiB)": 141.16, "step": 44480, "train_speed(iter/s)": 0.292075 }, { "acc": 0.72163095, "epoch": 0.49773015475772703, "grad_norm": 6.6875, "learning_rate": 8.957198531398414e-06, "loss": 1.10531883, "memory(GiB)": 141.16, "step": 44500, "train_speed(iter/s)": 0.292115 }, { "acc": 0.73021107, "epoch": 0.49795385370368556, "grad_norm": 7.375, "learning_rate": 8.956067807346984e-06, "loss": 1.07343121, "memory(GiB)": 141.16, "step": 44520, "train_speed(iter/s)": 0.292159 }, { "acc": 0.74034185, "epoch": 0.4981775526496441, "grad_norm": 6.21875, "learning_rate": 8.954936542048934e-06, "loss": 1.0267951, "memory(GiB)": 141.16, "step": 44540, "train_speed(iter/s)": 0.292202 }, { "acc": 0.72499084, "epoch": 0.4984012515956026, "grad_norm": 6.65625, "learning_rate": 8.953804735659034e-06, "loss": 1.09395876, "memory(GiB)": 141.16, "step": 44560, "train_speed(iter/s)": 0.292245 }, { "acc": 0.71578398, "epoch": 0.49862495054156114, "grad_norm": 7.65625, "learning_rate": 8.952672388332136e-06, "loss": 1.14574966, "memory(GiB)": 141.16, "step": 44580, "train_speed(iter/s)": 0.292287 }, { "acc": 0.72294192, "epoch": 0.49884864948751967, "grad_norm": 7.84375, "learning_rate": 8.951539500223156e-06, "loss": 1.11369543, "memory(GiB)": 141.16, "step": 44600, "train_speed(iter/s)": 0.292331 }, { "acc": 0.7199245, "epoch": 0.49907234843347825, "grad_norm": 5.9375, "learning_rate": 8.950406071487095e-06, "loss": 1.1312994, "memory(GiB)": 141.16, "step": 44620, "train_speed(iter/s)": 0.292373 }, { "acc": 0.71463656, "epoch": 0.4992960473794368, "grad_norm": 6.625, "learning_rate": 8.949272102279016e-06, "loss": 1.16599016, "memory(GiB)": 141.16, "step": 44640, "train_speed(iter/s)": 0.292417 }, { "acc": 0.73200808, "epoch": 0.4995197463253953, "grad_norm": 6.34375, "learning_rate": 8.948137592754064e-06, "loss": 1.06844273, "memory(GiB)": 141.16, "step": 44660, "train_speed(iter/s)": 0.292462 }, { "acc": 0.73081646, "epoch": 0.49974344527135384, "grad_norm": 4.78125, "learning_rate": 8.947002543067462e-06, "loss": 1.07006397, "memory(GiB)": 141.16, "step": 44680, "train_speed(iter/s)": 0.292506 }, { "acc": 0.73723907, "epoch": 0.49996714421731236, "grad_norm": 5.65625, "learning_rate": 8.945866953374494e-06, "loss": 1.06970186, "memory(GiB)": 141.16, "step": 44700, "train_speed(iter/s)": 0.292549 }, { "acc": 0.72702975, "epoch": 0.5001908431632709, "grad_norm": 8.0, "learning_rate": 8.944730823830527e-06, "loss": 1.10818024, "memory(GiB)": 141.16, "step": 44720, "train_speed(iter/s)": 0.292591 }, { "acc": 0.73682156, "epoch": 0.5004145421092294, "grad_norm": 6.1875, "learning_rate": 8.943594154591e-06, "loss": 1.05468044, "memory(GiB)": 141.16, "step": 44740, "train_speed(iter/s)": 0.292636 }, { "acc": 0.72090468, "epoch": 0.500638241055188, "grad_norm": 7.71875, "learning_rate": 8.942456945811427e-06, "loss": 1.12607117, "memory(GiB)": 141.16, "step": 44760, "train_speed(iter/s)": 0.292681 }, { "acc": 0.74277754, "epoch": 0.5008619400011465, "grad_norm": 7.28125, "learning_rate": 8.941319197647394e-06, "loss": 1.02697716, "memory(GiB)": 141.16, "step": 44780, "train_speed(iter/s)": 0.292723 }, { "acc": 0.7345623, "epoch": 0.501085638947105, "grad_norm": 7.6875, "learning_rate": 8.940180910254556e-06, "loss": 1.06110945, "memory(GiB)": 141.16, "step": 44800, "train_speed(iter/s)": 0.292768 }, { "acc": 0.73727865, "epoch": 0.5013093378930635, "grad_norm": 6.59375, "learning_rate": 8.939042083788655e-06, "loss": 1.04359827, "memory(GiB)": 141.16, "step": 44820, "train_speed(iter/s)": 0.292806 }, { "acc": 0.73383899, "epoch": 0.5015330368390221, "grad_norm": 6.4375, "learning_rate": 8.937902718405495e-06, "loss": 1.06115665, "memory(GiB)": 141.16, "step": 44840, "train_speed(iter/s)": 0.292847 }, { "acc": 0.72993512, "epoch": 0.5017567357849806, "grad_norm": 9.0625, "learning_rate": 8.936762814260954e-06, "loss": 1.09653912, "memory(GiB)": 141.16, "step": 44860, "train_speed(iter/s)": 0.292888 }, { "acc": 0.74215231, "epoch": 0.5019804347309391, "grad_norm": 7.0, "learning_rate": 8.935622371510995e-06, "loss": 1.04128428, "memory(GiB)": 141.16, "step": 44880, "train_speed(iter/s)": 0.292926 }, { "acc": 0.72895708, "epoch": 0.5022041336768976, "grad_norm": 7.0, "learning_rate": 8.93448139031164e-06, "loss": 1.08707418, "memory(GiB)": 141.16, "step": 44900, "train_speed(iter/s)": 0.292973 }, { "acc": 0.72943211, "epoch": 0.5024278326228562, "grad_norm": 6.125, "learning_rate": 8.933339870818996e-06, "loss": 1.09071522, "memory(GiB)": 141.16, "step": 44920, "train_speed(iter/s)": 0.293017 }, { "acc": 0.73433876, "epoch": 0.5026515315688147, "grad_norm": 5.875, "learning_rate": 8.932197813189237e-06, "loss": 1.07995663, "memory(GiB)": 141.16, "step": 44940, "train_speed(iter/s)": 0.29306 }, { "acc": 0.72744198, "epoch": 0.5028752305147732, "grad_norm": 6.78125, "learning_rate": 8.931055217578612e-06, "loss": 1.08962154, "memory(GiB)": 141.16, "step": 44960, "train_speed(iter/s)": 0.293101 }, { "acc": 0.72536087, "epoch": 0.5030989294607318, "grad_norm": 7.1875, "learning_rate": 8.929912084143447e-06, "loss": 1.11438885, "memory(GiB)": 141.16, "step": 44980, "train_speed(iter/s)": 0.293141 }, { "acc": 0.73188705, "epoch": 0.5033226284066903, "grad_norm": 7.53125, "learning_rate": 8.928768413040135e-06, "loss": 1.07821102, "memory(GiB)": 141.16, "step": 45000, "train_speed(iter/s)": 0.293188 }, { "acc": 0.72662826, "epoch": 0.5035463273526488, "grad_norm": 5.8125, "learning_rate": 8.927624204425152e-06, "loss": 1.09475212, "memory(GiB)": 141.16, "step": 45020, "train_speed(iter/s)": 0.293231 }, { "acc": 0.74470377, "epoch": 0.5037700262986073, "grad_norm": 8.3125, "learning_rate": 8.926479458455037e-06, "loss": 1.02925453, "memory(GiB)": 141.16, "step": 45040, "train_speed(iter/s)": 0.293274 }, { "acc": 0.7321228, "epoch": 0.5039937252445659, "grad_norm": 6.96875, "learning_rate": 8.925334175286411e-06, "loss": 1.08727551, "memory(GiB)": 141.16, "step": 45060, "train_speed(iter/s)": 0.293318 }, { "acc": 0.73107367, "epoch": 0.5042174241905244, "grad_norm": 5.1875, "learning_rate": 8.924188355075963e-06, "loss": 1.08351393, "memory(GiB)": 141.16, "step": 45080, "train_speed(iter/s)": 0.293359 }, { "acc": 0.72983136, "epoch": 0.5044411231364829, "grad_norm": 7.4375, "learning_rate": 8.923041997980459e-06, "loss": 1.06845236, "memory(GiB)": 141.16, "step": 45100, "train_speed(iter/s)": 0.293404 }, { "acc": 0.73152685, "epoch": 0.5046648220824415, "grad_norm": 5.96875, "learning_rate": 8.921895104156734e-06, "loss": 1.08333492, "memory(GiB)": 141.16, "step": 45120, "train_speed(iter/s)": 0.293448 }, { "acc": 0.72360768, "epoch": 0.5048885210284, "grad_norm": 7.0625, "learning_rate": 8.920747673761705e-06, "loss": 1.11735582, "memory(GiB)": 141.16, "step": 45140, "train_speed(iter/s)": 0.293488 }, { "acc": 0.72828283, "epoch": 0.5051122199743585, "grad_norm": 7.71875, "learning_rate": 8.919599706952354e-06, "loss": 1.08442135, "memory(GiB)": 141.16, "step": 45160, "train_speed(iter/s)": 0.293533 }, { "acc": 0.72655296, "epoch": 0.505335918920317, "grad_norm": 5.8125, "learning_rate": 8.918451203885737e-06, "loss": 1.08249702, "memory(GiB)": 141.16, "step": 45180, "train_speed(iter/s)": 0.293579 }, { "acc": 0.71952748, "epoch": 0.5055596178662756, "grad_norm": 7.28125, "learning_rate": 8.91730216471899e-06, "loss": 1.13984995, "memory(GiB)": 141.16, "step": 45200, "train_speed(iter/s)": 0.293626 }, { "acc": 0.73205214, "epoch": 0.5057833168122341, "grad_norm": 7.5625, "learning_rate": 8.916152589609314e-06, "loss": 1.08945808, "memory(GiB)": 141.16, "step": 45220, "train_speed(iter/s)": 0.293667 }, { "acc": 0.72180929, "epoch": 0.5060070157581926, "grad_norm": 6.96875, "learning_rate": 8.91500247871399e-06, "loss": 1.11557312, "memory(GiB)": 141.16, "step": 45240, "train_speed(iter/s)": 0.293711 }, { "acc": 0.72970033, "epoch": 0.5062307147041512, "grad_norm": 6.9375, "learning_rate": 8.913851832190367e-06, "loss": 1.07756109, "memory(GiB)": 141.16, "step": 45260, "train_speed(iter/s)": 0.293747 }, { "acc": 0.73852282, "epoch": 0.5064544136501097, "grad_norm": 7.0, "learning_rate": 8.912700650195874e-06, "loss": 1.0388237, "memory(GiB)": 141.16, "step": 45280, "train_speed(iter/s)": 0.293784 }, { "acc": 0.7314538, "epoch": 0.5066781125960682, "grad_norm": 8.625, "learning_rate": 8.911548932888004e-06, "loss": 1.08286667, "memory(GiB)": 141.16, "step": 45300, "train_speed(iter/s)": 0.293826 }, { "acc": 0.74104204, "epoch": 0.5069018115420267, "grad_norm": 7.78125, "learning_rate": 8.910396680424334e-06, "loss": 1.02820559, "memory(GiB)": 141.16, "step": 45320, "train_speed(iter/s)": 0.293874 }, { "acc": 0.73709087, "epoch": 0.5071255104879853, "grad_norm": 7.21875, "learning_rate": 8.909243892962503e-06, "loss": 1.0472002, "memory(GiB)": 141.16, "step": 45340, "train_speed(iter/s)": 0.293922 }, { "acc": 0.72433624, "epoch": 0.5073492094339438, "grad_norm": 7.59375, "learning_rate": 8.908090570660233e-06, "loss": 1.12793541, "memory(GiB)": 141.16, "step": 45360, "train_speed(iter/s)": 0.293963 }, { "acc": 0.7307054, "epoch": 0.5075729083799023, "grad_norm": 7.375, "learning_rate": 8.906936713675314e-06, "loss": 1.08501682, "memory(GiB)": 141.16, "step": 45380, "train_speed(iter/s)": 0.294005 }, { "acc": 0.73250394, "epoch": 0.5077966073258608, "grad_norm": 9.0, "learning_rate": 8.905782322165608e-06, "loss": 1.06774597, "memory(GiB)": 141.16, "step": 45400, "train_speed(iter/s)": 0.294049 }, { "acc": 0.71567392, "epoch": 0.5080203062718194, "grad_norm": 6.4375, "learning_rate": 8.904627396289053e-06, "loss": 1.15908241, "memory(GiB)": 141.16, "step": 45420, "train_speed(iter/s)": 0.294091 }, { "acc": 0.72931337, "epoch": 0.5082440052177779, "grad_norm": 7.84375, "learning_rate": 8.903471936203663e-06, "loss": 1.10186691, "memory(GiB)": 141.16, "step": 45440, "train_speed(iter/s)": 0.294133 }, { "acc": 0.73246531, "epoch": 0.5084677041637364, "grad_norm": 5.625, "learning_rate": 8.902315942067517e-06, "loss": 1.07265739, "memory(GiB)": 141.16, "step": 45460, "train_speed(iter/s)": 0.294174 }, { "acc": 0.73246822, "epoch": 0.508691403109695, "grad_norm": 9.0, "learning_rate": 8.901159414038773e-06, "loss": 1.07352867, "memory(GiB)": 141.16, "step": 45480, "train_speed(iter/s)": 0.294217 }, { "acc": 0.71038513, "epoch": 0.5089151020556535, "grad_norm": 6.78125, "learning_rate": 8.900002352275661e-06, "loss": 1.1738205, "memory(GiB)": 141.16, "step": 45500, "train_speed(iter/s)": 0.29426 }, { "acc": 0.73258739, "epoch": 0.509138801001612, "grad_norm": 9.75, "learning_rate": 8.898844756936484e-06, "loss": 1.06688604, "memory(GiB)": 141.16, "step": 45520, "train_speed(iter/s)": 0.294301 }, { "acc": 0.72657986, "epoch": 0.5093624999475705, "grad_norm": 8.0625, "learning_rate": 8.897686628179616e-06, "loss": 1.10215549, "memory(GiB)": 141.16, "step": 45540, "train_speed(iter/s)": 0.294338 }, { "acc": 0.72591934, "epoch": 0.5095861988935291, "grad_norm": 6.125, "learning_rate": 8.896527966163509e-06, "loss": 1.09674158, "memory(GiB)": 141.16, "step": 45560, "train_speed(iter/s)": 0.294383 }, { "acc": 0.7490797, "epoch": 0.5098098978394876, "grad_norm": 7.28125, "learning_rate": 8.895368771046679e-06, "loss": 1.00167732, "memory(GiB)": 141.16, "step": 45580, "train_speed(iter/s)": 0.294428 }, { "acc": 0.71971507, "epoch": 0.5100335967854461, "grad_norm": 7.40625, "learning_rate": 8.894209042987725e-06, "loss": 1.12551479, "memory(GiB)": 141.16, "step": 45600, "train_speed(iter/s)": 0.294472 }, { "acc": 0.73866997, "epoch": 0.5102572957314047, "grad_norm": 6.9375, "learning_rate": 8.893048782145311e-06, "loss": 1.05994272, "memory(GiB)": 141.16, "step": 45620, "train_speed(iter/s)": 0.294519 }, { "acc": 0.73069668, "epoch": 0.5104809946773632, "grad_norm": 7.28125, "learning_rate": 8.89188798867818e-06, "loss": 1.08221083, "memory(GiB)": 141.16, "step": 45640, "train_speed(iter/s)": 0.294561 }, { "acc": 0.73315058, "epoch": 0.5107046936233217, "grad_norm": 7.03125, "learning_rate": 8.890726662745147e-06, "loss": 1.06325579, "memory(GiB)": 141.16, "step": 45660, "train_speed(iter/s)": 0.294605 }, { "acc": 0.72858648, "epoch": 0.5109283925692802, "grad_norm": 5.6875, "learning_rate": 8.889564804505092e-06, "loss": 1.09222393, "memory(GiB)": 141.16, "step": 45680, "train_speed(iter/s)": 0.294646 }, { "acc": 0.73096128, "epoch": 0.5111520915152388, "grad_norm": 6.65625, "learning_rate": 8.888402414116978e-06, "loss": 1.08152752, "memory(GiB)": 141.16, "step": 45700, "train_speed(iter/s)": 0.294682 }, { "acc": 0.72585449, "epoch": 0.5113757904611973, "grad_norm": 7.84375, "learning_rate": 8.887239491739835e-06, "loss": 1.08692198, "memory(GiB)": 141.16, "step": 45720, "train_speed(iter/s)": 0.294726 }, { "acc": 0.72505441, "epoch": 0.5115994894071558, "grad_norm": 6.5, "learning_rate": 8.886076037532769e-06, "loss": 1.10452375, "memory(GiB)": 141.16, "step": 45740, "train_speed(iter/s)": 0.294769 }, { "acc": 0.73045812, "epoch": 0.5118231883531144, "grad_norm": 7.53125, "learning_rate": 8.884912051654956e-06, "loss": 1.08521385, "memory(GiB)": 141.16, "step": 45760, "train_speed(iter/s)": 0.294809 }, { "acc": 0.73685031, "epoch": 0.5120468872990729, "grad_norm": 6.96875, "learning_rate": 8.883747534265645e-06, "loss": 1.05277424, "memory(GiB)": 141.16, "step": 45780, "train_speed(iter/s)": 0.29485 }, { "acc": 0.72863398, "epoch": 0.5122705862450314, "grad_norm": 5.53125, "learning_rate": 8.882582485524162e-06, "loss": 1.08142853, "memory(GiB)": 141.16, "step": 45800, "train_speed(iter/s)": 0.294891 }, { "acc": 0.71987858, "epoch": 0.5124942851909899, "grad_norm": 7.96875, "learning_rate": 8.881416905589898e-06, "loss": 1.12886219, "memory(GiB)": 141.16, "step": 45820, "train_speed(iter/s)": 0.294932 }, { "acc": 0.73399444, "epoch": 0.5127179841369485, "grad_norm": 5.15625, "learning_rate": 8.880250794622325e-06, "loss": 1.0692152, "memory(GiB)": 141.16, "step": 45840, "train_speed(iter/s)": 0.294977 }, { "acc": 0.72365227, "epoch": 0.512941683082907, "grad_norm": 8.4375, "learning_rate": 8.879084152780982e-06, "loss": 1.1069396, "memory(GiB)": 141.16, "step": 45860, "train_speed(iter/s)": 0.295021 }, { "acc": 0.73500333, "epoch": 0.5131653820288655, "grad_norm": 6.25, "learning_rate": 8.877916980225479e-06, "loss": 1.06656818, "memory(GiB)": 141.16, "step": 45880, "train_speed(iter/s)": 0.295063 }, { "acc": 0.72872295, "epoch": 0.513389080974824, "grad_norm": 7.28125, "learning_rate": 8.876749277115506e-06, "loss": 1.07683811, "memory(GiB)": 141.16, "step": 45900, "train_speed(iter/s)": 0.295105 }, { "acc": 0.73790321, "epoch": 0.5136127799207826, "grad_norm": 8.5625, "learning_rate": 8.875581043610823e-06, "loss": 1.05602198, "memory(GiB)": 141.16, "step": 45920, "train_speed(iter/s)": 0.295143 }, { "acc": 0.73212409, "epoch": 0.5138364788667411, "grad_norm": 5.59375, "learning_rate": 8.874412279871257e-06, "loss": 1.06425219, "memory(GiB)": 141.16, "step": 45940, "train_speed(iter/s)": 0.29518 }, { "acc": 0.74088335, "epoch": 0.5140601778126996, "grad_norm": 7.15625, "learning_rate": 8.873242986056712e-06, "loss": 1.04178572, "memory(GiB)": 141.16, "step": 45960, "train_speed(iter/s)": 0.295222 }, { "acc": 0.73576136, "epoch": 0.5142838767586582, "grad_norm": 7.34375, "learning_rate": 8.872073162327165e-06, "loss": 1.05057411, "memory(GiB)": 141.16, "step": 45980, "train_speed(iter/s)": 0.295259 }, { "acc": 0.73468876, "epoch": 0.5145075757046167, "grad_norm": 8.0625, "learning_rate": 8.870902808842665e-06, "loss": 1.07726164, "memory(GiB)": 141.16, "step": 46000, "train_speed(iter/s)": 0.2953 }, { "epoch": 0.5145075757046167, "eval_acc": 0.6886298546852921, "eval_loss": 1.085204005241394, "eval_runtime": 2320.4713, "eval_samples_per_second": 32.443, "eval_steps_per_second": 16.222, "step": 46000 }, { "acc": 0.741219, "epoch": 0.5147312746505752, "grad_norm": 8.75, "learning_rate": 8.869731925763332e-06, "loss": 1.02081594, "memory(GiB)": 141.16, "step": 46020, "train_speed(iter/s)": 0.290914 }, { "acc": 0.7117609, "epoch": 0.5149549735965337, "grad_norm": 7.125, "learning_rate": 8.868560513249363e-06, "loss": 1.15682001, "memory(GiB)": 141.16, "step": 46040, "train_speed(iter/s)": 0.290955 }, { "acc": 0.73248281, "epoch": 0.5151786725424923, "grad_norm": 8.4375, "learning_rate": 8.86738857146102e-06, "loss": 1.06079292, "memory(GiB)": 141.16, "step": 46060, "train_speed(iter/s)": 0.290993 }, { "acc": 0.74441729, "epoch": 0.5154023714884508, "grad_norm": 6.9375, "learning_rate": 8.866216100558642e-06, "loss": 0.99910774, "memory(GiB)": 141.16, "step": 46080, "train_speed(iter/s)": 0.291038 }, { "acc": 0.72757535, "epoch": 0.5156260704344093, "grad_norm": 7.84375, "learning_rate": 8.86504310070264e-06, "loss": 1.09444809, "memory(GiB)": 141.16, "step": 46100, "train_speed(iter/s)": 0.29108 }, { "acc": 0.72890549, "epoch": 0.5158497693803679, "grad_norm": 6.46875, "learning_rate": 8.8638695720535e-06, "loss": 1.09154987, "memory(GiB)": 141.16, "step": 46120, "train_speed(iter/s)": 0.291122 }, { "acc": 0.73777795, "epoch": 0.5160734683263264, "grad_norm": 8.9375, "learning_rate": 8.862695514771774e-06, "loss": 1.06545372, "memory(GiB)": 141.16, "step": 46140, "train_speed(iter/s)": 0.291164 }, { "acc": 0.7328526, "epoch": 0.5162971672722849, "grad_norm": 8.5, "learning_rate": 8.86152092901809e-06, "loss": 1.05525331, "memory(GiB)": 141.16, "step": 46160, "train_speed(iter/s)": 0.291206 }, { "acc": 0.71912379, "epoch": 0.5165208662182434, "grad_norm": 7.21875, "learning_rate": 8.86034581495315e-06, "loss": 1.13297577, "memory(GiB)": 141.16, "step": 46180, "train_speed(iter/s)": 0.291246 }, { "acc": 0.72991314, "epoch": 0.516744565164202, "grad_norm": 7.875, "learning_rate": 8.859170172737724e-06, "loss": 1.09240646, "memory(GiB)": 141.16, "step": 46200, "train_speed(iter/s)": 0.29129 }, { "acc": 0.7366529, "epoch": 0.5169682641101605, "grad_norm": 6.46875, "learning_rate": 8.85799400253266e-06, "loss": 1.04981728, "memory(GiB)": 141.16, "step": 46220, "train_speed(iter/s)": 0.291334 }, { "acc": 0.72951956, "epoch": 0.517191963056119, "grad_norm": 9.9375, "learning_rate": 8.856817304498872e-06, "loss": 1.08917885, "memory(GiB)": 141.16, "step": 46240, "train_speed(iter/s)": 0.291376 }, { "acc": 0.7181221, "epoch": 0.5174156620020776, "grad_norm": 7.53125, "learning_rate": 8.85564007879735e-06, "loss": 1.13843403, "memory(GiB)": 141.16, "step": 46260, "train_speed(iter/s)": 0.291418 }, { "acc": 0.7292449, "epoch": 0.5176393609480361, "grad_norm": 7.46875, "learning_rate": 8.854462325589157e-06, "loss": 1.08627911, "memory(GiB)": 141.16, "step": 46280, "train_speed(iter/s)": 0.291458 }, { "acc": 0.73821325, "epoch": 0.5178630598939946, "grad_norm": 7.125, "learning_rate": 8.853284045035424e-06, "loss": 1.05128841, "memory(GiB)": 141.16, "step": 46300, "train_speed(iter/s)": 0.2915 }, { "acc": 0.72748461, "epoch": 0.5180867588399531, "grad_norm": 7.25, "learning_rate": 8.852105237297357e-06, "loss": 1.10586014, "memory(GiB)": 141.16, "step": 46320, "train_speed(iter/s)": 0.291541 }, { "acc": 0.73655987, "epoch": 0.5183104577859117, "grad_norm": 7.21875, "learning_rate": 8.850925902536233e-06, "loss": 1.04861984, "memory(GiB)": 141.16, "step": 46340, "train_speed(iter/s)": 0.291591 }, { "acc": 0.73114338, "epoch": 0.5185341567318702, "grad_norm": 7.46875, "learning_rate": 8.849746040913404e-06, "loss": 1.07984819, "memory(GiB)": 141.16, "step": 46360, "train_speed(iter/s)": 0.291636 }, { "acc": 0.74024754, "epoch": 0.5187578556778287, "grad_norm": 6.21875, "learning_rate": 8.848565652590293e-06, "loss": 1.04545746, "memory(GiB)": 141.16, "step": 46380, "train_speed(iter/s)": 0.291678 }, { "acc": 0.72893362, "epoch": 0.5189815546237873, "grad_norm": 8.3125, "learning_rate": 8.847384737728391e-06, "loss": 1.091712, "memory(GiB)": 141.16, "step": 46400, "train_speed(iter/s)": 0.291723 }, { "acc": 0.73289385, "epoch": 0.5192052535697458, "grad_norm": 6.28125, "learning_rate": 8.846203296489265e-06, "loss": 1.0722415, "memory(GiB)": 141.16, "step": 46420, "train_speed(iter/s)": 0.291766 }, { "acc": 0.72482767, "epoch": 0.5194289525157043, "grad_norm": 9.0, "learning_rate": 8.845021329034553e-06, "loss": 1.09809246, "memory(GiB)": 141.16, "step": 46440, "train_speed(iter/s)": 0.291806 }, { "acc": 0.74412632, "epoch": 0.5196526514616628, "grad_norm": 9.25, "learning_rate": 8.843838835525965e-06, "loss": 1.01542463, "memory(GiB)": 141.16, "step": 46460, "train_speed(iter/s)": 0.291848 }, { "acc": 0.73845453, "epoch": 0.5198763504076214, "grad_norm": 7.6875, "learning_rate": 8.842655816125284e-06, "loss": 1.03943615, "memory(GiB)": 141.16, "step": 46480, "train_speed(iter/s)": 0.291888 }, { "acc": 0.72551327, "epoch": 0.52010004935358, "grad_norm": 6.03125, "learning_rate": 8.841472270994363e-06, "loss": 1.09203215, "memory(GiB)": 141.16, "step": 46500, "train_speed(iter/s)": 0.291927 }, { "acc": 0.74733524, "epoch": 0.5203237482995385, "grad_norm": 7.4375, "learning_rate": 8.840288200295126e-06, "loss": 0.99458103, "memory(GiB)": 141.16, "step": 46520, "train_speed(iter/s)": 0.291972 }, { "acc": 0.72562189, "epoch": 0.5205474472454971, "grad_norm": 5.34375, "learning_rate": 8.839103604189575e-06, "loss": 1.10471973, "memory(GiB)": 141.16, "step": 46540, "train_speed(iter/s)": 0.292006 }, { "acc": 0.73784232, "epoch": 0.5207711461914556, "grad_norm": 8.8125, "learning_rate": 8.837918482839776e-06, "loss": 1.05937405, "memory(GiB)": 141.16, "step": 46560, "train_speed(iter/s)": 0.29204 }, { "acc": 0.7367177, "epoch": 0.5209948451374141, "grad_norm": 7.40625, "learning_rate": 8.836732836407873e-06, "loss": 1.04822922, "memory(GiB)": 141.16, "step": 46580, "train_speed(iter/s)": 0.292087 }, { "acc": 0.73437343, "epoch": 0.5212185440833726, "grad_norm": 8.9375, "learning_rate": 8.835546665056078e-06, "loss": 1.07240314, "memory(GiB)": 141.16, "step": 46600, "train_speed(iter/s)": 0.292128 }, { "acc": 0.72954741, "epoch": 0.5214422430293312, "grad_norm": 7.6875, "learning_rate": 8.834359968946678e-06, "loss": 1.09808693, "memory(GiB)": 141.16, "step": 46620, "train_speed(iter/s)": 0.292173 }, { "acc": 0.71125822, "epoch": 0.5216659419752897, "grad_norm": 5.40625, "learning_rate": 8.833172748242026e-06, "loss": 1.16718407, "memory(GiB)": 141.16, "step": 46640, "train_speed(iter/s)": 0.292217 }, { "acc": 0.73505201, "epoch": 0.5218896409212482, "grad_norm": 7.6875, "learning_rate": 8.831985003104557e-06, "loss": 1.05095158, "memory(GiB)": 141.16, "step": 46660, "train_speed(iter/s)": 0.292259 }, { "acc": 0.73502426, "epoch": 0.5221133398672068, "grad_norm": 6.71875, "learning_rate": 8.830796733696765e-06, "loss": 1.07462635, "memory(GiB)": 141.16, "step": 46680, "train_speed(iter/s)": 0.292303 }, { "acc": 0.7308177, "epoch": 0.5223370388131653, "grad_norm": 7.0, "learning_rate": 8.829607940181227e-06, "loss": 1.08947906, "memory(GiB)": 141.16, "step": 46700, "train_speed(iter/s)": 0.292345 }, { "acc": 0.73025408, "epoch": 0.5225607377591238, "grad_norm": 7.0, "learning_rate": 8.828418622720582e-06, "loss": 1.08224678, "memory(GiB)": 141.16, "step": 46720, "train_speed(iter/s)": 0.292388 }, { "acc": 0.72529697, "epoch": 0.5227844367050823, "grad_norm": 6.15625, "learning_rate": 8.827228781477553e-06, "loss": 1.09196434, "memory(GiB)": 141.16, "step": 46740, "train_speed(iter/s)": 0.292431 }, { "acc": 0.71972289, "epoch": 0.5230081356510409, "grad_norm": 7.09375, "learning_rate": 8.826038416614919e-06, "loss": 1.13956814, "memory(GiB)": 141.16, "step": 46760, "train_speed(iter/s)": 0.292473 }, { "acc": 0.72352982, "epoch": 0.5232318345969994, "grad_norm": 7.71875, "learning_rate": 8.824847528295546e-06, "loss": 1.11920624, "memory(GiB)": 141.16, "step": 46780, "train_speed(iter/s)": 0.292514 }, { "acc": 0.7342936, "epoch": 0.5234555335429579, "grad_norm": 6.65625, "learning_rate": 8.823656116682359e-06, "loss": 1.06632614, "memory(GiB)": 141.16, "step": 46800, "train_speed(iter/s)": 0.292557 }, { "acc": 0.74488287, "epoch": 0.5236792324889165, "grad_norm": 6.375, "learning_rate": 8.822464181938364e-06, "loss": 1.01537418, "memory(GiB)": 141.16, "step": 46820, "train_speed(iter/s)": 0.292598 }, { "acc": 0.73321753, "epoch": 0.523902931434875, "grad_norm": 7.3125, "learning_rate": 8.821271724226633e-06, "loss": 1.0670352, "memory(GiB)": 141.16, "step": 46840, "train_speed(iter/s)": 0.292636 }, { "acc": 0.72254572, "epoch": 0.5241266303808335, "grad_norm": 6.15625, "learning_rate": 8.820078743710312e-06, "loss": 1.10823488, "memory(GiB)": 141.16, "step": 46860, "train_speed(iter/s)": 0.292675 }, { "acc": 0.73930621, "epoch": 0.524350329326792, "grad_norm": 5.875, "learning_rate": 8.818885240552617e-06, "loss": 1.05329857, "memory(GiB)": 141.16, "step": 46880, "train_speed(iter/s)": 0.292714 }, { "acc": 0.73154411, "epoch": 0.5245740282727506, "grad_norm": 8.5625, "learning_rate": 8.817691214916837e-06, "loss": 1.07839947, "memory(GiB)": 141.16, "step": 46900, "train_speed(iter/s)": 0.29276 }, { "acc": 0.73430443, "epoch": 0.5247977272187091, "grad_norm": 7.3125, "learning_rate": 8.81649666696633e-06, "loss": 1.06146526, "memory(GiB)": 141.16, "step": 46920, "train_speed(iter/s)": 0.292802 }, { "acc": 0.72152982, "epoch": 0.5250214261646676, "grad_norm": 6.5, "learning_rate": 8.815301596864529e-06, "loss": 1.12264309, "memory(GiB)": 141.16, "step": 46940, "train_speed(iter/s)": 0.292847 }, { "acc": 0.72430439, "epoch": 0.5252451251106262, "grad_norm": 6.75, "learning_rate": 8.814106004774939e-06, "loss": 1.10216436, "memory(GiB)": 141.16, "step": 46960, "train_speed(iter/s)": 0.292886 }, { "acc": 0.7301671, "epoch": 0.5254688240565847, "grad_norm": 8.4375, "learning_rate": 8.812909890861128e-06, "loss": 1.08476877, "memory(GiB)": 141.16, "step": 46980, "train_speed(iter/s)": 0.292927 }, { "acc": 0.73161159, "epoch": 0.5256925230025432, "grad_norm": 8.6875, "learning_rate": 8.811713255286746e-06, "loss": 1.07695961, "memory(GiB)": 141.16, "step": 47000, "train_speed(iter/s)": 0.292967 }, { "acc": 0.72199211, "epoch": 0.5259162219485017, "grad_norm": 7.375, "learning_rate": 8.810516098215508e-06, "loss": 1.12251587, "memory(GiB)": 141.16, "step": 47020, "train_speed(iter/s)": 0.293003 }, { "acc": 0.73172197, "epoch": 0.5261399208944603, "grad_norm": 5.15625, "learning_rate": 8.809318419811206e-06, "loss": 1.08371391, "memory(GiB)": 141.16, "step": 47040, "train_speed(iter/s)": 0.293041 }, { "acc": 0.72584748, "epoch": 0.5263636198404188, "grad_norm": 7.40625, "learning_rate": 8.808120220237693e-06, "loss": 1.1036047, "memory(GiB)": 141.16, "step": 47060, "train_speed(iter/s)": 0.293083 }, { "acc": 0.72844524, "epoch": 0.5265873187863773, "grad_norm": 5.96875, "learning_rate": 8.806921499658906e-06, "loss": 1.10038519, "memory(GiB)": 141.16, "step": 47080, "train_speed(iter/s)": 0.293128 }, { "acc": 0.7382297, "epoch": 0.5268110177323359, "grad_norm": 8.1875, "learning_rate": 8.805722258238842e-06, "loss": 1.04975319, "memory(GiB)": 141.16, "step": 47100, "train_speed(iter/s)": 0.293168 }, { "acc": 0.72722049, "epoch": 0.5270347166782944, "grad_norm": 7.5625, "learning_rate": 8.804522496141579e-06, "loss": 1.08993559, "memory(GiB)": 141.16, "step": 47120, "train_speed(iter/s)": 0.293205 }, { "acc": 0.742974, "epoch": 0.5272584156242529, "grad_norm": 6.03125, "learning_rate": 8.803322213531257e-06, "loss": 1.01903801, "memory(GiB)": 141.16, "step": 47140, "train_speed(iter/s)": 0.293247 }, { "acc": 0.73832979, "epoch": 0.5274821145702114, "grad_norm": 6.84375, "learning_rate": 8.802121410572097e-06, "loss": 1.03773422, "memory(GiB)": 141.16, "step": 47160, "train_speed(iter/s)": 0.293286 }, { "acc": 0.73207426, "epoch": 0.52770581351617, "grad_norm": 7.1875, "learning_rate": 8.800920087428381e-06, "loss": 1.06238937, "memory(GiB)": 141.16, "step": 47180, "train_speed(iter/s)": 0.293328 }, { "acc": 0.7390131, "epoch": 0.5279295124621285, "grad_norm": 9.6875, "learning_rate": 8.79971824426447e-06, "loss": 1.0478569, "memory(GiB)": 141.16, "step": 47200, "train_speed(iter/s)": 0.293364 }, { "acc": 0.71431513, "epoch": 0.528153211408087, "grad_norm": 6.0625, "learning_rate": 8.798515881244794e-06, "loss": 1.17367897, "memory(GiB)": 141.16, "step": 47220, "train_speed(iter/s)": 0.293408 }, { "acc": 0.7352972, "epoch": 0.5283769103540455, "grad_norm": 6.96875, "learning_rate": 8.79731299853385e-06, "loss": 1.05907755, "memory(GiB)": 141.16, "step": 47240, "train_speed(iter/s)": 0.293444 }, { "acc": 0.72936811, "epoch": 0.5286006093000041, "grad_norm": 6.25, "learning_rate": 8.796109596296213e-06, "loss": 1.08062353, "memory(GiB)": 141.16, "step": 47260, "train_speed(iter/s)": 0.293481 }, { "acc": 0.72414274, "epoch": 0.5288243082459626, "grad_norm": 6.8125, "learning_rate": 8.794905674696523e-06, "loss": 1.12588053, "memory(GiB)": 141.16, "step": 47280, "train_speed(iter/s)": 0.29352 }, { "acc": 0.73136206, "epoch": 0.5290480071919211, "grad_norm": 8.25, "learning_rate": 8.793701233899496e-06, "loss": 1.08328772, "memory(GiB)": 141.16, "step": 47300, "train_speed(iter/s)": 0.293558 }, { "acc": 0.729669, "epoch": 0.5292717061378797, "grad_norm": 8.875, "learning_rate": 8.792496274069916e-06, "loss": 1.08084841, "memory(GiB)": 141.16, "step": 47320, "train_speed(iter/s)": 0.293596 }, { "acc": 0.72801914, "epoch": 0.5294954050838382, "grad_norm": 7.75, "learning_rate": 8.791290795372638e-06, "loss": 1.10365906, "memory(GiB)": 141.16, "step": 47340, "train_speed(iter/s)": 0.293637 }, { "acc": 0.73638268, "epoch": 0.5297191040297967, "grad_norm": 7.53125, "learning_rate": 8.79008479797259e-06, "loss": 1.05613441, "memory(GiB)": 141.16, "step": 47360, "train_speed(iter/s)": 0.293676 }, { "acc": 0.73910484, "epoch": 0.5299428029757552, "grad_norm": 7.34375, "learning_rate": 8.788878282034768e-06, "loss": 1.02474308, "memory(GiB)": 141.16, "step": 47380, "train_speed(iter/s)": 0.293718 }, { "acc": 0.73469391, "epoch": 0.5301665019217138, "grad_norm": 7.96875, "learning_rate": 8.787671247724241e-06, "loss": 1.07178745, "memory(GiB)": 141.16, "step": 47400, "train_speed(iter/s)": 0.293757 }, { "acc": 0.7293829, "epoch": 0.5303902008676723, "grad_norm": 6.65625, "learning_rate": 8.786463695206149e-06, "loss": 1.07875261, "memory(GiB)": 141.16, "step": 47420, "train_speed(iter/s)": 0.293798 }, { "acc": 0.72856979, "epoch": 0.5306138998136308, "grad_norm": 7.875, "learning_rate": 8.785255624645703e-06, "loss": 1.08123779, "memory(GiB)": 141.16, "step": 47440, "train_speed(iter/s)": 0.293838 }, { "acc": 0.73088694, "epoch": 0.5308375987595894, "grad_norm": 8.625, "learning_rate": 8.784047036208183e-06, "loss": 1.08193741, "memory(GiB)": 141.16, "step": 47460, "train_speed(iter/s)": 0.293875 }, { "acc": 0.7398561, "epoch": 0.5310612977055479, "grad_norm": 7.0625, "learning_rate": 8.782837930058943e-06, "loss": 1.03027735, "memory(GiB)": 141.16, "step": 47480, "train_speed(iter/s)": 0.293915 }, { "acc": 0.7237493, "epoch": 0.5312849966515064, "grad_norm": 7.6875, "learning_rate": 8.781628306363405e-06, "loss": 1.10309868, "memory(GiB)": 141.16, "step": 47500, "train_speed(iter/s)": 0.293959 }, { "acc": 0.71958771, "epoch": 0.5315086955974649, "grad_norm": 7.0, "learning_rate": 8.780418165287062e-06, "loss": 1.14908886, "memory(GiB)": 141.16, "step": 47520, "train_speed(iter/s)": 0.293996 }, { "acc": 0.72652469, "epoch": 0.5317323945434235, "grad_norm": 6.8125, "learning_rate": 8.77920750699548e-06, "loss": 1.08256903, "memory(GiB)": 141.16, "step": 47540, "train_speed(iter/s)": 0.294032 }, { "acc": 0.73841419, "epoch": 0.531956093489382, "grad_norm": 8.3125, "learning_rate": 8.777996331654294e-06, "loss": 1.03873901, "memory(GiB)": 141.16, "step": 47560, "train_speed(iter/s)": 0.294062 }, { "acc": 0.72331357, "epoch": 0.5321797924353405, "grad_norm": 6.6875, "learning_rate": 8.77678463942921e-06, "loss": 1.11128502, "memory(GiB)": 141.16, "step": 47580, "train_speed(iter/s)": 0.294097 }, { "acc": 0.75257339, "epoch": 0.532403491381299, "grad_norm": 8.375, "learning_rate": 8.775572430486004e-06, "loss": 0.99514408, "memory(GiB)": 141.16, "step": 47600, "train_speed(iter/s)": 0.29413 }, { "acc": 0.73618402, "epoch": 0.5326271903272576, "grad_norm": 6.90625, "learning_rate": 8.774359704990523e-06, "loss": 1.05420666, "memory(GiB)": 141.16, "step": 47620, "train_speed(iter/s)": 0.294169 }, { "acc": 0.72136002, "epoch": 0.5328508892732161, "grad_norm": 7.3125, "learning_rate": 8.773146463108687e-06, "loss": 1.12970819, "memory(GiB)": 141.16, "step": 47640, "train_speed(iter/s)": 0.294205 }, { "acc": 0.72431474, "epoch": 0.5330745882191746, "grad_norm": 7.0625, "learning_rate": 8.771932705006485e-06, "loss": 1.09959469, "memory(GiB)": 141.16, "step": 47660, "train_speed(iter/s)": 0.294247 }, { "acc": 0.73538113, "epoch": 0.5332982871651332, "grad_norm": 7.78125, "learning_rate": 8.770718430849976e-06, "loss": 1.04465084, "memory(GiB)": 141.16, "step": 47680, "train_speed(iter/s)": 0.294292 }, { "acc": 0.7257947, "epoch": 0.5335219861110917, "grad_norm": 6.90625, "learning_rate": 8.769503640805288e-06, "loss": 1.09995289, "memory(GiB)": 141.16, "step": 47700, "train_speed(iter/s)": 0.294336 }, { "acc": 0.73191347, "epoch": 0.5337456850570502, "grad_norm": 6.5, "learning_rate": 8.768288335038625e-06, "loss": 1.05648823, "memory(GiB)": 141.16, "step": 47720, "train_speed(iter/s)": 0.294378 }, { "acc": 0.72649493, "epoch": 0.5339693840030088, "grad_norm": 6.71875, "learning_rate": 8.767072513716254e-06, "loss": 1.09026108, "memory(GiB)": 141.16, "step": 47740, "train_speed(iter/s)": 0.294416 }, { "acc": 0.73800764, "epoch": 0.5341930829489673, "grad_norm": 8.375, "learning_rate": 8.765856177004522e-06, "loss": 1.04398689, "memory(GiB)": 141.16, "step": 47760, "train_speed(iter/s)": 0.294454 }, { "acc": 0.72456245, "epoch": 0.5344167818949258, "grad_norm": 7.25, "learning_rate": 8.764639325069838e-06, "loss": 1.1124876, "memory(GiB)": 141.16, "step": 47780, "train_speed(iter/s)": 0.294493 }, { "acc": 0.72044482, "epoch": 0.5346404808408843, "grad_norm": 6.625, "learning_rate": 8.763421958078684e-06, "loss": 1.13167887, "memory(GiB)": 141.16, "step": 47800, "train_speed(iter/s)": 0.294533 }, { "acc": 0.7318923, "epoch": 0.5348641797868429, "grad_norm": 7.3125, "learning_rate": 8.762204076197615e-06, "loss": 1.08178177, "memory(GiB)": 141.16, "step": 47820, "train_speed(iter/s)": 0.29458 }, { "acc": 0.74015846, "epoch": 0.5350878787328014, "grad_norm": 6.84375, "learning_rate": 8.760985679593255e-06, "loss": 1.0278615, "memory(GiB)": 141.16, "step": 47840, "train_speed(iter/s)": 0.294623 }, { "acc": 0.72505822, "epoch": 0.5353115776787599, "grad_norm": 9.375, "learning_rate": 8.759766768432297e-06, "loss": 1.09026814, "memory(GiB)": 141.16, "step": 47860, "train_speed(iter/s)": 0.294664 }, { "acc": 0.73570361, "epoch": 0.5355352766247184, "grad_norm": 5.78125, "learning_rate": 8.758547342881505e-06, "loss": 1.0455286, "memory(GiB)": 141.16, "step": 47880, "train_speed(iter/s)": 0.294704 }, { "acc": 0.73822169, "epoch": 0.535758975570677, "grad_norm": 6.9375, "learning_rate": 8.757327403107713e-06, "loss": 1.04510136, "memory(GiB)": 141.16, "step": 47900, "train_speed(iter/s)": 0.294744 }, { "acc": 0.72227163, "epoch": 0.5359826745166355, "grad_norm": 4.90625, "learning_rate": 8.756106949277829e-06, "loss": 1.10524607, "memory(GiB)": 141.16, "step": 47920, "train_speed(iter/s)": 0.294783 }, { "acc": 0.72104387, "epoch": 0.536206373462594, "grad_norm": 7.40625, "learning_rate": 8.754885981558829e-06, "loss": 1.12552204, "memory(GiB)": 141.16, "step": 47940, "train_speed(iter/s)": 0.294822 }, { "acc": 0.72102442, "epoch": 0.5364300724085526, "grad_norm": 7.59375, "learning_rate": 8.753664500117756e-06, "loss": 1.11738777, "memory(GiB)": 141.16, "step": 47960, "train_speed(iter/s)": 0.294864 }, { "acc": 0.73181095, "epoch": 0.5366537713545111, "grad_norm": 6.09375, "learning_rate": 8.752442505121726e-06, "loss": 1.06686287, "memory(GiB)": 141.16, "step": 47980, "train_speed(iter/s)": 0.294906 }, { "acc": 0.73785486, "epoch": 0.5368774703004696, "grad_norm": 8.5, "learning_rate": 8.751219996737927e-06, "loss": 1.0466917, "memory(GiB)": 141.16, "step": 48000, "train_speed(iter/s)": 0.294948 }, { "epoch": 0.5368774703004696, "eval_acc": 0.6887846954787868, "eval_loss": 1.0845776796340942, "eval_runtime": 2323.4961, "eval_samples_per_second": 32.401, "eval_steps_per_second": 16.201, "step": 48000 }, { "acc": 0.73206158, "epoch": 0.5371011692464281, "grad_norm": 6.78125, "learning_rate": 8.749996975133614e-06, "loss": 1.06910992, "memory(GiB)": 141.16, "step": 48020, "train_speed(iter/s)": 0.290748 }, { "acc": 0.72145667, "epoch": 0.5373248681923867, "grad_norm": 7.96875, "learning_rate": 8.748773440476117e-06, "loss": 1.11935406, "memory(GiB)": 141.16, "step": 48040, "train_speed(iter/s)": 0.290788 }, { "acc": 0.73411093, "epoch": 0.5375485671383452, "grad_norm": 6.59375, "learning_rate": 8.74754939293283e-06, "loss": 1.06843452, "memory(GiB)": 141.16, "step": 48060, "train_speed(iter/s)": 0.290831 }, { "acc": 0.72445898, "epoch": 0.5377722660843037, "grad_norm": 6.8125, "learning_rate": 8.746324832671223e-06, "loss": 1.10698223, "memory(GiB)": 141.16, "step": 48080, "train_speed(iter/s)": 0.290872 }, { "acc": 0.73575277, "epoch": 0.5379959650302623, "grad_norm": 8.25, "learning_rate": 8.745099759858828e-06, "loss": 1.05530529, "memory(GiB)": 141.16, "step": 48100, "train_speed(iter/s)": 0.290916 }, { "acc": 0.73287268, "epoch": 0.5382196639762208, "grad_norm": 6.34375, "learning_rate": 8.743874174663259e-06, "loss": 1.0492075, "memory(GiB)": 141.16, "step": 48120, "train_speed(iter/s)": 0.290958 }, { "acc": 0.73512731, "epoch": 0.5384433629221793, "grad_norm": 7.28125, "learning_rate": 8.74264807725219e-06, "loss": 1.06736298, "memory(GiB)": 141.16, "step": 48140, "train_speed(iter/s)": 0.291 }, { "acc": 0.7366127, "epoch": 0.5386670618681378, "grad_norm": 7.03125, "learning_rate": 8.741421467793369e-06, "loss": 1.0569355, "memory(GiB)": 141.16, "step": 48160, "train_speed(iter/s)": 0.291044 }, { "acc": 0.72717896, "epoch": 0.5388907608140964, "grad_norm": 7.8125, "learning_rate": 8.740194346454614e-06, "loss": 1.0950449, "memory(GiB)": 141.16, "step": 48180, "train_speed(iter/s)": 0.291085 }, { "acc": 0.73241963, "epoch": 0.5391144597600549, "grad_norm": 8.3125, "learning_rate": 8.738966713403812e-06, "loss": 1.07206688, "memory(GiB)": 141.16, "step": 48200, "train_speed(iter/s)": 0.291123 }, { "acc": 0.725073, "epoch": 0.5393381587060134, "grad_norm": 6.90625, "learning_rate": 8.737738568808923e-06, "loss": 1.12272816, "memory(GiB)": 141.16, "step": 48220, "train_speed(iter/s)": 0.291168 }, { "acc": 0.72578287, "epoch": 0.539561857651972, "grad_norm": 7.625, "learning_rate": 8.736509912837971e-06, "loss": 1.10174351, "memory(GiB)": 141.16, "step": 48240, "train_speed(iter/s)": 0.291207 }, { "acc": 0.7417552, "epoch": 0.5397855565979305, "grad_norm": 5.875, "learning_rate": 8.735280745659058e-06, "loss": 1.05058231, "memory(GiB)": 141.16, "step": 48260, "train_speed(iter/s)": 0.291246 }, { "acc": 0.74748764, "epoch": 0.540009255543889, "grad_norm": 8.3125, "learning_rate": 8.734051067440349e-06, "loss": 1.00806656, "memory(GiB)": 141.16, "step": 48280, "train_speed(iter/s)": 0.291284 }, { "acc": 0.72548113, "epoch": 0.5402329544898475, "grad_norm": 7.0, "learning_rate": 8.732820878350081e-06, "loss": 1.12163353, "memory(GiB)": 141.16, "step": 48300, "train_speed(iter/s)": 0.291324 }, { "acc": 0.72092075, "epoch": 0.5404566534358061, "grad_norm": 6.8125, "learning_rate": 8.731590178556563e-06, "loss": 1.12558546, "memory(GiB)": 141.16, "step": 48320, "train_speed(iter/s)": 0.291366 }, { "acc": 0.72723956, "epoch": 0.5406803523817646, "grad_norm": 8.5, "learning_rate": 8.730358968228173e-06, "loss": 1.08771334, "memory(GiB)": 141.16, "step": 48340, "train_speed(iter/s)": 0.291407 }, { "acc": 0.73082457, "epoch": 0.5409040513277231, "grad_norm": 8.4375, "learning_rate": 8.729127247533357e-06, "loss": 1.08197422, "memory(GiB)": 141.16, "step": 48360, "train_speed(iter/s)": 0.291444 }, { "acc": 0.72030835, "epoch": 0.5411277502736817, "grad_norm": 8.8125, "learning_rate": 8.727895016640631e-06, "loss": 1.14526358, "memory(GiB)": 141.16, "step": 48380, "train_speed(iter/s)": 0.291484 }, { "acc": 0.73803406, "epoch": 0.5413514492196402, "grad_norm": 7.40625, "learning_rate": 8.726662275718582e-06, "loss": 1.04656296, "memory(GiB)": 141.16, "step": 48400, "train_speed(iter/s)": 0.29153 }, { "acc": 0.72963438, "epoch": 0.5415751481655987, "grad_norm": 9.0625, "learning_rate": 8.72542902493587e-06, "loss": 1.08252506, "memory(GiB)": 141.16, "step": 48420, "train_speed(iter/s)": 0.291574 }, { "acc": 0.72982497, "epoch": 0.5417988471115572, "grad_norm": 6.40625, "learning_rate": 8.724195264461218e-06, "loss": 1.071416, "memory(GiB)": 141.16, "step": 48440, "train_speed(iter/s)": 0.29161 }, { "acc": 0.71531663, "epoch": 0.5420225460575158, "grad_norm": 6.25, "learning_rate": 8.722960994463421e-06, "loss": 1.15393162, "memory(GiB)": 141.16, "step": 48460, "train_speed(iter/s)": 0.291645 }, { "acc": 0.73044643, "epoch": 0.5422462450034743, "grad_norm": 8.5625, "learning_rate": 8.721726215111348e-06, "loss": 1.08270273, "memory(GiB)": 141.16, "step": 48480, "train_speed(iter/s)": 0.291685 }, { "acc": 0.7198802, "epoch": 0.5424699439494328, "grad_norm": 7.28125, "learning_rate": 8.720490926573932e-06, "loss": 1.12073421, "memory(GiB)": 141.16, "step": 48500, "train_speed(iter/s)": 0.291727 }, { "acc": 0.72443886, "epoch": 0.5426936428953913, "grad_norm": 7.1875, "learning_rate": 8.71925512902018e-06, "loss": 1.11198759, "memory(GiB)": 141.16, "step": 48520, "train_speed(iter/s)": 0.291762 }, { "acc": 0.7389823, "epoch": 0.5429173418413499, "grad_norm": 6.5, "learning_rate": 8.718018822619167e-06, "loss": 1.04496746, "memory(GiB)": 141.16, "step": 48540, "train_speed(iter/s)": 0.291806 }, { "acc": 0.73762283, "epoch": 0.5431410407873084, "grad_norm": 6.96875, "learning_rate": 8.716782007540035e-06, "loss": 1.04355507, "memory(GiB)": 141.16, "step": 48560, "train_speed(iter/s)": 0.291851 }, { "acc": 0.71662359, "epoch": 0.5433647397332669, "grad_norm": 8.5, "learning_rate": 8.715544683952e-06, "loss": 1.14580708, "memory(GiB)": 141.16, "step": 48580, "train_speed(iter/s)": 0.291893 }, { "acc": 0.73996696, "epoch": 0.5435884386792255, "grad_norm": 7.90625, "learning_rate": 8.714306852024343e-06, "loss": 1.03135996, "memory(GiB)": 141.16, "step": 48600, "train_speed(iter/s)": 0.291935 }, { "acc": 0.73414326, "epoch": 0.543812137625184, "grad_norm": 6.84375, "learning_rate": 8.71306851192642e-06, "loss": 1.06490908, "memory(GiB)": 141.16, "step": 48620, "train_speed(iter/s)": 0.291977 }, { "acc": 0.73180866, "epoch": 0.5440358365711425, "grad_norm": 8.25, "learning_rate": 8.711829663827654e-06, "loss": 1.05347137, "memory(GiB)": 141.16, "step": 48640, "train_speed(iter/s)": 0.292016 }, { "acc": 0.72708611, "epoch": 0.544259535517101, "grad_norm": 5.78125, "learning_rate": 8.710590307897534e-06, "loss": 1.09300518, "memory(GiB)": 141.16, "step": 48660, "train_speed(iter/s)": 0.292056 }, { "acc": 0.72724104, "epoch": 0.5444832344630596, "grad_norm": 6.84375, "learning_rate": 8.709350444305625e-06, "loss": 1.0883604, "memory(GiB)": 141.16, "step": 48680, "train_speed(iter/s)": 0.292095 }, { "acc": 0.72838264, "epoch": 0.5447069334090181, "grad_norm": 6.65625, "learning_rate": 8.708110073221554e-06, "loss": 1.09609661, "memory(GiB)": 141.16, "step": 48700, "train_speed(iter/s)": 0.292134 }, { "acc": 0.73192534, "epoch": 0.5449306323549766, "grad_norm": 6.0, "learning_rate": 8.706869194815025e-06, "loss": 1.07132072, "memory(GiB)": 141.16, "step": 48720, "train_speed(iter/s)": 0.292172 }, { "acc": 0.7342926, "epoch": 0.5451543313009352, "grad_norm": 8.125, "learning_rate": 8.705627809255807e-06, "loss": 1.07226009, "memory(GiB)": 141.16, "step": 48740, "train_speed(iter/s)": 0.292213 }, { "acc": 0.71977892, "epoch": 0.5453780302468937, "grad_norm": 6.46875, "learning_rate": 8.70438591671374e-06, "loss": 1.11501255, "memory(GiB)": 141.16, "step": 48760, "train_speed(iter/s)": 0.292252 }, { "acc": 0.73355618, "epoch": 0.5456017291928522, "grad_norm": 7.46875, "learning_rate": 8.70314351735873e-06, "loss": 1.0791893, "memory(GiB)": 141.16, "step": 48780, "train_speed(iter/s)": 0.292289 }, { "acc": 0.73271017, "epoch": 0.5458254281388107, "grad_norm": 6.40625, "learning_rate": 8.701900611360758e-06, "loss": 1.06887875, "memory(GiB)": 141.16, "step": 48800, "train_speed(iter/s)": 0.292329 }, { "acc": 0.73176351, "epoch": 0.5460491270847693, "grad_norm": 4.59375, "learning_rate": 8.700657198889869e-06, "loss": 1.07675295, "memory(GiB)": 141.16, "step": 48820, "train_speed(iter/s)": 0.292365 }, { "acc": 0.71399593, "epoch": 0.5462728260307278, "grad_norm": 6.5, "learning_rate": 8.699413280116182e-06, "loss": 1.16109161, "memory(GiB)": 141.16, "step": 48840, "train_speed(iter/s)": 0.292406 }, { "acc": 0.7276073, "epoch": 0.5464965249766863, "grad_norm": 6.71875, "learning_rate": 8.69816885520988e-06, "loss": 1.09591866, "memory(GiB)": 141.16, "step": 48860, "train_speed(iter/s)": 0.292443 }, { "acc": 0.72553778, "epoch": 0.5467202239226449, "grad_norm": 6.875, "learning_rate": 8.69692392434122e-06, "loss": 1.10907087, "memory(GiB)": 141.16, "step": 48880, "train_speed(iter/s)": 0.292479 }, { "acc": 0.73150015, "epoch": 0.5469439228686034, "grad_norm": 7.375, "learning_rate": 8.695678487680526e-06, "loss": 1.05583916, "memory(GiB)": 141.16, "step": 48900, "train_speed(iter/s)": 0.292516 }, { "acc": 0.73554831, "epoch": 0.5471676218145619, "grad_norm": 8.25, "learning_rate": 8.694432545398193e-06, "loss": 1.05009613, "memory(GiB)": 141.16, "step": 48920, "train_speed(iter/s)": 0.292554 }, { "acc": 0.7311296, "epoch": 0.5473913207605204, "grad_norm": 5.78125, "learning_rate": 8.69318609766468e-06, "loss": 1.06294746, "memory(GiB)": 141.16, "step": 48940, "train_speed(iter/s)": 0.292592 }, { "acc": 0.72946024, "epoch": 0.547615019706479, "grad_norm": 8.75, "learning_rate": 8.69193914465052e-06, "loss": 1.08737726, "memory(GiB)": 141.16, "step": 48960, "train_speed(iter/s)": 0.292629 }, { "acc": 0.72666922, "epoch": 0.5478387186524375, "grad_norm": 7.34375, "learning_rate": 8.690691686526318e-06, "loss": 1.08888626, "memory(GiB)": 141.16, "step": 48980, "train_speed(iter/s)": 0.292669 }, { "acc": 0.72276983, "epoch": 0.548062417598396, "grad_norm": 8.5625, "learning_rate": 8.68944372346274e-06, "loss": 1.13022089, "memory(GiB)": 141.16, "step": 49000, "train_speed(iter/s)": 0.292709 }, { "acc": 0.72899361, "epoch": 0.5482861165443547, "grad_norm": 6.65625, "learning_rate": 8.688195255630527e-06, "loss": 1.09784365, "memory(GiB)": 141.16, "step": 49020, "train_speed(iter/s)": 0.292749 }, { "acc": 0.73148975, "epoch": 0.5485098154903132, "grad_norm": 6.84375, "learning_rate": 8.686946283200486e-06, "loss": 1.07689428, "memory(GiB)": 141.16, "step": 49040, "train_speed(iter/s)": 0.29279 }, { "acc": 0.72877531, "epoch": 0.5487335144362717, "grad_norm": 7.28125, "learning_rate": 8.685696806343495e-06, "loss": 1.09528675, "memory(GiB)": 141.16, "step": 49060, "train_speed(iter/s)": 0.29283 }, { "acc": 0.72345901, "epoch": 0.5489572133822302, "grad_norm": 6.375, "learning_rate": 8.684446825230499e-06, "loss": 1.11627998, "memory(GiB)": 141.16, "step": 49080, "train_speed(iter/s)": 0.29287 }, { "acc": 0.72732515, "epoch": 0.5491809123281888, "grad_norm": 6.25, "learning_rate": 8.683196340032516e-06, "loss": 1.09536657, "memory(GiB)": 141.16, "step": 49100, "train_speed(iter/s)": 0.292906 }, { "acc": 0.73525486, "epoch": 0.5494046112741473, "grad_norm": 7.90625, "learning_rate": 8.681945350920628e-06, "loss": 1.05555849, "memory(GiB)": 141.16, "step": 49120, "train_speed(iter/s)": 0.292942 }, { "acc": 0.73475537, "epoch": 0.5496283102201058, "grad_norm": 7.46875, "learning_rate": 8.680693858065989e-06, "loss": 1.06674824, "memory(GiB)": 141.16, "step": 49140, "train_speed(iter/s)": 0.292984 }, { "acc": 0.72821236, "epoch": 0.5498520091660644, "grad_norm": 7.21875, "learning_rate": 8.67944186163982e-06, "loss": 1.09382496, "memory(GiB)": 141.16, "step": 49160, "train_speed(iter/s)": 0.293024 }, { "acc": 0.717875, "epoch": 0.5500757081120229, "grad_norm": 8.5625, "learning_rate": 8.678189361813414e-06, "loss": 1.13870955, "memory(GiB)": 141.16, "step": 49180, "train_speed(iter/s)": 0.29306 }, { "acc": 0.72557964, "epoch": 0.5502994070579814, "grad_norm": 9.875, "learning_rate": 8.67693635875813e-06, "loss": 1.1173852, "memory(GiB)": 141.16, "step": 49200, "train_speed(iter/s)": 0.293099 }, { "acc": 0.74847898, "epoch": 0.5505231060039399, "grad_norm": 7.40625, "learning_rate": 8.675682852645396e-06, "loss": 0.98661127, "memory(GiB)": 141.16, "step": 49220, "train_speed(iter/s)": 0.293136 }, { "acc": 0.71182117, "epoch": 0.5507468049498985, "grad_norm": 6.78125, "learning_rate": 8.67442884364671e-06, "loss": 1.16765766, "memory(GiB)": 141.16, "step": 49240, "train_speed(iter/s)": 0.293176 }, { "acc": 0.72848988, "epoch": 0.550970503895857, "grad_norm": 7.625, "learning_rate": 8.673174331933639e-06, "loss": 1.07982502, "memory(GiB)": 141.16, "step": 49260, "train_speed(iter/s)": 0.293215 }, { "acc": 0.74169044, "epoch": 0.5511942028418155, "grad_norm": 6.0, "learning_rate": 8.671919317677819e-06, "loss": 1.02314224, "memory(GiB)": 141.16, "step": 49280, "train_speed(iter/s)": 0.293255 }, { "acc": 0.72482853, "epoch": 0.5514179017877741, "grad_norm": 6.65625, "learning_rate": 8.67066380105095e-06, "loss": 1.09856586, "memory(GiB)": 141.16, "step": 49300, "train_speed(iter/s)": 0.293297 }, { "acc": 0.71961451, "epoch": 0.5516416007337326, "grad_norm": 6.65625, "learning_rate": 8.669407782224808e-06, "loss": 1.12295971, "memory(GiB)": 141.16, "step": 49320, "train_speed(iter/s)": 0.293336 }, { "acc": 0.72500191, "epoch": 0.5518652996796911, "grad_norm": 7.53125, "learning_rate": 8.668151261371234e-06, "loss": 1.10095634, "memory(GiB)": 141.16, "step": 49340, "train_speed(iter/s)": 0.293377 }, { "acc": 0.72530847, "epoch": 0.5520889986256496, "grad_norm": 7.15625, "learning_rate": 8.666894238662136e-06, "loss": 1.11958981, "memory(GiB)": 141.16, "step": 49360, "train_speed(iter/s)": 0.293415 }, { "acc": 0.72675762, "epoch": 0.5523126975716082, "grad_norm": 7.65625, "learning_rate": 8.665636714269497e-06, "loss": 1.10526772, "memory(GiB)": 141.16, "step": 49380, "train_speed(iter/s)": 0.293456 }, { "acc": 0.73209782, "epoch": 0.5525363965175667, "grad_norm": 8.0625, "learning_rate": 8.66437868836536e-06, "loss": 1.07810574, "memory(GiB)": 141.16, "step": 49400, "train_speed(iter/s)": 0.293494 }, { "acc": 0.72190533, "epoch": 0.5527600954635252, "grad_norm": 6.875, "learning_rate": 8.663120161121841e-06, "loss": 1.1058382, "memory(GiB)": 141.16, "step": 49420, "train_speed(iter/s)": 0.293536 }, { "acc": 0.71984425, "epoch": 0.5529837944094838, "grad_norm": 7.40625, "learning_rate": 8.661861132711127e-06, "loss": 1.12001829, "memory(GiB)": 141.16, "step": 49440, "train_speed(iter/s)": 0.293575 }, { "acc": 0.73203325, "epoch": 0.5532074933554423, "grad_norm": 6.375, "learning_rate": 8.66060160330547e-06, "loss": 1.09429302, "memory(GiB)": 141.16, "step": 49460, "train_speed(iter/s)": 0.293611 }, { "acc": 0.73066287, "epoch": 0.5534311923014008, "grad_norm": 6.3125, "learning_rate": 8.659341573077192e-06, "loss": 1.08150158, "memory(GiB)": 141.16, "step": 49480, "train_speed(iter/s)": 0.293648 }, { "acc": 0.72346754, "epoch": 0.5536548912473593, "grad_norm": 6.84375, "learning_rate": 8.658081042198682e-06, "loss": 1.11097145, "memory(GiB)": 141.16, "step": 49500, "train_speed(iter/s)": 0.293688 }, { "acc": 0.7373415, "epoch": 0.5538785901933179, "grad_norm": 6.46875, "learning_rate": 8.6568200108424e-06, "loss": 1.06620884, "memory(GiB)": 141.16, "step": 49520, "train_speed(iter/s)": 0.293732 }, { "acc": 0.73125648, "epoch": 0.5541022891392764, "grad_norm": 7.625, "learning_rate": 8.655558479180874e-06, "loss": 1.07318554, "memory(GiB)": 141.16, "step": 49540, "train_speed(iter/s)": 0.29377 }, { "acc": 0.73044486, "epoch": 0.5543259880852349, "grad_norm": 5.0625, "learning_rate": 8.654296447386696e-06, "loss": 1.09705515, "memory(GiB)": 141.16, "step": 49560, "train_speed(iter/s)": 0.29381 }, { "acc": 0.71640053, "epoch": 0.5545496870311935, "grad_norm": 8.125, "learning_rate": 8.653033915632531e-06, "loss": 1.1435112, "memory(GiB)": 141.16, "step": 49580, "train_speed(iter/s)": 0.293851 }, { "acc": 0.73008113, "epoch": 0.554773385977152, "grad_norm": 7.875, "learning_rate": 8.651770884091115e-06, "loss": 1.08298206, "memory(GiB)": 141.16, "step": 49600, "train_speed(iter/s)": 0.293888 }, { "acc": 0.73613577, "epoch": 0.5549970849231105, "grad_norm": 7.65625, "learning_rate": 8.650507352935245e-06, "loss": 1.058916, "memory(GiB)": 141.16, "step": 49620, "train_speed(iter/s)": 0.293928 }, { "acc": 0.71744237, "epoch": 0.555220783869069, "grad_norm": 7.625, "learning_rate": 8.649243322337793e-06, "loss": 1.14106531, "memory(GiB)": 141.16, "step": 49640, "train_speed(iter/s)": 0.293967 }, { "acc": 0.73278308, "epoch": 0.5554444828150276, "grad_norm": 5.5, "learning_rate": 8.647978792471692e-06, "loss": 1.05740299, "memory(GiB)": 141.16, "step": 49660, "train_speed(iter/s)": 0.294007 }, { "acc": 0.73963008, "epoch": 0.5556681817609861, "grad_norm": 9.6875, "learning_rate": 8.646713763509953e-06, "loss": 1.0399519, "memory(GiB)": 141.16, "step": 49680, "train_speed(iter/s)": 0.294041 }, { "acc": 0.74358892, "epoch": 0.5558918807069446, "grad_norm": 6.0625, "learning_rate": 8.645448235625646e-06, "loss": 1.02573795, "memory(GiB)": 141.16, "step": 49700, "train_speed(iter/s)": 0.294083 }, { "acc": 0.72526588, "epoch": 0.5561155796529031, "grad_norm": 7.1875, "learning_rate": 8.644182208991915e-06, "loss": 1.10500183, "memory(GiB)": 141.16, "step": 49720, "train_speed(iter/s)": 0.294118 }, { "acc": 0.71945848, "epoch": 0.5563392785988617, "grad_norm": 8.1875, "learning_rate": 8.642915683781972e-06, "loss": 1.13142719, "memory(GiB)": 141.16, "step": 49740, "train_speed(iter/s)": 0.29416 }, { "acc": 0.73892975, "epoch": 0.5565629775448202, "grad_norm": 7.53125, "learning_rate": 8.641648660169092e-06, "loss": 1.05857201, "memory(GiB)": 141.16, "step": 49760, "train_speed(iter/s)": 0.294198 }, { "acc": 0.72759566, "epoch": 0.5567866764907787, "grad_norm": 7.34375, "learning_rate": 8.640381138326626e-06, "loss": 1.08711681, "memory(GiB)": 141.16, "step": 49780, "train_speed(iter/s)": 0.294233 }, { "acc": 0.72757864, "epoch": 0.5570103754367373, "grad_norm": 7.125, "learning_rate": 8.639113118427987e-06, "loss": 1.09361477, "memory(GiB)": 141.16, "step": 49800, "train_speed(iter/s)": 0.294275 }, { "acc": 0.72573233, "epoch": 0.5572340743826958, "grad_norm": 6.28125, "learning_rate": 8.637844600646656e-06, "loss": 1.08906803, "memory(GiB)": 141.16, "step": 49820, "train_speed(iter/s)": 0.294313 }, { "acc": 0.73577089, "epoch": 0.5574577733286543, "grad_norm": 6.1875, "learning_rate": 8.636575585156189e-06, "loss": 1.07184525, "memory(GiB)": 141.16, "step": 49840, "train_speed(iter/s)": 0.294347 }, { "acc": 0.73360901, "epoch": 0.5576814722746128, "grad_norm": 7.375, "learning_rate": 8.635306072130204e-06, "loss": 1.05263815, "memory(GiB)": 141.16, "step": 49860, "train_speed(iter/s)": 0.294386 }, { "acc": 0.73907576, "epoch": 0.5579051712205714, "grad_norm": 8.625, "learning_rate": 8.634036061742386e-06, "loss": 1.0475378, "memory(GiB)": 141.16, "step": 49880, "train_speed(iter/s)": 0.294427 }, { "acc": 0.72251167, "epoch": 0.5581288701665299, "grad_norm": 7.1875, "learning_rate": 8.632765554166494e-06, "loss": 1.11776495, "memory(GiB)": 141.16, "step": 49900, "train_speed(iter/s)": 0.29446 }, { "acc": 0.72148933, "epoch": 0.5583525691124884, "grad_norm": 7.78125, "learning_rate": 8.631494549576349e-06, "loss": 1.12855835, "memory(GiB)": 141.16, "step": 49920, "train_speed(iter/s)": 0.294498 }, { "acc": 0.72934008, "epoch": 0.558576268058447, "grad_norm": 7.0625, "learning_rate": 8.630223048145844e-06, "loss": 1.07405491, "memory(GiB)": 141.16, "step": 49940, "train_speed(iter/s)": 0.29454 }, { "acc": 0.73100481, "epoch": 0.5587999670044055, "grad_norm": 6.0, "learning_rate": 8.628951050048938e-06, "loss": 1.07012081, "memory(GiB)": 141.16, "step": 49960, "train_speed(iter/s)": 0.294585 }, { "acc": 0.7356205, "epoch": 0.559023665950364, "grad_norm": 8.25, "learning_rate": 8.627678555459658e-06, "loss": 1.06408577, "memory(GiB)": 141.16, "step": 49980, "train_speed(iter/s)": 0.294623 }, { "acc": 0.7332922, "epoch": 0.5592473648963225, "grad_norm": 5.9375, "learning_rate": 8.626405564552102e-06, "loss": 1.08356323, "memory(GiB)": 141.16, "step": 50000, "train_speed(iter/s)": 0.294664 }, { "epoch": 0.5592473648963225, "eval_acc": 0.6889506773163216, "eval_loss": 1.0839855670928955, "eval_runtime": 2322.2103, "eval_samples_per_second": 32.419, "eval_steps_per_second": 16.21, "step": 50000 }, { "acc": 0.71921825, "epoch": 0.5594710638422811, "grad_norm": 6.5, "learning_rate": 8.62513207750043e-06, "loss": 1.13004827, "memory(GiB)": 141.16, "step": 50020, "train_speed(iter/s)": 0.290642 }, { "acc": 0.72796354, "epoch": 0.5596947627882396, "grad_norm": 7.25, "learning_rate": 8.623858094478876e-06, "loss": 1.08471365, "memory(GiB)": 141.16, "step": 50040, "train_speed(iter/s)": 0.29068 }, { "acc": 0.72889061, "epoch": 0.5599184617341981, "grad_norm": 5.59375, "learning_rate": 8.622583615661737e-06, "loss": 1.0903223, "memory(GiB)": 141.16, "step": 50060, "train_speed(iter/s)": 0.290717 }, { "acc": 0.72403898, "epoch": 0.5601421606801567, "grad_norm": 8.25, "learning_rate": 8.62130864122338e-06, "loss": 1.11184196, "memory(GiB)": 141.16, "step": 50080, "train_speed(iter/s)": 0.290757 }, { "acc": 0.7196743, "epoch": 0.5603658596261152, "grad_norm": 6.40625, "learning_rate": 8.620033171338242e-06, "loss": 1.13593197, "memory(GiB)": 141.16, "step": 50100, "train_speed(iter/s)": 0.290792 }, { "acc": 0.71970692, "epoch": 0.5605895585720737, "grad_norm": 8.0625, "learning_rate": 8.618757206180822e-06, "loss": 1.12931042, "memory(GiB)": 141.16, "step": 50120, "train_speed(iter/s)": 0.290834 }, { "acc": 0.72221537, "epoch": 0.5608132575180322, "grad_norm": 7.25, "learning_rate": 8.617480745925694e-06, "loss": 1.10489483, "memory(GiB)": 141.16, "step": 50140, "train_speed(iter/s)": 0.290873 }, { "acc": 0.73360224, "epoch": 0.5610369564639908, "grad_norm": 7.21875, "learning_rate": 8.616203790747493e-06, "loss": 1.08003502, "memory(GiB)": 141.16, "step": 50160, "train_speed(iter/s)": 0.290912 }, { "acc": 0.74158883, "epoch": 0.5612606554099493, "grad_norm": 6.25, "learning_rate": 8.614926340820925e-06, "loss": 1.03406563, "memory(GiB)": 141.16, "step": 50180, "train_speed(iter/s)": 0.290949 }, { "acc": 0.72705016, "epoch": 0.5614843543559078, "grad_norm": 6.0625, "learning_rate": 8.613648396320768e-06, "loss": 1.09702263, "memory(GiB)": 141.16, "step": 50200, "train_speed(iter/s)": 0.290988 }, { "acc": 0.73275003, "epoch": 0.5617080533018664, "grad_norm": 6.9375, "learning_rate": 8.612369957421858e-06, "loss": 1.06507568, "memory(GiB)": 141.16, "step": 50220, "train_speed(iter/s)": 0.291028 }, { "acc": 0.73775291, "epoch": 0.5619317522478249, "grad_norm": 6.8125, "learning_rate": 8.611091024299103e-06, "loss": 1.05162334, "memory(GiB)": 141.16, "step": 50240, "train_speed(iter/s)": 0.291065 }, { "acc": 0.72389345, "epoch": 0.5621554511937834, "grad_norm": 7.5625, "learning_rate": 8.609811597127484e-06, "loss": 1.09896698, "memory(GiB)": 141.16, "step": 50260, "train_speed(iter/s)": 0.291108 }, { "acc": 0.73376517, "epoch": 0.5623791501397419, "grad_norm": 5.75, "learning_rate": 8.608531676082041e-06, "loss": 1.06793346, "memory(GiB)": 141.16, "step": 50280, "train_speed(iter/s)": 0.291145 }, { "acc": 0.73582582, "epoch": 0.5626028490857005, "grad_norm": 6.5625, "learning_rate": 8.607251261337888e-06, "loss": 1.06182384, "memory(GiB)": 141.16, "step": 50300, "train_speed(iter/s)": 0.291184 }, { "acc": 0.72349176, "epoch": 0.562826548031659, "grad_norm": 4.96875, "learning_rate": 8.6059703530702e-06, "loss": 1.09949265, "memory(GiB)": 141.16, "step": 50320, "train_speed(iter/s)": 0.291221 }, { "acc": 0.7432806, "epoch": 0.5630502469776175, "grad_norm": 7.34375, "learning_rate": 8.60468895145423e-06, "loss": 1.02307034, "memory(GiB)": 141.16, "step": 50340, "train_speed(iter/s)": 0.291259 }, { "acc": 0.73042831, "epoch": 0.563273945923576, "grad_norm": 7.75, "learning_rate": 8.603407056665287e-06, "loss": 1.07656536, "memory(GiB)": 141.16, "step": 50360, "train_speed(iter/s)": 0.291291 }, { "acc": 0.71953197, "epoch": 0.5634976448695346, "grad_norm": 6.4375, "learning_rate": 8.602124668878755e-06, "loss": 1.13152122, "memory(GiB)": 141.16, "step": 50380, "train_speed(iter/s)": 0.291327 }, { "acc": 0.72724886, "epoch": 0.5637213438154931, "grad_norm": 6.0625, "learning_rate": 8.600841788270082e-06, "loss": 1.08384209, "memory(GiB)": 141.16, "step": 50400, "train_speed(iter/s)": 0.291362 }, { "acc": 0.73336096, "epoch": 0.5639450427614516, "grad_norm": 7.09375, "learning_rate": 8.599558415014784e-06, "loss": 1.07061558, "memory(GiB)": 141.16, "step": 50420, "train_speed(iter/s)": 0.291399 }, { "acc": 0.73278522, "epoch": 0.5641687417074102, "grad_norm": 6.5625, "learning_rate": 8.598274549288446e-06, "loss": 1.07701569, "memory(GiB)": 141.16, "step": 50440, "train_speed(iter/s)": 0.291441 }, { "acc": 0.72504692, "epoch": 0.5643924406533687, "grad_norm": 7.0625, "learning_rate": 8.596990191266716e-06, "loss": 1.09679947, "memory(GiB)": 141.16, "step": 50460, "train_speed(iter/s)": 0.291479 }, { "acc": 0.73014407, "epoch": 0.5646161395993272, "grad_norm": 8.6875, "learning_rate": 8.595705341125318e-06, "loss": 1.06657877, "memory(GiB)": 141.16, "step": 50480, "train_speed(iter/s)": 0.291517 }, { "acc": 0.72234473, "epoch": 0.5648398385452857, "grad_norm": 7.75, "learning_rate": 8.594419999040034e-06, "loss": 1.11232052, "memory(GiB)": 141.16, "step": 50500, "train_speed(iter/s)": 0.291553 }, { "acc": 0.72425327, "epoch": 0.5650635374912443, "grad_norm": 6.53125, "learning_rate": 8.593134165186718e-06, "loss": 1.10829239, "memory(GiB)": 141.16, "step": 50520, "train_speed(iter/s)": 0.291591 }, { "acc": 0.7249876, "epoch": 0.5652872364372028, "grad_norm": 6.0625, "learning_rate": 8.59184783974129e-06, "loss": 1.10936041, "memory(GiB)": 141.16, "step": 50540, "train_speed(iter/s)": 0.291629 }, { "acc": 0.73125663, "epoch": 0.5655109353831613, "grad_norm": 7.03125, "learning_rate": 8.590561022879738e-06, "loss": 1.07019539, "memory(GiB)": 141.16, "step": 50560, "train_speed(iter/s)": 0.291664 }, { "acc": 0.72963643, "epoch": 0.5657346343291199, "grad_norm": 9.0625, "learning_rate": 8.589273714778118e-06, "loss": 1.09313774, "memory(GiB)": 141.16, "step": 50580, "train_speed(iter/s)": 0.291702 }, { "acc": 0.73547339, "epoch": 0.5659583332750784, "grad_norm": 7.03125, "learning_rate": 8.587985915612548e-06, "loss": 1.04083843, "memory(GiB)": 141.16, "step": 50600, "train_speed(iter/s)": 0.291738 }, { "acc": 0.72347393, "epoch": 0.5661820322210369, "grad_norm": 6.84375, "learning_rate": 8.586697625559224e-06, "loss": 1.09588451, "memory(GiB)": 141.16, "step": 50620, "train_speed(iter/s)": 0.291774 }, { "acc": 0.72243967, "epoch": 0.5664057311669954, "grad_norm": 7.1875, "learning_rate": 8.5854088447944e-06, "loss": 1.12337723, "memory(GiB)": 141.16, "step": 50640, "train_speed(iter/s)": 0.291812 }, { "acc": 0.72662153, "epoch": 0.566629430112954, "grad_norm": 6.09375, "learning_rate": 8.584119573494396e-06, "loss": 1.09967461, "memory(GiB)": 141.16, "step": 50660, "train_speed(iter/s)": 0.291849 }, { "acc": 0.72411466, "epoch": 0.5668531290589125, "grad_norm": 6.21875, "learning_rate": 8.582829811835607e-06, "loss": 1.10897646, "memory(GiB)": 141.16, "step": 50680, "train_speed(iter/s)": 0.291883 }, { "acc": 0.7417346, "epoch": 0.567076828004871, "grad_norm": 9.375, "learning_rate": 8.58153955999449e-06, "loss": 1.03433409, "memory(GiB)": 141.16, "step": 50700, "train_speed(iter/s)": 0.291924 }, { "acc": 0.73579807, "epoch": 0.5673005269508296, "grad_norm": 7.1875, "learning_rate": 8.580248818147568e-06, "loss": 1.04983273, "memory(GiB)": 141.16, "step": 50720, "train_speed(iter/s)": 0.291963 }, { "acc": 0.73122597, "epoch": 0.5675242258967881, "grad_norm": 8.0625, "learning_rate": 8.578957586471434e-06, "loss": 1.08141289, "memory(GiB)": 141.16, "step": 50740, "train_speed(iter/s)": 0.291999 }, { "acc": 0.73714185, "epoch": 0.5677479248427466, "grad_norm": 5.65625, "learning_rate": 8.577665865142747e-06, "loss": 1.03531208, "memory(GiB)": 141.16, "step": 50760, "train_speed(iter/s)": 0.292038 }, { "acc": 0.73947763, "epoch": 0.5679716237887051, "grad_norm": 5.90625, "learning_rate": 8.576373654338233e-06, "loss": 1.03007975, "memory(GiB)": 141.16, "step": 50780, "train_speed(iter/s)": 0.292077 }, { "acc": 0.72367902, "epoch": 0.5681953227346637, "grad_norm": 6.875, "learning_rate": 8.575080954234686e-06, "loss": 1.11124268, "memory(GiB)": 141.16, "step": 50800, "train_speed(iter/s)": 0.292115 }, { "acc": 0.73980932, "epoch": 0.5684190216806222, "grad_norm": 5.375, "learning_rate": 8.573787765008964e-06, "loss": 1.03851948, "memory(GiB)": 141.16, "step": 50820, "train_speed(iter/s)": 0.292155 }, { "acc": 0.72216496, "epoch": 0.5686427206265807, "grad_norm": 7.03125, "learning_rate": 8.572494086837994e-06, "loss": 1.10972004, "memory(GiB)": 141.16, "step": 50840, "train_speed(iter/s)": 0.292196 }, { "acc": 0.73765211, "epoch": 0.5688664195725393, "grad_norm": 5.9375, "learning_rate": 8.571199919898771e-06, "loss": 1.03536491, "memory(GiB)": 141.16, "step": 50860, "train_speed(iter/s)": 0.292235 }, { "acc": 0.73458204, "epoch": 0.5690901185184978, "grad_norm": 7.5, "learning_rate": 8.569905264368354e-06, "loss": 1.07657509, "memory(GiB)": 141.16, "step": 50880, "train_speed(iter/s)": 0.292273 }, { "acc": 0.73179846, "epoch": 0.5693138174644563, "grad_norm": 6.25, "learning_rate": 8.568610120423872e-06, "loss": 1.04880276, "memory(GiB)": 141.16, "step": 50900, "train_speed(iter/s)": 0.292311 }, { "acc": 0.73975482, "epoch": 0.5695375164104148, "grad_norm": 6.53125, "learning_rate": 8.567314488242518e-06, "loss": 1.04359617, "memory(GiB)": 141.16, "step": 50920, "train_speed(iter/s)": 0.292347 }, { "acc": 0.72446766, "epoch": 0.5697612153563734, "grad_norm": 8.4375, "learning_rate": 8.566018368001555e-06, "loss": 1.10269547, "memory(GiB)": 141.16, "step": 50940, "train_speed(iter/s)": 0.292383 }, { "acc": 0.73428483, "epoch": 0.5699849143023319, "grad_norm": 7.1875, "learning_rate": 8.564721759878306e-06, "loss": 1.06093369, "memory(GiB)": 141.16, "step": 50960, "train_speed(iter/s)": 0.292421 }, { "acc": 0.7228478, "epoch": 0.5702086132482904, "grad_norm": 7.75, "learning_rate": 8.56342466405017e-06, "loss": 1.09851265, "memory(GiB)": 141.16, "step": 50980, "train_speed(iter/s)": 0.292464 }, { "acc": 0.74107838, "epoch": 0.570432312194249, "grad_norm": 8.125, "learning_rate": 8.562127080694607e-06, "loss": 1.01875725, "memory(GiB)": 141.16, "step": 51000, "train_speed(iter/s)": 0.292501 }, { "acc": 0.74049625, "epoch": 0.5706560111402075, "grad_norm": 6.96875, "learning_rate": 8.560829009989146e-06, "loss": 1.02853584, "memory(GiB)": 141.16, "step": 51020, "train_speed(iter/s)": 0.292544 }, { "acc": 0.73710213, "epoch": 0.570879710086166, "grad_norm": 6.84375, "learning_rate": 8.55953045211138e-06, "loss": 1.04612055, "memory(GiB)": 141.16, "step": 51040, "train_speed(iter/s)": 0.292581 }, { "acc": 0.72311358, "epoch": 0.5711034090321245, "grad_norm": 6.75, "learning_rate": 8.558231407238969e-06, "loss": 1.12836761, "memory(GiB)": 141.16, "step": 51060, "train_speed(iter/s)": 0.292618 }, { "acc": 0.73179145, "epoch": 0.5713271079780831, "grad_norm": 6.25, "learning_rate": 8.556931875549644e-06, "loss": 1.07475128, "memory(GiB)": 141.16, "step": 51080, "train_speed(iter/s)": 0.292651 }, { "acc": 0.72897587, "epoch": 0.5715508069240416, "grad_norm": 7.0, "learning_rate": 8.555631857221198e-06, "loss": 1.09058781, "memory(GiB)": 141.16, "step": 51100, "train_speed(iter/s)": 0.29269 }, { "acc": 0.71946144, "epoch": 0.5717745058700001, "grad_norm": 8.4375, "learning_rate": 8.55433135243149e-06, "loss": 1.13264465, "memory(GiB)": 141.16, "step": 51120, "train_speed(iter/s)": 0.292727 }, { "acc": 0.73847041, "epoch": 0.5719982048159586, "grad_norm": 7.65625, "learning_rate": 8.553030361358455e-06, "loss": 1.04830723, "memory(GiB)": 141.16, "step": 51140, "train_speed(iter/s)": 0.292764 }, { "acc": 0.72861147, "epoch": 0.5722219037619172, "grad_norm": 7.28125, "learning_rate": 8.551728884180077e-06, "loss": 1.08793869, "memory(GiB)": 141.16, "step": 51160, "train_speed(iter/s)": 0.2928 }, { "acc": 0.73168211, "epoch": 0.5724456027078757, "grad_norm": 8.875, "learning_rate": 8.550426921074425e-06, "loss": 1.06736288, "memory(GiB)": 141.16, "step": 51180, "train_speed(iter/s)": 0.292836 }, { "acc": 0.73187304, "epoch": 0.5726693016538342, "grad_norm": 7.5625, "learning_rate": 8.549124472219621e-06, "loss": 1.0615509, "memory(GiB)": 141.16, "step": 51200, "train_speed(iter/s)": 0.292867 }, { "acc": 0.73877912, "epoch": 0.5728930005997928, "grad_norm": 7.375, "learning_rate": 8.547821537793862e-06, "loss": 1.03780804, "memory(GiB)": 141.16, "step": 51220, "train_speed(iter/s)": 0.292906 }, { "acc": 0.71788025, "epoch": 0.5731166995457513, "grad_norm": 6.65625, "learning_rate": 8.546518117975406e-06, "loss": 1.14567928, "memory(GiB)": 141.16, "step": 51240, "train_speed(iter/s)": 0.292943 }, { "acc": 0.73879156, "epoch": 0.5733403984917098, "grad_norm": 6.5625, "learning_rate": 8.54521421294258e-06, "loss": 1.0461277, "memory(GiB)": 141.16, "step": 51260, "train_speed(iter/s)": 0.292984 }, { "acc": 0.72649918, "epoch": 0.5735640974376683, "grad_norm": 5.3125, "learning_rate": 8.543909822873776e-06, "loss": 1.08418779, "memory(GiB)": 141.16, "step": 51280, "train_speed(iter/s)": 0.293024 }, { "acc": 0.72355709, "epoch": 0.5737877963836269, "grad_norm": 7.4375, "learning_rate": 8.542604947947454e-06, "loss": 1.12457581, "memory(GiB)": 141.16, "step": 51300, "train_speed(iter/s)": 0.293062 }, { "acc": 0.72972264, "epoch": 0.5740114953295854, "grad_norm": 8.125, "learning_rate": 8.54129958834214e-06, "loss": 1.07581701, "memory(GiB)": 141.16, "step": 51320, "train_speed(iter/s)": 0.293099 }, { "acc": 0.74331388, "epoch": 0.5742351942755439, "grad_norm": 8.4375, "learning_rate": 8.539993744236426e-06, "loss": 1.04109259, "memory(GiB)": 141.16, "step": 51340, "train_speed(iter/s)": 0.29314 }, { "acc": 0.71901302, "epoch": 0.5744588932215025, "grad_norm": 6.21875, "learning_rate": 8.538687415808971e-06, "loss": 1.14261742, "memory(GiB)": 141.16, "step": 51360, "train_speed(iter/s)": 0.293177 }, { "acc": 0.72420073, "epoch": 0.574682592167461, "grad_norm": 7.15625, "learning_rate": 8.537380603238497e-06, "loss": 1.1163558, "memory(GiB)": 141.16, "step": 51380, "train_speed(iter/s)": 0.293216 }, { "acc": 0.71622343, "epoch": 0.5749062911134195, "grad_norm": 5.96875, "learning_rate": 8.536073306703794e-06, "loss": 1.13145275, "memory(GiB)": 141.16, "step": 51400, "train_speed(iter/s)": 0.293256 }, { "acc": 0.73684015, "epoch": 0.575129990059378, "grad_norm": 7.28125, "learning_rate": 8.534765526383722e-06, "loss": 1.06442108, "memory(GiB)": 141.16, "step": 51420, "train_speed(iter/s)": 0.293294 }, { "acc": 0.7350636, "epoch": 0.5753536890053366, "grad_norm": 6.71875, "learning_rate": 8.533457262457202e-06, "loss": 1.04654417, "memory(GiB)": 141.16, "step": 51440, "train_speed(iter/s)": 0.29333 }, { "acc": 0.71489177, "epoch": 0.5755773879512951, "grad_norm": 6.28125, "learning_rate": 8.532148515103224e-06, "loss": 1.15251846, "memory(GiB)": 141.16, "step": 51460, "train_speed(iter/s)": 0.293365 }, { "acc": 0.72751255, "epoch": 0.5758010868972536, "grad_norm": 5.96875, "learning_rate": 8.530839284500843e-06, "loss": 1.09790764, "memory(GiB)": 141.16, "step": 51480, "train_speed(iter/s)": 0.293406 }, { "acc": 0.72369924, "epoch": 0.5760247858432122, "grad_norm": 7.6875, "learning_rate": 8.52952957082918e-06, "loss": 1.10778685, "memory(GiB)": 141.16, "step": 51500, "train_speed(iter/s)": 0.293441 }, { "acc": 0.72888699, "epoch": 0.5762484847891708, "grad_norm": 6.625, "learning_rate": 8.528219374267425e-06, "loss": 1.09928007, "memory(GiB)": 141.16, "step": 51520, "train_speed(iter/s)": 0.293481 }, { "acc": 0.73164206, "epoch": 0.5764721837351293, "grad_norm": 6.71875, "learning_rate": 8.52690869499483e-06, "loss": 1.07813148, "memory(GiB)": 141.16, "step": 51540, "train_speed(iter/s)": 0.293519 }, { "acc": 0.72551527, "epoch": 0.5766958826810878, "grad_norm": 6.25, "learning_rate": 8.52559753319071e-06, "loss": 1.08969126, "memory(GiB)": 141.16, "step": 51560, "train_speed(iter/s)": 0.293558 }, { "acc": 0.72891216, "epoch": 0.5769195816270464, "grad_norm": 9.0, "learning_rate": 8.524285889034458e-06, "loss": 1.09093418, "memory(GiB)": 141.16, "step": 51580, "train_speed(iter/s)": 0.293597 }, { "acc": 0.73101311, "epoch": 0.5771432805730049, "grad_norm": 7.9375, "learning_rate": 8.522973762705524e-06, "loss": 1.08631239, "memory(GiB)": 141.16, "step": 51600, "train_speed(iter/s)": 0.293626 }, { "acc": 0.72609978, "epoch": 0.5773669795189634, "grad_norm": 6.46875, "learning_rate": 8.521661154383423e-06, "loss": 1.10165958, "memory(GiB)": 141.16, "step": 51620, "train_speed(iter/s)": 0.293663 }, { "acc": 0.73412604, "epoch": 0.577590678464922, "grad_norm": 8.6875, "learning_rate": 8.520348064247739e-06, "loss": 1.05904694, "memory(GiB)": 141.16, "step": 51640, "train_speed(iter/s)": 0.293701 }, { "acc": 0.7296999, "epoch": 0.5778143774108805, "grad_norm": 6.53125, "learning_rate": 8.519034492478124e-06, "loss": 1.0719389, "memory(GiB)": 141.16, "step": 51660, "train_speed(iter/s)": 0.293738 }, { "acc": 0.73018112, "epoch": 0.578038076356839, "grad_norm": 6.0, "learning_rate": 8.517720439254291e-06, "loss": 1.08043451, "memory(GiB)": 141.16, "step": 51680, "train_speed(iter/s)": 0.293775 }, { "acc": 0.72833514, "epoch": 0.5782617753027975, "grad_norm": 5.9375, "learning_rate": 8.516405904756022e-06, "loss": 1.08680058, "memory(GiB)": 141.16, "step": 51700, "train_speed(iter/s)": 0.293815 }, { "acc": 0.73652229, "epoch": 0.5784854742487561, "grad_norm": 9.0, "learning_rate": 8.515090889163165e-06, "loss": 1.03946285, "memory(GiB)": 141.16, "step": 51720, "train_speed(iter/s)": 0.293849 }, { "acc": 0.72124386, "epoch": 0.5787091731947146, "grad_norm": 7.46875, "learning_rate": 8.513775392655633e-06, "loss": 1.1232954, "memory(GiB)": 141.16, "step": 51740, "train_speed(iter/s)": 0.293886 }, { "acc": 0.72932801, "epoch": 0.5789328721406731, "grad_norm": 6.34375, "learning_rate": 8.512459415413402e-06, "loss": 1.08452549, "memory(GiB)": 141.16, "step": 51760, "train_speed(iter/s)": 0.293919 }, { "acc": 0.72918015, "epoch": 0.5791565710866317, "grad_norm": 7.21875, "learning_rate": 8.511142957616518e-06, "loss": 1.09026203, "memory(GiB)": 141.16, "step": 51780, "train_speed(iter/s)": 0.293959 }, { "acc": 0.72723422, "epoch": 0.5793802700325902, "grad_norm": 9.8125, "learning_rate": 8.509826019445094e-06, "loss": 1.09461327, "memory(GiB)": 141.16, "step": 51800, "train_speed(iter/s)": 0.293996 }, { "acc": 0.73107877, "epoch": 0.5796039689785487, "grad_norm": 8.9375, "learning_rate": 8.508508601079301e-06, "loss": 1.08854523, "memory(GiB)": 141.16, "step": 51820, "train_speed(iter/s)": 0.294031 }, { "acc": 0.72677336, "epoch": 0.5798276679245072, "grad_norm": 7.9375, "learning_rate": 8.507190702699385e-06, "loss": 1.08962154, "memory(GiB)": 141.16, "step": 51840, "train_speed(iter/s)": 0.294066 }, { "acc": 0.73389435, "epoch": 0.5800513668704658, "grad_norm": 7.28125, "learning_rate": 8.505872324485652e-06, "loss": 1.06683502, "memory(GiB)": 141.16, "step": 51860, "train_speed(iter/s)": 0.294103 }, { "acc": 0.72730088, "epoch": 0.5802750658164243, "grad_norm": 6.34375, "learning_rate": 8.504553466618473e-06, "loss": 1.08979988, "memory(GiB)": 141.16, "step": 51880, "train_speed(iter/s)": 0.294141 }, { "acc": 0.74013729, "epoch": 0.5804987647623828, "grad_norm": 5.21875, "learning_rate": 8.503234129278288e-06, "loss": 1.04869232, "memory(GiB)": 141.16, "step": 51900, "train_speed(iter/s)": 0.294174 }, { "acc": 0.73753862, "epoch": 0.5807224637083414, "grad_norm": 6.78125, "learning_rate": 8.501914312645601e-06, "loss": 1.04110508, "memory(GiB)": 141.16, "step": 51920, "train_speed(iter/s)": 0.29421 }, { "acc": 0.71948786, "epoch": 0.5809461626542999, "grad_norm": 8.375, "learning_rate": 8.500594016900984e-06, "loss": 1.11088839, "memory(GiB)": 141.16, "step": 51940, "train_speed(iter/s)": 0.294249 }, { "acc": 0.73213644, "epoch": 0.5811698616002584, "grad_norm": 8.125, "learning_rate": 8.49927324222507e-06, "loss": 1.07976418, "memory(GiB)": 141.16, "step": 51960, "train_speed(iter/s)": 0.294286 }, { "acc": 0.7362009, "epoch": 0.5813935605462169, "grad_norm": 7.59375, "learning_rate": 8.49795198879856e-06, "loss": 1.05560226, "memory(GiB)": 141.16, "step": 51980, "train_speed(iter/s)": 0.294318 }, { "acc": 0.73142214, "epoch": 0.5816172594921755, "grad_norm": 5.0625, "learning_rate": 8.49663025680222e-06, "loss": 1.07511997, "memory(GiB)": 141.16, "step": 52000, "train_speed(iter/s)": 0.294353 }, { "epoch": 0.5816172594921755, "eval_acc": 0.689058587694214, "eval_loss": 1.0833675861358643, "eval_runtime": 2322.7268, "eval_samples_per_second": 32.411, "eval_steps_per_second": 16.206, "step": 52000 }, { "acc": 0.72438817, "epoch": 0.581840958438134, "grad_norm": 6.1875, "learning_rate": 8.495308046416884e-06, "loss": 1.0982851, "memory(GiB)": 141.16, "step": 52020, "train_speed(iter/s)": 0.290489 }, { "acc": 0.72281613, "epoch": 0.5820646573840925, "grad_norm": 7.65625, "learning_rate": 8.493985357823447e-06, "loss": 1.1023613, "memory(GiB)": 141.16, "step": 52040, "train_speed(iter/s)": 0.290523 }, { "acc": 0.72833891, "epoch": 0.582288356330051, "grad_norm": 7.875, "learning_rate": 8.492662191202872e-06, "loss": 1.10066395, "memory(GiB)": 141.16, "step": 52060, "train_speed(iter/s)": 0.29056 }, { "acc": 0.72609029, "epoch": 0.5825120552760096, "grad_norm": 7.84375, "learning_rate": 8.491338546736188e-06, "loss": 1.1029232, "memory(GiB)": 141.16, "step": 52080, "train_speed(iter/s)": 0.290595 }, { "acc": 0.73323393, "epoch": 0.5827357542219681, "grad_norm": 8.6875, "learning_rate": 8.490014424604487e-06, "loss": 1.06077404, "memory(GiB)": 141.16, "step": 52100, "train_speed(iter/s)": 0.290634 }, { "acc": 0.71848555, "epoch": 0.5829594531679266, "grad_norm": 8.125, "learning_rate": 8.488689824988929e-06, "loss": 1.14527693, "memory(GiB)": 141.16, "step": 52120, "train_speed(iter/s)": 0.29067 }, { "acc": 0.71749163, "epoch": 0.5831831521138852, "grad_norm": 9.0, "learning_rate": 8.48736474807074e-06, "loss": 1.13444805, "memory(GiB)": 141.16, "step": 52140, "train_speed(iter/s)": 0.290704 }, { "acc": 0.72589812, "epoch": 0.5834068510598437, "grad_norm": 4.9375, "learning_rate": 8.486039194031206e-06, "loss": 1.09486942, "memory(GiB)": 141.16, "step": 52160, "train_speed(iter/s)": 0.290742 }, { "acc": 0.72654381, "epoch": 0.5836305500058022, "grad_norm": 6.375, "learning_rate": 8.484713163051685e-06, "loss": 1.10954609, "memory(GiB)": 141.16, "step": 52180, "train_speed(iter/s)": 0.290775 }, { "acc": 0.72063961, "epoch": 0.5838542489517607, "grad_norm": 5.90625, "learning_rate": 8.483386655313593e-06, "loss": 1.11309185, "memory(GiB)": 141.16, "step": 52200, "train_speed(iter/s)": 0.29081 }, { "acc": 0.73213482, "epoch": 0.5840779478977193, "grad_norm": 6.625, "learning_rate": 8.482059670998419e-06, "loss": 1.07082005, "memory(GiB)": 141.16, "step": 52220, "train_speed(iter/s)": 0.290847 }, { "acc": 0.73128109, "epoch": 0.5843016468436778, "grad_norm": 9.1875, "learning_rate": 8.480732210287712e-06, "loss": 1.08251476, "memory(GiB)": 141.16, "step": 52240, "train_speed(iter/s)": 0.290886 }, { "acc": 0.72434168, "epoch": 0.5845253457896363, "grad_norm": 7.90625, "learning_rate": 8.479404273363087e-06, "loss": 1.10737648, "memory(GiB)": 141.16, "step": 52260, "train_speed(iter/s)": 0.290926 }, { "acc": 0.73255205, "epoch": 0.5847490447355949, "grad_norm": 5.9375, "learning_rate": 8.478075860406225e-06, "loss": 1.0738596, "memory(GiB)": 141.16, "step": 52280, "train_speed(iter/s)": 0.290963 }, { "acc": 0.72875276, "epoch": 0.5849727436815534, "grad_norm": 5.53125, "learning_rate": 8.476746971598873e-06, "loss": 1.0826376, "memory(GiB)": 141.16, "step": 52300, "train_speed(iter/s)": 0.291001 }, { "acc": 0.72380471, "epoch": 0.5851964426275119, "grad_norm": 7.5, "learning_rate": 8.47541760712284e-06, "loss": 1.1242012, "memory(GiB)": 141.16, "step": 52320, "train_speed(iter/s)": 0.291038 }, { "acc": 0.73270421, "epoch": 0.5854201415734704, "grad_norm": 7.71875, "learning_rate": 8.474087767160004e-06, "loss": 1.08168526, "memory(GiB)": 141.16, "step": 52340, "train_speed(iter/s)": 0.291074 }, { "acc": 0.72233028, "epoch": 0.585643840519429, "grad_norm": 7.6875, "learning_rate": 8.472757451892305e-06, "loss": 1.11470833, "memory(GiB)": 141.16, "step": 52360, "train_speed(iter/s)": 0.291115 }, { "acc": 0.72011175, "epoch": 0.5858675394653875, "grad_norm": 6.25, "learning_rate": 8.47142666150175e-06, "loss": 1.14746313, "memory(GiB)": 141.16, "step": 52380, "train_speed(iter/s)": 0.29115 }, { "acc": 0.73519387, "epoch": 0.586091238411346, "grad_norm": 7.125, "learning_rate": 8.470095396170408e-06, "loss": 1.06909809, "memory(GiB)": 141.16, "step": 52400, "train_speed(iter/s)": 0.291182 }, { "acc": 0.72192459, "epoch": 0.5863149373573046, "grad_norm": 7.40625, "learning_rate": 8.46876365608042e-06, "loss": 1.12109795, "memory(GiB)": 141.16, "step": 52420, "train_speed(iter/s)": 0.291221 }, { "acc": 0.73116198, "epoch": 0.5865386363032631, "grad_norm": 9.125, "learning_rate": 8.467431441413981e-06, "loss": 1.08404121, "memory(GiB)": 141.16, "step": 52440, "train_speed(iter/s)": 0.29126 }, { "acc": 0.73985176, "epoch": 0.5867623352492216, "grad_norm": 9.0, "learning_rate": 8.466098752353359e-06, "loss": 1.04628105, "memory(GiB)": 141.16, "step": 52460, "train_speed(iter/s)": 0.291299 }, { "acc": 0.72718801, "epoch": 0.5869860341951801, "grad_norm": 7.25, "learning_rate": 8.464765589080888e-06, "loss": 1.10496483, "memory(GiB)": 141.16, "step": 52480, "train_speed(iter/s)": 0.291332 }, { "acc": 0.72853174, "epoch": 0.5872097331411387, "grad_norm": 7.375, "learning_rate": 8.46343195177896e-06, "loss": 1.06359844, "memory(GiB)": 141.16, "step": 52500, "train_speed(iter/s)": 0.291369 }, { "acc": 0.71804004, "epoch": 0.5874334320870972, "grad_norm": 6.71875, "learning_rate": 8.462097840630037e-06, "loss": 1.13953609, "memory(GiB)": 141.16, "step": 52520, "train_speed(iter/s)": 0.291407 }, { "acc": 0.72847519, "epoch": 0.5876571310330557, "grad_norm": 7.125, "learning_rate": 8.460763255816645e-06, "loss": 1.09664373, "memory(GiB)": 141.16, "step": 52540, "train_speed(iter/s)": 0.291442 }, { "acc": 0.73446951, "epoch": 0.5878808299790143, "grad_norm": 7.46875, "learning_rate": 8.459428197521375e-06, "loss": 1.07279339, "memory(GiB)": 141.16, "step": 52560, "train_speed(iter/s)": 0.291477 }, { "acc": 0.73479567, "epoch": 0.5881045289249728, "grad_norm": 7.65625, "learning_rate": 8.45809266592688e-06, "loss": 1.06670837, "memory(GiB)": 141.16, "step": 52580, "train_speed(iter/s)": 0.29151 }, { "acc": 0.72899556, "epoch": 0.5883282278709313, "grad_norm": 6.5625, "learning_rate": 8.456756661215882e-06, "loss": 1.09603481, "memory(GiB)": 141.16, "step": 52600, "train_speed(iter/s)": 0.291545 }, { "acc": 0.73583679, "epoch": 0.5885519268168898, "grad_norm": 6.4375, "learning_rate": 8.45542018357116e-06, "loss": 1.05699501, "memory(GiB)": 141.16, "step": 52620, "train_speed(iter/s)": 0.29158 }, { "acc": 0.72370224, "epoch": 0.5887756257628484, "grad_norm": 8.0, "learning_rate": 8.454083233175573e-06, "loss": 1.1071312, "memory(GiB)": 141.16, "step": 52640, "train_speed(iter/s)": 0.291619 }, { "acc": 0.72796583, "epoch": 0.5889993247088069, "grad_norm": 9.875, "learning_rate": 8.452745810212028e-06, "loss": 1.10623245, "memory(GiB)": 141.16, "step": 52660, "train_speed(iter/s)": 0.291653 }, { "acc": 0.7114172, "epoch": 0.5892230236547654, "grad_norm": 6.40625, "learning_rate": 8.451407914863502e-06, "loss": 1.18141422, "memory(GiB)": 141.16, "step": 52680, "train_speed(iter/s)": 0.291686 }, { "acc": 0.72124662, "epoch": 0.589446722600724, "grad_norm": 7.15625, "learning_rate": 8.450069547313045e-06, "loss": 1.13120584, "memory(GiB)": 141.16, "step": 52700, "train_speed(iter/s)": 0.291721 }, { "acc": 0.72904043, "epoch": 0.5896704215466825, "grad_norm": 7.625, "learning_rate": 8.448730707743759e-06, "loss": 1.09217939, "memory(GiB)": 141.16, "step": 52720, "train_speed(iter/s)": 0.291761 }, { "acc": 0.7202445, "epoch": 0.589894120492641, "grad_norm": 8.5, "learning_rate": 8.44739139633882e-06, "loss": 1.13588524, "memory(GiB)": 141.16, "step": 52740, "train_speed(iter/s)": 0.291802 }, { "acc": 0.74020839, "epoch": 0.5901178194385995, "grad_norm": 7.34375, "learning_rate": 8.446051613281462e-06, "loss": 1.03185482, "memory(GiB)": 141.16, "step": 52760, "train_speed(iter/s)": 0.291839 }, { "acc": 0.74289942, "epoch": 0.5903415183845581, "grad_norm": 6.375, "learning_rate": 8.444711358754988e-06, "loss": 1.03450851, "memory(GiB)": 141.16, "step": 52780, "train_speed(iter/s)": 0.291876 }, { "acc": 0.73213415, "epoch": 0.5905652173305166, "grad_norm": 6.6875, "learning_rate": 8.443370632942765e-06, "loss": 1.06735191, "memory(GiB)": 141.16, "step": 52800, "train_speed(iter/s)": 0.291914 }, { "acc": 0.72441006, "epoch": 0.5907889162764751, "grad_norm": 6.0625, "learning_rate": 8.442029436028222e-06, "loss": 1.10916767, "memory(GiB)": 141.16, "step": 52820, "train_speed(iter/s)": 0.291948 }, { "acc": 0.7255187, "epoch": 0.5910126152224336, "grad_norm": 6.15625, "learning_rate": 8.440687768194852e-06, "loss": 1.09831352, "memory(GiB)": 141.16, "step": 52840, "train_speed(iter/s)": 0.291987 }, { "acc": 0.72584, "epoch": 0.5912363141683922, "grad_norm": 7.4375, "learning_rate": 8.439345629626219e-06, "loss": 1.09921284, "memory(GiB)": 141.16, "step": 52860, "train_speed(iter/s)": 0.292024 }, { "acc": 0.73530064, "epoch": 0.5914600131143507, "grad_norm": 7.46875, "learning_rate": 8.438003020505945e-06, "loss": 1.06386309, "memory(GiB)": 141.16, "step": 52880, "train_speed(iter/s)": 0.292055 }, { "acc": 0.7254746, "epoch": 0.5916837120603092, "grad_norm": 7.21875, "learning_rate": 8.436659941017715e-06, "loss": 1.09271955, "memory(GiB)": 141.16, "step": 52900, "train_speed(iter/s)": 0.292092 }, { "acc": 0.72899766, "epoch": 0.5919074110062678, "grad_norm": 7.46875, "learning_rate": 8.435316391345286e-06, "loss": 1.09842129, "memory(GiB)": 141.16, "step": 52920, "train_speed(iter/s)": 0.292126 }, { "acc": 0.7189126, "epoch": 0.5921311099522263, "grad_norm": 8.6875, "learning_rate": 8.433972371672471e-06, "loss": 1.13536758, "memory(GiB)": 141.16, "step": 52940, "train_speed(iter/s)": 0.292164 }, { "acc": 0.72357388, "epoch": 0.5923548088981848, "grad_norm": 7.21875, "learning_rate": 8.432627882183153e-06, "loss": 1.12088146, "memory(GiB)": 141.16, "step": 52960, "train_speed(iter/s)": 0.292199 }, { "acc": 0.74150529, "epoch": 0.5925785078441433, "grad_norm": 8.75, "learning_rate": 8.431282923061279e-06, "loss": 1.01972427, "memory(GiB)": 141.16, "step": 52980, "train_speed(iter/s)": 0.292239 }, { "acc": 0.71703844, "epoch": 0.5928022067901019, "grad_norm": 7.53125, "learning_rate": 8.429937494490853e-06, "loss": 1.12868748, "memory(GiB)": 141.16, "step": 53000, "train_speed(iter/s)": 0.292277 }, { "acc": 0.74334545, "epoch": 0.5930259057360604, "grad_norm": 6.59375, "learning_rate": 8.428591596655957e-06, "loss": 1.01853218, "memory(GiB)": 141.16, "step": 53020, "train_speed(iter/s)": 0.292317 }, { "acc": 0.73839264, "epoch": 0.5932496046820189, "grad_norm": 10.1875, "learning_rate": 8.427245229740722e-06, "loss": 1.04866428, "memory(GiB)": 141.16, "step": 53040, "train_speed(iter/s)": 0.292352 }, { "acc": 0.73750486, "epoch": 0.5934733036279775, "grad_norm": 8.25, "learning_rate": 8.425898393929353e-06, "loss": 1.04939022, "memory(GiB)": 141.16, "step": 53060, "train_speed(iter/s)": 0.292388 }, { "acc": 0.7217916, "epoch": 0.593697002573936, "grad_norm": 8.8125, "learning_rate": 8.424551089406118e-06, "loss": 1.11612434, "memory(GiB)": 141.16, "step": 53080, "train_speed(iter/s)": 0.292425 }, { "acc": 0.73900251, "epoch": 0.5939207015198945, "grad_norm": 7.625, "learning_rate": 8.423203316355345e-06, "loss": 1.04809494, "memory(GiB)": 141.16, "step": 53100, "train_speed(iter/s)": 0.292463 }, { "acc": 0.71774836, "epoch": 0.594144400465853, "grad_norm": 8.25, "learning_rate": 8.42185507496143e-06, "loss": 1.13783703, "memory(GiB)": 141.16, "step": 53120, "train_speed(iter/s)": 0.292498 }, { "acc": 0.72164946, "epoch": 0.5943680994118116, "grad_norm": 6.0625, "learning_rate": 8.420506365408829e-06, "loss": 1.12784271, "memory(GiB)": 141.16, "step": 53140, "train_speed(iter/s)": 0.292533 }, { "acc": 0.73091617, "epoch": 0.5945917983577701, "grad_norm": 7.34375, "learning_rate": 8.419157187882068e-06, "loss": 1.06335754, "memory(GiB)": 141.16, "step": 53160, "train_speed(iter/s)": 0.292571 }, { "acc": 0.72441187, "epoch": 0.5948154973037286, "grad_norm": 6.1875, "learning_rate": 8.417807542565735e-06, "loss": 1.11189651, "memory(GiB)": 141.16, "step": 53180, "train_speed(iter/s)": 0.292611 }, { "acc": 0.74297109, "epoch": 0.5950391962496872, "grad_norm": 5.875, "learning_rate": 8.416457429644476e-06, "loss": 1.03378487, "memory(GiB)": 141.16, "step": 53200, "train_speed(iter/s)": 0.292648 }, { "acc": 0.72745914, "epoch": 0.5952628951956457, "grad_norm": 5.9375, "learning_rate": 8.415106849303007e-06, "loss": 1.10356045, "memory(GiB)": 141.16, "step": 53220, "train_speed(iter/s)": 0.292685 }, { "acc": 0.72401476, "epoch": 0.5954865941416042, "grad_norm": 6.0625, "learning_rate": 8.413755801726111e-06, "loss": 1.1106576, "memory(GiB)": 141.16, "step": 53240, "train_speed(iter/s)": 0.292723 }, { "acc": 0.72548389, "epoch": 0.5957102930875627, "grad_norm": 6.3125, "learning_rate": 8.412404287098626e-06, "loss": 1.11532297, "memory(GiB)": 141.16, "step": 53260, "train_speed(iter/s)": 0.292759 }, { "acc": 0.72931476, "epoch": 0.5959339920335213, "grad_norm": 7.0, "learning_rate": 8.41105230560546e-06, "loss": 1.08808661, "memory(GiB)": 141.16, "step": 53280, "train_speed(iter/s)": 0.292799 }, { "acc": 0.74253407, "epoch": 0.5961576909794798, "grad_norm": 7.4375, "learning_rate": 8.409699857431584e-06, "loss": 1.00607281, "memory(GiB)": 141.16, "step": 53300, "train_speed(iter/s)": 0.292835 }, { "acc": 0.71839476, "epoch": 0.5963813899254383, "grad_norm": 7.21875, "learning_rate": 8.40834694276203e-06, "loss": 1.13084679, "memory(GiB)": 141.16, "step": 53320, "train_speed(iter/s)": 0.292876 }, { "acc": 0.73044968, "epoch": 0.5966050888713968, "grad_norm": 7.5625, "learning_rate": 8.4069935617819e-06, "loss": 1.08971071, "memory(GiB)": 141.16, "step": 53340, "train_speed(iter/s)": 0.292916 }, { "acc": 0.73446827, "epoch": 0.5968287878173554, "grad_norm": 5.90625, "learning_rate": 8.405639714676353e-06, "loss": 1.06161346, "memory(GiB)": 141.16, "step": 53360, "train_speed(iter/s)": 0.292957 }, { "acc": 0.71198955, "epoch": 0.5970524867633139, "grad_norm": 7.125, "learning_rate": 8.404285401630614e-06, "loss": 1.16898575, "memory(GiB)": 141.16, "step": 53380, "train_speed(iter/s)": 0.292994 }, { "acc": 0.72509789, "epoch": 0.5972761857092724, "grad_norm": 7.84375, "learning_rate": 8.402930622829975e-06, "loss": 1.10072231, "memory(GiB)": 141.16, "step": 53400, "train_speed(iter/s)": 0.293031 }, { "acc": 0.74125614, "epoch": 0.597499884655231, "grad_norm": 8.4375, "learning_rate": 8.401575378459785e-06, "loss": 1.03464622, "memory(GiB)": 141.16, "step": 53420, "train_speed(iter/s)": 0.293069 }, { "acc": 0.72460241, "epoch": 0.5977235836011895, "grad_norm": 7.125, "learning_rate": 8.400219668705468e-06, "loss": 1.11088505, "memory(GiB)": 141.16, "step": 53440, "train_speed(iter/s)": 0.293108 }, { "acc": 0.73466101, "epoch": 0.597947282547148, "grad_norm": 6.65625, "learning_rate": 8.398863493752495e-06, "loss": 1.06883488, "memory(GiB)": 141.16, "step": 53460, "train_speed(iter/s)": 0.293142 }, { "acc": 0.72669821, "epoch": 0.5981709814931065, "grad_norm": 8.0, "learning_rate": 8.397506853786419e-06, "loss": 1.10888577, "memory(GiB)": 141.16, "step": 53480, "train_speed(iter/s)": 0.293175 }, { "acc": 0.72805414, "epoch": 0.5983946804390651, "grad_norm": 6.28125, "learning_rate": 8.396149748992844e-06, "loss": 1.08301401, "memory(GiB)": 141.16, "step": 53500, "train_speed(iter/s)": 0.293213 }, { "acc": 0.73846331, "epoch": 0.5986183793850236, "grad_norm": 6.625, "learning_rate": 8.394792179557438e-06, "loss": 1.03787222, "memory(GiB)": 141.16, "step": 53520, "train_speed(iter/s)": 0.293256 }, { "acc": 0.72267828, "epoch": 0.5988420783309821, "grad_norm": 6.03125, "learning_rate": 8.393434145665941e-06, "loss": 1.11773529, "memory(GiB)": 141.16, "step": 53540, "train_speed(iter/s)": 0.293285 }, { "acc": 0.72905159, "epoch": 0.5990657772769407, "grad_norm": 8.875, "learning_rate": 8.39207564750415e-06, "loss": 1.084128, "memory(GiB)": 141.16, "step": 53560, "train_speed(iter/s)": 0.29332 }, { "acc": 0.7301712, "epoch": 0.5992894762228992, "grad_norm": 6.4375, "learning_rate": 8.390716685257924e-06, "loss": 1.08786983, "memory(GiB)": 141.16, "step": 53580, "train_speed(iter/s)": 0.293356 }, { "acc": 0.73331699, "epoch": 0.5995131751688577, "grad_norm": 8.1875, "learning_rate": 8.389357259113195e-06, "loss": 1.07812195, "memory(GiB)": 141.16, "step": 53600, "train_speed(iter/s)": 0.293393 }, { "acc": 0.72526736, "epoch": 0.5997368741148162, "grad_norm": 6.71875, "learning_rate": 8.387997369255945e-06, "loss": 1.11168175, "memory(GiB)": 141.16, "step": 53620, "train_speed(iter/s)": 0.293431 }, { "acc": 0.72377386, "epoch": 0.5999605730607748, "grad_norm": 7.65625, "learning_rate": 8.38663701587223e-06, "loss": 1.09910507, "memory(GiB)": 141.16, "step": 53640, "train_speed(iter/s)": 0.293461 }, { "acc": 0.7251678, "epoch": 0.6001842720067333, "grad_norm": 7.65625, "learning_rate": 8.385276199148164e-06, "loss": 1.10775166, "memory(GiB)": 141.16, "step": 53660, "train_speed(iter/s)": 0.2935 }, { "acc": 0.72856293, "epoch": 0.6004079709526918, "grad_norm": 7.3125, "learning_rate": 8.383914919269929e-06, "loss": 1.08438129, "memory(GiB)": 141.16, "step": 53680, "train_speed(iter/s)": 0.293537 }, { "acc": 0.72936792, "epoch": 0.6006316698986504, "grad_norm": 8.25, "learning_rate": 8.382553176423764e-06, "loss": 1.08157139, "memory(GiB)": 141.16, "step": 53700, "train_speed(iter/s)": 0.293576 }, { "acc": 0.7301259, "epoch": 0.6008553688446089, "grad_norm": 7.15625, "learning_rate": 8.381190970795978e-06, "loss": 1.07489805, "memory(GiB)": 141.16, "step": 53720, "train_speed(iter/s)": 0.293614 }, { "acc": 0.73428221, "epoch": 0.6010790677905674, "grad_norm": 7.9375, "learning_rate": 8.37982830257294e-06, "loss": 1.05328159, "memory(GiB)": 141.16, "step": 53740, "train_speed(iter/s)": 0.293651 }, { "acc": 0.74090734, "epoch": 0.6013027667365259, "grad_norm": 6.0, "learning_rate": 8.378465171941078e-06, "loss": 1.00995502, "memory(GiB)": 141.16, "step": 53760, "train_speed(iter/s)": 0.293688 }, { "acc": 0.73675451, "epoch": 0.6015264656824845, "grad_norm": 8.3125, "learning_rate": 8.377101579086893e-06, "loss": 1.06148796, "memory(GiB)": 141.16, "step": 53780, "train_speed(iter/s)": 0.293725 }, { "acc": 0.72598038, "epoch": 0.601750164628443, "grad_norm": 6.96875, "learning_rate": 8.375737524196942e-06, "loss": 1.100105, "memory(GiB)": 141.16, "step": 53800, "train_speed(iter/s)": 0.293761 }, { "acc": 0.72929392, "epoch": 0.6019738635744015, "grad_norm": 6.34375, "learning_rate": 8.374373007457847e-06, "loss": 1.08759193, "memory(GiB)": 141.16, "step": 53820, "train_speed(iter/s)": 0.293793 }, { "acc": 0.72550106, "epoch": 0.60219756252036, "grad_norm": 7.65625, "learning_rate": 8.373008029056292e-06, "loss": 1.10493336, "memory(GiB)": 141.16, "step": 53840, "train_speed(iter/s)": 0.293832 }, { "acc": 0.73042479, "epoch": 0.6024212614663186, "grad_norm": 7.21875, "learning_rate": 8.37164258917903e-06, "loss": 1.0917387, "memory(GiB)": 141.16, "step": 53860, "train_speed(iter/s)": 0.293867 }, { "acc": 0.7304872, "epoch": 0.6026449604122771, "grad_norm": 7.375, "learning_rate": 8.37027668801287e-06, "loss": 1.08825741, "memory(GiB)": 141.16, "step": 53880, "train_speed(iter/s)": 0.293899 }, { "acc": 0.71880178, "epoch": 0.6028686593582356, "grad_norm": 7.09375, "learning_rate": 8.368910325744686e-06, "loss": 1.13441601, "memory(GiB)": 141.16, "step": 53900, "train_speed(iter/s)": 0.293931 }, { "acc": 0.73448706, "epoch": 0.6030923583041942, "grad_norm": 7.34375, "learning_rate": 8.367543502561416e-06, "loss": 1.06139441, "memory(GiB)": 141.16, "step": 53920, "train_speed(iter/s)": 0.293964 }, { "acc": 0.72925172, "epoch": 0.6033160572501527, "grad_norm": 5.4375, "learning_rate": 8.36617621865006e-06, "loss": 1.09419146, "memory(GiB)": 141.16, "step": 53940, "train_speed(iter/s)": 0.293999 }, { "acc": 0.72395363, "epoch": 0.6035397561961112, "grad_norm": 7.03125, "learning_rate": 8.364808474197687e-06, "loss": 1.11554012, "memory(GiB)": 141.16, "step": 53960, "train_speed(iter/s)": 0.29403 }, { "acc": 0.74403677, "epoch": 0.6037634551420697, "grad_norm": 6.875, "learning_rate": 8.363440269391419e-06, "loss": 1.0298625, "memory(GiB)": 141.16, "step": 53980, "train_speed(iter/s)": 0.294066 }, { "acc": 0.72833395, "epoch": 0.6039871540880283, "grad_norm": 6.625, "learning_rate": 8.362071604418447e-06, "loss": 1.09458179, "memory(GiB)": 141.16, "step": 54000, "train_speed(iter/s)": 0.294101 }, { "epoch": 0.6039871540880283, "eval_acc": 0.6892291048240129, "eval_loss": 1.0828158855438232, "eval_runtime": 2320.0846, "eval_samples_per_second": 32.448, "eval_steps_per_second": 16.224, "step": 54000 }, { "acc": 0.7233407, "epoch": 0.6042108530339868, "grad_norm": 8.0, "learning_rate": 8.360702479466025e-06, "loss": 1.10813084, "memory(GiB)": 141.16, "step": 54020, "train_speed(iter/s)": 0.290393 }, { "acc": 0.7337326, "epoch": 0.6044345519799454, "grad_norm": 6.65625, "learning_rate": 8.359332894721469e-06, "loss": 1.06541519, "memory(GiB)": 141.16, "step": 54040, "train_speed(iter/s)": 0.29043 }, { "acc": 0.7353488, "epoch": 0.604658250925904, "grad_norm": 7.5, "learning_rate": 8.357962850372154e-06, "loss": 1.05226631, "memory(GiB)": 141.16, "step": 54060, "train_speed(iter/s)": 0.290466 }, { "acc": 0.72263107, "epoch": 0.6048819498718625, "grad_norm": 6.75, "learning_rate": 8.356592346605528e-06, "loss": 1.11201191, "memory(GiB)": 141.16, "step": 54080, "train_speed(iter/s)": 0.290503 }, { "acc": 0.72196927, "epoch": 0.605105648817821, "grad_norm": 4.875, "learning_rate": 8.35522138360909e-06, "loss": 1.1091795, "memory(GiB)": 141.16, "step": 54100, "train_speed(iter/s)": 0.290536 }, { "acc": 0.71841154, "epoch": 0.6053293477637796, "grad_norm": 7.15625, "learning_rate": 8.353849961570413e-06, "loss": 1.1524826, "memory(GiB)": 141.16, "step": 54120, "train_speed(iter/s)": 0.290571 }, { "acc": 0.71737566, "epoch": 0.6055530467097381, "grad_norm": 9.1875, "learning_rate": 8.352478080677122e-06, "loss": 1.1302412, "memory(GiB)": 141.16, "step": 54140, "train_speed(iter/s)": 0.290606 }, { "acc": 0.73978376, "epoch": 0.6057767456556966, "grad_norm": 6.5625, "learning_rate": 8.351105741116909e-06, "loss": 1.04661179, "memory(GiB)": 141.16, "step": 54160, "train_speed(iter/s)": 0.29064 }, { "acc": 0.73270836, "epoch": 0.6060004446016551, "grad_norm": 6.25, "learning_rate": 8.349732943077535e-06, "loss": 1.06348457, "memory(GiB)": 141.16, "step": 54180, "train_speed(iter/s)": 0.290674 }, { "acc": 0.72674861, "epoch": 0.6062241435476137, "grad_norm": 6.84375, "learning_rate": 8.348359686746815e-06, "loss": 1.11484432, "memory(GiB)": 141.16, "step": 54200, "train_speed(iter/s)": 0.290704 }, { "acc": 0.71809883, "epoch": 0.6064478424935722, "grad_norm": 6.78125, "learning_rate": 8.34698597231263e-06, "loss": 1.15720425, "memory(GiB)": 141.16, "step": 54220, "train_speed(iter/s)": 0.290737 }, { "acc": 0.73306875, "epoch": 0.6066715414395307, "grad_norm": 6.34375, "learning_rate": 8.345611799962927e-06, "loss": 1.05754433, "memory(GiB)": 141.16, "step": 54240, "train_speed(iter/s)": 0.290771 }, { "acc": 0.72648239, "epoch": 0.6068952403854893, "grad_norm": 6.96875, "learning_rate": 8.344237169885707e-06, "loss": 1.11218796, "memory(GiB)": 141.16, "step": 54260, "train_speed(iter/s)": 0.290804 }, { "acc": 0.73838696, "epoch": 0.6071189393314478, "grad_norm": 7.0625, "learning_rate": 8.342862082269043e-06, "loss": 1.04416389, "memory(GiB)": 141.16, "step": 54280, "train_speed(iter/s)": 0.290836 }, { "acc": 0.7463316, "epoch": 0.6073426382774063, "grad_norm": 7.84375, "learning_rate": 8.341486537301067e-06, "loss": 1.01804543, "memory(GiB)": 141.16, "step": 54300, "train_speed(iter/s)": 0.290872 }, { "acc": 0.72233334, "epoch": 0.6075663372233648, "grad_norm": 8.1875, "learning_rate": 8.34011053516997e-06, "loss": 1.1058342, "memory(GiB)": 141.16, "step": 54320, "train_speed(iter/s)": 0.29091 }, { "acc": 0.72768211, "epoch": 0.6077900361693234, "grad_norm": 7.15625, "learning_rate": 8.338734076064013e-06, "loss": 1.08432341, "memory(GiB)": 141.16, "step": 54340, "train_speed(iter/s)": 0.290948 }, { "acc": 0.73742228, "epoch": 0.6080137351152819, "grad_norm": 7.25, "learning_rate": 8.33735716017151e-06, "loss": 1.05019493, "memory(GiB)": 141.16, "step": 54360, "train_speed(iter/s)": 0.290982 }, { "acc": 0.72592001, "epoch": 0.6082374340612404, "grad_norm": 6.90625, "learning_rate": 8.335979787680848e-06, "loss": 1.09987707, "memory(GiB)": 141.16, "step": 54380, "train_speed(iter/s)": 0.29102 }, { "acc": 0.72060642, "epoch": 0.608461133007199, "grad_norm": 9.25, "learning_rate": 8.334601958780467e-06, "loss": 1.12906952, "memory(GiB)": 141.16, "step": 54400, "train_speed(iter/s)": 0.291056 }, { "acc": 0.74464693, "epoch": 0.6086848319531575, "grad_norm": 8.25, "learning_rate": 8.333223673658877e-06, "loss": 1.01611738, "memory(GiB)": 141.16, "step": 54420, "train_speed(iter/s)": 0.291088 }, { "acc": 0.74149694, "epoch": 0.608908530899116, "grad_norm": 6.40625, "learning_rate": 8.331844932504644e-06, "loss": 1.02788124, "memory(GiB)": 141.16, "step": 54440, "train_speed(iter/s)": 0.291117 }, { "acc": 0.71577597, "epoch": 0.6091322298450745, "grad_norm": 6.875, "learning_rate": 8.330465735506403e-06, "loss": 1.12328959, "memory(GiB)": 141.16, "step": 54460, "train_speed(iter/s)": 0.291155 }, { "acc": 0.73953085, "epoch": 0.6093559287910331, "grad_norm": 7.0625, "learning_rate": 8.329086082852844e-06, "loss": 1.05784683, "memory(GiB)": 141.16, "step": 54480, "train_speed(iter/s)": 0.291188 }, { "acc": 0.72415304, "epoch": 0.6095796277369916, "grad_norm": 6.15625, "learning_rate": 8.327705974732727e-06, "loss": 1.12005024, "memory(GiB)": 141.16, "step": 54500, "train_speed(iter/s)": 0.291219 }, { "acc": 0.72667065, "epoch": 0.6098033266829501, "grad_norm": 6.0, "learning_rate": 8.326325411334868e-06, "loss": 1.11628351, "memory(GiB)": 141.16, "step": 54520, "train_speed(iter/s)": 0.291252 }, { "acc": 0.73544102, "epoch": 0.6100270256289086, "grad_norm": 8.0625, "learning_rate": 8.324944392848149e-06, "loss": 1.05212536, "memory(GiB)": 141.16, "step": 54540, "train_speed(iter/s)": 0.291288 }, { "acc": 0.72240701, "epoch": 0.6102507245748672, "grad_norm": 7.84375, "learning_rate": 8.323562919461514e-06, "loss": 1.09573994, "memory(GiB)": 141.16, "step": 54560, "train_speed(iter/s)": 0.291322 }, { "acc": 0.75040255, "epoch": 0.6104744235208257, "grad_norm": 6.625, "learning_rate": 8.322180991363965e-06, "loss": 0.98381081, "memory(GiB)": 141.16, "step": 54580, "train_speed(iter/s)": 0.291354 }, { "acc": 0.74177303, "epoch": 0.6106981224667842, "grad_norm": 6.3125, "learning_rate": 8.32079860874457e-06, "loss": 1.02791023, "memory(GiB)": 141.16, "step": 54600, "train_speed(iter/s)": 0.291391 }, { "acc": 0.73213687, "epoch": 0.6109218214127428, "grad_norm": 7.625, "learning_rate": 8.319415771792464e-06, "loss": 1.0532238, "memory(GiB)": 141.16, "step": 54620, "train_speed(iter/s)": 0.291423 }, { "acc": 0.72644706, "epoch": 0.6111455203587013, "grad_norm": 7.125, "learning_rate": 8.318032480696834e-06, "loss": 1.10595455, "memory(GiB)": 141.16, "step": 54640, "train_speed(iter/s)": 0.291459 }, { "acc": 0.73404269, "epoch": 0.6113692193046598, "grad_norm": 8.9375, "learning_rate": 8.316648735646933e-06, "loss": 1.06074944, "memory(GiB)": 141.16, "step": 54660, "train_speed(iter/s)": 0.291488 }, { "acc": 0.72990165, "epoch": 0.6115929182506183, "grad_norm": 7.5, "learning_rate": 8.315264536832082e-06, "loss": 1.0868391, "memory(GiB)": 141.16, "step": 54680, "train_speed(iter/s)": 0.291526 }, { "acc": 0.72387729, "epoch": 0.6118166171965769, "grad_norm": 7.71875, "learning_rate": 8.313879884441655e-06, "loss": 1.12095585, "memory(GiB)": 141.16, "step": 54700, "train_speed(iter/s)": 0.291563 }, { "acc": 0.72699161, "epoch": 0.6120403161425354, "grad_norm": 6.5, "learning_rate": 8.312494778665092e-06, "loss": 1.09869289, "memory(GiB)": 141.16, "step": 54720, "train_speed(iter/s)": 0.291599 }, { "acc": 0.74686856, "epoch": 0.6122640150884939, "grad_norm": 8.75, "learning_rate": 8.311109219691898e-06, "loss": 1.00741539, "memory(GiB)": 141.16, "step": 54740, "train_speed(iter/s)": 0.291634 }, { "acc": 0.72534981, "epoch": 0.6124877140344525, "grad_norm": 6.40625, "learning_rate": 8.309723207711638e-06, "loss": 1.1033042, "memory(GiB)": 141.16, "step": 54760, "train_speed(iter/s)": 0.29167 }, { "acc": 0.72534618, "epoch": 0.612711412980411, "grad_norm": 8.125, "learning_rate": 8.308336742913934e-06, "loss": 1.08692207, "memory(GiB)": 141.16, "step": 54780, "train_speed(iter/s)": 0.291705 }, { "acc": 0.72017398, "epoch": 0.6129351119263695, "grad_norm": 6.6875, "learning_rate": 8.306949825488477e-06, "loss": 1.1356432, "memory(GiB)": 141.16, "step": 54800, "train_speed(iter/s)": 0.291743 }, { "acc": 0.73897057, "epoch": 0.613158810872328, "grad_norm": 6.71875, "learning_rate": 8.305562455625016e-06, "loss": 1.03559093, "memory(GiB)": 141.16, "step": 54820, "train_speed(iter/s)": 0.291775 }, { "acc": 0.72978401, "epoch": 0.6133825098182866, "grad_norm": 6.46875, "learning_rate": 8.304174633513364e-06, "loss": 1.09763174, "memory(GiB)": 141.16, "step": 54840, "train_speed(iter/s)": 0.291809 }, { "acc": 0.73028297, "epoch": 0.6136062087642451, "grad_norm": 6.8125, "learning_rate": 8.302786359343395e-06, "loss": 1.0866024, "memory(GiB)": 141.16, "step": 54860, "train_speed(iter/s)": 0.291845 }, { "acc": 0.73724599, "epoch": 0.6138299077102036, "grad_norm": 7.09375, "learning_rate": 8.301397633305045e-06, "loss": 1.04775076, "memory(GiB)": 141.16, "step": 54880, "train_speed(iter/s)": 0.291878 }, { "acc": 0.72698479, "epoch": 0.6140536066561622, "grad_norm": 6.53125, "learning_rate": 8.300008455588311e-06, "loss": 1.114361, "memory(GiB)": 141.16, "step": 54900, "train_speed(iter/s)": 0.291908 }, { "acc": 0.73027568, "epoch": 0.6142773056021207, "grad_norm": 8.3125, "learning_rate": 8.298618826383251e-06, "loss": 1.08612356, "memory(GiB)": 141.16, "step": 54920, "train_speed(iter/s)": 0.291943 }, { "acc": 0.72364554, "epoch": 0.6145010045480792, "grad_norm": 6.03125, "learning_rate": 8.297228745879989e-06, "loss": 1.12032986, "memory(GiB)": 141.16, "step": 54940, "train_speed(iter/s)": 0.291981 }, { "acc": 0.72225189, "epoch": 0.6147247034940377, "grad_norm": 7.71875, "learning_rate": 8.295838214268704e-06, "loss": 1.1117054, "memory(GiB)": 141.16, "step": 54960, "train_speed(iter/s)": 0.292017 }, { "acc": 0.73739777, "epoch": 0.6149484024399963, "grad_norm": 7.34375, "learning_rate": 8.294447231739644e-06, "loss": 1.04285831, "memory(GiB)": 141.16, "step": 54980, "train_speed(iter/s)": 0.292053 }, { "acc": 0.73051543, "epoch": 0.6151721013859548, "grad_norm": 7.875, "learning_rate": 8.293055798483116e-06, "loss": 1.07109613, "memory(GiB)": 141.16, "step": 55000, "train_speed(iter/s)": 0.292089 }, { "acc": 0.71783743, "epoch": 0.6153958003319133, "grad_norm": 6.65625, "learning_rate": 8.291663914689485e-06, "loss": 1.1276247, "memory(GiB)": 141.16, "step": 55020, "train_speed(iter/s)": 0.292126 }, { "acc": 0.73719168, "epoch": 0.6156194992778719, "grad_norm": 6.03125, "learning_rate": 8.29027158054918e-06, "loss": 1.06141701, "memory(GiB)": 141.16, "step": 55040, "train_speed(iter/s)": 0.292162 }, { "acc": 0.72082953, "epoch": 0.6158431982238304, "grad_norm": 8.3125, "learning_rate": 8.288878796252695e-06, "loss": 1.1385397, "memory(GiB)": 141.16, "step": 55060, "train_speed(iter/s)": 0.292192 }, { "acc": 0.72883701, "epoch": 0.6160668971697889, "grad_norm": 7.28125, "learning_rate": 8.287485561990582e-06, "loss": 1.10643578, "memory(GiB)": 141.16, "step": 55080, "train_speed(iter/s)": 0.292225 }, { "acc": 0.72498732, "epoch": 0.6162905961157474, "grad_norm": 7.625, "learning_rate": 8.286091877953455e-06, "loss": 1.11129551, "memory(GiB)": 141.16, "step": 55100, "train_speed(iter/s)": 0.292261 }, { "acc": 0.72861433, "epoch": 0.616514295061706, "grad_norm": 6.71875, "learning_rate": 8.28469774433199e-06, "loss": 1.08129635, "memory(GiB)": 141.16, "step": 55120, "train_speed(iter/s)": 0.292296 }, { "acc": 0.7520977, "epoch": 0.6167379940076645, "grad_norm": 7.34375, "learning_rate": 8.283303161316924e-06, "loss": 0.98620129, "memory(GiB)": 141.16, "step": 55140, "train_speed(iter/s)": 0.292328 }, { "acc": 0.73414044, "epoch": 0.616961692953623, "grad_norm": 7.71875, "learning_rate": 8.28190812909906e-06, "loss": 1.0803112, "memory(GiB)": 141.16, "step": 55160, "train_speed(iter/s)": 0.292363 }, { "acc": 0.73189325, "epoch": 0.6171853918995815, "grad_norm": 8.0, "learning_rate": 8.28051264786925e-06, "loss": 1.06461182, "memory(GiB)": 141.16, "step": 55180, "train_speed(iter/s)": 0.292397 }, { "acc": 0.73414774, "epoch": 0.6174090908455401, "grad_norm": 8.3125, "learning_rate": 8.279116717818422e-06, "loss": 1.05894413, "memory(GiB)": 141.16, "step": 55200, "train_speed(iter/s)": 0.292431 }, { "acc": 0.73666196, "epoch": 0.6176327897914986, "grad_norm": 8.3125, "learning_rate": 8.277720339137559e-06, "loss": 1.04790659, "memory(GiB)": 141.16, "step": 55220, "train_speed(iter/s)": 0.292464 }, { "acc": 0.73051515, "epoch": 0.6178564887374571, "grad_norm": 7.0625, "learning_rate": 8.276323512017702e-06, "loss": 1.08375034, "memory(GiB)": 141.16, "step": 55240, "train_speed(iter/s)": 0.292501 }, { "acc": 0.7230639, "epoch": 0.6180801876834157, "grad_norm": 5.46875, "learning_rate": 8.27492623664996e-06, "loss": 1.10965538, "memory(GiB)": 141.16, "step": 55260, "train_speed(iter/s)": 0.292536 }, { "acc": 0.72884388, "epoch": 0.6183038866293742, "grad_norm": 7.5625, "learning_rate": 8.273528513225499e-06, "loss": 1.09006767, "memory(GiB)": 141.16, "step": 55280, "train_speed(iter/s)": 0.292571 }, { "acc": 0.72742281, "epoch": 0.6185275855753327, "grad_norm": 8.125, "learning_rate": 8.27213034193555e-06, "loss": 1.11372681, "memory(GiB)": 141.16, "step": 55300, "train_speed(iter/s)": 0.292603 }, { "acc": 0.7350174, "epoch": 0.6187512845212912, "grad_norm": 6.25, "learning_rate": 8.270731722971398e-06, "loss": 1.06576948, "memory(GiB)": 141.16, "step": 55320, "train_speed(iter/s)": 0.292635 }, { "acc": 0.71706343, "epoch": 0.6189749834672498, "grad_norm": 8.25, "learning_rate": 8.269332656524399e-06, "loss": 1.13097029, "memory(GiB)": 141.16, "step": 55340, "train_speed(iter/s)": 0.29267 }, { "acc": 0.72989125, "epoch": 0.6191986824132083, "grad_norm": 7.0625, "learning_rate": 8.26793314278596e-06, "loss": 1.09295712, "memory(GiB)": 141.16, "step": 55360, "train_speed(iter/s)": 0.292706 }, { "acc": 0.74074173, "epoch": 0.6194223813591668, "grad_norm": 6.875, "learning_rate": 8.266533181947561e-06, "loss": 1.02554684, "memory(GiB)": 141.16, "step": 55380, "train_speed(iter/s)": 0.292741 }, { "acc": 0.72615409, "epoch": 0.6196460803051254, "grad_norm": 7.8125, "learning_rate": 8.26513277420073e-06, "loss": 1.09624157, "memory(GiB)": 141.16, "step": 55400, "train_speed(iter/s)": 0.292777 }, { "acc": 0.73720007, "epoch": 0.6198697792510839, "grad_norm": 6.0625, "learning_rate": 8.263731919737068e-06, "loss": 1.05375156, "memory(GiB)": 141.16, "step": 55420, "train_speed(iter/s)": 0.292814 }, { "acc": 0.72590904, "epoch": 0.6200934781970424, "grad_norm": 7.65625, "learning_rate": 8.26233061874823e-06, "loss": 1.0915678, "memory(GiB)": 141.16, "step": 55440, "train_speed(iter/s)": 0.292851 }, { "acc": 0.72975488, "epoch": 0.6203171771430009, "grad_norm": 8.6875, "learning_rate": 8.260928871425932e-06, "loss": 1.09058361, "memory(GiB)": 141.16, "step": 55460, "train_speed(iter/s)": 0.292885 }, { "acc": 0.74832749, "epoch": 0.6205408760889595, "grad_norm": 6.34375, "learning_rate": 8.259526677961956e-06, "loss": 0.99608841, "memory(GiB)": 141.16, "step": 55480, "train_speed(iter/s)": 0.292917 }, { "acc": 0.73502769, "epoch": 0.620764575034918, "grad_norm": 7.375, "learning_rate": 8.258124038548141e-06, "loss": 1.04885378, "memory(GiB)": 141.16, "step": 55500, "train_speed(iter/s)": 0.292952 }, { "acc": 0.72576046, "epoch": 0.6209882739808765, "grad_norm": 5.96875, "learning_rate": 8.256720953376389e-06, "loss": 1.10781326, "memory(GiB)": 141.16, "step": 55520, "train_speed(iter/s)": 0.292985 }, { "acc": 0.71541648, "epoch": 0.621211972926835, "grad_norm": 6.15625, "learning_rate": 8.25531742263866e-06, "loss": 1.14640713, "memory(GiB)": 141.16, "step": 55540, "train_speed(iter/s)": 0.293025 }, { "acc": 0.71718369, "epoch": 0.6214356718727936, "grad_norm": 7.0625, "learning_rate": 8.25391344652698e-06, "loss": 1.14116821, "memory(GiB)": 141.16, "step": 55560, "train_speed(iter/s)": 0.293062 }, { "acc": 0.7363018, "epoch": 0.6216593708187521, "grad_norm": 5.78125, "learning_rate": 8.25250902523343e-06, "loss": 1.04616184, "memory(GiB)": 141.16, "step": 55580, "train_speed(iter/s)": 0.2931 }, { "acc": 0.72470589, "epoch": 0.6218830697647106, "grad_norm": 7.4375, "learning_rate": 8.251104158950158e-06, "loss": 1.10143728, "memory(GiB)": 141.16, "step": 55600, "train_speed(iter/s)": 0.293137 }, { "acc": 0.72353477, "epoch": 0.6221067687106692, "grad_norm": 7.625, "learning_rate": 8.249698847869368e-06, "loss": 1.11426182, "memory(GiB)": 141.16, "step": 55620, "train_speed(iter/s)": 0.293177 }, { "acc": 0.73707399, "epoch": 0.6223304676566277, "grad_norm": 7.3125, "learning_rate": 8.248293092183324e-06, "loss": 1.05970745, "memory(GiB)": 141.16, "step": 55640, "train_speed(iter/s)": 0.293215 }, { "acc": 0.73243814, "epoch": 0.6225541666025862, "grad_norm": 8.25, "learning_rate": 8.246886892084359e-06, "loss": 1.06859913, "memory(GiB)": 141.16, "step": 55660, "train_speed(iter/s)": 0.293256 }, { "acc": 0.72217064, "epoch": 0.6227778655485448, "grad_norm": 7.625, "learning_rate": 8.245480247764856e-06, "loss": 1.11983881, "memory(GiB)": 141.16, "step": 55680, "train_speed(iter/s)": 0.293294 }, { "acc": 0.73896618, "epoch": 0.6230015644945033, "grad_norm": 9.0625, "learning_rate": 8.244073159417268e-06, "loss": 1.04824753, "memory(GiB)": 141.16, "step": 55700, "train_speed(iter/s)": 0.293328 }, { "acc": 0.73241701, "epoch": 0.6232252634404618, "grad_norm": 7.6875, "learning_rate": 8.242665627234104e-06, "loss": 1.08611279, "memory(GiB)": 141.16, "step": 55720, "train_speed(iter/s)": 0.293361 }, { "acc": 0.72968755, "epoch": 0.6234489623864203, "grad_norm": 6.3125, "learning_rate": 8.241257651407933e-06, "loss": 1.07856483, "memory(GiB)": 141.16, "step": 55740, "train_speed(iter/s)": 0.293396 }, { "acc": 0.72728786, "epoch": 0.6236726613323789, "grad_norm": 7.03125, "learning_rate": 8.239849232131386e-06, "loss": 1.10036469, "memory(GiB)": 141.16, "step": 55760, "train_speed(iter/s)": 0.29343 }, { "acc": 0.72637005, "epoch": 0.6238963602783374, "grad_norm": 8.1875, "learning_rate": 8.238440369597157e-06, "loss": 1.09795284, "memory(GiB)": 141.16, "step": 55780, "train_speed(iter/s)": 0.293463 }, { "acc": 0.72903886, "epoch": 0.6241200592242959, "grad_norm": 7.625, "learning_rate": 8.237031063997999e-06, "loss": 1.08896751, "memory(GiB)": 141.16, "step": 55800, "train_speed(iter/s)": 0.293498 }, { "acc": 0.72939482, "epoch": 0.6243437581702544, "grad_norm": 5.5, "learning_rate": 8.23562131552672e-06, "loss": 1.08798618, "memory(GiB)": 141.16, "step": 55820, "train_speed(iter/s)": 0.293532 }, { "acc": 0.71936312, "epoch": 0.624567457116213, "grad_norm": 6.5625, "learning_rate": 8.234211124376199e-06, "loss": 1.1428793, "memory(GiB)": 141.16, "step": 55840, "train_speed(iter/s)": 0.293568 }, { "acc": 0.73927302, "epoch": 0.6247911560621715, "grad_norm": 7.71875, "learning_rate": 8.23280049073937e-06, "loss": 1.03934383, "memory(GiB)": 141.16, "step": 55860, "train_speed(iter/s)": 0.293598 }, { "acc": 0.72211504, "epoch": 0.62501485500813, "grad_norm": 5.5625, "learning_rate": 8.231389414809226e-06, "loss": 1.1171236, "memory(GiB)": 141.16, "step": 55880, "train_speed(iter/s)": 0.293632 }, { "acc": 0.72653346, "epoch": 0.6252385539540886, "grad_norm": 6.59375, "learning_rate": 8.229977896778822e-06, "loss": 1.11110144, "memory(GiB)": 141.16, "step": 55900, "train_speed(iter/s)": 0.293669 }, { "acc": 0.73133912, "epoch": 0.6254622529000471, "grad_norm": 8.9375, "learning_rate": 8.228565936841274e-06, "loss": 1.07897282, "memory(GiB)": 141.16, "step": 55920, "train_speed(iter/s)": 0.293709 }, { "acc": 0.7454298, "epoch": 0.6256859518460056, "grad_norm": 8.625, "learning_rate": 8.22715353518976e-06, "loss": 1.01284218, "memory(GiB)": 141.16, "step": 55940, "train_speed(iter/s)": 0.293743 }, { "acc": 0.72435946, "epoch": 0.6259096507919641, "grad_norm": 6.65625, "learning_rate": 8.225740692017516e-06, "loss": 1.09535398, "memory(GiB)": 141.16, "step": 55960, "train_speed(iter/s)": 0.293783 }, { "acc": 0.7382369, "epoch": 0.6261333497379227, "grad_norm": 7.84375, "learning_rate": 8.22432740751784e-06, "loss": 1.04961777, "memory(GiB)": 141.16, "step": 55980, "train_speed(iter/s)": 0.293822 }, { "acc": 0.72196469, "epoch": 0.6263570486838812, "grad_norm": 7.40625, "learning_rate": 8.222913681884085e-06, "loss": 1.11184692, "memory(GiB)": 141.16, "step": 56000, "train_speed(iter/s)": 0.293855 }, { "epoch": 0.6263570486838812, "eval_acc": 0.689307782285464, "eval_loss": 1.082321047782898, "eval_runtime": 2321.748, "eval_samples_per_second": 32.425, "eval_steps_per_second": 16.213, "step": 56000 }, { "acc": 0.7322752, "epoch": 0.6265807476298397, "grad_norm": 7.5625, "learning_rate": 8.221499515309676e-06, "loss": 1.07566032, "memory(GiB)": 141.16, "step": 56020, "train_speed(iter/s)": 0.290277 }, { "acc": 0.72377253, "epoch": 0.6268044465757983, "grad_norm": 5.90625, "learning_rate": 8.220084907988085e-06, "loss": 1.10841808, "memory(GiB)": 141.16, "step": 56040, "train_speed(iter/s)": 0.290314 }, { "acc": 0.72286263, "epoch": 0.6270281455217568, "grad_norm": 7.25, "learning_rate": 8.218669860112854e-06, "loss": 1.09974432, "memory(GiB)": 141.16, "step": 56060, "train_speed(iter/s)": 0.290348 }, { "acc": 0.73065548, "epoch": 0.6272518444677153, "grad_norm": 8.125, "learning_rate": 8.21725437187758e-06, "loss": 1.07860928, "memory(GiB)": 141.16, "step": 56080, "train_speed(iter/s)": 0.290382 }, { "acc": 0.74048986, "epoch": 0.6274755434136738, "grad_norm": 6.375, "learning_rate": 8.215838443475925e-06, "loss": 1.03536377, "memory(GiB)": 141.16, "step": 56100, "train_speed(iter/s)": 0.290419 }, { "acc": 0.7335001, "epoch": 0.6276992423596324, "grad_norm": 6.5625, "learning_rate": 8.214422075101603e-06, "loss": 1.06733418, "memory(GiB)": 141.16, "step": 56120, "train_speed(iter/s)": 0.290455 }, { "acc": 0.73509974, "epoch": 0.6279229413055909, "grad_norm": 7.5625, "learning_rate": 8.213005266948398e-06, "loss": 1.07068462, "memory(GiB)": 141.16, "step": 56140, "train_speed(iter/s)": 0.290492 }, { "acc": 0.7291482, "epoch": 0.6281466402515494, "grad_norm": 7.90625, "learning_rate": 8.211588019210148e-06, "loss": 1.10667362, "memory(GiB)": 141.16, "step": 56160, "train_speed(iter/s)": 0.290528 }, { "acc": 0.73475628, "epoch": 0.628370339197508, "grad_norm": 7.21875, "learning_rate": 8.210170332080752e-06, "loss": 1.05819731, "memory(GiB)": 141.16, "step": 56180, "train_speed(iter/s)": 0.290564 }, { "acc": 0.7296258, "epoch": 0.6285940381434665, "grad_norm": 8.375, "learning_rate": 8.208752205754171e-06, "loss": 1.06348677, "memory(GiB)": 141.16, "step": 56200, "train_speed(iter/s)": 0.2906 }, { "acc": 0.72867165, "epoch": 0.628817737089425, "grad_norm": 8.4375, "learning_rate": 8.207333640424426e-06, "loss": 1.08688784, "memory(GiB)": 141.16, "step": 56220, "train_speed(iter/s)": 0.290633 }, { "acc": 0.73216429, "epoch": 0.6290414360353835, "grad_norm": 7.125, "learning_rate": 8.205914636285594e-06, "loss": 1.08175335, "memory(GiB)": 141.16, "step": 56240, "train_speed(iter/s)": 0.290669 }, { "acc": 0.73584232, "epoch": 0.6292651349813421, "grad_norm": 6.28125, "learning_rate": 8.204495193531816e-06, "loss": 1.06387749, "memory(GiB)": 141.16, "step": 56260, "train_speed(iter/s)": 0.290699 }, { "acc": 0.72413239, "epoch": 0.6294888339273006, "grad_norm": 6.0, "learning_rate": 8.203075312357295e-06, "loss": 1.08922443, "memory(GiB)": 141.16, "step": 56280, "train_speed(iter/s)": 0.290732 }, { "acc": 0.73039274, "epoch": 0.6297125328732591, "grad_norm": 5.53125, "learning_rate": 8.201654992956287e-06, "loss": 1.08587227, "memory(GiB)": 141.16, "step": 56300, "train_speed(iter/s)": 0.29077 }, { "acc": 0.73895683, "epoch": 0.6299362318192177, "grad_norm": 7.8125, "learning_rate": 8.200234235523114e-06, "loss": 1.03465595, "memory(GiB)": 141.16, "step": 56320, "train_speed(iter/s)": 0.290804 }, { "acc": 0.72569361, "epoch": 0.6301599307651762, "grad_norm": 6.9375, "learning_rate": 8.198813040252157e-06, "loss": 1.10070467, "memory(GiB)": 141.16, "step": 56340, "train_speed(iter/s)": 0.29084 }, { "acc": 0.73287296, "epoch": 0.6303836297111347, "grad_norm": 8.5625, "learning_rate": 8.197391407337854e-06, "loss": 1.07498665, "memory(GiB)": 141.16, "step": 56360, "train_speed(iter/s)": 0.290873 }, { "acc": 0.72430649, "epoch": 0.6306073286570932, "grad_norm": 6.96875, "learning_rate": 8.195969336974705e-06, "loss": 1.11502266, "memory(GiB)": 141.16, "step": 56380, "train_speed(iter/s)": 0.290909 }, { "acc": 0.72707801, "epoch": 0.6308310276030518, "grad_norm": 6.875, "learning_rate": 8.194546829357269e-06, "loss": 1.08704128, "memory(GiB)": 141.16, "step": 56400, "train_speed(iter/s)": 0.290944 }, { "acc": 0.73498139, "epoch": 0.6310547265490103, "grad_norm": 8.0, "learning_rate": 8.193123884680168e-06, "loss": 1.05015688, "memory(GiB)": 141.16, "step": 56420, "train_speed(iter/s)": 0.290983 }, { "acc": 0.7363709, "epoch": 0.6312784254949688, "grad_norm": 7.4375, "learning_rate": 8.191700503138077e-06, "loss": 1.03440714, "memory(GiB)": 141.16, "step": 56440, "train_speed(iter/s)": 0.291015 }, { "acc": 0.73517194, "epoch": 0.6315021244409273, "grad_norm": 6.96875, "learning_rate": 8.190276684925738e-06, "loss": 1.07524242, "memory(GiB)": 141.16, "step": 56460, "train_speed(iter/s)": 0.291047 }, { "acc": 0.72986708, "epoch": 0.6317258233868859, "grad_norm": 6.71875, "learning_rate": 8.18885243023795e-06, "loss": 1.08433208, "memory(GiB)": 141.16, "step": 56480, "train_speed(iter/s)": 0.291083 }, { "acc": 0.73515325, "epoch": 0.6319495223328444, "grad_norm": 6.125, "learning_rate": 8.18742773926957e-06, "loss": 1.04823666, "memory(GiB)": 141.16, "step": 56500, "train_speed(iter/s)": 0.291117 }, { "acc": 0.72635565, "epoch": 0.6321732212788029, "grad_norm": 5.9375, "learning_rate": 8.186002612215515e-06, "loss": 1.0963954, "memory(GiB)": 141.16, "step": 56520, "train_speed(iter/s)": 0.291152 }, { "acc": 0.72755427, "epoch": 0.6323969202247615, "grad_norm": 8.3125, "learning_rate": 8.184577049270765e-06, "loss": 1.10298872, "memory(GiB)": 141.16, "step": 56540, "train_speed(iter/s)": 0.291182 }, { "acc": 0.72169766, "epoch": 0.6326206191707201, "grad_norm": 7.4375, "learning_rate": 8.183151050630358e-06, "loss": 1.12087231, "memory(GiB)": 141.16, "step": 56560, "train_speed(iter/s)": 0.29122 }, { "acc": 0.73076229, "epoch": 0.6328443181166786, "grad_norm": 6.15625, "learning_rate": 8.181724616489389e-06, "loss": 1.08770561, "memory(GiB)": 141.16, "step": 56580, "train_speed(iter/s)": 0.291249 }, { "acc": 0.72067084, "epoch": 0.6330680170626372, "grad_norm": 6.9375, "learning_rate": 8.180297747043014e-06, "loss": 1.11935253, "memory(GiB)": 141.16, "step": 56600, "train_speed(iter/s)": 0.291282 }, { "acc": 0.73342533, "epoch": 0.6332917160085957, "grad_norm": 6.96875, "learning_rate": 8.178870442486451e-06, "loss": 1.07568035, "memory(GiB)": 141.16, "step": 56620, "train_speed(iter/s)": 0.291315 }, { "acc": 0.71289134, "epoch": 0.6335154149545542, "grad_norm": 7.03125, "learning_rate": 8.177442703014975e-06, "loss": 1.14701986, "memory(GiB)": 141.16, "step": 56640, "train_speed(iter/s)": 0.291348 }, { "acc": 0.71032739, "epoch": 0.6337391139005127, "grad_norm": 8.1875, "learning_rate": 8.17601452882392e-06, "loss": 1.16094704, "memory(GiB)": 141.16, "step": 56660, "train_speed(iter/s)": 0.291383 }, { "acc": 0.7259469, "epoch": 0.6339628128464713, "grad_norm": 5.96875, "learning_rate": 8.174585920108682e-06, "loss": 1.08164558, "memory(GiB)": 141.16, "step": 56680, "train_speed(iter/s)": 0.291415 }, { "acc": 0.72010684, "epoch": 0.6341865117924298, "grad_norm": 7.1875, "learning_rate": 8.173156877064717e-06, "loss": 1.13297319, "memory(GiB)": 141.16, "step": 56700, "train_speed(iter/s)": 0.29145 }, { "acc": 0.72212791, "epoch": 0.6344102107383883, "grad_norm": 5.5, "learning_rate": 8.171727399887535e-06, "loss": 1.11009407, "memory(GiB)": 141.16, "step": 56720, "train_speed(iter/s)": 0.291485 }, { "acc": 0.72834325, "epoch": 0.6346339096843469, "grad_norm": 5.59375, "learning_rate": 8.170297488772709e-06, "loss": 1.08886976, "memory(GiB)": 141.16, "step": 56740, "train_speed(iter/s)": 0.291517 }, { "acc": 0.7194736, "epoch": 0.6348576086303054, "grad_norm": 6.84375, "learning_rate": 8.168867143915874e-06, "loss": 1.14038715, "memory(GiB)": 141.16, "step": 56760, "train_speed(iter/s)": 0.291551 }, { "acc": 0.7287396, "epoch": 0.6350813075762639, "grad_norm": 8.125, "learning_rate": 8.16743636551272e-06, "loss": 1.08264923, "memory(GiB)": 141.16, "step": 56780, "train_speed(iter/s)": 0.291585 }, { "acc": 0.71752119, "epoch": 0.6353050065222224, "grad_norm": 7.71875, "learning_rate": 8.166005153758997e-06, "loss": 1.16663857, "memory(GiB)": 141.16, "step": 56800, "train_speed(iter/s)": 0.291621 }, { "acc": 0.73789864, "epoch": 0.635528705468181, "grad_norm": 8.125, "learning_rate": 8.164573508850517e-06, "loss": 1.04853802, "memory(GiB)": 141.16, "step": 56820, "train_speed(iter/s)": 0.291655 }, { "acc": 0.72906651, "epoch": 0.6357524044141395, "grad_norm": 7.4375, "learning_rate": 8.16314143098315e-06, "loss": 1.07433462, "memory(GiB)": 141.16, "step": 56840, "train_speed(iter/s)": 0.291692 }, { "acc": 0.73402276, "epoch": 0.635976103360098, "grad_norm": 9.4375, "learning_rate": 8.161708920352823e-06, "loss": 1.07330704, "memory(GiB)": 141.16, "step": 56860, "train_speed(iter/s)": 0.291725 }, { "acc": 0.73008947, "epoch": 0.6361998023060565, "grad_norm": 7.25, "learning_rate": 8.160275977155523e-06, "loss": 1.09438133, "memory(GiB)": 141.16, "step": 56880, "train_speed(iter/s)": 0.291758 }, { "acc": 0.73313141, "epoch": 0.6364235012520151, "grad_norm": 5.96875, "learning_rate": 8.158842601587301e-06, "loss": 1.08733969, "memory(GiB)": 141.16, "step": 56900, "train_speed(iter/s)": 0.291789 }, { "acc": 0.73441038, "epoch": 0.6366472001979736, "grad_norm": 7.1875, "learning_rate": 8.157408793844258e-06, "loss": 1.05742588, "memory(GiB)": 141.16, "step": 56920, "train_speed(iter/s)": 0.291827 }, { "acc": 0.72990761, "epoch": 0.6368708991439321, "grad_norm": 5.8125, "learning_rate": 8.155974554122562e-06, "loss": 1.0805975, "memory(GiB)": 141.16, "step": 56940, "train_speed(iter/s)": 0.291862 }, { "acc": 0.7267015, "epoch": 0.6370945980898907, "grad_norm": 4.8125, "learning_rate": 8.15453988261844e-06, "loss": 1.1084815, "memory(GiB)": 141.16, "step": 56960, "train_speed(iter/s)": 0.291898 }, { "acc": 0.73310194, "epoch": 0.6373182970358492, "grad_norm": 7.03125, "learning_rate": 8.153104779528173e-06, "loss": 1.06519985, "memory(GiB)": 141.16, "step": 56980, "train_speed(iter/s)": 0.291934 }, { "acc": 0.72732363, "epoch": 0.6375419959818077, "grad_norm": 7.90625, "learning_rate": 8.151669245048104e-06, "loss": 1.09270535, "memory(GiB)": 141.16, "step": 57000, "train_speed(iter/s)": 0.29197 }, { "acc": 0.72659388, "epoch": 0.6377656949277662, "grad_norm": 8.875, "learning_rate": 8.150233279374635e-06, "loss": 1.11249714, "memory(GiB)": 141.16, "step": 57020, "train_speed(iter/s)": 0.292008 }, { "acc": 0.72569857, "epoch": 0.6379893938737248, "grad_norm": 7.375, "learning_rate": 8.148796882704223e-06, "loss": 1.11658707, "memory(GiB)": 141.16, "step": 57040, "train_speed(iter/s)": 0.292039 }, { "acc": 0.73129454, "epoch": 0.6382130928196833, "grad_norm": 7.78125, "learning_rate": 8.147360055233395e-06, "loss": 1.06734152, "memory(GiB)": 141.16, "step": 57060, "train_speed(iter/s)": 0.292075 }, { "acc": 0.72863083, "epoch": 0.6384367917656418, "grad_norm": 5.9375, "learning_rate": 8.145922797158724e-06, "loss": 1.08314972, "memory(GiB)": 141.16, "step": 57080, "train_speed(iter/s)": 0.292107 }, { "acc": 0.72463522, "epoch": 0.6386604907116004, "grad_norm": 6.5625, "learning_rate": 8.144485108676847e-06, "loss": 1.12084618, "memory(GiB)": 141.16, "step": 57100, "train_speed(iter/s)": 0.292134 }, { "acc": 0.73527455, "epoch": 0.6388841896575589, "grad_norm": 6.78125, "learning_rate": 8.143046989984464e-06, "loss": 1.04839382, "memory(GiB)": 141.16, "step": 57120, "train_speed(iter/s)": 0.292167 }, { "acc": 0.72783766, "epoch": 0.6391078886035174, "grad_norm": 6.0625, "learning_rate": 8.141608441278328e-06, "loss": 1.08505707, "memory(GiB)": 141.16, "step": 57140, "train_speed(iter/s)": 0.292201 }, { "acc": 0.73509893, "epoch": 0.6393315875494759, "grad_norm": 7.21875, "learning_rate": 8.140169462755252e-06, "loss": 1.06709576, "memory(GiB)": 141.16, "step": 57160, "train_speed(iter/s)": 0.292234 }, { "acc": 0.71229391, "epoch": 0.6395552864954345, "grad_norm": 7.21875, "learning_rate": 8.138730054612111e-06, "loss": 1.15949039, "memory(GiB)": 141.16, "step": 57180, "train_speed(iter/s)": 0.292268 }, { "acc": 0.73771806, "epoch": 0.639778985441393, "grad_norm": 7.90625, "learning_rate": 8.137290217045837e-06, "loss": 1.0322504, "memory(GiB)": 141.16, "step": 57200, "train_speed(iter/s)": 0.292305 }, { "acc": 0.72589531, "epoch": 0.6400026843873515, "grad_norm": 6.9375, "learning_rate": 8.135849950253416e-06, "loss": 1.0935154, "memory(GiB)": 141.16, "step": 57220, "train_speed(iter/s)": 0.292339 }, { "acc": 0.7139575, "epoch": 0.6402263833333101, "grad_norm": 6.40625, "learning_rate": 8.134409254431903e-06, "loss": 1.15616465, "memory(GiB)": 141.16, "step": 57240, "train_speed(iter/s)": 0.292373 }, { "acc": 0.73074641, "epoch": 0.6404500822792686, "grad_norm": 7.25, "learning_rate": 8.132968129778401e-06, "loss": 1.07466602, "memory(GiB)": 141.16, "step": 57260, "train_speed(iter/s)": 0.292401 }, { "acc": 0.73173695, "epoch": 0.6406737812252271, "grad_norm": 8.0625, "learning_rate": 8.13152657649008e-06, "loss": 1.0669796, "memory(GiB)": 141.16, "step": 57280, "train_speed(iter/s)": 0.292436 }, { "acc": 0.72465467, "epoch": 0.6408974801711856, "grad_norm": 6.90625, "learning_rate": 8.130084594764162e-06, "loss": 1.11045876, "memory(GiB)": 141.16, "step": 57300, "train_speed(iter/s)": 0.29247 }, { "acc": 0.74021864, "epoch": 0.6411211791171442, "grad_norm": 6.96875, "learning_rate": 8.128642184797934e-06, "loss": 1.03921261, "memory(GiB)": 141.16, "step": 57320, "train_speed(iter/s)": 0.292505 }, { "acc": 0.72185802, "epoch": 0.6413448780631027, "grad_norm": 10.0625, "learning_rate": 8.127199346788734e-06, "loss": 1.12535801, "memory(GiB)": 141.16, "step": 57340, "train_speed(iter/s)": 0.292538 }, { "acc": 0.74180961, "epoch": 0.6415685770090612, "grad_norm": 8.375, "learning_rate": 8.125756080933968e-06, "loss": 1.02275658, "memory(GiB)": 141.16, "step": 57360, "train_speed(iter/s)": 0.292571 }, { "acc": 0.72141685, "epoch": 0.6417922759550198, "grad_norm": 6.71875, "learning_rate": 8.124312387431092e-06, "loss": 1.11062946, "memory(GiB)": 141.16, "step": 57380, "train_speed(iter/s)": 0.2926 }, { "acc": 0.74043341, "epoch": 0.6420159749009783, "grad_norm": 7.59375, "learning_rate": 8.122868266477623e-06, "loss": 1.04213619, "memory(GiB)": 141.16, "step": 57400, "train_speed(iter/s)": 0.292632 }, { "acc": 0.73624849, "epoch": 0.6422396738469368, "grad_norm": 6.78125, "learning_rate": 8.121423718271142e-06, "loss": 1.0495595, "memory(GiB)": 141.16, "step": 57420, "train_speed(iter/s)": 0.292664 }, { "acc": 0.71206269, "epoch": 0.6424633727928953, "grad_norm": 6.65625, "learning_rate": 8.119978743009278e-06, "loss": 1.15579014, "memory(GiB)": 141.16, "step": 57440, "train_speed(iter/s)": 0.292691 }, { "acc": 0.73254781, "epoch": 0.6426870717388539, "grad_norm": 7.1875, "learning_rate": 8.11853334088973e-06, "loss": 1.06791344, "memory(GiB)": 141.16, "step": 57460, "train_speed(iter/s)": 0.292726 }, { "acc": 0.72494879, "epoch": 0.6429107706848124, "grad_norm": 8.0625, "learning_rate": 8.117087512110245e-06, "loss": 1.090205, "memory(GiB)": 141.16, "step": 57480, "train_speed(iter/s)": 0.292757 }, { "acc": 0.72859769, "epoch": 0.6431344696307709, "grad_norm": 7.78125, "learning_rate": 8.115641256868636e-06, "loss": 1.09081736, "memory(GiB)": 141.16, "step": 57500, "train_speed(iter/s)": 0.292793 }, { "acc": 0.7336833, "epoch": 0.6433581685767295, "grad_norm": 6.125, "learning_rate": 8.114194575362769e-06, "loss": 1.06251431, "memory(GiB)": 141.16, "step": 57520, "train_speed(iter/s)": 0.292826 }, { "acc": 0.72519755, "epoch": 0.643581867522688, "grad_norm": 8.0, "learning_rate": 8.112747467790572e-06, "loss": 1.10712128, "memory(GiB)": 141.16, "step": 57540, "train_speed(iter/s)": 0.292859 }, { "acc": 0.72144299, "epoch": 0.6438055664686465, "grad_norm": 7.53125, "learning_rate": 8.11129993435003e-06, "loss": 1.11417189, "memory(GiB)": 141.16, "step": 57560, "train_speed(iter/s)": 0.292892 }, { "acc": 0.73525333, "epoch": 0.644029265414605, "grad_norm": 7.625, "learning_rate": 8.109851975239188e-06, "loss": 1.06316004, "memory(GiB)": 141.16, "step": 57580, "train_speed(iter/s)": 0.292924 }, { "acc": 0.72496262, "epoch": 0.6442529643605636, "grad_norm": 5.9375, "learning_rate": 8.108403590656144e-06, "loss": 1.09957466, "memory(GiB)": 141.16, "step": 57600, "train_speed(iter/s)": 0.292954 }, { "acc": 0.72171373, "epoch": 0.6444766633065221, "grad_norm": 5.90625, "learning_rate": 8.106954780799062e-06, "loss": 1.12260246, "memory(GiB)": 141.16, "step": 57620, "train_speed(iter/s)": 0.292988 }, { "acc": 0.72324247, "epoch": 0.6447003622524806, "grad_norm": 6.4375, "learning_rate": 8.105505545866155e-06, "loss": 1.11707773, "memory(GiB)": 141.16, "step": 57640, "train_speed(iter/s)": 0.293019 }, { "acc": 0.73106394, "epoch": 0.6449240611984391, "grad_norm": 6.78125, "learning_rate": 8.104055886055702e-06, "loss": 1.06387444, "memory(GiB)": 141.16, "step": 57660, "train_speed(iter/s)": 0.29305 }, { "acc": 0.73714552, "epoch": 0.6451477601443977, "grad_norm": 6.5625, "learning_rate": 8.102605801566038e-06, "loss": 1.05037622, "memory(GiB)": 141.16, "step": 57680, "train_speed(iter/s)": 0.293079 }, { "acc": 0.72363749, "epoch": 0.6453714590903562, "grad_norm": 8.5, "learning_rate": 8.101155292595551e-06, "loss": 1.10779171, "memory(GiB)": 141.16, "step": 57700, "train_speed(iter/s)": 0.293113 }, { "acc": 0.72218733, "epoch": 0.6455951580363147, "grad_norm": 7.25, "learning_rate": 8.099704359342695e-06, "loss": 1.12049198, "memory(GiB)": 141.16, "step": 57720, "train_speed(iter/s)": 0.29314 }, { "acc": 0.72428579, "epoch": 0.6458188569822733, "grad_norm": 7.5625, "learning_rate": 8.098253002005979e-06, "loss": 1.11067886, "memory(GiB)": 141.16, "step": 57740, "train_speed(iter/s)": 0.293177 }, { "acc": 0.71739149, "epoch": 0.6460425559282318, "grad_norm": 7.3125, "learning_rate": 8.096801220783967e-06, "loss": 1.16311932, "memory(GiB)": 141.16, "step": 57760, "train_speed(iter/s)": 0.293212 }, { "acc": 0.73793864, "epoch": 0.6462662548741903, "grad_norm": 6.65625, "learning_rate": 8.095349015875284e-06, "loss": 1.04600439, "memory(GiB)": 141.16, "step": 57780, "train_speed(iter/s)": 0.293242 }, { "acc": 0.72438097, "epoch": 0.6464899538201488, "grad_norm": 7.40625, "learning_rate": 8.093896387478615e-06, "loss": 1.10683537, "memory(GiB)": 141.16, "step": 57800, "train_speed(iter/s)": 0.293275 }, { "acc": 0.74054036, "epoch": 0.6467136527661074, "grad_norm": 6.78125, "learning_rate": 8.092443335792697e-06, "loss": 1.0361784, "memory(GiB)": 141.16, "step": 57820, "train_speed(iter/s)": 0.293309 }, { "acc": 0.72925835, "epoch": 0.6469373517120659, "grad_norm": 6.5625, "learning_rate": 8.090989861016329e-06, "loss": 1.10325947, "memory(GiB)": 141.16, "step": 57840, "train_speed(iter/s)": 0.293342 }, { "acc": 0.74186678, "epoch": 0.6471610506580244, "grad_norm": 7.15625, "learning_rate": 8.089535963348367e-06, "loss": 1.02111149, "memory(GiB)": 141.16, "step": 57860, "train_speed(iter/s)": 0.293375 }, { "acc": 0.73328276, "epoch": 0.647384749603983, "grad_norm": 8.6875, "learning_rate": 8.08808164298773e-06, "loss": 1.07494755, "memory(GiB)": 141.16, "step": 57880, "train_speed(iter/s)": 0.293408 }, { "acc": 0.71400318, "epoch": 0.6476084485499415, "grad_norm": 5.6875, "learning_rate": 8.08662690013338e-06, "loss": 1.16994972, "memory(GiB)": 141.16, "step": 57900, "train_speed(iter/s)": 0.293438 }, { "acc": 0.73318577, "epoch": 0.6478321474959, "grad_norm": 7.0625, "learning_rate": 8.085171734984353e-06, "loss": 1.06994991, "memory(GiB)": 141.16, "step": 57920, "train_speed(iter/s)": 0.293472 }, { "acc": 0.73846359, "epoch": 0.6480558464418585, "grad_norm": 7.8125, "learning_rate": 8.083716147739738e-06, "loss": 1.04745369, "memory(GiB)": 141.16, "step": 57940, "train_speed(iter/s)": 0.293506 }, { "acc": 0.73923922, "epoch": 0.6482795453878171, "grad_norm": 8.125, "learning_rate": 8.082260138598674e-06, "loss": 1.0400279, "memory(GiB)": 141.16, "step": 57960, "train_speed(iter/s)": 0.293534 }, { "acc": 0.7118485, "epoch": 0.6485032443337756, "grad_norm": 9.625, "learning_rate": 8.08080370776037e-06, "loss": 1.15179176, "memory(GiB)": 141.16, "step": 57980, "train_speed(iter/s)": 0.29357 }, { "acc": 0.73722849, "epoch": 0.6487269432797341, "grad_norm": 9.1875, "learning_rate": 8.079346855424084e-06, "loss": 1.04761381, "memory(GiB)": 141.16, "step": 58000, "train_speed(iter/s)": 0.293605 }, { "epoch": 0.6487269432797341, "eval_acc": 0.6894327493060017, "eval_loss": 1.0819883346557617, "eval_runtime": 2318.9689, "eval_samples_per_second": 32.464, "eval_steps_per_second": 16.232, "step": 58000 }, { "acc": 0.72598457, "epoch": 0.6489506422256927, "grad_norm": 6.65625, "learning_rate": 8.077889581789133e-06, "loss": 1.09372435, "memory(GiB)": 141.16, "step": 58020, "train_speed(iter/s)": 0.290157 }, { "acc": 0.73173881, "epoch": 0.6491743411716512, "grad_norm": 7.03125, "learning_rate": 8.076431887054894e-06, "loss": 1.05965385, "memory(GiB)": 141.16, "step": 58040, "train_speed(iter/s)": 0.29019 }, { "acc": 0.73715467, "epoch": 0.6493980401176097, "grad_norm": 7.84375, "learning_rate": 8.0749737714208e-06, "loss": 1.05526085, "memory(GiB)": 141.16, "step": 58060, "train_speed(iter/s)": 0.290225 }, { "acc": 0.74016151, "epoch": 0.6496217390635682, "grad_norm": 9.0, "learning_rate": 8.073515235086345e-06, "loss": 1.0368886, "memory(GiB)": 141.16, "step": 58080, "train_speed(iter/s)": 0.290257 }, { "acc": 0.72561569, "epoch": 0.6498454380095268, "grad_norm": 7.40625, "learning_rate": 8.072056278251073e-06, "loss": 1.11168623, "memory(GiB)": 141.16, "step": 58100, "train_speed(iter/s)": 0.290289 }, { "acc": 0.72525806, "epoch": 0.6500691369554853, "grad_norm": 5.9375, "learning_rate": 8.070596901114594e-06, "loss": 1.09724903, "memory(GiB)": 141.16, "step": 58120, "train_speed(iter/s)": 0.290326 }, { "acc": 0.7242053, "epoch": 0.6502928359014438, "grad_norm": 7.0625, "learning_rate": 8.069137103876568e-06, "loss": 1.11244173, "memory(GiB)": 141.16, "step": 58140, "train_speed(iter/s)": 0.290364 }, { "acc": 0.7238224, "epoch": 0.6505165348474024, "grad_norm": 8.875, "learning_rate": 8.067676886736719e-06, "loss": 1.10893555, "memory(GiB)": 141.16, "step": 58160, "train_speed(iter/s)": 0.290401 }, { "acc": 0.73693972, "epoch": 0.6507402337933609, "grad_norm": 6.9375, "learning_rate": 8.066216249894824e-06, "loss": 1.06965446, "memory(GiB)": 141.16, "step": 58180, "train_speed(iter/s)": 0.290437 }, { "acc": 0.73920002, "epoch": 0.6509639327393194, "grad_norm": 7.78125, "learning_rate": 8.064755193550721e-06, "loss": 1.04369335, "memory(GiB)": 141.16, "step": 58200, "train_speed(iter/s)": 0.290472 }, { "acc": 0.73020363, "epoch": 0.6511876316852779, "grad_norm": 8.5625, "learning_rate": 8.063293717904303e-06, "loss": 1.07683659, "memory(GiB)": 141.16, "step": 58220, "train_speed(iter/s)": 0.290498 }, { "acc": 0.72751751, "epoch": 0.6514113306312365, "grad_norm": 6.90625, "learning_rate": 8.06183182315552e-06, "loss": 1.09084625, "memory(GiB)": 141.16, "step": 58240, "train_speed(iter/s)": 0.290528 }, { "acc": 0.72963076, "epoch": 0.651635029577195, "grad_norm": 6.5625, "learning_rate": 8.060369509504377e-06, "loss": 1.10446434, "memory(GiB)": 141.16, "step": 58260, "train_speed(iter/s)": 0.290559 }, { "acc": 0.7333734, "epoch": 0.6518587285231535, "grad_norm": 8.6875, "learning_rate": 8.058906777150943e-06, "loss": 1.08083496, "memory(GiB)": 141.16, "step": 58280, "train_speed(iter/s)": 0.29059 }, { "acc": 0.73004994, "epoch": 0.652082427469112, "grad_norm": 7.84375, "learning_rate": 8.057443626295342e-06, "loss": 1.08680935, "memory(GiB)": 141.16, "step": 58300, "train_speed(iter/s)": 0.290627 }, { "acc": 0.72891955, "epoch": 0.6523061264150706, "grad_norm": 5.09375, "learning_rate": 8.055980057137752e-06, "loss": 1.09887991, "memory(GiB)": 141.16, "step": 58320, "train_speed(iter/s)": 0.290662 }, { "acc": 0.74380083, "epoch": 0.6525298253610291, "grad_norm": 8.25, "learning_rate": 8.054516069878408e-06, "loss": 1.02636414, "memory(GiB)": 141.16, "step": 58340, "train_speed(iter/s)": 0.290696 }, { "acc": 0.72419386, "epoch": 0.6527535243069876, "grad_norm": 8.3125, "learning_rate": 8.053051664717606e-06, "loss": 1.10084171, "memory(GiB)": 141.16, "step": 58360, "train_speed(iter/s)": 0.290726 }, { "acc": 0.73029199, "epoch": 0.6529772232529462, "grad_norm": 6.25, "learning_rate": 8.051586841855702e-06, "loss": 1.09967813, "memory(GiB)": 141.16, "step": 58380, "train_speed(iter/s)": 0.290759 }, { "acc": 0.725385, "epoch": 0.6532009221989047, "grad_norm": 5.90625, "learning_rate": 8.050121601493097e-06, "loss": 1.10154514, "memory(GiB)": 141.16, "step": 58400, "train_speed(iter/s)": 0.290795 }, { "acc": 0.73219872, "epoch": 0.6534246211448632, "grad_norm": 8.5, "learning_rate": 8.048655943830261e-06, "loss": 1.08088465, "memory(GiB)": 141.16, "step": 58420, "train_speed(iter/s)": 0.290825 }, { "acc": 0.71354198, "epoch": 0.6536483200908217, "grad_norm": 7.46875, "learning_rate": 8.047189869067718e-06, "loss": 1.15905628, "memory(GiB)": 141.16, "step": 58440, "train_speed(iter/s)": 0.290854 }, { "acc": 0.73601203, "epoch": 0.6538720190367803, "grad_norm": 8.125, "learning_rate": 8.045723377406046e-06, "loss": 1.06062546, "memory(GiB)": 141.16, "step": 58460, "train_speed(iter/s)": 0.290885 }, { "acc": 0.74260931, "epoch": 0.6540957179827388, "grad_norm": 8.6875, "learning_rate": 8.044256469045882e-06, "loss": 1.02915936, "memory(GiB)": 141.16, "step": 58480, "train_speed(iter/s)": 0.290919 }, { "acc": 0.73376622, "epoch": 0.6543194169286973, "grad_norm": 6.4375, "learning_rate": 8.042789144187922e-06, "loss": 1.0724369, "memory(GiB)": 141.16, "step": 58500, "train_speed(iter/s)": 0.290953 }, { "acc": 0.74095516, "epoch": 0.6545431158746559, "grad_norm": 5.75, "learning_rate": 8.041321403032914e-06, "loss": 1.03036957, "memory(GiB)": 141.16, "step": 58520, "train_speed(iter/s)": 0.290987 }, { "acc": 0.73241081, "epoch": 0.6547668148206144, "grad_norm": 9.0625, "learning_rate": 8.039853245781669e-06, "loss": 1.08523083, "memory(GiB)": 141.16, "step": 58540, "train_speed(iter/s)": 0.291026 }, { "acc": 0.72301474, "epoch": 0.6549905137665729, "grad_norm": 6.375, "learning_rate": 8.03838467263505e-06, "loss": 1.12289724, "memory(GiB)": 141.16, "step": 58560, "train_speed(iter/s)": 0.291061 }, { "acc": 0.7326376, "epoch": 0.6552142127125314, "grad_norm": 6.65625, "learning_rate": 8.03691568379398e-06, "loss": 1.0718791, "memory(GiB)": 141.16, "step": 58580, "train_speed(iter/s)": 0.291093 }, { "acc": 0.72573891, "epoch": 0.65543791165849, "grad_norm": 7.5625, "learning_rate": 8.035446279459436e-06, "loss": 1.10754509, "memory(GiB)": 141.16, "step": 58600, "train_speed(iter/s)": 0.291125 }, { "acc": 0.73316555, "epoch": 0.6556616106044485, "grad_norm": 6.8125, "learning_rate": 8.033976459832453e-06, "loss": 1.07180376, "memory(GiB)": 141.16, "step": 58620, "train_speed(iter/s)": 0.291153 }, { "acc": 0.72898607, "epoch": 0.655885309550407, "grad_norm": 7.1875, "learning_rate": 8.032506225114126e-06, "loss": 1.07775841, "memory(GiB)": 141.16, "step": 58640, "train_speed(iter/s)": 0.291187 }, { "acc": 0.73364563, "epoch": 0.6561090084963656, "grad_norm": 5.78125, "learning_rate": 8.031035575505603e-06, "loss": 1.07882156, "memory(GiB)": 141.16, "step": 58660, "train_speed(iter/s)": 0.291218 }, { "acc": 0.73483572, "epoch": 0.6563327074423241, "grad_norm": 5.28125, "learning_rate": 8.02956451120809e-06, "loss": 1.06796818, "memory(GiB)": 141.16, "step": 58680, "train_speed(iter/s)": 0.29125 }, { "acc": 0.72060146, "epoch": 0.6565564063882826, "grad_norm": 7.3125, "learning_rate": 8.02809303242285e-06, "loss": 1.14898911, "memory(GiB)": 141.16, "step": 58700, "train_speed(iter/s)": 0.291279 }, { "acc": 0.74011092, "epoch": 0.6567801053342411, "grad_norm": 8.25, "learning_rate": 8.0266211393512e-06, "loss": 1.03094511, "memory(GiB)": 141.16, "step": 58720, "train_speed(iter/s)": 0.291314 }, { "acc": 0.72945404, "epoch": 0.6570038042801997, "grad_norm": 7.875, "learning_rate": 8.02514883219452e-06, "loss": 1.10167542, "memory(GiB)": 141.16, "step": 58740, "train_speed(iter/s)": 0.291349 }, { "acc": 0.71998844, "epoch": 0.6572275032261582, "grad_norm": 6.59375, "learning_rate": 8.02367611115424e-06, "loss": 1.13739109, "memory(GiB)": 141.16, "step": 58760, "train_speed(iter/s)": 0.291383 }, { "acc": 0.72568822, "epoch": 0.6574512021721167, "grad_norm": 7.03125, "learning_rate": 8.022202976431848e-06, "loss": 1.10118065, "memory(GiB)": 141.16, "step": 58780, "train_speed(iter/s)": 0.29142 }, { "acc": 0.73453722, "epoch": 0.6576749011180753, "grad_norm": 5.0, "learning_rate": 8.020729428228893e-06, "loss": 1.07960072, "memory(GiB)": 141.16, "step": 58800, "train_speed(iter/s)": 0.291452 }, { "acc": 0.71971526, "epoch": 0.6578986000640338, "grad_norm": 6.28125, "learning_rate": 8.019255466746975e-06, "loss": 1.12990417, "memory(GiB)": 141.16, "step": 58820, "train_speed(iter/s)": 0.291487 }, { "acc": 0.72989187, "epoch": 0.6581222990099923, "grad_norm": 6.25, "learning_rate": 8.017781092187755e-06, "loss": 1.06963959, "memory(GiB)": 141.16, "step": 58840, "train_speed(iter/s)": 0.291517 }, { "acc": 0.73506594, "epoch": 0.6583459979559508, "grad_norm": 6.9375, "learning_rate": 8.016306304752947e-06, "loss": 1.0476553, "memory(GiB)": 141.16, "step": 58860, "train_speed(iter/s)": 0.291552 }, { "acc": 0.73336411, "epoch": 0.6585696969019094, "grad_norm": 9.1875, "learning_rate": 8.014831104644325e-06, "loss": 1.06384687, "memory(GiB)": 141.16, "step": 58880, "train_speed(iter/s)": 0.291585 }, { "acc": 0.7315774, "epoch": 0.6587933958478679, "grad_norm": 7.40625, "learning_rate": 8.013355492063715e-06, "loss": 1.0698328, "memory(GiB)": 141.16, "step": 58900, "train_speed(iter/s)": 0.291618 }, { "acc": 0.73447504, "epoch": 0.6590170947938264, "grad_norm": 7.53125, "learning_rate": 8.011879467213002e-06, "loss": 1.06737986, "memory(GiB)": 141.16, "step": 58920, "train_speed(iter/s)": 0.291648 }, { "acc": 0.73600998, "epoch": 0.659240793739785, "grad_norm": 7.71875, "learning_rate": 8.010403030294129e-06, "loss": 1.06850414, "memory(GiB)": 141.16, "step": 58940, "train_speed(iter/s)": 0.291679 }, { "acc": 0.72057786, "epoch": 0.6594644926857435, "grad_norm": 7.71875, "learning_rate": 8.008926181509093e-06, "loss": 1.12551785, "memory(GiB)": 141.16, "step": 58960, "train_speed(iter/s)": 0.291712 }, { "acc": 0.72944264, "epoch": 0.659688191631702, "grad_norm": 6.28125, "learning_rate": 8.007448921059948e-06, "loss": 1.07392273, "memory(GiB)": 141.16, "step": 58980, "train_speed(iter/s)": 0.291746 }, { "acc": 0.72650228, "epoch": 0.6599118905776605, "grad_norm": 6.75, "learning_rate": 8.005971249148804e-06, "loss": 1.09463005, "memory(GiB)": 141.16, "step": 59000, "train_speed(iter/s)": 0.29178 }, { "acc": 0.7223701, "epoch": 0.6601355895236191, "grad_norm": 8.125, "learning_rate": 8.004493165977827e-06, "loss": 1.12137451, "memory(GiB)": 141.16, "step": 59020, "train_speed(iter/s)": 0.291812 }, { "acc": 0.72594137, "epoch": 0.6603592884695776, "grad_norm": 6.71875, "learning_rate": 8.003014671749241e-06, "loss": 1.0935854, "memory(GiB)": 141.16, "step": 59040, "train_speed(iter/s)": 0.291846 }, { "acc": 0.73489904, "epoch": 0.6605829874155362, "grad_norm": 9.0, "learning_rate": 8.001535766665326e-06, "loss": 1.06678772, "memory(GiB)": 141.16, "step": 59060, "train_speed(iter/s)": 0.291877 }, { "acc": 0.7165513, "epoch": 0.6608066863614948, "grad_norm": 8.4375, "learning_rate": 8.000056450928418e-06, "loss": 1.14898844, "memory(GiB)": 141.16, "step": 59080, "train_speed(iter/s)": 0.291912 }, { "acc": 0.72285247, "epoch": 0.6610303853074533, "grad_norm": 8.5625, "learning_rate": 7.998576724740903e-06, "loss": 1.12481489, "memory(GiB)": 141.16, "step": 59100, "train_speed(iter/s)": 0.291945 }, { "acc": 0.7374835, "epoch": 0.6612540842534118, "grad_norm": 6.125, "learning_rate": 7.997096588305235e-06, "loss": 1.04597321, "memory(GiB)": 141.16, "step": 59120, "train_speed(iter/s)": 0.291978 }, { "acc": 0.72535677, "epoch": 0.6614777831993703, "grad_norm": 8.75, "learning_rate": 7.995616041823914e-06, "loss": 1.10844917, "memory(GiB)": 141.16, "step": 59140, "train_speed(iter/s)": 0.292009 }, { "acc": 0.73630848, "epoch": 0.6617014821453289, "grad_norm": 6.96875, "learning_rate": 7.994135085499502e-06, "loss": 1.05295792, "memory(GiB)": 141.16, "step": 59160, "train_speed(iter/s)": 0.29204 }, { "acc": 0.74201622, "epoch": 0.6619251810912874, "grad_norm": 9.625, "learning_rate": 7.992653719534613e-06, "loss": 1.04430819, "memory(GiB)": 141.16, "step": 59180, "train_speed(iter/s)": 0.292071 }, { "acc": 0.72512674, "epoch": 0.6621488800372459, "grad_norm": 6.25, "learning_rate": 7.991171944131922e-06, "loss": 1.083251, "memory(GiB)": 141.16, "step": 59200, "train_speed(iter/s)": 0.292104 }, { "acc": 0.72241888, "epoch": 0.6623725789832045, "grad_norm": 6.0, "learning_rate": 7.989689759494155e-06, "loss": 1.11876431, "memory(GiB)": 141.16, "step": 59220, "train_speed(iter/s)": 0.29214 }, { "acc": 0.7348721, "epoch": 0.662596277929163, "grad_norm": 7.9375, "learning_rate": 7.988207165824096e-06, "loss": 1.04374638, "memory(GiB)": 141.16, "step": 59240, "train_speed(iter/s)": 0.292177 }, { "acc": 0.73244452, "epoch": 0.6628199768751215, "grad_norm": 6.75, "learning_rate": 7.986724163324585e-06, "loss": 1.08236427, "memory(GiB)": 141.16, "step": 59260, "train_speed(iter/s)": 0.292206 }, { "acc": 0.72314968, "epoch": 0.66304367582108, "grad_norm": 5.34375, "learning_rate": 7.98524075219852e-06, "loss": 1.10350752, "memory(GiB)": 141.16, "step": 59280, "train_speed(iter/s)": 0.292237 }, { "acc": 0.73301954, "epoch": 0.6632673747670386, "grad_norm": 7.3125, "learning_rate": 7.98375693264885e-06, "loss": 1.06906719, "memory(GiB)": 141.16, "step": 59300, "train_speed(iter/s)": 0.292269 }, { "acc": 0.72118878, "epoch": 0.6634910737129971, "grad_norm": 6.09375, "learning_rate": 7.982272704878582e-06, "loss": 1.13150711, "memory(GiB)": 141.16, "step": 59320, "train_speed(iter/s)": 0.292298 }, { "acc": 0.73131065, "epoch": 0.6637147726589556, "grad_norm": 7.40625, "learning_rate": 7.980788069090784e-06, "loss": 1.09418507, "memory(GiB)": 141.16, "step": 59340, "train_speed(iter/s)": 0.292332 }, { "acc": 0.74086919, "epoch": 0.6639384716049141, "grad_norm": 7.125, "learning_rate": 7.979303025488571e-06, "loss": 1.04079361, "memory(GiB)": 141.16, "step": 59360, "train_speed(iter/s)": 0.292363 }, { "acc": 0.72837934, "epoch": 0.6641621705508727, "grad_norm": 6.0625, "learning_rate": 7.977817574275123e-06, "loss": 1.09192047, "memory(GiB)": 141.16, "step": 59380, "train_speed(iter/s)": 0.292394 }, { "acc": 0.72546854, "epoch": 0.6643858694968312, "grad_norm": 5.6875, "learning_rate": 7.976331715653666e-06, "loss": 1.10009212, "memory(GiB)": 141.16, "step": 59400, "train_speed(iter/s)": 0.292422 }, { "acc": 0.72517962, "epoch": 0.6646095684427897, "grad_norm": 7.90625, "learning_rate": 7.974845449827489e-06, "loss": 1.09858341, "memory(GiB)": 141.16, "step": 59420, "train_speed(iter/s)": 0.292454 }, { "acc": 0.72885504, "epoch": 0.6648332673887483, "grad_norm": 7.5, "learning_rate": 7.973358776999935e-06, "loss": 1.08769684, "memory(GiB)": 141.16, "step": 59440, "train_speed(iter/s)": 0.292483 }, { "acc": 0.73094554, "epoch": 0.6650569663347068, "grad_norm": 7.34375, "learning_rate": 7.9718716973744e-06, "loss": 1.0778862, "memory(GiB)": 141.16, "step": 59460, "train_speed(iter/s)": 0.292515 }, { "acc": 0.73541903, "epoch": 0.6652806652806653, "grad_norm": 5.90625, "learning_rate": 7.97038421115434e-06, "loss": 1.06755905, "memory(GiB)": 141.16, "step": 59480, "train_speed(iter/s)": 0.292547 }, { "acc": 0.72704458, "epoch": 0.6655043642266238, "grad_norm": 7.1875, "learning_rate": 7.968896318543262e-06, "loss": 1.09715748, "memory(GiB)": 141.16, "step": 59500, "train_speed(iter/s)": 0.292581 }, { "acc": 0.73186474, "epoch": 0.6657280631725824, "grad_norm": 10.1875, "learning_rate": 7.967408019744734e-06, "loss": 1.07705994, "memory(GiB)": 141.16, "step": 59520, "train_speed(iter/s)": 0.292611 }, { "acc": 0.72408037, "epoch": 0.6659517621185409, "grad_norm": 6.71875, "learning_rate": 7.965919314962374e-06, "loss": 1.08691902, "memory(GiB)": 141.16, "step": 59540, "train_speed(iter/s)": 0.292636 }, { "acc": 0.73306074, "epoch": 0.6661754610644994, "grad_norm": 7.75, "learning_rate": 7.964430204399858e-06, "loss": 1.06140633, "memory(GiB)": 141.16, "step": 59560, "train_speed(iter/s)": 0.292669 }, { "acc": 0.72545424, "epoch": 0.666399160010458, "grad_norm": 6.15625, "learning_rate": 7.962940688260918e-06, "loss": 1.09268513, "memory(GiB)": 141.16, "step": 59580, "train_speed(iter/s)": 0.2927 }, { "acc": 0.73045158, "epoch": 0.6666228589564165, "grad_norm": 6.78125, "learning_rate": 7.961450766749343e-06, "loss": 1.08882713, "memory(GiB)": 141.16, "step": 59600, "train_speed(iter/s)": 0.292732 }, { "acc": 0.73331585, "epoch": 0.666846557902375, "grad_norm": 5.71875, "learning_rate": 7.959960440068975e-06, "loss": 1.08870869, "memory(GiB)": 141.16, "step": 59620, "train_speed(iter/s)": 0.292764 }, { "acc": 0.74068813, "epoch": 0.6670702568483335, "grad_norm": 8.375, "learning_rate": 7.95846970842371e-06, "loss": 1.03863983, "memory(GiB)": 141.16, "step": 59640, "train_speed(iter/s)": 0.292799 }, { "acc": 0.72608299, "epoch": 0.6672939557942921, "grad_norm": 6.25, "learning_rate": 7.956978572017504e-06, "loss": 1.10006943, "memory(GiB)": 141.16, "step": 59660, "train_speed(iter/s)": 0.292831 }, { "acc": 0.71865635, "epoch": 0.6675176547402506, "grad_norm": 8.75, "learning_rate": 7.955487031054364e-06, "loss": 1.13032703, "memory(GiB)": 141.16, "step": 59680, "train_speed(iter/s)": 0.292861 }, { "acc": 0.74735394, "epoch": 0.6677413536862091, "grad_norm": 8.5625, "learning_rate": 7.953995085738354e-06, "loss": 1.00843935, "memory(GiB)": 141.16, "step": 59700, "train_speed(iter/s)": 0.292894 }, { "acc": 0.71736712, "epoch": 0.6679650526321677, "grad_norm": 6.34375, "learning_rate": 7.952502736273594e-06, "loss": 1.14423761, "memory(GiB)": 141.16, "step": 59720, "train_speed(iter/s)": 0.292928 }, { "acc": 0.73064666, "epoch": 0.6681887515781262, "grad_norm": 7.65625, "learning_rate": 7.951009982864257e-06, "loss": 1.0769289, "memory(GiB)": 141.16, "step": 59740, "train_speed(iter/s)": 0.29296 }, { "acc": 0.73376217, "epoch": 0.6684124505240847, "grad_norm": 6.4375, "learning_rate": 7.949516825714578e-06, "loss": 1.05280151, "memory(GiB)": 141.16, "step": 59760, "train_speed(iter/s)": 0.29299 }, { "acc": 0.729, "epoch": 0.6686361494700432, "grad_norm": 7.03125, "learning_rate": 7.948023265028837e-06, "loss": 1.09876537, "memory(GiB)": 141.16, "step": 59780, "train_speed(iter/s)": 0.293027 }, { "acc": 0.73438311, "epoch": 0.6688598484160018, "grad_norm": 7.53125, "learning_rate": 7.946529301011376e-06, "loss": 1.07799158, "memory(GiB)": 141.16, "step": 59800, "train_speed(iter/s)": 0.293061 }, { "acc": 0.71786685, "epoch": 0.6690835473619603, "grad_norm": 6.28125, "learning_rate": 7.945034933866592e-06, "loss": 1.13078775, "memory(GiB)": 141.16, "step": 59820, "train_speed(iter/s)": 0.293091 }, { "acc": 0.70957932, "epoch": 0.6693072463079188, "grad_norm": 7.9375, "learning_rate": 7.943540163798934e-06, "loss": 1.15646305, "memory(GiB)": 141.16, "step": 59840, "train_speed(iter/s)": 0.293122 }, { "acc": 0.73822327, "epoch": 0.6695309452538774, "grad_norm": 7.1875, "learning_rate": 7.942044991012909e-06, "loss": 1.05221043, "memory(GiB)": 141.16, "step": 59860, "train_speed(iter/s)": 0.293154 }, { "acc": 0.74490924, "epoch": 0.6697546441998359, "grad_norm": 8.25, "learning_rate": 7.940549415713078e-06, "loss": 1.00116062, "memory(GiB)": 141.16, "step": 59880, "train_speed(iter/s)": 0.293184 }, { "acc": 0.73405418, "epoch": 0.6699783431457944, "grad_norm": 5.71875, "learning_rate": 7.939053438104056e-06, "loss": 1.06416168, "memory(GiB)": 141.16, "step": 59900, "train_speed(iter/s)": 0.293217 }, { "acc": 0.72516394, "epoch": 0.6702020420917529, "grad_norm": 6.59375, "learning_rate": 7.937557058390515e-06, "loss": 1.10729027, "memory(GiB)": 141.16, "step": 59920, "train_speed(iter/s)": 0.293247 }, { "acc": 0.74150419, "epoch": 0.6704257410377115, "grad_norm": 6.125, "learning_rate": 7.936060276777183e-06, "loss": 1.04767151, "memory(GiB)": 141.16, "step": 59940, "train_speed(iter/s)": 0.293279 }, { "acc": 0.73510323, "epoch": 0.67064943998367, "grad_norm": 7.40625, "learning_rate": 7.934563093468838e-06, "loss": 1.05694408, "memory(GiB)": 141.16, "step": 59960, "train_speed(iter/s)": 0.293309 }, { "acc": 0.72497859, "epoch": 0.6708731389296285, "grad_norm": 8.875, "learning_rate": 7.933065508670317e-06, "loss": 1.10472469, "memory(GiB)": 141.16, "step": 59980, "train_speed(iter/s)": 0.293341 }, { "acc": 0.72677255, "epoch": 0.671096837875587, "grad_norm": 6.40625, "learning_rate": 7.931567522586511e-06, "loss": 1.1038332, "memory(GiB)": 141.16, "step": 60000, "train_speed(iter/s)": 0.293369 }, { "epoch": 0.671096837875587, "eval_acc": 0.6894945673114274, "eval_loss": 1.0817344188690186, "eval_runtime": 2323.6949, "eval_samples_per_second": 32.398, "eval_steps_per_second": 16.199, "step": 60000 }, { "acc": 0.72740784, "epoch": 0.6713205368215456, "grad_norm": 6.0, "learning_rate": 7.930069135422366e-06, "loss": 1.09641352, "memory(GiB)": 141.16, "step": 60020, "train_speed(iter/s)": 0.290034 }, { "acc": 0.72970562, "epoch": 0.6715442357675041, "grad_norm": 7.5625, "learning_rate": 7.928570347382884e-06, "loss": 1.09003906, "memory(GiB)": 141.16, "step": 60040, "train_speed(iter/s)": 0.290067 }, { "acc": 0.72916718, "epoch": 0.6717679347134626, "grad_norm": 7.625, "learning_rate": 7.927071158673118e-06, "loss": 1.08022556, "memory(GiB)": 141.16, "step": 60060, "train_speed(iter/s)": 0.290098 }, { "acc": 0.72142754, "epoch": 0.6719916336594212, "grad_norm": 7.46875, "learning_rate": 7.925571569498182e-06, "loss": 1.1266695, "memory(GiB)": 141.16, "step": 60080, "train_speed(iter/s)": 0.290126 }, { "acc": 0.7397933, "epoch": 0.6722153326053797, "grad_norm": 6.40625, "learning_rate": 7.924071580063238e-06, "loss": 1.0369936, "memory(GiB)": 141.16, "step": 60100, "train_speed(iter/s)": 0.29016 }, { "acc": 0.72830057, "epoch": 0.6724390315513382, "grad_norm": 7.1875, "learning_rate": 7.922571190573507e-06, "loss": 1.08463936, "memory(GiB)": 141.16, "step": 60120, "train_speed(iter/s)": 0.290191 }, { "acc": 0.73858356, "epoch": 0.6726627304972967, "grad_norm": 6.28125, "learning_rate": 7.921070401234265e-06, "loss": 1.03417587, "memory(GiB)": 141.16, "step": 60140, "train_speed(iter/s)": 0.290226 }, { "acc": 0.72662401, "epoch": 0.6728864294432553, "grad_norm": 6.78125, "learning_rate": 7.919569212250839e-06, "loss": 1.09781733, "memory(GiB)": 141.16, "step": 60160, "train_speed(iter/s)": 0.290261 }, { "acc": 0.7283947, "epoch": 0.6731101283892138, "grad_norm": 7.625, "learning_rate": 7.918067623828616e-06, "loss": 1.09657631, "memory(GiB)": 141.16, "step": 60180, "train_speed(iter/s)": 0.290291 }, { "acc": 0.74282961, "epoch": 0.6733338273351723, "grad_norm": 8.9375, "learning_rate": 7.916565636173032e-06, "loss": 1.01796217, "memory(GiB)": 141.16, "step": 60200, "train_speed(iter/s)": 0.290324 }, { "acc": 0.71887941, "epoch": 0.6735575262811309, "grad_norm": 7.6875, "learning_rate": 7.915063249489582e-06, "loss": 1.13662891, "memory(GiB)": 141.16, "step": 60220, "train_speed(iter/s)": 0.290358 }, { "acc": 0.7449461, "epoch": 0.6737812252270894, "grad_norm": 9.375, "learning_rate": 7.913560463983815e-06, "loss": 1.02932196, "memory(GiB)": 141.16, "step": 60240, "train_speed(iter/s)": 0.29039 }, { "acc": 0.7208147, "epoch": 0.6740049241730479, "grad_norm": 8.3125, "learning_rate": 7.91205727986133e-06, "loss": 1.1336216, "memory(GiB)": 141.16, "step": 60260, "train_speed(iter/s)": 0.290419 }, { "acc": 0.73375444, "epoch": 0.6742286231190064, "grad_norm": 5.625, "learning_rate": 7.910553697327787e-06, "loss": 1.06361618, "memory(GiB)": 141.16, "step": 60280, "train_speed(iter/s)": 0.290451 }, { "acc": 0.72336421, "epoch": 0.674452322064965, "grad_norm": 6.09375, "learning_rate": 7.909049716588898e-06, "loss": 1.11673927, "memory(GiB)": 141.16, "step": 60300, "train_speed(iter/s)": 0.290481 }, { "acc": 0.7372745, "epoch": 0.6746760210109235, "grad_norm": 6.4375, "learning_rate": 7.907545337850426e-06, "loss": 1.06237106, "memory(GiB)": 141.16, "step": 60320, "train_speed(iter/s)": 0.290505 }, { "acc": 0.70893497, "epoch": 0.674899719956882, "grad_norm": 6.96875, "learning_rate": 7.906040561318195e-06, "loss": 1.17699318, "memory(GiB)": 141.16, "step": 60340, "train_speed(iter/s)": 0.290537 }, { "acc": 0.72599154, "epoch": 0.6751234189028406, "grad_norm": 7.09375, "learning_rate": 7.904535387198079e-06, "loss": 1.10489502, "memory(GiB)": 141.16, "step": 60360, "train_speed(iter/s)": 0.290566 }, { "acc": 0.72221079, "epoch": 0.6753471178487991, "grad_norm": 6.875, "learning_rate": 7.903029815696004e-06, "loss": 1.13666821, "memory(GiB)": 141.16, "step": 60380, "train_speed(iter/s)": 0.290597 }, { "acc": 0.74497004, "epoch": 0.6755708167947576, "grad_norm": 5.8125, "learning_rate": 7.901523847017958e-06, "loss": 1.00330429, "memory(GiB)": 141.16, "step": 60400, "train_speed(iter/s)": 0.290628 }, { "acc": 0.72934246, "epoch": 0.6757945157407161, "grad_norm": 7.875, "learning_rate": 7.900017481369976e-06, "loss": 1.09650469, "memory(GiB)": 141.16, "step": 60420, "train_speed(iter/s)": 0.29066 }, { "acc": 0.73709173, "epoch": 0.6760182146866747, "grad_norm": 7.5, "learning_rate": 7.898510718958152e-06, "loss": 1.03784237, "memory(GiB)": 141.16, "step": 60440, "train_speed(iter/s)": 0.290689 }, { "acc": 0.73792849, "epoch": 0.6762419136326332, "grad_norm": 6.9375, "learning_rate": 7.897003559988634e-06, "loss": 1.04519081, "memory(GiB)": 141.16, "step": 60460, "train_speed(iter/s)": 0.290722 }, { "acc": 0.72774725, "epoch": 0.6764656125785917, "grad_norm": 7.25, "learning_rate": 7.89549600466762e-06, "loss": 1.08650284, "memory(GiB)": 141.16, "step": 60480, "train_speed(iter/s)": 0.290752 }, { "acc": 0.73865223, "epoch": 0.6766893115245503, "grad_norm": 7.9375, "learning_rate": 7.893988053201367e-06, "loss": 1.05059395, "memory(GiB)": 141.16, "step": 60500, "train_speed(iter/s)": 0.290783 }, { "acc": 0.72786903, "epoch": 0.6769130104705088, "grad_norm": 8.4375, "learning_rate": 7.892479705796184e-06, "loss": 1.09190998, "memory(GiB)": 141.16, "step": 60520, "train_speed(iter/s)": 0.290814 }, { "acc": 0.73687906, "epoch": 0.6771367094164673, "grad_norm": 7.25, "learning_rate": 7.890970962658432e-06, "loss": 1.053895, "memory(GiB)": 141.16, "step": 60540, "train_speed(iter/s)": 0.290845 }, { "acc": 0.71457033, "epoch": 0.6773604083624258, "grad_norm": 6.84375, "learning_rate": 7.889461823994533e-06, "loss": 1.16624508, "memory(GiB)": 141.16, "step": 60560, "train_speed(iter/s)": 0.290879 }, { "acc": 0.71840711, "epoch": 0.6775841073083844, "grad_norm": 5.0, "learning_rate": 7.887952290010956e-06, "loss": 1.12688923, "memory(GiB)": 141.16, "step": 60580, "train_speed(iter/s)": 0.290909 }, { "acc": 0.73684921, "epoch": 0.6778078062543429, "grad_norm": 5.71875, "learning_rate": 7.886442360914228e-06, "loss": 1.05448694, "memory(GiB)": 141.16, "step": 60600, "train_speed(iter/s)": 0.290943 }, { "acc": 0.734583, "epoch": 0.6780315052003014, "grad_norm": 6.21875, "learning_rate": 7.884932036910928e-06, "loss": 1.06875029, "memory(GiB)": 141.16, "step": 60620, "train_speed(iter/s)": 0.290974 }, { "acc": 0.73324842, "epoch": 0.67825520414626, "grad_norm": 6.59375, "learning_rate": 7.88342131820769e-06, "loss": 1.06627913, "memory(GiB)": 141.16, "step": 60640, "train_speed(iter/s)": 0.291008 }, { "acc": 0.71949282, "epoch": 0.6784789030922185, "grad_norm": 7.25, "learning_rate": 7.881910205011203e-06, "loss": 1.13160191, "memory(GiB)": 141.16, "step": 60660, "train_speed(iter/s)": 0.291043 }, { "acc": 0.73749261, "epoch": 0.678702602038177, "grad_norm": 8.4375, "learning_rate": 7.880398697528206e-06, "loss": 1.06122894, "memory(GiB)": 141.16, "step": 60680, "train_speed(iter/s)": 0.291074 }, { "acc": 0.73688536, "epoch": 0.6789263009841355, "grad_norm": 5.40625, "learning_rate": 7.878886795965497e-06, "loss": 1.05550175, "memory(GiB)": 141.16, "step": 60700, "train_speed(iter/s)": 0.291103 }, { "acc": 0.74529467, "epoch": 0.6791499999300941, "grad_norm": 6.90625, "learning_rate": 7.877374500529926e-06, "loss": 1.01724663, "memory(GiB)": 141.16, "step": 60720, "train_speed(iter/s)": 0.291135 }, { "acc": 0.73113832, "epoch": 0.6793736988760526, "grad_norm": 7.21875, "learning_rate": 7.875861811428399e-06, "loss": 1.07227793, "memory(GiB)": 141.16, "step": 60740, "train_speed(iter/s)": 0.291167 }, { "acc": 0.71638079, "epoch": 0.6795973978220111, "grad_norm": 7.375, "learning_rate": 7.874348728867866e-06, "loss": 1.14211502, "memory(GiB)": 141.16, "step": 60760, "train_speed(iter/s)": 0.291198 }, { "acc": 0.7364203, "epoch": 0.6798210967679696, "grad_norm": 8.9375, "learning_rate": 7.872835253055344e-06, "loss": 1.05367212, "memory(GiB)": 141.16, "step": 60780, "train_speed(iter/s)": 0.291229 }, { "acc": 0.73411956, "epoch": 0.6800447957139282, "grad_norm": 7.9375, "learning_rate": 7.871321384197898e-06, "loss": 1.05938702, "memory(GiB)": 141.16, "step": 60800, "train_speed(iter/s)": 0.291257 }, { "acc": 0.74200492, "epoch": 0.6802684946598867, "grad_norm": 6.125, "learning_rate": 7.869807122502648e-06, "loss": 1.04443016, "memory(GiB)": 141.16, "step": 60820, "train_speed(iter/s)": 0.291286 }, { "acc": 0.75331092, "epoch": 0.6804921936058452, "grad_norm": 7.5625, "learning_rate": 7.868292468176762e-06, "loss": 0.97390366, "memory(GiB)": 141.16, "step": 60840, "train_speed(iter/s)": 0.291318 }, { "acc": 0.74260044, "epoch": 0.6807158925518038, "grad_norm": 6.6875, "learning_rate": 7.86677742142747e-06, "loss": 1.01547871, "memory(GiB)": 141.16, "step": 60860, "train_speed(iter/s)": 0.291349 }, { "acc": 0.73974867, "epoch": 0.6809395914977623, "grad_norm": 9.375, "learning_rate": 7.86526198246205e-06, "loss": 1.04166679, "memory(GiB)": 141.16, "step": 60880, "train_speed(iter/s)": 0.291381 }, { "acc": 0.7265871, "epoch": 0.6811632904437208, "grad_norm": 7.65625, "learning_rate": 7.86374615148784e-06, "loss": 1.09525175, "memory(GiB)": 141.16, "step": 60900, "train_speed(iter/s)": 0.291411 }, { "acc": 0.72225575, "epoch": 0.6813869893896793, "grad_norm": 6.28125, "learning_rate": 7.86222992871222e-06, "loss": 1.13982716, "memory(GiB)": 141.16, "step": 60920, "train_speed(iter/s)": 0.291441 }, { "acc": 0.7262208, "epoch": 0.6816106883356379, "grad_norm": 9.375, "learning_rate": 7.860713314342636e-06, "loss": 1.11971073, "memory(GiB)": 141.16, "step": 60940, "train_speed(iter/s)": 0.291472 }, { "acc": 0.73125887, "epoch": 0.6818343872815964, "grad_norm": 5.1875, "learning_rate": 7.859196308586583e-06, "loss": 1.07154446, "memory(GiB)": 141.16, "step": 60960, "train_speed(iter/s)": 0.291508 }, { "acc": 0.71920204, "epoch": 0.6820580862275549, "grad_norm": 8.4375, "learning_rate": 7.857678911651608e-06, "loss": 1.12383633, "memory(GiB)": 141.16, "step": 60980, "train_speed(iter/s)": 0.29154 }, { "acc": 0.72728682, "epoch": 0.6822817851735135, "grad_norm": 5.5, "learning_rate": 7.856161123745311e-06, "loss": 1.08736162, "memory(GiB)": 141.16, "step": 61000, "train_speed(iter/s)": 0.291575 }, { "acc": 0.72337475, "epoch": 0.682505484119472, "grad_norm": 8.4375, "learning_rate": 7.854642945075348e-06, "loss": 1.11093655, "memory(GiB)": 141.16, "step": 61020, "train_speed(iter/s)": 0.291608 }, { "acc": 0.74207916, "epoch": 0.6827291830654305, "grad_norm": 6.4375, "learning_rate": 7.853124375849429e-06, "loss": 1.03326979, "memory(GiB)": 141.16, "step": 61040, "train_speed(iter/s)": 0.291636 }, { "acc": 0.72552657, "epoch": 0.682952882011389, "grad_norm": 6.5, "learning_rate": 7.851605416275314e-06, "loss": 1.10386982, "memory(GiB)": 141.16, "step": 61060, "train_speed(iter/s)": 0.291669 }, { "acc": 0.74304533, "epoch": 0.6831765809573476, "grad_norm": 6.90625, "learning_rate": 7.85008606656082e-06, "loss": 1.0298069, "memory(GiB)": 141.16, "step": 61080, "train_speed(iter/s)": 0.291702 }, { "acc": 0.74028349, "epoch": 0.6834002799033061, "grad_norm": 8.0, "learning_rate": 7.848566326913813e-06, "loss": 1.03043938, "memory(GiB)": 141.16, "step": 61100, "train_speed(iter/s)": 0.29172 }, { "acc": 0.74216118, "epoch": 0.6836239788492646, "grad_norm": 6.9375, "learning_rate": 7.847046197542219e-06, "loss": 1.01177549, "memory(GiB)": 141.16, "step": 61120, "train_speed(iter/s)": 0.291753 }, { "acc": 0.73415198, "epoch": 0.6838476777952232, "grad_norm": 8.0, "learning_rate": 7.845525678654012e-06, "loss": 1.0638114, "memory(GiB)": 141.16, "step": 61140, "train_speed(iter/s)": 0.291782 }, { "acc": 0.735361, "epoch": 0.6840713767411817, "grad_norm": 6.4375, "learning_rate": 7.844004770457219e-06, "loss": 1.06616831, "memory(GiB)": 141.16, "step": 61160, "train_speed(iter/s)": 0.29181 }, { "acc": 0.73423386, "epoch": 0.6842950756871402, "grad_norm": 7.09375, "learning_rate": 7.842483473159923e-06, "loss": 1.06015491, "memory(GiB)": 141.16, "step": 61180, "train_speed(iter/s)": 0.29184 }, { "acc": 0.741152, "epoch": 0.6845187746330987, "grad_norm": 6.96875, "learning_rate": 7.840961786970261e-06, "loss": 1.04408035, "memory(GiB)": 141.16, "step": 61200, "train_speed(iter/s)": 0.291873 }, { "acc": 0.74782372, "epoch": 0.6847424735790573, "grad_norm": 6.0625, "learning_rate": 7.839439712096418e-06, "loss": 1.0048502, "memory(GiB)": 141.16, "step": 61220, "train_speed(iter/s)": 0.291905 }, { "acc": 0.72899647, "epoch": 0.6849661725250158, "grad_norm": 7.65625, "learning_rate": 7.837917248746637e-06, "loss": 1.1020546, "memory(GiB)": 141.16, "step": 61240, "train_speed(iter/s)": 0.291938 }, { "acc": 0.72416921, "epoch": 0.6851898714709743, "grad_norm": 7.875, "learning_rate": 7.836394397129216e-06, "loss": 1.10398216, "memory(GiB)": 141.16, "step": 61260, "train_speed(iter/s)": 0.291969 }, { "acc": 0.72662797, "epoch": 0.6854135704169328, "grad_norm": 6.53125, "learning_rate": 7.834871157452499e-06, "loss": 1.09738417, "memory(GiB)": 141.16, "step": 61280, "train_speed(iter/s)": 0.292005 }, { "acc": 0.74200239, "epoch": 0.6856372693628914, "grad_norm": 8.5, "learning_rate": 7.833347529924886e-06, "loss": 1.03237991, "memory(GiB)": 141.16, "step": 61300, "train_speed(iter/s)": 0.292036 }, { "acc": 0.72261448, "epoch": 0.6858609683088499, "grad_norm": 6.21875, "learning_rate": 7.831823514754836e-06, "loss": 1.12117081, "memory(GiB)": 141.16, "step": 61320, "train_speed(iter/s)": 0.292066 }, { "acc": 0.74084778, "epoch": 0.6860846672548084, "grad_norm": 7.875, "learning_rate": 7.830299112150851e-06, "loss": 1.04415913, "memory(GiB)": 141.16, "step": 61340, "train_speed(iter/s)": 0.292094 }, { "acc": 0.73588362, "epoch": 0.686308366200767, "grad_norm": 6.15625, "learning_rate": 7.828774322321492e-06, "loss": 1.03770542, "memory(GiB)": 141.16, "step": 61360, "train_speed(iter/s)": 0.292123 }, { "acc": 0.74455919, "epoch": 0.6865320651467255, "grad_norm": 8.25, "learning_rate": 7.827249145475377e-06, "loss": 1.00611553, "memory(GiB)": 141.16, "step": 61380, "train_speed(iter/s)": 0.292157 }, { "acc": 0.72801895, "epoch": 0.686755764092684, "grad_norm": 6.90625, "learning_rate": 7.825723581821165e-06, "loss": 1.08298016, "memory(GiB)": 141.16, "step": 61400, "train_speed(iter/s)": 0.292192 }, { "acc": 0.73085346, "epoch": 0.6869794630386425, "grad_norm": 9.75, "learning_rate": 7.82419763156758e-06, "loss": 1.06886406, "memory(GiB)": 141.16, "step": 61420, "train_speed(iter/s)": 0.292223 }, { "acc": 0.73811336, "epoch": 0.6872031619846011, "grad_norm": 6.09375, "learning_rate": 7.822671294923392e-06, "loss": 1.05097828, "memory(GiB)": 141.16, "step": 61440, "train_speed(iter/s)": 0.292252 }, { "acc": 0.72477565, "epoch": 0.6874268609305596, "grad_norm": 8.8125, "learning_rate": 7.821144572097424e-06, "loss": 1.1193696, "memory(GiB)": 141.16, "step": 61460, "train_speed(iter/s)": 0.292285 }, { "acc": 0.72958837, "epoch": 0.6876505598765181, "grad_norm": 6.53125, "learning_rate": 7.819617463298557e-06, "loss": 1.08281021, "memory(GiB)": 141.16, "step": 61480, "train_speed(iter/s)": 0.292317 }, { "acc": 0.73256721, "epoch": 0.6878742588224767, "grad_norm": 6.34375, "learning_rate": 7.818089968735717e-06, "loss": 1.06779976, "memory(GiB)": 141.16, "step": 61500, "train_speed(iter/s)": 0.29235 }, { "acc": 0.73684473, "epoch": 0.6880979577684352, "grad_norm": 7.1875, "learning_rate": 7.816562088617891e-06, "loss": 1.05565605, "memory(GiB)": 141.16, "step": 61520, "train_speed(iter/s)": 0.292383 }, { "acc": 0.74453707, "epoch": 0.6883216567143937, "grad_norm": 6.53125, "learning_rate": 7.815033823154112e-06, "loss": 1.00851879, "memory(GiB)": 141.16, "step": 61540, "train_speed(iter/s)": 0.292412 }, { "acc": 0.73830667, "epoch": 0.6885453556603522, "grad_norm": 7.09375, "learning_rate": 7.813505172553472e-06, "loss": 1.06448536, "memory(GiB)": 141.16, "step": 61560, "train_speed(iter/s)": 0.292443 }, { "acc": 0.72855473, "epoch": 0.6887690546063109, "grad_norm": 6.625, "learning_rate": 7.81197613702511e-06, "loss": 1.0890276, "memory(GiB)": 141.16, "step": 61580, "train_speed(iter/s)": 0.292475 }, { "acc": 0.72217026, "epoch": 0.6889927535522694, "grad_norm": 5.4375, "learning_rate": 7.810446716778218e-06, "loss": 1.11892471, "memory(GiB)": 141.16, "step": 61600, "train_speed(iter/s)": 0.292504 }, { "acc": 0.73117552, "epoch": 0.6892164524982279, "grad_norm": 7.6875, "learning_rate": 7.808916912022046e-06, "loss": 1.0785697, "memory(GiB)": 141.16, "step": 61620, "train_speed(iter/s)": 0.292535 }, { "acc": 0.72216539, "epoch": 0.6894401514441865, "grad_norm": 7.15625, "learning_rate": 7.807386722965891e-06, "loss": 1.1248333, "memory(GiB)": 141.16, "step": 61640, "train_speed(iter/s)": 0.292563 }, { "acc": 0.73474503, "epoch": 0.689663850390145, "grad_norm": 7.03125, "learning_rate": 7.805856149819107e-06, "loss": 1.0738142, "memory(GiB)": 141.16, "step": 61660, "train_speed(iter/s)": 0.29259 }, { "acc": 0.73201704, "epoch": 0.6898875493361035, "grad_norm": 9.75, "learning_rate": 7.804325192791096e-06, "loss": 1.07350655, "memory(GiB)": 141.16, "step": 61680, "train_speed(iter/s)": 0.292624 }, { "acc": 0.74226933, "epoch": 0.690111248282062, "grad_norm": 5.78125, "learning_rate": 7.802793852091315e-06, "loss": 1.03628368, "memory(GiB)": 141.16, "step": 61700, "train_speed(iter/s)": 0.292654 }, { "acc": 0.73307533, "epoch": 0.6903349472280206, "grad_norm": 6.46875, "learning_rate": 7.801262127929274e-06, "loss": 1.06168175, "memory(GiB)": 141.16, "step": 61720, "train_speed(iter/s)": 0.292686 }, { "acc": 0.72162604, "epoch": 0.6905586461739791, "grad_norm": 7.46875, "learning_rate": 7.799730020514536e-06, "loss": 1.11715965, "memory(GiB)": 141.16, "step": 61740, "train_speed(iter/s)": 0.292717 }, { "acc": 0.7319797, "epoch": 0.6907823451199376, "grad_norm": 7.75, "learning_rate": 7.79819753005671e-06, "loss": 1.07353401, "memory(GiB)": 141.16, "step": 61760, "train_speed(iter/s)": 0.292745 }, { "acc": 0.74455395, "epoch": 0.6910060440658962, "grad_norm": 7.125, "learning_rate": 7.796664656765472e-06, "loss": 1.00440731, "memory(GiB)": 141.16, "step": 61780, "train_speed(iter/s)": 0.292782 }, { "acc": 0.72712913, "epoch": 0.6912297430118547, "grad_norm": 7.75, "learning_rate": 7.795131400850533e-06, "loss": 1.09590149, "memory(GiB)": 141.16, "step": 61800, "train_speed(iter/s)": 0.292814 }, { "acc": 0.73258848, "epoch": 0.6914534419578132, "grad_norm": 5.84375, "learning_rate": 7.793597762521666e-06, "loss": 1.07647266, "memory(GiB)": 141.16, "step": 61820, "train_speed(iter/s)": 0.292844 }, { "acc": 0.73281145, "epoch": 0.6916771409037717, "grad_norm": 7.78125, "learning_rate": 7.792063741988695e-06, "loss": 1.07723188, "memory(GiB)": 141.16, "step": 61840, "train_speed(iter/s)": 0.292878 }, { "acc": 0.74651279, "epoch": 0.6919008398497303, "grad_norm": 6.59375, "learning_rate": 7.790529339461497e-06, "loss": 1.01257019, "memory(GiB)": 141.16, "step": 61860, "train_speed(iter/s)": 0.292911 }, { "acc": 0.71706161, "epoch": 0.6921245387956888, "grad_norm": 5.9375, "learning_rate": 7.78899455515e-06, "loss": 1.13828239, "memory(GiB)": 141.16, "step": 61880, "train_speed(iter/s)": 0.292942 }, { "acc": 0.7192131, "epoch": 0.6923482377416473, "grad_norm": 7.75, "learning_rate": 7.787459389264183e-06, "loss": 1.14775639, "memory(GiB)": 141.16, "step": 61900, "train_speed(iter/s)": 0.292975 }, { "acc": 0.72465887, "epoch": 0.6925719366876059, "grad_norm": 6.5625, "learning_rate": 7.78592384201408e-06, "loss": 1.10484858, "memory(GiB)": 141.16, "step": 61920, "train_speed(iter/s)": 0.293003 }, { "acc": 0.73889918, "epoch": 0.6927956356335644, "grad_norm": 6.46875, "learning_rate": 7.784387913609775e-06, "loss": 1.03197184, "memory(GiB)": 141.16, "step": 61940, "train_speed(iter/s)": 0.293036 }, { "acc": 0.74247799, "epoch": 0.6930193345795229, "grad_norm": 7.8125, "learning_rate": 7.782851604261406e-06, "loss": 1.03661623, "memory(GiB)": 141.16, "step": 61960, "train_speed(iter/s)": 0.293066 }, { "acc": 0.7156743, "epoch": 0.6932430335254814, "grad_norm": 6.34375, "learning_rate": 7.781314914179161e-06, "loss": 1.14145966, "memory(GiB)": 141.16, "step": 61980, "train_speed(iter/s)": 0.293098 }, { "acc": 0.72218032, "epoch": 0.69346673247144, "grad_norm": 6.75, "learning_rate": 7.779777843573282e-06, "loss": 1.12809944, "memory(GiB)": 141.16, "step": 62000, "train_speed(iter/s)": 0.293133 }, { "epoch": 0.69346673247144, "eval_acc": 0.6895879844727367, "eval_loss": 1.081417202949524, "eval_runtime": 2324.6645, "eval_samples_per_second": 32.384, "eval_steps_per_second": 16.192, "step": 62000 }, { "acc": 0.74567146, "epoch": 0.6936904314173985, "grad_norm": 9.375, "learning_rate": 7.778240392654061e-06, "loss": 1.00990267, "memory(GiB)": 141.16, "step": 62020, "train_speed(iter/s)": 0.289905 }, { "acc": 0.7293149, "epoch": 0.693914130363357, "grad_norm": 7.28125, "learning_rate": 7.776702561631847e-06, "loss": 1.08241577, "memory(GiB)": 141.16, "step": 62040, "train_speed(iter/s)": 0.289935 }, { "acc": 0.73158913, "epoch": 0.6941378293093156, "grad_norm": 8.875, "learning_rate": 7.77516435071703e-06, "loss": 1.0693347, "memory(GiB)": 141.16, "step": 62060, "train_speed(iter/s)": 0.289968 }, { "acc": 0.72841368, "epoch": 0.6943615282552741, "grad_norm": 6.625, "learning_rate": 7.773625760120067e-06, "loss": 1.0855114, "memory(GiB)": 141.16, "step": 62080, "train_speed(iter/s)": 0.29 }, { "acc": 0.73871293, "epoch": 0.6945852272012326, "grad_norm": 7.65625, "learning_rate": 7.772086790051453e-06, "loss": 1.04872055, "memory(GiB)": 141.16, "step": 62100, "train_speed(iter/s)": 0.290019 }, { "acc": 0.72073174, "epoch": 0.6948089261471911, "grad_norm": 8.1875, "learning_rate": 7.770547440721745e-06, "loss": 1.11889439, "memory(GiB)": 141.16, "step": 62120, "train_speed(iter/s)": 0.290053 }, { "acc": 0.73479872, "epoch": 0.6950326250931497, "grad_norm": 9.3125, "learning_rate": 7.769007712341548e-06, "loss": 1.05241127, "memory(GiB)": 141.16, "step": 62140, "train_speed(iter/s)": 0.290088 }, { "acc": 0.71715546, "epoch": 0.6952563240391082, "grad_norm": 6.65625, "learning_rate": 7.767467605121518e-06, "loss": 1.1472271, "memory(GiB)": 141.16, "step": 62160, "train_speed(iter/s)": 0.29012 }, { "acc": 0.71731825, "epoch": 0.6954800229850667, "grad_norm": 6.46875, "learning_rate": 7.765927119272361e-06, "loss": 1.1556385, "memory(GiB)": 141.16, "step": 62180, "train_speed(iter/s)": 0.290148 }, { "acc": 0.73539181, "epoch": 0.6957037219310253, "grad_norm": 6.5625, "learning_rate": 7.76438625500484e-06, "loss": 1.05257034, "memory(GiB)": 141.16, "step": 62200, "train_speed(iter/s)": 0.290177 }, { "acc": 0.72906947, "epoch": 0.6959274208769838, "grad_norm": 7.40625, "learning_rate": 7.76284501252977e-06, "loss": 1.09118385, "memory(GiB)": 141.16, "step": 62220, "train_speed(iter/s)": 0.290212 }, { "acc": 0.73109989, "epoch": 0.6961511198229423, "grad_norm": 6.21875, "learning_rate": 7.76130339205801e-06, "loss": 1.09049482, "memory(GiB)": 141.16, "step": 62240, "train_speed(iter/s)": 0.290244 }, { "acc": 0.73420286, "epoch": 0.6963748187689008, "grad_norm": 6.75, "learning_rate": 7.759761393800477e-06, "loss": 1.06293812, "memory(GiB)": 141.16, "step": 62260, "train_speed(iter/s)": 0.290276 }, { "acc": 0.71717539, "epoch": 0.6965985177148594, "grad_norm": 7.25, "learning_rate": 7.75821901796814e-06, "loss": 1.13468094, "memory(GiB)": 141.16, "step": 62280, "train_speed(iter/s)": 0.290309 }, { "acc": 0.72263098, "epoch": 0.6968222166608179, "grad_norm": 7.15625, "learning_rate": 7.756676264772019e-06, "loss": 1.12687855, "memory(GiB)": 141.16, "step": 62300, "train_speed(iter/s)": 0.290341 }, { "acc": 0.7121603, "epoch": 0.6970459156067764, "grad_norm": 6.84375, "learning_rate": 7.75513313442318e-06, "loss": 1.17920752, "memory(GiB)": 141.16, "step": 62320, "train_speed(iter/s)": 0.290365 }, { "acc": 0.73078651, "epoch": 0.697269614552735, "grad_norm": 6.71875, "learning_rate": 7.753589627132752e-06, "loss": 1.07724419, "memory(GiB)": 141.16, "step": 62340, "train_speed(iter/s)": 0.290396 }, { "acc": 0.72293987, "epoch": 0.6974933134986935, "grad_norm": 7.90625, "learning_rate": 7.752045743111902e-06, "loss": 1.12025166, "memory(GiB)": 141.16, "step": 62360, "train_speed(iter/s)": 0.290425 }, { "acc": 0.73328509, "epoch": 0.697717012444652, "grad_norm": 6.15625, "learning_rate": 7.750501482571859e-06, "loss": 1.05480804, "memory(GiB)": 141.16, "step": 62380, "train_speed(iter/s)": 0.290454 }, { "acc": 0.73419104, "epoch": 0.6979407113906105, "grad_norm": 8.125, "learning_rate": 7.748956845723901e-06, "loss": 1.06238861, "memory(GiB)": 141.16, "step": 62400, "train_speed(iter/s)": 0.290485 }, { "acc": 0.73289614, "epoch": 0.6981644103365691, "grad_norm": 7.6875, "learning_rate": 7.747411832779354e-06, "loss": 1.06757755, "memory(GiB)": 141.16, "step": 62420, "train_speed(iter/s)": 0.290519 }, { "acc": 0.7335125, "epoch": 0.6983881092825276, "grad_norm": 6.65625, "learning_rate": 7.745866443949599e-06, "loss": 1.06621037, "memory(GiB)": 141.16, "step": 62440, "train_speed(iter/s)": 0.290553 }, { "acc": 0.72950315, "epoch": 0.6986118082284861, "grad_norm": 7.90625, "learning_rate": 7.744320679446067e-06, "loss": 1.08576355, "memory(GiB)": 141.16, "step": 62460, "train_speed(iter/s)": 0.290579 }, { "acc": 0.73406572, "epoch": 0.6988355071744446, "grad_norm": 6.0, "learning_rate": 7.74277453948024e-06, "loss": 1.07630863, "memory(GiB)": 141.16, "step": 62480, "train_speed(iter/s)": 0.290614 }, { "acc": 0.72924013, "epoch": 0.6990592061204032, "grad_norm": 9.3125, "learning_rate": 7.741228024263653e-06, "loss": 1.08021984, "memory(GiB)": 141.16, "step": 62500, "train_speed(iter/s)": 0.290647 }, { "acc": 0.73605919, "epoch": 0.6992829050663617, "grad_norm": 6.0625, "learning_rate": 7.739681134007893e-06, "loss": 1.06233082, "memory(GiB)": 141.16, "step": 62520, "train_speed(iter/s)": 0.290679 }, { "acc": 0.7219317, "epoch": 0.6995066040123202, "grad_norm": 7.09375, "learning_rate": 7.738133868924592e-06, "loss": 1.12318764, "memory(GiB)": 141.16, "step": 62540, "train_speed(iter/s)": 0.290709 }, { "acc": 0.73757391, "epoch": 0.6997303029582788, "grad_norm": 7.625, "learning_rate": 7.736586229225442e-06, "loss": 1.05372047, "memory(GiB)": 141.16, "step": 62560, "train_speed(iter/s)": 0.290741 }, { "acc": 0.73995504, "epoch": 0.6999540019042373, "grad_norm": 5.96875, "learning_rate": 7.735038215122181e-06, "loss": 1.03864937, "memory(GiB)": 141.16, "step": 62580, "train_speed(iter/s)": 0.290772 }, { "acc": 0.72532539, "epoch": 0.7001777008501958, "grad_norm": 8.6875, "learning_rate": 7.733489826826598e-06, "loss": 1.10846558, "memory(GiB)": 141.16, "step": 62600, "train_speed(iter/s)": 0.2908 }, { "acc": 0.7316503, "epoch": 0.7004013997961543, "grad_norm": 6.53125, "learning_rate": 7.73194106455054e-06, "loss": 1.07700462, "memory(GiB)": 141.16, "step": 62620, "train_speed(iter/s)": 0.290832 }, { "acc": 0.71664457, "epoch": 0.7006250987421129, "grad_norm": 6.4375, "learning_rate": 7.730391928505892e-06, "loss": 1.13470259, "memory(GiB)": 141.16, "step": 62640, "train_speed(iter/s)": 0.290866 }, { "acc": 0.72737989, "epoch": 0.7008487976880714, "grad_norm": 6.96875, "learning_rate": 7.728842418904602e-06, "loss": 1.09841614, "memory(GiB)": 141.16, "step": 62660, "train_speed(iter/s)": 0.290896 }, { "acc": 0.73340001, "epoch": 0.7010724966340299, "grad_norm": 6.71875, "learning_rate": 7.727292535958667e-06, "loss": 1.05110111, "memory(GiB)": 141.16, "step": 62680, "train_speed(iter/s)": 0.290927 }, { "acc": 0.73307209, "epoch": 0.7012961955799885, "grad_norm": 6.6875, "learning_rate": 7.725742279880131e-06, "loss": 1.07104263, "memory(GiB)": 141.16, "step": 62700, "train_speed(iter/s)": 0.290957 }, { "acc": 0.73933482, "epoch": 0.701519894525947, "grad_norm": 7.1875, "learning_rate": 7.72419165088109e-06, "loss": 1.0344985, "memory(GiB)": 141.16, "step": 62720, "train_speed(iter/s)": 0.290986 }, { "acc": 0.73395329, "epoch": 0.7017435934719055, "grad_norm": 7.25, "learning_rate": 7.722640649173693e-06, "loss": 1.06899529, "memory(GiB)": 141.16, "step": 62740, "train_speed(iter/s)": 0.291014 }, { "acc": 0.72962189, "epoch": 0.701967292417864, "grad_norm": 8.8125, "learning_rate": 7.721089274970142e-06, "loss": 1.09490395, "memory(GiB)": 141.16, "step": 62760, "train_speed(iter/s)": 0.291045 }, { "acc": 0.73671412, "epoch": 0.7021909913638226, "grad_norm": 6.75, "learning_rate": 7.719537528482683e-06, "loss": 1.05651808, "memory(GiB)": 141.16, "step": 62780, "train_speed(iter/s)": 0.291077 }, { "acc": 0.72879772, "epoch": 0.7024146903097811, "grad_norm": 5.90625, "learning_rate": 7.71798540992362e-06, "loss": 1.08484125, "memory(GiB)": 141.16, "step": 62800, "train_speed(iter/s)": 0.291108 }, { "acc": 0.7314537, "epoch": 0.7026383892557396, "grad_norm": 9.0, "learning_rate": 7.716432919505303e-06, "loss": 1.07403774, "memory(GiB)": 141.16, "step": 62820, "train_speed(iter/s)": 0.291142 }, { "acc": 0.725912, "epoch": 0.7028620882016982, "grad_norm": 9.875, "learning_rate": 7.714880057440137e-06, "loss": 1.10789394, "memory(GiB)": 141.16, "step": 62840, "train_speed(iter/s)": 0.291173 }, { "acc": 0.71404533, "epoch": 0.7030857871476567, "grad_norm": 7.21875, "learning_rate": 7.713326823940573e-06, "loss": 1.16278973, "memory(GiB)": 141.16, "step": 62860, "train_speed(iter/s)": 0.291206 }, { "acc": 0.72875195, "epoch": 0.7033094860936152, "grad_norm": 6.1875, "learning_rate": 7.711773219219119e-06, "loss": 1.10608292, "memory(GiB)": 141.16, "step": 62880, "train_speed(iter/s)": 0.291239 }, { "acc": 0.73629832, "epoch": 0.7035331850395737, "grad_norm": 7.34375, "learning_rate": 7.710219243488326e-06, "loss": 1.06208153, "memory(GiB)": 141.16, "step": 62900, "train_speed(iter/s)": 0.291271 }, { "acc": 0.7316463, "epoch": 0.7037568839855323, "grad_norm": 8.3125, "learning_rate": 7.708664896960804e-06, "loss": 1.08532763, "memory(GiB)": 141.16, "step": 62920, "train_speed(iter/s)": 0.291302 }, { "acc": 0.72188826, "epoch": 0.7039805829314908, "grad_norm": 6.15625, "learning_rate": 7.707110179849208e-06, "loss": 1.12866821, "memory(GiB)": 141.16, "step": 62940, "train_speed(iter/s)": 0.291335 }, { "acc": 0.74686003, "epoch": 0.7042042818774493, "grad_norm": 8.6875, "learning_rate": 7.705555092366247e-06, "loss": 1.00597115, "memory(GiB)": 141.16, "step": 62960, "train_speed(iter/s)": 0.291368 }, { "acc": 0.74112434, "epoch": 0.7044279808234079, "grad_norm": 7.3125, "learning_rate": 7.703999634724678e-06, "loss": 1.02652435, "memory(GiB)": 141.16, "step": 62980, "train_speed(iter/s)": 0.291399 }, { "acc": 0.72757835, "epoch": 0.7046516797693664, "grad_norm": 7.84375, "learning_rate": 7.70244380713731e-06, "loss": 1.09118671, "memory(GiB)": 141.16, "step": 63000, "train_speed(iter/s)": 0.29143 }, { "acc": 0.71006184, "epoch": 0.7048753787153249, "grad_norm": 6.3125, "learning_rate": 7.700887609817e-06, "loss": 1.18113689, "memory(GiB)": 141.16, "step": 63020, "train_speed(iter/s)": 0.29146 }, { "acc": 0.72894778, "epoch": 0.7050990776612834, "grad_norm": 7.0, "learning_rate": 7.699331042976664e-06, "loss": 1.07744818, "memory(GiB)": 141.16, "step": 63040, "train_speed(iter/s)": 0.29149 }, { "acc": 0.72452116, "epoch": 0.705322776607242, "grad_norm": 7.03125, "learning_rate": 7.697774106829257e-06, "loss": 1.11862154, "memory(GiB)": 141.16, "step": 63060, "train_speed(iter/s)": 0.29152 }, { "acc": 0.72298384, "epoch": 0.7055464755532005, "grad_norm": 5.6875, "learning_rate": 7.696216801587791e-06, "loss": 1.11877346, "memory(GiB)": 141.16, "step": 63080, "train_speed(iter/s)": 0.291548 }, { "acc": 0.73086348, "epoch": 0.705770174499159, "grad_norm": 6.125, "learning_rate": 7.69465912746533e-06, "loss": 1.08342972, "memory(GiB)": 141.16, "step": 63100, "train_speed(iter/s)": 0.291581 }, { "acc": 0.73052874, "epoch": 0.7059938734451175, "grad_norm": 6.15625, "learning_rate": 7.693101084674984e-06, "loss": 1.08473682, "memory(GiB)": 141.16, "step": 63120, "train_speed(iter/s)": 0.291616 }, { "acc": 0.72013483, "epoch": 0.7062175723910761, "grad_norm": 7.375, "learning_rate": 7.691542673429917e-06, "loss": 1.11634111, "memory(GiB)": 141.16, "step": 63140, "train_speed(iter/s)": 0.291649 }, { "acc": 0.73083138, "epoch": 0.7064412713370346, "grad_norm": 7.0625, "learning_rate": 7.689983893943342e-06, "loss": 1.08585548, "memory(GiB)": 141.16, "step": 63160, "train_speed(iter/s)": 0.291681 }, { "acc": 0.72249489, "epoch": 0.7066649702829931, "grad_norm": 8.25, "learning_rate": 7.68842474642852e-06, "loss": 1.11052475, "memory(GiB)": 141.16, "step": 63180, "train_speed(iter/s)": 0.291709 }, { "acc": 0.7315074, "epoch": 0.7068886692289517, "grad_norm": 8.3125, "learning_rate": 7.686865231098767e-06, "loss": 1.06705303, "memory(GiB)": 141.16, "step": 63200, "train_speed(iter/s)": 0.291738 }, { "acc": 0.72078538, "epoch": 0.7071123681749102, "grad_norm": 8.6875, "learning_rate": 7.685305348167446e-06, "loss": 1.12419147, "memory(GiB)": 141.16, "step": 63220, "train_speed(iter/s)": 0.29177 }, { "acc": 0.72566791, "epoch": 0.7073360671208687, "grad_norm": 6.84375, "learning_rate": 7.683745097847973e-06, "loss": 1.09203033, "memory(GiB)": 141.16, "step": 63240, "train_speed(iter/s)": 0.291804 }, { "acc": 0.73133512, "epoch": 0.7075597660668272, "grad_norm": 9.0, "learning_rate": 7.68218448035381e-06, "loss": 1.07416821, "memory(GiB)": 141.16, "step": 63260, "train_speed(iter/s)": 0.291835 }, { "acc": 0.72904577, "epoch": 0.7077834650127858, "grad_norm": 6.03125, "learning_rate": 7.680623495898472e-06, "loss": 1.09152384, "memory(GiB)": 141.16, "step": 63280, "train_speed(iter/s)": 0.291863 }, { "acc": 0.73982072, "epoch": 0.7080071639587443, "grad_norm": 8.1875, "learning_rate": 7.679062144695525e-06, "loss": 1.05718327, "memory(GiB)": 141.16, "step": 63300, "train_speed(iter/s)": 0.291888 }, { "acc": 0.73124695, "epoch": 0.7082308629047028, "grad_norm": 8.0625, "learning_rate": 7.677500426958584e-06, "loss": 1.06814251, "memory(GiB)": 141.16, "step": 63320, "train_speed(iter/s)": 0.291919 }, { "acc": 0.72952042, "epoch": 0.7084545618506614, "grad_norm": 6.21875, "learning_rate": 7.675938342901315e-06, "loss": 1.08763332, "memory(GiB)": 141.16, "step": 63340, "train_speed(iter/s)": 0.291954 }, { "acc": 0.7298512, "epoch": 0.7086782607966199, "grad_norm": 6.40625, "learning_rate": 7.674375892737433e-06, "loss": 1.08116484, "memory(GiB)": 141.16, "step": 63360, "train_speed(iter/s)": 0.291988 }, { "acc": 0.73009009, "epoch": 0.7089019597425784, "grad_norm": 5.09375, "learning_rate": 7.672813076680703e-06, "loss": 1.07839375, "memory(GiB)": 141.16, "step": 63380, "train_speed(iter/s)": 0.29202 }, { "acc": 0.73224869, "epoch": 0.7091256586885369, "grad_norm": 6.9375, "learning_rate": 7.67124989494494e-06, "loss": 1.06180496, "memory(GiB)": 141.16, "step": 63400, "train_speed(iter/s)": 0.292052 }, { "acc": 0.72100182, "epoch": 0.7093493576344955, "grad_norm": 7.78125, "learning_rate": 7.66968634774401e-06, "loss": 1.12291784, "memory(GiB)": 141.16, "step": 63420, "train_speed(iter/s)": 0.292083 }, { "acc": 0.71731639, "epoch": 0.709573056580454, "grad_norm": 5.9375, "learning_rate": 7.66812243529183e-06, "loss": 1.13706093, "memory(GiB)": 141.16, "step": 63440, "train_speed(iter/s)": 0.292112 }, { "acc": 0.729777, "epoch": 0.7097967555264125, "grad_norm": 6.3125, "learning_rate": 7.666558157802364e-06, "loss": 1.08320332, "memory(GiB)": 141.16, "step": 63460, "train_speed(iter/s)": 0.292143 }, { "acc": 0.72709808, "epoch": 0.710020454472371, "grad_norm": 6.59375, "learning_rate": 7.66499351548963e-06, "loss": 1.10241909, "memory(GiB)": 141.16, "step": 63480, "train_speed(iter/s)": 0.292174 }, { "acc": 0.7357132, "epoch": 0.7102441534183296, "grad_norm": 8.3125, "learning_rate": 7.663428508567689e-06, "loss": 1.05785503, "memory(GiB)": 141.16, "step": 63500, "train_speed(iter/s)": 0.292207 }, { "acc": 0.71872797, "epoch": 0.7104678523642881, "grad_norm": 8.4375, "learning_rate": 7.66186313725066e-06, "loss": 1.15705986, "memory(GiB)": 141.16, "step": 63520, "train_speed(iter/s)": 0.292233 }, { "acc": 0.74164915, "epoch": 0.7106915513102466, "grad_norm": 7.8125, "learning_rate": 7.660297401752708e-06, "loss": 1.02677755, "memory(GiB)": 141.16, "step": 63540, "train_speed(iter/s)": 0.292263 }, { "acc": 0.72852526, "epoch": 0.7109152502562052, "grad_norm": 6.90625, "learning_rate": 7.658731302288046e-06, "loss": 1.0900074, "memory(GiB)": 141.16, "step": 63560, "train_speed(iter/s)": 0.292296 }, { "acc": 0.72413168, "epoch": 0.7111389492021637, "grad_norm": 7.0, "learning_rate": 7.657164839070941e-06, "loss": 1.12872753, "memory(GiB)": 141.16, "step": 63580, "train_speed(iter/s)": 0.292326 }, { "acc": 0.73686175, "epoch": 0.7113626481481222, "grad_norm": 6.875, "learning_rate": 7.655598012315706e-06, "loss": 1.06746998, "memory(GiB)": 141.16, "step": 63600, "train_speed(iter/s)": 0.292352 }, { "acc": 0.73002453, "epoch": 0.7115863470940808, "grad_norm": 6.1875, "learning_rate": 7.654030822236705e-06, "loss": 1.09803953, "memory(GiB)": 141.16, "step": 63620, "train_speed(iter/s)": 0.29238 }, { "acc": 0.73065133, "epoch": 0.7118100460400393, "grad_norm": 5.4375, "learning_rate": 7.65246326904835e-06, "loss": 1.06605721, "memory(GiB)": 141.16, "step": 63640, "train_speed(iter/s)": 0.292408 }, { "acc": 0.72538476, "epoch": 0.7120337449859978, "grad_norm": 8.0625, "learning_rate": 7.65089535296511e-06, "loss": 1.11301079, "memory(GiB)": 141.16, "step": 63660, "train_speed(iter/s)": 0.292437 }, { "acc": 0.72982621, "epoch": 0.7122574439319563, "grad_norm": 7.84375, "learning_rate": 7.649327074201498e-06, "loss": 1.0872385, "memory(GiB)": 141.16, "step": 63680, "train_speed(iter/s)": 0.292468 }, { "acc": 0.7326592, "epoch": 0.7124811428779149, "grad_norm": 6.21875, "learning_rate": 7.647758432972072e-06, "loss": 1.08148594, "memory(GiB)": 141.16, "step": 63700, "train_speed(iter/s)": 0.292499 }, { "acc": 0.73441529, "epoch": 0.7127048418238734, "grad_norm": 8.3125, "learning_rate": 7.646189429491449e-06, "loss": 1.06009312, "memory(GiB)": 141.16, "step": 63720, "train_speed(iter/s)": 0.292532 }, { "acc": 0.72381773, "epoch": 0.7129285407698319, "grad_norm": 7.46875, "learning_rate": 7.644620063974287e-06, "loss": 1.11109676, "memory(GiB)": 141.16, "step": 63740, "train_speed(iter/s)": 0.292562 }, { "acc": 0.72852259, "epoch": 0.7131522397157904, "grad_norm": 6.78125, "learning_rate": 7.643050336635301e-06, "loss": 1.09126015, "memory(GiB)": 141.16, "step": 63760, "train_speed(iter/s)": 0.292591 }, { "acc": 0.73502674, "epoch": 0.713375938661749, "grad_norm": 7.53125, "learning_rate": 7.64148024768925e-06, "loss": 1.05618076, "memory(GiB)": 141.16, "step": 63780, "train_speed(iter/s)": 0.292617 }, { "acc": 0.73767195, "epoch": 0.7135996376077075, "grad_norm": 5.34375, "learning_rate": 7.639909797350945e-06, "loss": 1.05036182, "memory(GiB)": 141.16, "step": 63800, "train_speed(iter/s)": 0.292649 }, { "acc": 0.74241056, "epoch": 0.713823336553666, "grad_norm": 8.1875, "learning_rate": 7.63833898583525e-06, "loss": 1.0209609, "memory(GiB)": 141.16, "step": 63820, "train_speed(iter/s)": 0.292677 }, { "acc": 0.74622374, "epoch": 0.7140470354996246, "grad_norm": 7.34375, "learning_rate": 7.63676781335707e-06, "loss": 1.00696259, "memory(GiB)": 141.16, "step": 63840, "train_speed(iter/s)": 0.292703 }, { "acc": 0.73235083, "epoch": 0.7142707344455831, "grad_norm": 5.78125, "learning_rate": 7.635196280131363e-06, "loss": 1.0940753, "memory(GiB)": 141.16, "step": 63860, "train_speed(iter/s)": 0.292732 }, { "acc": 0.73761826, "epoch": 0.7144944333915416, "grad_norm": 8.0, "learning_rate": 7.63362438637314e-06, "loss": 1.05170135, "memory(GiB)": 141.16, "step": 63880, "train_speed(iter/s)": 0.292763 }, { "acc": 0.72659569, "epoch": 0.7147181323375001, "grad_norm": 7.8125, "learning_rate": 7.632052132297459e-06, "loss": 1.09550085, "memory(GiB)": 141.16, "step": 63900, "train_speed(iter/s)": 0.292794 }, { "acc": 0.73759546, "epoch": 0.7149418312834587, "grad_norm": 7.875, "learning_rate": 7.630479518119425e-06, "loss": 1.05206852, "memory(GiB)": 141.16, "step": 63920, "train_speed(iter/s)": 0.292825 }, { "acc": 0.72484417, "epoch": 0.7151655302294172, "grad_norm": 8.0625, "learning_rate": 7.628906544054196e-06, "loss": 1.110987, "memory(GiB)": 141.16, "step": 63940, "train_speed(iter/s)": 0.292857 }, { "acc": 0.72352514, "epoch": 0.7153892291753757, "grad_norm": 8.125, "learning_rate": 7.627333210316974e-06, "loss": 1.11676807, "memory(GiB)": 141.16, "step": 63960, "train_speed(iter/s)": 0.292888 }, { "acc": 0.72550912, "epoch": 0.7156129281213343, "grad_norm": 6.78125, "learning_rate": 7.625759517123016e-06, "loss": 1.11976528, "memory(GiB)": 141.16, "step": 63980, "train_speed(iter/s)": 0.292919 }, { "acc": 0.73771777, "epoch": 0.7158366270672928, "grad_norm": 6.78125, "learning_rate": 7.624185464687626e-06, "loss": 1.05200958, "memory(GiB)": 141.16, "step": 64000, "train_speed(iter/s)": 0.292948 }, { "epoch": 0.7158366270672928, "eval_acc": 0.6896308725626255, "eval_loss": 1.081082820892334, "eval_runtime": 2320.7355, "eval_samples_per_second": 32.439, "eval_steps_per_second": 16.22, "step": 64000 }, { "acc": 0.74638195, "epoch": 0.7160603260132513, "grad_norm": 6.375, "learning_rate": 7.622611053226157e-06, "loss": 1.00033398, "memory(GiB)": 141.16, "step": 64020, "train_speed(iter/s)": 0.289836 }, { "acc": 0.72532349, "epoch": 0.7162840249592098, "grad_norm": 7.25, "learning_rate": 7.621036282954008e-06, "loss": 1.1134697, "memory(GiB)": 141.16, "step": 64040, "train_speed(iter/s)": 0.289869 }, { "acc": 0.72216706, "epoch": 0.7165077239051684, "grad_norm": 5.75, "learning_rate": 7.619461154086633e-06, "loss": 1.13300743, "memory(GiB)": 141.16, "step": 64060, "train_speed(iter/s)": 0.289896 }, { "acc": 0.73640199, "epoch": 0.7167314228511269, "grad_norm": 8.25, "learning_rate": 7.617885666839531e-06, "loss": 1.0449626, "memory(GiB)": 141.16, "step": 64080, "train_speed(iter/s)": 0.289926 }, { "acc": 0.73924513, "epoch": 0.7169551217970855, "grad_norm": 7.03125, "learning_rate": 7.616309821428254e-06, "loss": 1.04320383, "memory(GiB)": 141.16, "step": 64100, "train_speed(iter/s)": 0.28996 }, { "acc": 0.71778936, "epoch": 0.7171788207430441, "grad_norm": 7.625, "learning_rate": 7.614733618068395e-06, "loss": 1.1485178, "memory(GiB)": 141.16, "step": 64120, "train_speed(iter/s)": 0.289993 }, { "acc": 0.72380085, "epoch": 0.7174025196890026, "grad_norm": 7.84375, "learning_rate": 7.613157056975604e-06, "loss": 1.09707756, "memory(GiB)": 141.16, "step": 64140, "train_speed(iter/s)": 0.290019 }, { "acc": 0.74060001, "epoch": 0.7176262186349611, "grad_norm": 8.25, "learning_rate": 7.6115801383655776e-06, "loss": 1.01729965, "memory(GiB)": 141.16, "step": 64160, "train_speed(iter/s)": 0.290046 }, { "acc": 0.73217621, "epoch": 0.7178499175809196, "grad_norm": 7.5, "learning_rate": 7.610002862454063e-06, "loss": 1.07998772, "memory(GiB)": 141.16, "step": 64180, "train_speed(iter/s)": 0.290077 }, { "acc": 0.73804164, "epoch": 0.7180736165268782, "grad_norm": 10.0, "learning_rate": 7.608425229456847e-06, "loss": 1.04941998, "memory(GiB)": 141.16, "step": 64200, "train_speed(iter/s)": 0.290108 }, { "acc": 0.73139811, "epoch": 0.7182973154728367, "grad_norm": 6.9375, "learning_rate": 7.606847239589779e-06, "loss": 1.06592655, "memory(GiB)": 141.16, "step": 64220, "train_speed(iter/s)": 0.290137 }, { "acc": 0.72736244, "epoch": 0.7185210144187952, "grad_norm": 5.96875, "learning_rate": 7.605268893068748e-06, "loss": 1.09281025, "memory(GiB)": 141.16, "step": 64240, "train_speed(iter/s)": 0.290168 }, { "acc": 0.74590521, "epoch": 0.7187447133647538, "grad_norm": 7.875, "learning_rate": 7.603690190109694e-06, "loss": 1.01603165, "memory(GiB)": 141.16, "step": 64260, "train_speed(iter/s)": 0.2902 }, { "acc": 0.73738618, "epoch": 0.7189684123107123, "grad_norm": 7.59375, "learning_rate": 7.602111130928606e-06, "loss": 1.05209856, "memory(GiB)": 141.16, "step": 64280, "train_speed(iter/s)": 0.290231 }, { "acc": 0.73110423, "epoch": 0.7191921112566708, "grad_norm": 9.3125, "learning_rate": 7.600531715741523e-06, "loss": 1.09704714, "memory(GiB)": 141.16, "step": 64300, "train_speed(iter/s)": 0.290259 }, { "acc": 0.72749462, "epoch": 0.7194158102026293, "grad_norm": 7.03125, "learning_rate": 7.5989519447645325e-06, "loss": 1.10162315, "memory(GiB)": 141.16, "step": 64320, "train_speed(iter/s)": 0.290288 }, { "acc": 0.7358407, "epoch": 0.7196395091485879, "grad_norm": 8.25, "learning_rate": 7.597371818213768e-06, "loss": 1.05243425, "memory(GiB)": 141.16, "step": 64340, "train_speed(iter/s)": 0.290319 }, { "acc": 0.72968121, "epoch": 0.7198632080945464, "grad_norm": 7.15625, "learning_rate": 7.595791336305411e-06, "loss": 1.08436012, "memory(GiB)": 141.16, "step": 64360, "train_speed(iter/s)": 0.290347 }, { "acc": 0.71522217, "epoch": 0.7200869070405049, "grad_norm": 10.875, "learning_rate": 7.5942104992557e-06, "loss": 1.14200592, "memory(GiB)": 141.16, "step": 64380, "train_speed(iter/s)": 0.290376 }, { "acc": 0.73426495, "epoch": 0.7203106059864635, "grad_norm": 8.0625, "learning_rate": 7.592629307280912e-06, "loss": 1.05565643, "memory(GiB)": 141.16, "step": 64400, "train_speed(iter/s)": 0.290407 }, { "acc": 0.71577597, "epoch": 0.720534304932422, "grad_norm": 7.21875, "learning_rate": 7.591047760597378e-06, "loss": 1.13465843, "memory(GiB)": 141.16, "step": 64420, "train_speed(iter/s)": 0.290435 }, { "acc": 0.72570395, "epoch": 0.7207580038783805, "grad_norm": 7.71875, "learning_rate": 7.589465859421474e-06, "loss": 1.10431061, "memory(GiB)": 141.16, "step": 64440, "train_speed(iter/s)": 0.290465 }, { "acc": 0.73747759, "epoch": 0.720981702824339, "grad_norm": 8.625, "learning_rate": 7.5878836039696305e-06, "loss": 1.04918594, "memory(GiB)": 141.16, "step": 64460, "train_speed(iter/s)": 0.290496 }, { "acc": 0.72493401, "epoch": 0.7212054017702976, "grad_norm": 7.5625, "learning_rate": 7.586300994458319e-06, "loss": 1.09342165, "memory(GiB)": 141.16, "step": 64480, "train_speed(iter/s)": 0.290527 }, { "acc": 0.72199402, "epoch": 0.7214291007162561, "grad_norm": 5.5, "learning_rate": 7.584718031104065e-06, "loss": 1.11216373, "memory(GiB)": 141.16, "step": 64500, "train_speed(iter/s)": 0.290558 }, { "acc": 0.72086349, "epoch": 0.7216527996622146, "grad_norm": 7.6875, "learning_rate": 7.583134714123441e-06, "loss": 1.12038364, "memory(GiB)": 141.16, "step": 64520, "train_speed(iter/s)": 0.290586 }, { "acc": 0.73555212, "epoch": 0.7218764986081732, "grad_norm": 6.03125, "learning_rate": 7.581551043733066e-06, "loss": 1.04573088, "memory(GiB)": 141.16, "step": 64540, "train_speed(iter/s)": 0.29061 }, { "acc": 0.73695917, "epoch": 0.7221001975541317, "grad_norm": 5.53125, "learning_rate": 7.5799670201496085e-06, "loss": 1.04993439, "memory(GiB)": 141.16, "step": 64560, "train_speed(iter/s)": 0.29064 }, { "acc": 0.727211, "epoch": 0.7223238965000902, "grad_norm": 7.3125, "learning_rate": 7.578382643589788e-06, "loss": 1.08992062, "memory(GiB)": 141.16, "step": 64580, "train_speed(iter/s)": 0.29067 }, { "acc": 0.72328157, "epoch": 0.7225475954460487, "grad_norm": 5.40625, "learning_rate": 7.576797914270368e-06, "loss": 1.10793695, "memory(GiB)": 141.16, "step": 64600, "train_speed(iter/s)": 0.290701 }, { "acc": 0.73745127, "epoch": 0.7227712943920073, "grad_norm": 8.5625, "learning_rate": 7.575212832408162e-06, "loss": 1.03733063, "memory(GiB)": 141.16, "step": 64620, "train_speed(iter/s)": 0.290733 }, { "acc": 0.73048177, "epoch": 0.7229949933379658, "grad_norm": 7.34375, "learning_rate": 7.5736273982200315e-06, "loss": 1.08868752, "memory(GiB)": 141.16, "step": 64640, "train_speed(iter/s)": 0.290764 }, { "acc": 0.73029475, "epoch": 0.7232186922839243, "grad_norm": 7.6875, "learning_rate": 7.572041611922889e-06, "loss": 1.08040714, "memory(GiB)": 141.16, "step": 64660, "train_speed(iter/s)": 0.290795 }, { "acc": 0.73527794, "epoch": 0.7234423912298829, "grad_norm": 9.375, "learning_rate": 7.57045547373369e-06, "loss": 1.05114574, "memory(GiB)": 141.16, "step": 64680, "train_speed(iter/s)": 0.290825 }, { "acc": 0.73762012, "epoch": 0.7236660901758414, "grad_norm": 7.125, "learning_rate": 7.5688689838694415e-06, "loss": 1.0367033, "memory(GiB)": 141.16, "step": 64700, "train_speed(iter/s)": 0.290858 }, { "acc": 0.72718801, "epoch": 0.7238897891217999, "grad_norm": 7.1875, "learning_rate": 7.5672821425471996e-06, "loss": 1.09566832, "memory(GiB)": 141.16, "step": 64720, "train_speed(iter/s)": 0.290889 }, { "acc": 0.72370501, "epoch": 0.7241134880677584, "grad_norm": 5.5625, "learning_rate": 7.5656949499840656e-06, "loss": 1.09782286, "memory(GiB)": 141.16, "step": 64740, "train_speed(iter/s)": 0.290919 }, { "acc": 0.73871908, "epoch": 0.724337187013717, "grad_norm": 6.75, "learning_rate": 7.56410740639719e-06, "loss": 1.05259762, "memory(GiB)": 141.16, "step": 64760, "train_speed(iter/s)": 0.290951 }, { "acc": 0.73478498, "epoch": 0.7245608859596755, "grad_norm": 5.5, "learning_rate": 7.562519512003771e-06, "loss": 1.05063267, "memory(GiB)": 141.16, "step": 64780, "train_speed(iter/s)": 0.290982 }, { "acc": 0.74046154, "epoch": 0.724784584905634, "grad_norm": 6.4375, "learning_rate": 7.560931267021056e-06, "loss": 1.03331909, "memory(GiB)": 141.16, "step": 64800, "train_speed(iter/s)": 0.291007 }, { "acc": 0.72160702, "epoch": 0.7250082838515925, "grad_norm": 8.4375, "learning_rate": 7.55934267166634e-06, "loss": 1.10999231, "memory(GiB)": 141.16, "step": 64820, "train_speed(iter/s)": 0.291035 }, { "acc": 0.74105663, "epoch": 0.7252319827975511, "grad_norm": 6.75, "learning_rate": 7.557753726156965e-06, "loss": 1.04463158, "memory(GiB)": 141.16, "step": 64840, "train_speed(iter/s)": 0.291066 }, { "acc": 0.71997318, "epoch": 0.7254556817435096, "grad_norm": 7.84375, "learning_rate": 7.556164430710322e-06, "loss": 1.1112833, "memory(GiB)": 141.16, "step": 64860, "train_speed(iter/s)": 0.291094 }, { "acc": 0.7390605, "epoch": 0.7256793806894681, "grad_norm": 7.125, "learning_rate": 7.554574785543848e-06, "loss": 1.04842262, "memory(GiB)": 141.16, "step": 64880, "train_speed(iter/s)": 0.291124 }, { "acc": 0.74341917, "epoch": 0.7259030796354267, "grad_norm": 6.5, "learning_rate": 7.5529847908750295e-06, "loss": 1.02576504, "memory(GiB)": 141.16, "step": 64900, "train_speed(iter/s)": 0.291156 }, { "acc": 0.71737866, "epoch": 0.7261267785813852, "grad_norm": 7.5, "learning_rate": 7.551394446921403e-06, "loss": 1.1503005, "memory(GiB)": 141.16, "step": 64920, "train_speed(iter/s)": 0.291183 }, { "acc": 0.73616171, "epoch": 0.7263504775273437, "grad_norm": 7.84375, "learning_rate": 7.5498037539005464e-06, "loss": 1.05751019, "memory(GiB)": 141.16, "step": 64940, "train_speed(iter/s)": 0.291212 }, { "acc": 0.72452412, "epoch": 0.7265741764733022, "grad_norm": 9.3125, "learning_rate": 7.548212712030092e-06, "loss": 1.10844927, "memory(GiB)": 141.16, "step": 64960, "train_speed(iter/s)": 0.291243 }, { "acc": 0.72051229, "epoch": 0.7267978754192608, "grad_norm": 7.84375, "learning_rate": 7.546621321527716e-06, "loss": 1.1309886, "memory(GiB)": 141.16, "step": 64980, "train_speed(iter/s)": 0.291271 }, { "acc": 0.72298441, "epoch": 0.7270215743652193, "grad_norm": 8.5625, "learning_rate": 7.545029582611144e-06, "loss": 1.10915756, "memory(GiB)": 141.16, "step": 65000, "train_speed(iter/s)": 0.291301 }, { "acc": 0.73678098, "epoch": 0.7272452733111778, "grad_norm": 8.1875, "learning_rate": 7.543437495498148e-06, "loss": 1.06983328, "memory(GiB)": 141.16, "step": 65020, "train_speed(iter/s)": 0.291329 }, { "acc": 0.72207832, "epoch": 0.7274689722571364, "grad_norm": 5.4375, "learning_rate": 7.5418450604065495e-06, "loss": 1.11253357, "memory(GiB)": 141.16, "step": 65040, "train_speed(iter/s)": 0.291354 }, { "acc": 0.72832503, "epoch": 0.7276926712030949, "grad_norm": 7.34375, "learning_rate": 7.5402522775542145e-06, "loss": 1.09486141, "memory(GiB)": 141.16, "step": 65060, "train_speed(iter/s)": 0.291387 }, { "acc": 0.73170199, "epoch": 0.7279163701490534, "grad_norm": 8.3125, "learning_rate": 7.53865914715906e-06, "loss": 1.08474331, "memory(GiB)": 141.16, "step": 65080, "train_speed(iter/s)": 0.291417 }, { "acc": 0.72091513, "epoch": 0.7281400690950119, "grad_norm": 5.65625, "learning_rate": 7.537065669439046e-06, "loss": 1.12764578, "memory(GiB)": 141.16, "step": 65100, "train_speed(iter/s)": 0.291448 }, { "acc": 0.7188426, "epoch": 0.7283637680409705, "grad_norm": 7.78125, "learning_rate": 7.535471844612188e-06, "loss": 1.1354353, "memory(GiB)": 141.16, "step": 65120, "train_speed(iter/s)": 0.291476 }, { "acc": 0.73433785, "epoch": 0.728587466986929, "grad_norm": 7.53125, "learning_rate": 7.5338776728965415e-06, "loss": 1.06715517, "memory(GiB)": 141.16, "step": 65140, "train_speed(iter/s)": 0.291507 }, { "acc": 0.72195826, "epoch": 0.7288111659328875, "grad_norm": 5.5625, "learning_rate": 7.532283154510209e-06, "loss": 1.10972118, "memory(GiB)": 141.16, "step": 65160, "train_speed(iter/s)": 0.291542 }, { "acc": 0.72426529, "epoch": 0.7290348648788461, "grad_norm": 7.625, "learning_rate": 7.530688289671348e-06, "loss": 1.11130466, "memory(GiB)": 141.16, "step": 65180, "train_speed(iter/s)": 0.291571 }, { "acc": 0.73219347, "epoch": 0.7292585638248046, "grad_norm": 5.9375, "learning_rate": 7.529093078598158e-06, "loss": 1.08439093, "memory(GiB)": 141.16, "step": 65200, "train_speed(iter/s)": 0.291601 }, { "acc": 0.7249136, "epoch": 0.7294822627707631, "grad_norm": 8.25, "learning_rate": 7.527497521508885e-06, "loss": 1.10062904, "memory(GiB)": 141.16, "step": 65220, "train_speed(iter/s)": 0.291627 }, { "acc": 0.73853416, "epoch": 0.7297059617167216, "grad_norm": 6.875, "learning_rate": 7.5259016186218255e-06, "loss": 1.0341939, "memory(GiB)": 141.16, "step": 65240, "train_speed(iter/s)": 0.291659 }, { "acc": 0.72451344, "epoch": 0.7299296606626802, "grad_norm": 5.40625, "learning_rate": 7.52430537015532e-06, "loss": 1.10792217, "memory(GiB)": 141.16, "step": 65260, "train_speed(iter/s)": 0.291691 }, { "acc": 0.71650591, "epoch": 0.7301533596086387, "grad_norm": 8.1875, "learning_rate": 7.522708776327761e-06, "loss": 1.14575768, "memory(GiB)": 141.16, "step": 65280, "train_speed(iter/s)": 0.291718 }, { "acc": 0.74297762, "epoch": 0.7303770585545972, "grad_norm": 5.90625, "learning_rate": 7.521111837357582e-06, "loss": 1.02680607, "memory(GiB)": 141.16, "step": 65300, "train_speed(iter/s)": 0.291746 }, { "acc": 0.72847915, "epoch": 0.7306007575005558, "grad_norm": 6.375, "learning_rate": 7.519514553463267e-06, "loss": 1.0932601, "memory(GiB)": 141.16, "step": 65320, "train_speed(iter/s)": 0.291777 }, { "acc": 0.73669658, "epoch": 0.7308244564465143, "grad_norm": 8.4375, "learning_rate": 7.517916924863353e-06, "loss": 1.0651495, "memory(GiB)": 141.16, "step": 65340, "train_speed(iter/s)": 0.291809 }, { "acc": 0.74592848, "epoch": 0.7310481553924728, "grad_norm": 8.625, "learning_rate": 7.5163189517764134e-06, "loss": 1.01211796, "memory(GiB)": 141.16, "step": 65360, "train_speed(iter/s)": 0.291838 }, { "acc": 0.71529684, "epoch": 0.7312718543384313, "grad_norm": 8.125, "learning_rate": 7.514720634421073e-06, "loss": 1.14313049, "memory(GiB)": 141.16, "step": 65380, "train_speed(iter/s)": 0.291866 }, { "acc": 0.72111115, "epoch": 0.7314955532843899, "grad_norm": 6.15625, "learning_rate": 7.5131219730160065e-06, "loss": 1.12676077, "memory(GiB)": 141.16, "step": 65400, "train_speed(iter/s)": 0.291895 }, { "acc": 0.73648415, "epoch": 0.7317192522303484, "grad_norm": 7.0625, "learning_rate": 7.511522967779934e-06, "loss": 1.06987972, "memory(GiB)": 141.16, "step": 65420, "train_speed(iter/s)": 0.291922 }, { "acc": 0.74218569, "epoch": 0.7319429511763069, "grad_norm": 7.21875, "learning_rate": 7.509923618931621e-06, "loss": 1.02756557, "memory(GiB)": 141.16, "step": 65440, "train_speed(iter/s)": 0.291949 }, { "acc": 0.73981385, "epoch": 0.7321666501222654, "grad_norm": 6.84375, "learning_rate": 7.5083239266898824e-06, "loss": 1.04771194, "memory(GiB)": 141.16, "step": 65460, "train_speed(iter/s)": 0.291982 }, { "acc": 0.73301759, "epoch": 0.732390349068224, "grad_norm": 7.0625, "learning_rate": 7.506723891273577e-06, "loss": 1.06243801, "memory(GiB)": 141.16, "step": 65480, "train_speed(iter/s)": 0.292012 }, { "acc": 0.73550625, "epoch": 0.7326140480141825, "grad_norm": 7.53125, "learning_rate": 7.505123512901615e-06, "loss": 1.05161018, "memory(GiB)": 141.16, "step": 65500, "train_speed(iter/s)": 0.292041 }, { "acc": 0.74012337, "epoch": 0.732837746960141, "grad_norm": 8.0625, "learning_rate": 7.5035227917929495e-06, "loss": 1.04513206, "memory(GiB)": 141.16, "step": 65520, "train_speed(iter/s)": 0.29207 }, { "acc": 0.7303977, "epoch": 0.7330614459060996, "grad_norm": 5.625, "learning_rate": 7.501921728166584e-06, "loss": 1.09243336, "memory(GiB)": 141.16, "step": 65540, "train_speed(iter/s)": 0.292101 }, { "acc": 0.73035111, "epoch": 0.7332851448520581, "grad_norm": 7.4375, "learning_rate": 7.500320322241564e-06, "loss": 1.06277981, "memory(GiB)": 141.16, "step": 65560, "train_speed(iter/s)": 0.292128 }, { "acc": 0.7440403, "epoch": 0.7335088437980166, "grad_norm": 5.09375, "learning_rate": 7.498718574236986e-06, "loss": 1.028444, "memory(GiB)": 141.16, "step": 65580, "train_speed(iter/s)": 0.292157 }, { "acc": 0.72703924, "epoch": 0.7337325427439751, "grad_norm": 8.375, "learning_rate": 7.497116484371992e-06, "loss": 1.10989723, "memory(GiB)": 141.16, "step": 65600, "train_speed(iter/s)": 0.292185 }, { "acc": 0.72645364, "epoch": 0.7339562416899337, "grad_norm": 5.71875, "learning_rate": 7.495514052865772e-06, "loss": 1.09810982, "memory(GiB)": 141.16, "step": 65620, "train_speed(iter/s)": 0.292216 }, { "acc": 0.73765812, "epoch": 0.7341799406358922, "grad_norm": 7.90625, "learning_rate": 7.49391127993756e-06, "loss": 1.05098801, "memory(GiB)": 141.16, "step": 65640, "train_speed(iter/s)": 0.292246 }, { "acc": 0.7205843, "epoch": 0.7344036395818507, "grad_norm": 6.90625, "learning_rate": 7.492308165806639e-06, "loss": 1.13732748, "memory(GiB)": 141.16, "step": 65660, "train_speed(iter/s)": 0.292278 }, { "acc": 0.71440506, "epoch": 0.7346273385278093, "grad_norm": 5.53125, "learning_rate": 7.490704710692337e-06, "loss": 1.15280418, "memory(GiB)": 141.16, "step": 65680, "train_speed(iter/s)": 0.292301 }, { "acc": 0.73715391, "epoch": 0.7348510374737678, "grad_norm": 6.0, "learning_rate": 7.4891009148140306e-06, "loss": 1.04697084, "memory(GiB)": 141.16, "step": 65700, "train_speed(iter/s)": 0.29233 }, { "acc": 0.73986149, "epoch": 0.7350747364197263, "grad_norm": 5.6875, "learning_rate": 7.487496778391141e-06, "loss": 1.02846203, "memory(GiB)": 141.16, "step": 65720, "train_speed(iter/s)": 0.29236 }, { "acc": 0.72904453, "epoch": 0.7352984353656848, "grad_norm": 9.75, "learning_rate": 7.485892301643137e-06, "loss": 1.09270706, "memory(GiB)": 141.16, "step": 65740, "train_speed(iter/s)": 0.29239 }, { "acc": 0.72883558, "epoch": 0.7355221343116434, "grad_norm": 7.09375, "learning_rate": 7.484287484789537e-06, "loss": 1.07830772, "memory(GiB)": 141.16, "step": 65760, "train_speed(iter/s)": 0.292419 }, { "acc": 0.71748548, "epoch": 0.7357458332576019, "grad_norm": 6.21875, "learning_rate": 7.482682328049899e-06, "loss": 1.13842297, "memory(GiB)": 141.16, "step": 65780, "train_speed(iter/s)": 0.292448 }, { "acc": 0.73651772, "epoch": 0.7359695322035604, "grad_norm": 7.0625, "learning_rate": 7.481076831643832e-06, "loss": 1.05367584, "memory(GiB)": 141.16, "step": 65800, "train_speed(iter/s)": 0.292477 }, { "acc": 0.7230649, "epoch": 0.736193231149519, "grad_norm": 7.5, "learning_rate": 7.4794709957909925e-06, "loss": 1.1077837, "memory(GiB)": 141.16, "step": 65820, "train_speed(iter/s)": 0.292506 }, { "acc": 0.74647903, "epoch": 0.7364169300954775, "grad_norm": 8.0625, "learning_rate": 7.477864820711081e-06, "loss": 1.01336575, "memory(GiB)": 141.16, "step": 65840, "train_speed(iter/s)": 0.292535 }, { "acc": 0.72575254, "epoch": 0.736640629041436, "grad_norm": 7.46875, "learning_rate": 7.476258306623846e-06, "loss": 1.11371708, "memory(GiB)": 141.16, "step": 65860, "train_speed(iter/s)": 0.292565 }, { "acc": 0.71785011, "epoch": 0.7368643279873945, "grad_norm": 7.8125, "learning_rate": 7.47465145374908e-06, "loss": 1.14211292, "memory(GiB)": 141.16, "step": 65880, "train_speed(iter/s)": 0.292595 }, { "acc": 0.72336669, "epoch": 0.7370880269333531, "grad_norm": 6.96875, "learning_rate": 7.4730442623066235e-06, "loss": 1.12248755, "memory(GiB)": 141.16, "step": 65900, "train_speed(iter/s)": 0.292627 }, { "acc": 0.73277988, "epoch": 0.7373117258793116, "grad_norm": 9.1875, "learning_rate": 7.471436732516364e-06, "loss": 1.06394796, "memory(GiB)": 141.16, "step": 65920, "train_speed(iter/s)": 0.29265 }, { "acc": 0.73300304, "epoch": 0.7375354248252701, "grad_norm": 9.1875, "learning_rate": 7.469828864598236e-06, "loss": 1.06952591, "memory(GiB)": 141.16, "step": 65940, "train_speed(iter/s)": 0.292681 }, { "acc": 0.72310324, "epoch": 0.7377591237712287, "grad_norm": 7.90625, "learning_rate": 7.468220658772216e-06, "loss": 1.12325706, "memory(GiB)": 141.16, "step": 65960, "train_speed(iter/s)": 0.29271 }, { "acc": 0.73155775, "epoch": 0.7379828227171872, "grad_norm": 7.6875, "learning_rate": 7.466612115258331e-06, "loss": 1.08323479, "memory(GiB)": 141.16, "step": 65980, "train_speed(iter/s)": 0.292739 }, { "acc": 0.73363581, "epoch": 0.7382065216631457, "grad_norm": 7.875, "learning_rate": 7.465003234276655e-06, "loss": 1.07048378, "memory(GiB)": 141.16, "step": 66000, "train_speed(iter/s)": 0.292769 }, { "epoch": 0.7382065216631457, "eval_acc": 0.6896798734377397, "eval_loss": 1.0808119773864746, "eval_runtime": 2318.1711, "eval_samples_per_second": 32.475, "eval_steps_per_second": 16.238, "step": 66000 }, { "acc": 0.73242664, "epoch": 0.7384302206091042, "grad_norm": 6.625, "learning_rate": 7.463394016047301e-06, "loss": 1.0543623, "memory(GiB)": 141.16, "step": 66020, "train_speed(iter/s)": 0.289756 }, { "acc": 0.71933765, "epoch": 0.7386539195550628, "grad_norm": 7.71875, "learning_rate": 7.461784460790435e-06, "loss": 1.11966209, "memory(GiB)": 141.16, "step": 66040, "train_speed(iter/s)": 0.289785 }, { "acc": 0.7104414, "epoch": 0.7388776185010213, "grad_norm": 7.59375, "learning_rate": 7.460174568726269e-06, "loss": 1.18445053, "memory(GiB)": 141.16, "step": 66060, "train_speed(iter/s)": 0.289815 }, { "acc": 0.7359024, "epoch": 0.7391013174469798, "grad_norm": 5.4375, "learning_rate": 7.458564340075057e-06, "loss": 1.04039936, "memory(GiB)": 141.16, "step": 66080, "train_speed(iter/s)": 0.289846 }, { "acc": 0.74243178, "epoch": 0.7393250163929384, "grad_norm": 9.25, "learning_rate": 7.456953775057105e-06, "loss": 1.03778734, "memory(GiB)": 141.16, "step": 66100, "train_speed(iter/s)": 0.289875 }, { "acc": 0.73237438, "epoch": 0.7395487153388969, "grad_norm": 5.5, "learning_rate": 7.455342873892756e-06, "loss": 1.07622948, "memory(GiB)": 141.16, "step": 66120, "train_speed(iter/s)": 0.289906 }, { "acc": 0.74227905, "epoch": 0.7397724142848554, "grad_norm": 7.78125, "learning_rate": 7.453731636802408e-06, "loss": 1.04137363, "memory(GiB)": 141.16, "step": 66140, "train_speed(iter/s)": 0.289934 }, { "acc": 0.72243471, "epoch": 0.7399961132308139, "grad_norm": 10.1875, "learning_rate": 7.452120064006499e-06, "loss": 1.12903004, "memory(GiB)": 141.16, "step": 66160, "train_speed(iter/s)": 0.289961 }, { "acc": 0.73142686, "epoch": 0.7402198121767725, "grad_norm": 6.09375, "learning_rate": 7.450508155725518e-06, "loss": 1.0694561, "memory(GiB)": 141.16, "step": 66180, "train_speed(iter/s)": 0.28999 }, { "acc": 0.73307323, "epoch": 0.740443511122731, "grad_norm": 6.625, "learning_rate": 7.448895912179994e-06, "loss": 1.05797482, "memory(GiB)": 141.16, "step": 66200, "train_speed(iter/s)": 0.29002 }, { "acc": 0.73338261, "epoch": 0.7406672100686895, "grad_norm": 7.5625, "learning_rate": 7.447283333590507e-06, "loss": 1.05989943, "memory(GiB)": 141.16, "step": 66220, "train_speed(iter/s)": 0.290049 }, { "acc": 0.73758812, "epoch": 0.740890909014648, "grad_norm": 6.78125, "learning_rate": 7.445670420177681e-06, "loss": 1.03732586, "memory(GiB)": 141.16, "step": 66240, "train_speed(iter/s)": 0.29008 }, { "acc": 0.73321934, "epoch": 0.7411146079606066, "grad_norm": 7.1875, "learning_rate": 7.444057172162184e-06, "loss": 1.05541267, "memory(GiB)": 141.16, "step": 66260, "train_speed(iter/s)": 0.290109 }, { "acc": 0.72823458, "epoch": 0.7413383069065651, "grad_norm": 6.34375, "learning_rate": 7.4424435897647316e-06, "loss": 1.08959589, "memory(GiB)": 141.16, "step": 66280, "train_speed(iter/s)": 0.290137 }, { "acc": 0.71949568, "epoch": 0.7415620058525236, "grad_norm": 6.875, "learning_rate": 7.440829673206087e-06, "loss": 1.12038193, "memory(GiB)": 141.16, "step": 66300, "train_speed(iter/s)": 0.290168 }, { "acc": 0.74847665, "epoch": 0.7417857047984822, "grad_norm": 6.59375, "learning_rate": 7.439215422707056e-06, "loss": 0.99147797, "memory(GiB)": 141.16, "step": 66320, "train_speed(iter/s)": 0.290199 }, { "acc": 0.72873759, "epoch": 0.7420094037444407, "grad_norm": 7.625, "learning_rate": 7.437600838488488e-06, "loss": 1.08586445, "memory(GiB)": 141.16, "step": 66340, "train_speed(iter/s)": 0.290229 }, { "acc": 0.72271528, "epoch": 0.7422331026903992, "grad_norm": 5.15625, "learning_rate": 7.4359859207712855e-06, "loss": 1.11841564, "memory(GiB)": 141.16, "step": 66360, "train_speed(iter/s)": 0.290258 }, { "acc": 0.73122058, "epoch": 0.7424568016363577, "grad_norm": 7.53125, "learning_rate": 7.434370669776392e-06, "loss": 1.08432646, "memory(GiB)": 141.16, "step": 66380, "train_speed(iter/s)": 0.290289 }, { "acc": 0.7324645, "epoch": 0.7426805005823163, "grad_norm": 7.34375, "learning_rate": 7.432755085724794e-06, "loss": 1.06964607, "memory(GiB)": 141.16, "step": 66400, "train_speed(iter/s)": 0.290319 }, { "acc": 0.74037657, "epoch": 0.7429041995282748, "grad_norm": 6.53125, "learning_rate": 7.431139168837529e-06, "loss": 1.0521718, "memory(GiB)": 141.16, "step": 66420, "train_speed(iter/s)": 0.290346 }, { "acc": 0.73484011, "epoch": 0.7431278984742333, "grad_norm": 8.25, "learning_rate": 7.429522919335676e-06, "loss": 1.05222511, "memory(GiB)": 141.16, "step": 66440, "train_speed(iter/s)": 0.290377 }, { "acc": 0.72295904, "epoch": 0.7433515974201919, "grad_norm": 6.34375, "learning_rate": 7.427906337440362e-06, "loss": 1.10419979, "memory(GiB)": 141.16, "step": 66460, "train_speed(iter/s)": 0.290406 }, { "acc": 0.72054825, "epoch": 0.7435752963661504, "grad_norm": 7.0625, "learning_rate": 7.426289423372759e-06, "loss": 1.12783146, "memory(GiB)": 141.16, "step": 66480, "train_speed(iter/s)": 0.290435 }, { "acc": 0.73801289, "epoch": 0.7437989953121089, "grad_norm": 5.90625, "learning_rate": 7.424672177354084e-06, "loss": 1.04222622, "memory(GiB)": 141.16, "step": 66500, "train_speed(iter/s)": 0.290465 }, { "acc": 0.71507215, "epoch": 0.7440226942580674, "grad_norm": 8.0625, "learning_rate": 7.423054599605597e-06, "loss": 1.16769981, "memory(GiB)": 141.16, "step": 66520, "train_speed(iter/s)": 0.290497 }, { "acc": 0.72768669, "epoch": 0.744246393204026, "grad_norm": 6.15625, "learning_rate": 7.421436690348608e-06, "loss": 1.09585791, "memory(GiB)": 141.16, "step": 66540, "train_speed(iter/s)": 0.290528 }, { "acc": 0.72889395, "epoch": 0.7444700921499845, "grad_norm": 7.1875, "learning_rate": 7.419818449804469e-06, "loss": 1.08924532, "memory(GiB)": 141.16, "step": 66560, "train_speed(iter/s)": 0.29056 }, { "acc": 0.73084059, "epoch": 0.744693791095943, "grad_norm": 6.8125, "learning_rate": 7.418199878194579e-06, "loss": 1.08861809, "memory(GiB)": 141.16, "step": 66580, "train_speed(iter/s)": 0.290582 }, { "acc": 0.73415461, "epoch": 0.7449174900419017, "grad_norm": 6.78125, "learning_rate": 7.416580975740382e-06, "loss": 1.04990807, "memory(GiB)": 141.16, "step": 66600, "train_speed(iter/s)": 0.290605 }, { "acc": 0.74641662, "epoch": 0.7451411889878602, "grad_norm": 7.75, "learning_rate": 7.414961742663367e-06, "loss": 1.01759977, "memory(GiB)": 141.16, "step": 66620, "train_speed(iter/s)": 0.290632 }, { "acc": 0.73819113, "epoch": 0.7453648879338187, "grad_norm": 7.5, "learning_rate": 7.413342179185065e-06, "loss": 1.04738483, "memory(GiB)": 141.16, "step": 66640, "train_speed(iter/s)": 0.290659 }, { "acc": 0.73229904, "epoch": 0.7455885868797772, "grad_norm": 6.75, "learning_rate": 7.411722285527061e-06, "loss": 1.06471863, "memory(GiB)": 141.16, "step": 66660, "train_speed(iter/s)": 0.290689 }, { "acc": 0.71945801, "epoch": 0.7458122858257358, "grad_norm": 5.65625, "learning_rate": 7.4101020619109765e-06, "loss": 1.13466606, "memory(GiB)": 141.16, "step": 66680, "train_speed(iter/s)": 0.290717 }, { "acc": 0.73273563, "epoch": 0.7460359847716943, "grad_norm": 6.46875, "learning_rate": 7.4084815085584816e-06, "loss": 1.07292099, "memory(GiB)": 141.16, "step": 66700, "train_speed(iter/s)": 0.290746 }, { "acc": 0.72021747, "epoch": 0.7462596837176528, "grad_norm": 8.0, "learning_rate": 7.406860625691292e-06, "loss": 1.11820269, "memory(GiB)": 141.16, "step": 66720, "train_speed(iter/s)": 0.290776 }, { "acc": 0.74632902, "epoch": 0.7464833826636114, "grad_norm": 8.375, "learning_rate": 7.4052394135311655e-06, "loss": 1.00900707, "memory(GiB)": 141.16, "step": 66740, "train_speed(iter/s)": 0.290803 }, { "acc": 0.73010464, "epoch": 0.7467070816095699, "grad_norm": 6.34375, "learning_rate": 7.403617872299908e-06, "loss": 1.09937248, "memory(GiB)": 141.16, "step": 66760, "train_speed(iter/s)": 0.290829 }, { "acc": 0.72379589, "epoch": 0.7469307805555284, "grad_norm": 6.40625, "learning_rate": 7.4019960022193715e-06, "loss": 1.09975414, "memory(GiB)": 141.16, "step": 66780, "train_speed(iter/s)": 0.290858 }, { "acc": 0.70976872, "epoch": 0.747154479501487, "grad_norm": 9.6875, "learning_rate": 7.400373803511448e-06, "loss": 1.16467304, "memory(GiB)": 141.16, "step": 66800, "train_speed(iter/s)": 0.290886 }, { "acc": 0.73677073, "epoch": 0.7473781784474455, "grad_norm": 6.46875, "learning_rate": 7.398751276398081e-06, "loss": 1.06279364, "memory(GiB)": 141.16, "step": 66820, "train_speed(iter/s)": 0.290914 }, { "acc": 0.72907333, "epoch": 0.747601877393404, "grad_norm": 6.03125, "learning_rate": 7.397128421101252e-06, "loss": 1.08374586, "memory(GiB)": 141.16, "step": 66840, "train_speed(iter/s)": 0.290944 }, { "acc": 0.73286295, "epoch": 0.7478255763393625, "grad_norm": 6.1875, "learning_rate": 7.39550523784299e-06, "loss": 1.08191576, "memory(GiB)": 141.16, "step": 66860, "train_speed(iter/s)": 0.29097 }, { "acc": 0.7204896, "epoch": 0.7480492752853211, "grad_norm": 8.75, "learning_rate": 7.393881726845374e-06, "loss": 1.12593431, "memory(GiB)": 141.16, "step": 66880, "train_speed(iter/s)": 0.290998 }, { "acc": 0.73626227, "epoch": 0.7482729742312796, "grad_norm": 6.96875, "learning_rate": 7.392257888330522e-06, "loss": 1.04886131, "memory(GiB)": 141.16, "step": 66900, "train_speed(iter/s)": 0.291027 }, { "acc": 0.73258133, "epoch": 0.7484966731772381, "grad_norm": 6.53125, "learning_rate": 7.390633722520597e-06, "loss": 1.06655331, "memory(GiB)": 141.16, "step": 66920, "train_speed(iter/s)": 0.291056 }, { "acc": 0.73011265, "epoch": 0.7487203721231966, "grad_norm": 6.03125, "learning_rate": 7.389009229637809e-06, "loss": 1.09922886, "memory(GiB)": 141.16, "step": 66940, "train_speed(iter/s)": 0.291087 }, { "acc": 0.73045979, "epoch": 0.7489440710691552, "grad_norm": 7.0625, "learning_rate": 7.387384409904411e-06, "loss": 1.0840661, "memory(GiB)": 141.16, "step": 66960, "train_speed(iter/s)": 0.291117 }, { "acc": 0.73671913, "epoch": 0.7491677700151137, "grad_norm": 8.125, "learning_rate": 7.385759263542702e-06, "loss": 1.04188309, "memory(GiB)": 141.16, "step": 66980, "train_speed(iter/s)": 0.291149 }, { "acc": 0.73028955, "epoch": 0.7493914689610722, "grad_norm": 8.5, "learning_rate": 7.384133790775025e-06, "loss": 1.07224836, "memory(GiB)": 141.16, "step": 67000, "train_speed(iter/s)": 0.291177 }, { "acc": 0.73386717, "epoch": 0.7496151679070308, "grad_norm": 6.84375, "learning_rate": 7.382507991823771e-06, "loss": 1.05974636, "memory(GiB)": 141.16, "step": 67020, "train_speed(iter/s)": 0.291205 }, { "acc": 0.74070773, "epoch": 0.7498388668529893, "grad_norm": 8.0625, "learning_rate": 7.380881866911367e-06, "loss": 1.02216988, "memory(GiB)": 141.16, "step": 67040, "train_speed(iter/s)": 0.291234 }, { "acc": 0.73567314, "epoch": 0.7500625657989478, "grad_norm": 7.5625, "learning_rate": 7.379255416260294e-06, "loss": 1.05909567, "memory(GiB)": 141.16, "step": 67060, "train_speed(iter/s)": 0.291267 }, { "acc": 0.7303628, "epoch": 0.7502862647449063, "grad_norm": 8.125, "learning_rate": 7.377628640093072e-06, "loss": 1.07711926, "memory(GiB)": 141.16, "step": 67080, "train_speed(iter/s)": 0.291297 }, { "acc": 0.73188639, "epoch": 0.7505099636908649, "grad_norm": 4.84375, "learning_rate": 7.376001538632268e-06, "loss": 1.07160044, "memory(GiB)": 141.16, "step": 67100, "train_speed(iter/s)": 0.291327 }, { "acc": 0.71432714, "epoch": 0.7507336626368234, "grad_norm": 9.0625, "learning_rate": 7.374374112100493e-06, "loss": 1.16058865, "memory(GiB)": 141.16, "step": 67120, "train_speed(iter/s)": 0.291356 }, { "acc": 0.72528815, "epoch": 0.7509573615827819, "grad_norm": 6.53125, "learning_rate": 7.372746360720403e-06, "loss": 1.10821476, "memory(GiB)": 141.16, "step": 67140, "train_speed(iter/s)": 0.291385 }, { "acc": 0.73254337, "epoch": 0.7511810605287405, "grad_norm": 7.0625, "learning_rate": 7.371118284714695e-06, "loss": 1.06801968, "memory(GiB)": 141.16, "step": 67160, "train_speed(iter/s)": 0.291415 }, { "acc": 0.72461743, "epoch": 0.751404759474699, "grad_norm": 4.75, "learning_rate": 7.369489884306115e-06, "loss": 1.1030777, "memory(GiB)": 141.16, "step": 67180, "train_speed(iter/s)": 0.291445 }, { "acc": 0.72650461, "epoch": 0.7516284584206575, "grad_norm": 7.21875, "learning_rate": 7.367861159717451e-06, "loss": 1.0984724, "memory(GiB)": 141.16, "step": 67200, "train_speed(iter/s)": 0.291472 }, { "acc": 0.72657986, "epoch": 0.751852157366616, "grad_norm": 8.0625, "learning_rate": 7.366232111171535e-06, "loss": 1.10322037, "memory(GiB)": 141.16, "step": 67220, "train_speed(iter/s)": 0.291498 }, { "acc": 0.72666702, "epoch": 0.7520758563125746, "grad_norm": 7.09375, "learning_rate": 7.3646027388912465e-06, "loss": 1.10444546, "memory(GiB)": 141.16, "step": 67240, "train_speed(iter/s)": 0.291526 }, { "acc": 0.72493458, "epoch": 0.7522995552585331, "grad_norm": 6.6875, "learning_rate": 7.362973043099504e-06, "loss": 1.09889393, "memory(GiB)": 141.16, "step": 67260, "train_speed(iter/s)": 0.291553 }, { "acc": 0.73875227, "epoch": 0.7525232542044916, "grad_norm": 6.3125, "learning_rate": 7.3613430240192754e-06, "loss": 1.04235668, "memory(GiB)": 141.16, "step": 67280, "train_speed(iter/s)": 0.291583 }, { "acc": 0.73051805, "epoch": 0.7527469531504501, "grad_norm": 7.84375, "learning_rate": 7.3597126818735686e-06, "loss": 1.08353748, "memory(GiB)": 141.16, "step": 67300, "train_speed(iter/s)": 0.291614 }, { "acc": 0.72956295, "epoch": 0.7529706520964087, "grad_norm": 5.71875, "learning_rate": 7.35808201688544e-06, "loss": 1.10002966, "memory(GiB)": 141.16, "step": 67320, "train_speed(iter/s)": 0.291638 }, { "acc": 0.72449093, "epoch": 0.7531943510423672, "grad_norm": 5.96875, "learning_rate": 7.356451029277987e-06, "loss": 1.12555847, "memory(GiB)": 141.16, "step": 67340, "train_speed(iter/s)": 0.291667 }, { "acc": 0.7400219, "epoch": 0.7534180499883257, "grad_norm": 7.53125, "learning_rate": 7.354819719274351e-06, "loss": 1.02651758, "memory(GiB)": 141.16, "step": 67360, "train_speed(iter/s)": 0.291692 }, { "acc": 0.73515611, "epoch": 0.7536417489342843, "grad_norm": 8.4375, "learning_rate": 7.353188087097719e-06, "loss": 1.06256123, "memory(GiB)": 141.16, "step": 67380, "train_speed(iter/s)": 0.29172 }, { "acc": 0.74471121, "epoch": 0.7538654478802428, "grad_norm": 8.9375, "learning_rate": 7.351556132971323e-06, "loss": 1.00929642, "memory(GiB)": 141.16, "step": 67400, "train_speed(iter/s)": 0.291751 }, { "acc": 0.73625216, "epoch": 0.7540891468262013, "grad_norm": 7.6875, "learning_rate": 7.349923857118435e-06, "loss": 1.0590313, "memory(GiB)": 141.16, "step": 67420, "train_speed(iter/s)": 0.291775 }, { "acc": 0.72023854, "epoch": 0.7543128457721598, "grad_norm": 5.96875, "learning_rate": 7.348291259762376e-06, "loss": 1.13570518, "memory(GiB)": 141.16, "step": 67440, "train_speed(iter/s)": 0.291805 }, { "acc": 0.73462076, "epoch": 0.7545365447181184, "grad_norm": 6.0, "learning_rate": 7.346658341126508e-06, "loss": 1.06653776, "memory(GiB)": 141.16, "step": 67460, "train_speed(iter/s)": 0.291833 }, { "acc": 0.72461181, "epoch": 0.7547602436640769, "grad_norm": 7.75, "learning_rate": 7.345025101434238e-06, "loss": 1.1074111, "memory(GiB)": 141.16, "step": 67480, "train_speed(iter/s)": 0.291859 }, { "acc": 0.71616745, "epoch": 0.7549839426100354, "grad_norm": 6.78125, "learning_rate": 7.343391540909014e-06, "loss": 1.14190426, "memory(GiB)": 141.16, "step": 67500, "train_speed(iter/s)": 0.291889 }, { "acc": 0.72072191, "epoch": 0.755207641555994, "grad_norm": 6.8125, "learning_rate": 7.341757659774333e-06, "loss": 1.11876469, "memory(GiB)": 141.16, "step": 67520, "train_speed(iter/s)": 0.291918 }, { "acc": 0.72406926, "epoch": 0.7554313405019525, "grad_norm": 6.78125, "learning_rate": 7.340123458253735e-06, "loss": 1.1085825, "memory(GiB)": 141.16, "step": 67540, "train_speed(iter/s)": 0.291944 }, { "acc": 0.72525196, "epoch": 0.755655039447911, "grad_norm": 6.03125, "learning_rate": 7.3384889365707975e-06, "loss": 1.10465355, "memory(GiB)": 141.16, "step": 67560, "train_speed(iter/s)": 0.291973 }, { "acc": 0.72701349, "epoch": 0.7558787383938695, "grad_norm": 5.9375, "learning_rate": 7.336854094949149e-06, "loss": 1.09034462, "memory(GiB)": 141.16, "step": 67580, "train_speed(iter/s)": 0.292005 }, { "acc": 0.74145374, "epoch": 0.7561024373398281, "grad_norm": 5.9375, "learning_rate": 7.33521893361246e-06, "loss": 1.02815552, "memory(GiB)": 141.16, "step": 67600, "train_speed(iter/s)": 0.292036 }, { "acc": 0.72918997, "epoch": 0.7563261362857866, "grad_norm": 6.90625, "learning_rate": 7.333583452784443e-06, "loss": 1.0961381, "memory(GiB)": 141.16, "step": 67620, "train_speed(iter/s)": 0.292066 }, { "acc": 0.73691673, "epoch": 0.7565498352317451, "grad_norm": 7.25, "learning_rate": 7.331947652688854e-06, "loss": 1.05704317, "memory(GiB)": 141.16, "step": 67640, "train_speed(iter/s)": 0.292097 }, { "acc": 0.72619834, "epoch": 0.7567735341777037, "grad_norm": 4.65625, "learning_rate": 7.330311533549496e-06, "loss": 1.10328922, "memory(GiB)": 141.16, "step": 67660, "train_speed(iter/s)": 0.292126 }, { "acc": 0.72672901, "epoch": 0.7569972331236622, "grad_norm": 6.84375, "learning_rate": 7.328675095590212e-06, "loss": 1.09217005, "memory(GiB)": 141.16, "step": 67680, "train_speed(iter/s)": 0.292157 }, { "acc": 0.74053497, "epoch": 0.7572209320696207, "grad_norm": 5.65625, "learning_rate": 7.327038339034889e-06, "loss": 1.03240166, "memory(GiB)": 141.16, "step": 67700, "train_speed(iter/s)": 0.292186 }, { "acc": 0.718715, "epoch": 0.7574446310155792, "grad_norm": 6.78125, "learning_rate": 7.325401264107462e-06, "loss": 1.13840303, "memory(GiB)": 141.16, "step": 67720, "train_speed(iter/s)": 0.292219 }, { "acc": 0.73940706, "epoch": 0.7576683299615378, "grad_norm": 6.46875, "learning_rate": 7.3237638710319035e-06, "loss": 1.0369606, "memory(GiB)": 141.16, "step": 67740, "train_speed(iter/s)": 0.29225 }, { "acc": 0.73605423, "epoch": 0.7578920289074963, "grad_norm": 4.84375, "learning_rate": 7.3221261600322345e-06, "loss": 1.06534843, "memory(GiB)": 141.16, "step": 67760, "train_speed(iter/s)": 0.292282 }, { "acc": 0.73007479, "epoch": 0.7581157278534548, "grad_norm": 7.4375, "learning_rate": 7.3204881313325145e-06, "loss": 1.0653265, "memory(GiB)": 141.16, "step": 67780, "train_speed(iter/s)": 0.292315 }, { "acc": 0.73266726, "epoch": 0.7583394267994134, "grad_norm": 6.8125, "learning_rate": 7.318849785156852e-06, "loss": 1.07362976, "memory(GiB)": 141.16, "step": 67800, "train_speed(iter/s)": 0.29234 }, { "acc": 0.73381491, "epoch": 0.7585631257453719, "grad_norm": 6.1875, "learning_rate": 7.317211121729394e-06, "loss": 1.05832014, "memory(GiB)": 141.16, "step": 67820, "train_speed(iter/s)": 0.292369 }, { "acc": 0.71790533, "epoch": 0.7587868246913304, "grad_norm": 7.125, "learning_rate": 7.315572141274334e-06, "loss": 1.13988771, "memory(GiB)": 141.16, "step": 67840, "train_speed(iter/s)": 0.2924 }, { "acc": 0.72914352, "epoch": 0.7590105236372889, "grad_norm": 7.8125, "learning_rate": 7.313932844015909e-06, "loss": 1.0899271, "memory(GiB)": 141.16, "step": 67860, "train_speed(iter/s)": 0.292429 }, { "acc": 0.7240159, "epoch": 0.7592342225832475, "grad_norm": 7.65625, "learning_rate": 7.312293230178396e-06, "loss": 1.11492538, "memory(GiB)": 141.16, "step": 67880, "train_speed(iter/s)": 0.292456 }, { "acc": 0.73755112, "epoch": 0.759457921529206, "grad_norm": 7.4375, "learning_rate": 7.310653299986119e-06, "loss": 1.04689827, "memory(GiB)": 141.16, "step": 67900, "train_speed(iter/s)": 0.292485 }, { "acc": 0.72773609, "epoch": 0.7596816204751645, "grad_norm": 6.625, "learning_rate": 7.309013053663443e-06, "loss": 1.09377575, "memory(GiB)": 141.16, "step": 67920, "train_speed(iter/s)": 0.292513 }, { "acc": 0.74613314, "epoch": 0.759905319421123, "grad_norm": 8.125, "learning_rate": 7.307372491434779e-06, "loss": 1.01528654, "memory(GiB)": 141.16, "step": 67940, "train_speed(iter/s)": 0.29254 }, { "acc": 0.72269125, "epoch": 0.7601290183670816, "grad_norm": 6.40625, "learning_rate": 7.305731613524578e-06, "loss": 1.11966829, "memory(GiB)": 141.16, "step": 67960, "train_speed(iter/s)": 0.292568 }, { "acc": 0.72582254, "epoch": 0.7603527173130401, "grad_norm": 6.75, "learning_rate": 7.304090420157336e-06, "loss": 1.10466747, "memory(GiB)": 141.16, "step": 67980, "train_speed(iter/s)": 0.292597 }, { "acc": 0.73637371, "epoch": 0.7605764162589986, "grad_norm": 7.8125, "learning_rate": 7.302448911557591e-06, "loss": 1.04270182, "memory(GiB)": 141.16, "step": 68000, "train_speed(iter/s)": 0.292626 }, { "epoch": 0.7605764162589986, "eval_acc": 0.6897573677794696, "eval_loss": 1.080535888671875, "eval_runtime": 2319.7837, "eval_samples_per_second": 32.453, "eval_steps_per_second": 16.227, "step": 68000 }, { "acc": 0.73077145, "epoch": 0.7608001152049572, "grad_norm": 9.125, "learning_rate": 7.300807087949925e-06, "loss": 1.08060226, "memory(GiB)": 141.16, "step": 68020, "train_speed(iter/s)": 0.289698 }, { "acc": 0.72789297, "epoch": 0.7610238141509157, "grad_norm": 5.78125, "learning_rate": 7.299164949558963e-06, "loss": 1.08851357, "memory(GiB)": 141.16, "step": 68040, "train_speed(iter/s)": 0.289726 }, { "acc": 0.72939873, "epoch": 0.7612475130968742, "grad_norm": 7.1875, "learning_rate": 7.297522496609375e-06, "loss": 1.08660011, "memory(GiB)": 141.16, "step": 68060, "train_speed(iter/s)": 0.289756 }, { "acc": 0.7430934, "epoch": 0.7614712120428327, "grad_norm": 7.3125, "learning_rate": 7.295879729325868e-06, "loss": 1.02401562, "memory(GiB)": 141.16, "step": 68080, "train_speed(iter/s)": 0.289782 }, { "acc": 0.73440161, "epoch": 0.7616949109887913, "grad_norm": 7.75, "learning_rate": 7.294236647933201e-06, "loss": 1.06811485, "memory(GiB)": 141.16, "step": 68100, "train_speed(iter/s)": 0.289809 }, { "acc": 0.72824254, "epoch": 0.7619186099347498, "grad_norm": 7.90625, "learning_rate": 7.292593252656166e-06, "loss": 1.07180309, "memory(GiB)": 141.16, "step": 68120, "train_speed(iter/s)": 0.289839 }, { "acc": 0.73911133, "epoch": 0.7621423088807083, "grad_norm": 7.34375, "learning_rate": 7.290949543719607e-06, "loss": 1.02678738, "memory(GiB)": 141.16, "step": 68140, "train_speed(iter/s)": 0.289871 }, { "acc": 0.72148581, "epoch": 0.7623660078266669, "grad_norm": 11.1875, "learning_rate": 7.289305521348404e-06, "loss": 1.13651419, "memory(GiB)": 141.16, "step": 68160, "train_speed(iter/s)": 0.289901 }, { "acc": 0.72734008, "epoch": 0.7625897067726254, "grad_norm": 8.9375, "learning_rate": 7.287661185767485e-06, "loss": 1.10128956, "memory(GiB)": 141.16, "step": 68180, "train_speed(iter/s)": 0.289929 }, { "acc": 0.73150573, "epoch": 0.7628134057185839, "grad_norm": 6.875, "learning_rate": 7.286016537201817e-06, "loss": 1.08244667, "memory(GiB)": 141.16, "step": 68200, "train_speed(iter/s)": 0.289958 }, { "acc": 0.72157726, "epoch": 0.7630371046645424, "grad_norm": 7.0625, "learning_rate": 7.284371575876412e-06, "loss": 1.10529003, "memory(GiB)": 141.16, "step": 68220, "train_speed(iter/s)": 0.289989 }, { "acc": 0.72989178, "epoch": 0.763260803610501, "grad_norm": 7.09375, "learning_rate": 7.2827263020163245e-06, "loss": 1.09236526, "memory(GiB)": 141.16, "step": 68240, "train_speed(iter/s)": 0.290018 }, { "acc": 0.73522463, "epoch": 0.7634845025564595, "grad_norm": 7.65625, "learning_rate": 7.281080715846651e-06, "loss": 1.04698696, "memory(GiB)": 141.16, "step": 68260, "train_speed(iter/s)": 0.290048 }, { "acc": 0.7417459, "epoch": 0.763708201502418, "grad_norm": 6.9375, "learning_rate": 7.2794348175925314e-06, "loss": 1.02816811, "memory(GiB)": 141.16, "step": 68280, "train_speed(iter/s)": 0.290076 }, { "acc": 0.72659764, "epoch": 0.7639319004483766, "grad_norm": 9.125, "learning_rate": 7.277788607479148e-06, "loss": 1.11157742, "memory(GiB)": 141.16, "step": 68300, "train_speed(iter/s)": 0.2901 }, { "acc": 0.73511658, "epoch": 0.7641555993943351, "grad_norm": 7.15625, "learning_rate": 7.276142085731727e-06, "loss": 1.05943298, "memory(GiB)": 141.16, "step": 68320, "train_speed(iter/s)": 0.290128 }, { "acc": 0.72648382, "epoch": 0.7643792983402936, "grad_norm": 6.53125, "learning_rate": 7.274495252575533e-06, "loss": 1.08537369, "memory(GiB)": 141.16, "step": 68340, "train_speed(iter/s)": 0.290154 }, { "acc": 0.7254652, "epoch": 0.7646029972862521, "grad_norm": 6.84375, "learning_rate": 7.2728481082358805e-06, "loss": 1.09116087, "memory(GiB)": 141.16, "step": 68360, "train_speed(iter/s)": 0.290183 }, { "acc": 0.71767416, "epoch": 0.7648266962322107, "grad_norm": 6.78125, "learning_rate": 7.27120065293812e-06, "loss": 1.15195522, "memory(GiB)": 141.16, "step": 68380, "train_speed(iter/s)": 0.290209 }, { "acc": 0.72521248, "epoch": 0.7650503951781692, "grad_norm": 6.96875, "learning_rate": 7.269552886907647e-06, "loss": 1.11190834, "memory(GiB)": 141.16, "step": 68400, "train_speed(iter/s)": 0.290236 }, { "acc": 0.71674423, "epoch": 0.7652740941241277, "grad_norm": 6.5625, "learning_rate": 7.267904810369899e-06, "loss": 1.14315052, "memory(GiB)": 141.16, "step": 68420, "train_speed(iter/s)": 0.290266 }, { "acc": 0.727036, "epoch": 0.7654977930700863, "grad_norm": 6.28125, "learning_rate": 7.266256423550357e-06, "loss": 1.07931957, "memory(GiB)": 141.16, "step": 68440, "train_speed(iter/s)": 0.290294 }, { "acc": 0.73095284, "epoch": 0.7657214920160448, "grad_norm": 6.65625, "learning_rate": 7.264607726674544e-06, "loss": 1.09350109, "memory(GiB)": 141.16, "step": 68460, "train_speed(iter/s)": 0.29032 }, { "acc": 0.73433285, "epoch": 0.7659451909620033, "grad_norm": 6.375, "learning_rate": 7.262958719968026e-06, "loss": 1.06189899, "memory(GiB)": 141.16, "step": 68480, "train_speed(iter/s)": 0.290351 }, { "acc": 0.73651609, "epoch": 0.7661688899079618, "grad_norm": 11.0, "learning_rate": 7.2613094036564105e-06, "loss": 1.06108828, "memory(GiB)": 141.16, "step": 68500, "train_speed(iter/s)": 0.290373 }, { "acc": 0.73083944, "epoch": 0.7663925888539204, "grad_norm": 7.75, "learning_rate": 7.259659777965346e-06, "loss": 1.08683701, "memory(GiB)": 141.16, "step": 68520, "train_speed(iter/s)": 0.2904 }, { "acc": 0.72979264, "epoch": 0.7666162877998789, "grad_norm": 7.0, "learning_rate": 7.258009843120526e-06, "loss": 1.07410984, "memory(GiB)": 141.16, "step": 68540, "train_speed(iter/s)": 0.290428 }, { "acc": 0.73297911, "epoch": 0.7668399867458374, "grad_norm": 8.6875, "learning_rate": 7.256359599347684e-06, "loss": 1.06162186, "memory(GiB)": 141.16, "step": 68560, "train_speed(iter/s)": 0.290454 }, { "acc": 0.72596769, "epoch": 0.767063685691796, "grad_norm": 6.46875, "learning_rate": 7.254709046872601e-06, "loss": 1.09924889, "memory(GiB)": 141.16, "step": 68580, "train_speed(iter/s)": 0.290482 }, { "acc": 0.72166843, "epoch": 0.7672873846377545, "grad_norm": 7.78125, "learning_rate": 7.253058185921091e-06, "loss": 1.11232452, "memory(GiB)": 141.16, "step": 68600, "train_speed(iter/s)": 0.290508 }, { "acc": 0.7272294, "epoch": 0.767511083583713, "grad_norm": 8.6875, "learning_rate": 7.251407016719017e-06, "loss": 1.09253616, "memory(GiB)": 141.16, "step": 68620, "train_speed(iter/s)": 0.290536 }, { "acc": 0.7325191, "epoch": 0.7677347825296715, "grad_norm": 9.6875, "learning_rate": 7.249755539492285e-06, "loss": 1.04344254, "memory(GiB)": 141.16, "step": 68640, "train_speed(iter/s)": 0.290564 }, { "acc": 0.74419689, "epoch": 0.7679584814756301, "grad_norm": 7.6875, "learning_rate": 7.248103754466838e-06, "loss": 1.02131996, "memory(GiB)": 141.16, "step": 68660, "train_speed(iter/s)": 0.290592 }, { "acc": 0.72021632, "epoch": 0.7681821804215886, "grad_norm": 6.15625, "learning_rate": 7.246451661868664e-06, "loss": 1.11994476, "memory(GiB)": 141.16, "step": 68680, "train_speed(iter/s)": 0.290618 }, { "acc": 0.73737011, "epoch": 0.7684058793675471, "grad_norm": 6.15625, "learning_rate": 7.244799261923794e-06, "loss": 1.06133232, "memory(GiB)": 141.16, "step": 68700, "train_speed(iter/s)": 0.290646 }, { "acc": 0.73488283, "epoch": 0.7686295783135056, "grad_norm": 6.4375, "learning_rate": 7.243146554858299e-06, "loss": 1.04713936, "memory(GiB)": 141.16, "step": 68720, "train_speed(iter/s)": 0.290674 }, { "acc": 0.7331069, "epoch": 0.7688532772594642, "grad_norm": 7.5625, "learning_rate": 7.241493540898294e-06, "loss": 1.08842564, "memory(GiB)": 141.16, "step": 68740, "train_speed(iter/s)": 0.290703 }, { "acc": 0.73240881, "epoch": 0.7690769762054227, "grad_norm": 7.625, "learning_rate": 7.239840220269934e-06, "loss": 1.06609726, "memory(GiB)": 141.16, "step": 68760, "train_speed(iter/s)": 0.29073 }, { "acc": 0.72941389, "epoch": 0.7693006751513812, "grad_norm": 7.21875, "learning_rate": 7.2381865931994165e-06, "loss": 1.08374672, "memory(GiB)": 141.16, "step": 68780, "train_speed(iter/s)": 0.29076 }, { "acc": 0.74452524, "epoch": 0.7695243740973398, "grad_norm": 7.125, "learning_rate": 7.236532659912983e-06, "loss": 1.01219015, "memory(GiB)": 141.16, "step": 68800, "train_speed(iter/s)": 0.29079 }, { "acc": 0.73531809, "epoch": 0.7697480730432983, "grad_norm": 6.875, "learning_rate": 7.234878420636913e-06, "loss": 1.06007185, "memory(GiB)": 141.16, "step": 68820, "train_speed(iter/s)": 0.290821 }, { "acc": 0.72649374, "epoch": 0.7699717719892568, "grad_norm": 6.9375, "learning_rate": 7.2332238755975326e-06, "loss": 1.1061058, "memory(GiB)": 141.16, "step": 68840, "train_speed(iter/s)": 0.290852 }, { "acc": 0.72640252, "epoch": 0.7701954709352153, "grad_norm": 6.09375, "learning_rate": 7.231569025021205e-06, "loss": 1.10082207, "memory(GiB)": 141.16, "step": 68860, "train_speed(iter/s)": 0.29088 }, { "acc": 0.73651743, "epoch": 0.7704191698811739, "grad_norm": 7.71875, "learning_rate": 7.229913869134339e-06, "loss": 1.04828997, "memory(GiB)": 141.16, "step": 68880, "train_speed(iter/s)": 0.290907 }, { "acc": 0.7318367, "epoch": 0.7706428688271324, "grad_norm": 6.625, "learning_rate": 7.228258408163382e-06, "loss": 1.08076239, "memory(GiB)": 141.16, "step": 68900, "train_speed(iter/s)": 0.290936 }, { "acc": 0.73475022, "epoch": 0.7708665677730909, "grad_norm": 9.0, "learning_rate": 7.2266026423348275e-06, "loss": 1.09009857, "memory(GiB)": 141.16, "step": 68920, "train_speed(iter/s)": 0.290963 }, { "acc": 0.7243988, "epoch": 0.7710902667190495, "grad_norm": 4.84375, "learning_rate": 7.224946571875204e-06, "loss": 1.09614401, "memory(GiB)": 141.16, "step": 68940, "train_speed(iter/s)": 0.290989 }, { "acc": 0.73813524, "epoch": 0.771313965665008, "grad_norm": 8.5625, "learning_rate": 7.223290197011088e-06, "loss": 1.05335541, "memory(GiB)": 141.16, "step": 68960, "train_speed(iter/s)": 0.291021 }, { "acc": 0.73600559, "epoch": 0.7715376646109665, "grad_norm": 5.21875, "learning_rate": 7.2216335179690954e-06, "loss": 1.04889669, "memory(GiB)": 141.16, "step": 68980, "train_speed(iter/s)": 0.291045 }, { "acc": 0.73346872, "epoch": 0.771761363556925, "grad_norm": 6.5625, "learning_rate": 7.219976534975883e-06, "loss": 1.05044413, "memory(GiB)": 141.16, "step": 69000, "train_speed(iter/s)": 0.29107 }, { "acc": 0.72586823, "epoch": 0.7719850625028836, "grad_norm": 9.5625, "learning_rate": 7.21831924825815e-06, "loss": 1.08871956, "memory(GiB)": 141.16, "step": 69020, "train_speed(iter/s)": 0.291098 }, { "acc": 0.72854409, "epoch": 0.7722087614488421, "grad_norm": 6.96875, "learning_rate": 7.216661658042637e-06, "loss": 1.0877037, "memory(GiB)": 141.16, "step": 69040, "train_speed(iter/s)": 0.291129 }, { "acc": 0.71636477, "epoch": 0.7724324603948006, "grad_norm": 5.84375, "learning_rate": 7.2150037645561255e-06, "loss": 1.14594479, "memory(GiB)": 141.16, "step": 69060, "train_speed(iter/s)": 0.291156 }, { "acc": 0.72151756, "epoch": 0.7726561593407592, "grad_norm": 5.75, "learning_rate": 7.213345568025438e-06, "loss": 1.11918373, "memory(GiB)": 141.16, "step": 69080, "train_speed(iter/s)": 0.291186 }, { "acc": 0.72267947, "epoch": 0.7728798582867177, "grad_norm": 7.875, "learning_rate": 7.211687068677442e-06, "loss": 1.10801916, "memory(GiB)": 141.16, "step": 69100, "train_speed(iter/s)": 0.291212 }, { "acc": 0.72276449, "epoch": 0.7731035572326763, "grad_norm": 6.6875, "learning_rate": 7.210028266739043e-06, "loss": 1.11848812, "memory(GiB)": 141.16, "step": 69120, "train_speed(iter/s)": 0.291239 }, { "acc": 0.7287384, "epoch": 0.7733272561786348, "grad_norm": 7.28125, "learning_rate": 7.2083691624371885e-06, "loss": 1.10357914, "memory(GiB)": 141.16, "step": 69140, "train_speed(iter/s)": 0.291268 }, { "acc": 0.72741642, "epoch": 0.7735509551245934, "grad_norm": 5.25, "learning_rate": 7.206709755998866e-06, "loss": 1.10224342, "memory(GiB)": 141.16, "step": 69160, "train_speed(iter/s)": 0.291294 }, { "acc": 0.73629389, "epoch": 0.7737746540705519, "grad_norm": 6.40625, "learning_rate": 7.20505004765111e-06, "loss": 1.04822235, "memory(GiB)": 141.16, "step": 69180, "train_speed(iter/s)": 0.291326 }, { "acc": 0.73322048, "epoch": 0.7739983530165104, "grad_norm": 8.0625, "learning_rate": 7.203390037620988e-06, "loss": 1.06879959, "memory(GiB)": 141.16, "step": 69200, "train_speed(iter/s)": 0.29136 }, { "acc": 0.73955956, "epoch": 0.774222051962469, "grad_norm": 5.875, "learning_rate": 7.201729726135618e-06, "loss": 1.0357645, "memory(GiB)": 141.16, "step": 69220, "train_speed(iter/s)": 0.291392 }, { "acc": 0.72191887, "epoch": 0.7744457509084275, "grad_norm": 7.6875, "learning_rate": 7.20006911342215e-06, "loss": 1.09911728, "memory(GiB)": 141.16, "step": 69240, "train_speed(iter/s)": 0.291419 }, { "acc": 0.7374599, "epoch": 0.774669449854386, "grad_norm": 6.40625, "learning_rate": 7.19840819970778e-06, "loss": 1.03212328, "memory(GiB)": 141.16, "step": 69260, "train_speed(iter/s)": 0.291445 }, { "acc": 0.73752136, "epoch": 0.7748931488003445, "grad_norm": 6.1875, "learning_rate": 7.196746985219747e-06, "loss": 1.06140804, "memory(GiB)": 141.16, "step": 69280, "train_speed(iter/s)": 0.291472 }, { "acc": 0.73066258, "epoch": 0.7751168477463031, "grad_norm": 6.1875, "learning_rate": 7.1950854701853265e-06, "loss": 1.08334789, "memory(GiB)": 141.16, "step": 69300, "train_speed(iter/s)": 0.291496 }, { "acc": 0.73907886, "epoch": 0.7753405466922616, "grad_norm": 7.375, "learning_rate": 7.193423654831841e-06, "loss": 1.04581547, "memory(GiB)": 141.16, "step": 69320, "train_speed(iter/s)": 0.291525 }, { "acc": 0.7258791, "epoch": 0.7755642456382201, "grad_norm": 6.34375, "learning_rate": 7.191761539386646e-06, "loss": 1.09491005, "memory(GiB)": 141.16, "step": 69340, "train_speed(iter/s)": 0.291556 }, { "acc": 0.71929283, "epoch": 0.7757879445841787, "grad_norm": 7.09375, "learning_rate": 7.190099124077146e-06, "loss": 1.14333706, "memory(GiB)": 141.16, "step": 69360, "train_speed(iter/s)": 0.291581 }, { "acc": 0.74140806, "epoch": 0.7760116435301372, "grad_norm": 8.375, "learning_rate": 7.188436409130781e-06, "loss": 1.03142834, "memory(GiB)": 141.16, "step": 69380, "train_speed(iter/s)": 0.291607 }, { "acc": 0.72362518, "epoch": 0.7762353424760957, "grad_norm": 6.46875, "learning_rate": 7.186773394775036e-06, "loss": 1.11877422, "memory(GiB)": 141.16, "step": 69400, "train_speed(iter/s)": 0.291634 }, { "acc": 0.73122907, "epoch": 0.7764590414220542, "grad_norm": 7.78125, "learning_rate": 7.185110081237435e-06, "loss": 1.08917713, "memory(GiB)": 141.16, "step": 69420, "train_speed(iter/s)": 0.291662 }, { "acc": 0.72305694, "epoch": 0.7766827403680128, "grad_norm": 7.21875, "learning_rate": 7.183446468745542e-06, "loss": 1.11979866, "memory(GiB)": 141.16, "step": 69440, "train_speed(iter/s)": 0.291691 }, { "acc": 0.72896042, "epoch": 0.7769064393139713, "grad_norm": 9.6875, "learning_rate": 7.181782557526963e-06, "loss": 1.08209, "memory(GiB)": 141.16, "step": 69460, "train_speed(iter/s)": 0.291721 }, { "acc": 0.70916491, "epoch": 0.7771301382599298, "grad_norm": 6.625, "learning_rate": 7.180118347809345e-06, "loss": 1.18085556, "memory(GiB)": 141.16, "step": 69480, "train_speed(iter/s)": 0.291747 }, { "acc": 0.74363146, "epoch": 0.7773538372058884, "grad_norm": 8.3125, "learning_rate": 7.178453839820378e-06, "loss": 1.03474369, "memory(GiB)": 141.16, "step": 69500, "train_speed(iter/s)": 0.291777 }, { "acc": 0.72521858, "epoch": 0.7775775361518469, "grad_norm": 6.625, "learning_rate": 7.176789033787786e-06, "loss": 1.10643253, "memory(GiB)": 141.16, "step": 69520, "train_speed(iter/s)": 0.291804 }, { "acc": 0.73161583, "epoch": 0.7778012350978054, "grad_norm": 8.1875, "learning_rate": 7.175123929939343e-06, "loss": 1.07183323, "memory(GiB)": 141.16, "step": 69540, "train_speed(iter/s)": 0.29183 }, { "acc": 0.72522583, "epoch": 0.7780249340437639, "grad_norm": 8.4375, "learning_rate": 7.173458528502855e-06, "loss": 1.09706087, "memory(GiB)": 141.16, "step": 69560, "train_speed(iter/s)": 0.291857 }, { "acc": 0.73123517, "epoch": 0.7782486329897225, "grad_norm": 6.5, "learning_rate": 7.1717928297061746e-06, "loss": 1.08283348, "memory(GiB)": 141.16, "step": 69580, "train_speed(iter/s)": 0.291885 }, { "acc": 0.73278313, "epoch": 0.778472331935681, "grad_norm": 7.4375, "learning_rate": 7.170126833777194e-06, "loss": 1.06768579, "memory(GiB)": 141.16, "step": 69600, "train_speed(iter/s)": 0.291911 }, { "acc": 0.73067217, "epoch": 0.7786960308816395, "grad_norm": 7.75, "learning_rate": 7.1684605409438425e-06, "loss": 1.09094677, "memory(GiB)": 141.16, "step": 69620, "train_speed(iter/s)": 0.291939 }, { "acc": 0.73613582, "epoch": 0.778919729827598, "grad_norm": 6.4375, "learning_rate": 7.166793951434097e-06, "loss": 1.04508991, "memory(GiB)": 141.16, "step": 69640, "train_speed(iter/s)": 0.291964 }, { "acc": 0.73528481, "epoch": 0.7791434287735566, "grad_norm": 6.90625, "learning_rate": 7.165127065475966e-06, "loss": 1.05795994, "memory(GiB)": 141.16, "step": 69660, "train_speed(iter/s)": 0.291993 }, { "acc": 0.73500128, "epoch": 0.7793671277195151, "grad_norm": 7.90625, "learning_rate": 7.163459883297506e-06, "loss": 1.06573057, "memory(GiB)": 141.16, "step": 69680, "train_speed(iter/s)": 0.292022 }, { "acc": 0.72063303, "epoch": 0.7795908266654736, "grad_norm": 12.1875, "learning_rate": 7.161792405126812e-06, "loss": 1.13256493, "memory(GiB)": 141.16, "step": 69700, "train_speed(iter/s)": 0.292051 }, { "acc": 0.73911309, "epoch": 0.7798145256114322, "grad_norm": 8.4375, "learning_rate": 7.160124631192017e-06, "loss": 1.03876638, "memory(GiB)": 141.16, "step": 69720, "train_speed(iter/s)": 0.292081 }, { "acc": 0.72033935, "epoch": 0.7800382245573907, "grad_norm": 8.1875, "learning_rate": 7.158456561721299e-06, "loss": 1.13169069, "memory(GiB)": 141.16, "step": 69740, "train_speed(iter/s)": 0.29211 }, { "acc": 0.7353488, "epoch": 0.7802619235033492, "grad_norm": 6.21875, "learning_rate": 7.15678819694287e-06, "loss": 1.05943279, "memory(GiB)": 141.16, "step": 69760, "train_speed(iter/s)": 0.292139 }, { "acc": 0.73454046, "epoch": 0.7804856224493077, "grad_norm": 5.65625, "learning_rate": 7.155119537084988e-06, "loss": 1.05755148, "memory(GiB)": 141.16, "step": 69780, "train_speed(iter/s)": 0.292165 }, { "acc": 0.72363162, "epoch": 0.7807093213952663, "grad_norm": 6.90625, "learning_rate": 7.1534505823759495e-06, "loss": 1.12586479, "memory(GiB)": 141.16, "step": 69800, "train_speed(iter/s)": 0.292192 }, { "acc": 0.73326674, "epoch": 0.7809330203412248, "grad_norm": 6.96875, "learning_rate": 7.151781333044092e-06, "loss": 1.06496248, "memory(GiB)": 141.16, "step": 69820, "train_speed(iter/s)": 0.292222 }, { "acc": 0.73966303, "epoch": 0.7811567192871833, "grad_norm": 7.9375, "learning_rate": 7.150111789317793e-06, "loss": 1.04792233, "memory(GiB)": 141.16, "step": 69840, "train_speed(iter/s)": 0.292245 }, { "acc": 0.72835064, "epoch": 0.7813804182331419, "grad_norm": 5.875, "learning_rate": 7.1484419514254675e-06, "loss": 1.09073515, "memory(GiB)": 141.16, "step": 69860, "train_speed(iter/s)": 0.292272 }, { "acc": 0.7343504, "epoch": 0.7816041171791004, "grad_norm": 6.71875, "learning_rate": 7.1467718195955746e-06, "loss": 1.0711565, "memory(GiB)": 141.16, "step": 69880, "train_speed(iter/s)": 0.292299 }, { "acc": 0.73549652, "epoch": 0.7818278161250589, "grad_norm": 7.25, "learning_rate": 7.145101394056614e-06, "loss": 1.06251545, "memory(GiB)": 141.16, "step": 69900, "train_speed(iter/s)": 0.292325 }, { "acc": 0.73579988, "epoch": 0.7820515150710174, "grad_norm": 6.28125, "learning_rate": 7.143430675037121e-06, "loss": 1.0429431, "memory(GiB)": 141.16, "step": 69920, "train_speed(iter/s)": 0.29235 }, { "acc": 0.72981911, "epoch": 0.782275214016976, "grad_norm": 8.75, "learning_rate": 7.141759662765676e-06, "loss": 1.10436192, "memory(GiB)": 141.16, "step": 69940, "train_speed(iter/s)": 0.292378 }, { "acc": 0.71808605, "epoch": 0.7824989129629345, "grad_norm": 7.9375, "learning_rate": 7.140088357470895e-06, "loss": 1.1331811, "memory(GiB)": 141.16, "step": 69960, "train_speed(iter/s)": 0.292404 }, { "acc": 0.72803736, "epoch": 0.782722611908893, "grad_norm": 7.53125, "learning_rate": 7.138416759381438e-06, "loss": 1.10131779, "memory(GiB)": 141.16, "step": 69980, "train_speed(iter/s)": 0.292435 }, { "acc": 0.73806658, "epoch": 0.7829463108548516, "grad_norm": 5.71875, "learning_rate": 7.136744868726003e-06, "loss": 1.03236179, "memory(GiB)": 141.16, "step": 70000, "train_speed(iter/s)": 0.292463 }, { "epoch": 0.7829463108548516, "eval_acc": 0.6898239675604347, "eval_loss": 1.0802960395812988, "eval_runtime": 2321.783, "eval_samples_per_second": 32.425, "eval_steps_per_second": 16.213, "step": 70000 }, { "acc": 0.72897706, "epoch": 0.7831700098008101, "grad_norm": 7.46875, "learning_rate": 7.135072685733329e-06, "loss": 1.0785058, "memory(GiB)": 141.16, "step": 70020, "train_speed(iter/s)": 0.289624 }, { "acc": 0.73696232, "epoch": 0.7833937087467686, "grad_norm": 6.375, "learning_rate": 7.1334002106321965e-06, "loss": 1.04387598, "memory(GiB)": 141.16, "step": 70040, "train_speed(iter/s)": 0.289651 }, { "acc": 0.73238502, "epoch": 0.7836174076927271, "grad_norm": 5.90625, "learning_rate": 7.1317274436514195e-06, "loss": 1.06588697, "memory(GiB)": 141.16, "step": 70060, "train_speed(iter/s)": 0.289674 }, { "acc": 0.74531546, "epoch": 0.7838411066386857, "grad_norm": 7.8125, "learning_rate": 7.13005438501986e-06, "loss": 1.02715816, "memory(GiB)": 141.16, "step": 70080, "train_speed(iter/s)": 0.289703 }, { "acc": 0.73658009, "epoch": 0.7840648055846442, "grad_norm": 8.0, "learning_rate": 7.128381034966415e-06, "loss": 1.0782341, "memory(GiB)": 141.16, "step": 70100, "train_speed(iter/s)": 0.289732 }, { "acc": 0.72208195, "epoch": 0.7842885045306027, "grad_norm": 8.375, "learning_rate": 7.126707393720023e-06, "loss": 1.1330245, "memory(GiB)": 141.16, "step": 70120, "train_speed(iter/s)": 0.289756 }, { "acc": 0.73357615, "epoch": 0.7845122034765613, "grad_norm": 8.0625, "learning_rate": 7.12503346150966e-06, "loss": 1.06479225, "memory(GiB)": 141.16, "step": 70140, "train_speed(iter/s)": 0.289785 }, { "acc": 0.72436447, "epoch": 0.7847359024225198, "grad_norm": 6.625, "learning_rate": 7.123359238564349e-06, "loss": 1.12871981, "memory(GiB)": 141.16, "step": 70160, "train_speed(iter/s)": 0.289814 }, { "acc": 0.72689109, "epoch": 0.7849596013684783, "grad_norm": 6.0625, "learning_rate": 7.121684725113142e-06, "loss": 1.10417442, "memory(GiB)": 141.16, "step": 70180, "train_speed(iter/s)": 0.289841 }, { "acc": 0.73134103, "epoch": 0.7851833003144368, "grad_norm": 8.0625, "learning_rate": 7.120009921385138e-06, "loss": 1.06411915, "memory(GiB)": 141.16, "step": 70200, "train_speed(iter/s)": 0.289868 }, { "acc": 0.73748655, "epoch": 0.7854069992603954, "grad_norm": 6.5, "learning_rate": 7.118334827609477e-06, "loss": 1.05664444, "memory(GiB)": 141.16, "step": 70220, "train_speed(iter/s)": 0.289894 }, { "acc": 0.72806253, "epoch": 0.7856306982063539, "grad_norm": 5.15625, "learning_rate": 7.116659444015333e-06, "loss": 1.09623518, "memory(GiB)": 141.16, "step": 70240, "train_speed(iter/s)": 0.289917 }, { "acc": 0.72834539, "epoch": 0.7858543971523124, "grad_norm": 7.46875, "learning_rate": 7.1149837708319226e-06, "loss": 1.07613144, "memory(GiB)": 141.16, "step": 70260, "train_speed(iter/s)": 0.289944 }, { "acc": 0.74786434, "epoch": 0.786078096098271, "grad_norm": 6.375, "learning_rate": 7.1133078082885025e-06, "loss": 1.01242332, "memory(GiB)": 141.16, "step": 70280, "train_speed(iter/s)": 0.28997 }, { "acc": 0.72682304, "epoch": 0.7863017950442295, "grad_norm": 5.96875, "learning_rate": 7.111631556614367e-06, "loss": 1.11049242, "memory(GiB)": 141.16, "step": 70300, "train_speed(iter/s)": 0.289999 }, { "acc": 0.73451824, "epoch": 0.786525493990188, "grad_norm": 9.0, "learning_rate": 7.109955016038854e-06, "loss": 1.0674078, "memory(GiB)": 141.16, "step": 70320, "train_speed(iter/s)": 0.290025 }, { "acc": 0.72632895, "epoch": 0.7867491929361465, "grad_norm": 7.0625, "learning_rate": 7.108278186791335e-06, "loss": 1.09969358, "memory(GiB)": 141.16, "step": 70340, "train_speed(iter/s)": 0.290053 }, { "acc": 0.72895536, "epoch": 0.7869728918821051, "grad_norm": 7.375, "learning_rate": 7.1066010691012275e-06, "loss": 1.08755159, "memory(GiB)": 141.16, "step": 70360, "train_speed(iter/s)": 0.290082 }, { "acc": 0.73776703, "epoch": 0.7871965908280636, "grad_norm": 5.71875, "learning_rate": 7.1049236631979824e-06, "loss": 1.0514616, "memory(GiB)": 141.16, "step": 70380, "train_speed(iter/s)": 0.29011 }, { "acc": 0.72192221, "epoch": 0.7874202897740221, "grad_norm": 6.78125, "learning_rate": 7.103245969311094e-06, "loss": 1.13288603, "memory(GiB)": 141.16, "step": 70400, "train_speed(iter/s)": 0.290139 }, { "acc": 0.73528657, "epoch": 0.7876439887199806, "grad_norm": 6.09375, "learning_rate": 7.101567987670095e-06, "loss": 1.06588249, "memory(GiB)": 141.16, "step": 70420, "train_speed(iter/s)": 0.290169 }, { "acc": 0.72998486, "epoch": 0.7878676876659392, "grad_norm": 7.40625, "learning_rate": 7.099889718504557e-06, "loss": 1.07911806, "memory(GiB)": 141.16, "step": 70440, "train_speed(iter/s)": 0.290198 }, { "acc": 0.73596945, "epoch": 0.7880913866118977, "grad_norm": 8.9375, "learning_rate": 7.098211162044092e-06, "loss": 1.05208492, "memory(GiB)": 141.16, "step": 70460, "train_speed(iter/s)": 0.290225 }, { "acc": 0.73206129, "epoch": 0.7883150855578562, "grad_norm": 8.125, "learning_rate": 7.096532318518348e-06, "loss": 1.07006931, "memory(GiB)": 141.16, "step": 70480, "train_speed(iter/s)": 0.290251 }, { "acc": 0.73449135, "epoch": 0.7885387845038148, "grad_norm": 6.1875, "learning_rate": 7.094853188157017e-06, "loss": 1.06318026, "memory(GiB)": 141.16, "step": 70500, "train_speed(iter/s)": 0.290279 }, { "acc": 0.72164168, "epoch": 0.7887624834497733, "grad_norm": 7.28125, "learning_rate": 7.093173771189828e-06, "loss": 1.11755791, "memory(GiB)": 141.16, "step": 70520, "train_speed(iter/s)": 0.290309 }, { "acc": 0.73884392, "epoch": 0.7889861823957318, "grad_norm": 9.125, "learning_rate": 7.091494067846547e-06, "loss": 1.03985214, "memory(GiB)": 141.16, "step": 70540, "train_speed(iter/s)": 0.290336 }, { "acc": 0.73175159, "epoch": 0.7892098813416903, "grad_norm": 8.0, "learning_rate": 7.089814078356986e-06, "loss": 1.07249346, "memory(GiB)": 141.16, "step": 70560, "train_speed(iter/s)": 0.290363 }, { "acc": 0.73835196, "epoch": 0.7894335802876489, "grad_norm": 7.46875, "learning_rate": 7.088133802950987e-06, "loss": 1.04990263, "memory(GiB)": 141.16, "step": 70580, "train_speed(iter/s)": 0.29039 }, { "acc": 0.72527919, "epoch": 0.7896572792336074, "grad_norm": 6.46875, "learning_rate": 7.086453241858437e-06, "loss": 1.10528717, "memory(GiB)": 141.16, "step": 70600, "train_speed(iter/s)": 0.290414 }, { "acc": 0.74023552, "epoch": 0.7898809781795659, "grad_norm": 8.8125, "learning_rate": 7.084772395309263e-06, "loss": 1.03918648, "memory(GiB)": 141.16, "step": 70620, "train_speed(iter/s)": 0.290444 }, { "acc": 0.7250596, "epoch": 0.7901046771255245, "grad_norm": 7.5625, "learning_rate": 7.083091263533426e-06, "loss": 1.11108236, "memory(GiB)": 141.16, "step": 70640, "train_speed(iter/s)": 0.290472 }, { "acc": 0.74584918, "epoch": 0.790328376071483, "grad_norm": 8.125, "learning_rate": 7.08140984676093e-06, "loss": 1.00248299, "memory(GiB)": 141.16, "step": 70660, "train_speed(iter/s)": 0.2905 }, { "acc": 0.72087984, "epoch": 0.7905520750174415, "grad_norm": 4.75, "learning_rate": 7.079728145221818e-06, "loss": 1.12665358, "memory(GiB)": 141.16, "step": 70680, "train_speed(iter/s)": 0.290526 }, { "acc": 0.72927957, "epoch": 0.7907757739634, "grad_norm": 5.46875, "learning_rate": 7.078046159146168e-06, "loss": 1.09740162, "memory(GiB)": 141.16, "step": 70700, "train_speed(iter/s)": 0.290554 }, { "acc": 0.7285367, "epoch": 0.7909994729093586, "grad_norm": 8.0, "learning_rate": 7.076363888764102e-06, "loss": 1.0849721, "memory(GiB)": 141.16, "step": 70720, "train_speed(iter/s)": 0.290583 }, { "acc": 0.73513327, "epoch": 0.7912231718553171, "grad_norm": 6.96875, "learning_rate": 7.074681334305778e-06, "loss": 1.05121784, "memory(GiB)": 141.16, "step": 70740, "train_speed(iter/s)": 0.290609 }, { "acc": 0.7257556, "epoch": 0.7914468708012756, "grad_norm": 7.25, "learning_rate": 7.072998496001392e-06, "loss": 1.1123785, "memory(GiB)": 141.16, "step": 70760, "train_speed(iter/s)": 0.290636 }, { "acc": 0.73755064, "epoch": 0.7916705697472342, "grad_norm": 7.84375, "learning_rate": 7.0713153740811835e-06, "loss": 1.05184689, "memory(GiB)": 141.16, "step": 70780, "train_speed(iter/s)": 0.290662 }, { "acc": 0.73431892, "epoch": 0.7918942686931927, "grad_norm": 8.1875, "learning_rate": 7.069631968775426e-06, "loss": 1.05476913, "memory(GiB)": 141.16, "step": 70800, "train_speed(iter/s)": 0.290687 }, { "acc": 0.73288155, "epoch": 0.7921179676391512, "grad_norm": 6.96875, "learning_rate": 7.067948280314432e-06, "loss": 1.08323326, "memory(GiB)": 141.16, "step": 70820, "train_speed(iter/s)": 0.290713 }, { "acc": 0.72502923, "epoch": 0.7923416665851097, "grad_norm": 8.25, "learning_rate": 7.066264308928556e-06, "loss": 1.11285324, "memory(GiB)": 141.16, "step": 70840, "train_speed(iter/s)": 0.290741 }, { "acc": 0.72208805, "epoch": 0.7925653655310683, "grad_norm": 7.25, "learning_rate": 7.064580054848188e-06, "loss": 1.1318861, "memory(GiB)": 141.16, "step": 70860, "train_speed(iter/s)": 0.290768 }, { "acc": 0.71883507, "epoch": 0.7927890644770268, "grad_norm": 10.25, "learning_rate": 7.06289551830376e-06, "loss": 1.14694481, "memory(GiB)": 141.16, "step": 70880, "train_speed(iter/s)": 0.290795 }, { "acc": 0.71978254, "epoch": 0.7930127634229853, "grad_norm": 8.5625, "learning_rate": 7.061210699525739e-06, "loss": 1.14183931, "memory(GiB)": 141.16, "step": 70900, "train_speed(iter/s)": 0.290824 }, { "acc": 0.73064418, "epoch": 0.7932364623689439, "grad_norm": 8.125, "learning_rate": 7.059525598744633e-06, "loss": 1.06775208, "memory(GiB)": 141.16, "step": 70920, "train_speed(iter/s)": 0.290852 }, { "acc": 0.73661327, "epoch": 0.7934601613149024, "grad_norm": 5.90625, "learning_rate": 7.057840216190988e-06, "loss": 1.05330276, "memory(GiB)": 141.16, "step": 70940, "train_speed(iter/s)": 0.290882 }, { "acc": 0.73197556, "epoch": 0.7936838602608609, "grad_norm": 8.0, "learning_rate": 7.056154552095387e-06, "loss": 1.07371902, "memory(GiB)": 141.16, "step": 70960, "train_speed(iter/s)": 0.290913 }, { "acc": 0.73487868, "epoch": 0.7939075592068194, "grad_norm": 6.78125, "learning_rate": 7.054468606688456e-06, "loss": 1.06285849, "memory(GiB)": 141.16, "step": 70980, "train_speed(iter/s)": 0.290941 }, { "acc": 0.72842517, "epoch": 0.794131258152778, "grad_norm": 7.8125, "learning_rate": 7.052782380200853e-06, "loss": 1.06371975, "memory(GiB)": 141.16, "step": 71000, "train_speed(iter/s)": 0.290966 }, { "acc": 0.72507505, "epoch": 0.7943549570987365, "grad_norm": 7.15625, "learning_rate": 7.0510958728632794e-06, "loss": 1.11269188, "memory(GiB)": 141.16, "step": 71020, "train_speed(iter/s)": 0.290994 }, { "acc": 0.71838646, "epoch": 0.794578656044695, "grad_norm": 6.8125, "learning_rate": 7.049409084906474e-06, "loss": 1.13737316, "memory(GiB)": 141.16, "step": 71040, "train_speed(iter/s)": 0.291019 }, { "acc": 0.72625899, "epoch": 0.7948023549906535, "grad_norm": 6.8125, "learning_rate": 7.0477220165612115e-06, "loss": 1.08405724, "memory(GiB)": 141.16, "step": 71060, "train_speed(iter/s)": 0.291046 }, { "acc": 0.73909636, "epoch": 0.7950260539366121, "grad_norm": 8.4375, "learning_rate": 7.0460346680583105e-06, "loss": 1.050319, "memory(GiB)": 141.16, "step": 71080, "train_speed(iter/s)": 0.291072 }, { "acc": 0.7310647, "epoch": 0.7952497528825706, "grad_norm": 8.3125, "learning_rate": 7.044347039628622e-06, "loss": 1.07777824, "memory(GiB)": 141.16, "step": 71100, "train_speed(iter/s)": 0.291099 }, { "acc": 0.73825407, "epoch": 0.7954734518285291, "grad_norm": 7.0, "learning_rate": 7.042659131503037e-06, "loss": 1.03778124, "memory(GiB)": 141.16, "step": 71120, "train_speed(iter/s)": 0.291125 }, { "acc": 0.73580103, "epoch": 0.7956971507744877, "grad_norm": 9.375, "learning_rate": 7.040970943912486e-06, "loss": 1.05247717, "memory(GiB)": 141.16, "step": 71140, "train_speed(iter/s)": 0.291152 }, { "acc": 0.7118464, "epoch": 0.7959208497204462, "grad_norm": 8.4375, "learning_rate": 7.03928247708794e-06, "loss": 1.16738834, "memory(GiB)": 141.16, "step": 71160, "train_speed(iter/s)": 0.29118 }, { "acc": 0.73150167, "epoch": 0.7961445486664047, "grad_norm": 6.46875, "learning_rate": 7.037593731260401e-06, "loss": 1.0852025, "memory(GiB)": 141.16, "step": 71180, "train_speed(iter/s)": 0.291207 }, { "acc": 0.74410477, "epoch": 0.7963682476123632, "grad_norm": 6.9375, "learning_rate": 7.035904706660917e-06, "loss": 1.01616907, "memory(GiB)": 141.16, "step": 71200, "train_speed(iter/s)": 0.291235 }, { "acc": 0.73511057, "epoch": 0.7965919465583218, "grad_norm": 6.0625, "learning_rate": 7.034215403520569e-06, "loss": 1.0516036, "memory(GiB)": 141.16, "step": 71220, "train_speed(iter/s)": 0.291263 }, { "acc": 0.72926607, "epoch": 0.7968156455042803, "grad_norm": 7.125, "learning_rate": 7.032525822070477e-06, "loss": 1.08932581, "memory(GiB)": 141.16, "step": 71240, "train_speed(iter/s)": 0.291292 }, { "acc": 0.72678204, "epoch": 0.7970393444502388, "grad_norm": 6.59375, "learning_rate": 7.030835962541802e-06, "loss": 1.07160149, "memory(GiB)": 141.16, "step": 71260, "train_speed(iter/s)": 0.291318 }, { "acc": 0.732935, "epoch": 0.7972630433961974, "grad_norm": 5.96875, "learning_rate": 7.0291458251657405e-06, "loss": 1.05976143, "memory(GiB)": 141.16, "step": 71280, "train_speed(iter/s)": 0.291341 }, { "acc": 0.72332306, "epoch": 0.7974867423421559, "grad_norm": 7.125, "learning_rate": 7.027455410173528e-06, "loss": 1.09851332, "memory(GiB)": 141.16, "step": 71300, "train_speed(iter/s)": 0.29137 }, { "acc": 0.72979412, "epoch": 0.7977104412881144, "grad_norm": 5.6875, "learning_rate": 7.025764717796435e-06, "loss": 1.08787785, "memory(GiB)": 141.16, "step": 71320, "train_speed(iter/s)": 0.291398 }, { "acc": 0.72357335, "epoch": 0.7979341402340729, "grad_norm": 6.96875, "learning_rate": 7.024073748265773e-06, "loss": 1.11114721, "memory(GiB)": 141.16, "step": 71340, "train_speed(iter/s)": 0.291425 }, { "acc": 0.73497272, "epoch": 0.7981578391800315, "grad_norm": 7.125, "learning_rate": 7.022382501812892e-06, "loss": 1.06350956, "memory(GiB)": 141.16, "step": 71360, "train_speed(iter/s)": 0.29145 }, { "acc": 0.72897182, "epoch": 0.79838153812599, "grad_norm": 6.78125, "learning_rate": 7.020690978669178e-06, "loss": 1.09999447, "memory(GiB)": 141.16, "step": 71380, "train_speed(iter/s)": 0.291475 }, { "acc": 0.71756015, "epoch": 0.7986052370719485, "grad_norm": 7.5, "learning_rate": 7.018999179066055e-06, "loss": 1.14318867, "memory(GiB)": 141.16, "step": 71400, "train_speed(iter/s)": 0.291502 }, { "acc": 0.73668022, "epoch": 0.798828936017907, "grad_norm": 7.84375, "learning_rate": 7.0173071032349896e-06, "loss": 1.04102859, "memory(GiB)": 141.16, "step": 71420, "train_speed(iter/s)": 0.291528 }, { "acc": 0.71366692, "epoch": 0.7990526349638656, "grad_norm": 9.0, "learning_rate": 7.015614751407475e-06, "loss": 1.15530167, "memory(GiB)": 141.16, "step": 71440, "train_speed(iter/s)": 0.291554 }, { "acc": 0.7323637, "epoch": 0.7992763339098241, "grad_norm": 6.15625, "learning_rate": 7.013922123815054e-06, "loss": 1.07342243, "memory(GiB)": 141.16, "step": 71460, "train_speed(iter/s)": 0.291578 }, { "acc": 0.73667936, "epoch": 0.7995000328557826, "grad_norm": 6.71875, "learning_rate": 7.0122292206893e-06, "loss": 1.05015278, "memory(GiB)": 141.16, "step": 71480, "train_speed(iter/s)": 0.291607 }, { "acc": 0.73780146, "epoch": 0.7997237318017412, "grad_norm": 7.25, "learning_rate": 7.010536042261828e-06, "loss": 1.04276485, "memory(GiB)": 141.16, "step": 71500, "train_speed(iter/s)": 0.291631 }, { "acc": 0.73377471, "epoch": 0.7999474307476997, "grad_norm": 8.6875, "learning_rate": 7.0088425887642885e-06, "loss": 1.05782356, "memory(GiB)": 141.16, "step": 71520, "train_speed(iter/s)": 0.291659 }, { "acc": 0.73746872, "epoch": 0.8001711296936582, "grad_norm": 5.46875, "learning_rate": 7.00714886042837e-06, "loss": 1.03730345, "memory(GiB)": 141.16, "step": 71540, "train_speed(iter/s)": 0.291686 }, { "acc": 0.73749666, "epoch": 0.8003948286396168, "grad_norm": 9.0, "learning_rate": 7.005454857485798e-06, "loss": 1.03064146, "memory(GiB)": 141.16, "step": 71560, "train_speed(iter/s)": 0.291712 }, { "acc": 0.73122659, "epoch": 0.8006185275855753, "grad_norm": 8.0, "learning_rate": 7.003760580168337e-06, "loss": 1.07623558, "memory(GiB)": 141.16, "step": 71580, "train_speed(iter/s)": 0.291738 }, { "acc": 0.72630477, "epoch": 0.8008422265315338, "grad_norm": 6.875, "learning_rate": 7.002066028707788e-06, "loss": 1.09788256, "memory(GiB)": 141.16, "step": 71600, "train_speed(iter/s)": 0.291763 }, { "acc": 0.72501278, "epoch": 0.8010659254774923, "grad_norm": 6.34375, "learning_rate": 7.0003712033359915e-06, "loss": 1.09892483, "memory(GiB)": 141.16, "step": 71620, "train_speed(iter/s)": 0.29179 }, { "acc": 0.71888528, "epoch": 0.801289624423451, "grad_norm": 4.96875, "learning_rate": 6.998676104284822e-06, "loss": 1.14366617, "memory(GiB)": 141.16, "step": 71640, "train_speed(iter/s)": 0.291819 }, { "acc": 0.72123723, "epoch": 0.8015133233694095, "grad_norm": 7.5, "learning_rate": 6.996980731786193e-06, "loss": 1.13392773, "memory(GiB)": 141.16, "step": 71660, "train_speed(iter/s)": 0.291843 }, { "acc": 0.72734041, "epoch": 0.801737022315368, "grad_norm": 6.65625, "learning_rate": 6.995285086072056e-06, "loss": 1.08047867, "memory(GiB)": 141.16, "step": 71680, "train_speed(iter/s)": 0.291871 }, { "acc": 0.72908335, "epoch": 0.8019607212613266, "grad_norm": 8.0, "learning_rate": 6.993589167374401e-06, "loss": 1.08155556, "memory(GiB)": 141.16, "step": 71700, "train_speed(iter/s)": 0.291899 }, { "acc": 0.7338923, "epoch": 0.8021844202072851, "grad_norm": 6.78125, "learning_rate": 6.991892975925253e-06, "loss": 1.06975994, "memory(GiB)": 141.16, "step": 71720, "train_speed(iter/s)": 0.291927 }, { "acc": 0.73790941, "epoch": 0.8024081191532436, "grad_norm": 6.59375, "learning_rate": 6.990196511956675e-06, "loss": 1.04950762, "memory(GiB)": 141.16, "step": 71740, "train_speed(iter/s)": 0.291955 }, { "acc": 0.7327981, "epoch": 0.8026318180992021, "grad_norm": 6.71875, "learning_rate": 6.988499775700768e-06, "loss": 1.07237883, "memory(GiB)": 141.16, "step": 71760, "train_speed(iter/s)": 0.291983 }, { "acc": 0.73264017, "epoch": 0.8028555170451607, "grad_norm": 7.125, "learning_rate": 6.986802767389669e-06, "loss": 1.07545147, "memory(GiB)": 141.16, "step": 71780, "train_speed(iter/s)": 0.29201 }, { "acc": 0.72835989, "epoch": 0.8030792159911192, "grad_norm": 5.875, "learning_rate": 6.985105487255553e-06, "loss": 1.09554691, "memory(GiB)": 141.16, "step": 71800, "train_speed(iter/s)": 0.292038 }, { "acc": 0.72427483, "epoch": 0.8033029149370777, "grad_norm": 7.09375, "learning_rate": 6.9834079355306335e-06, "loss": 1.10673351, "memory(GiB)": 141.16, "step": 71820, "train_speed(iter/s)": 0.292068 }, { "acc": 0.74525738, "epoch": 0.8035266138830363, "grad_norm": 6.21875, "learning_rate": 6.981710112447159e-06, "loss": 1.01965075, "memory(GiB)": 141.16, "step": 71840, "train_speed(iter/s)": 0.292093 }, { "acc": 0.73025856, "epoch": 0.8037503128289948, "grad_norm": 6.78125, "learning_rate": 6.980012018237415e-06, "loss": 1.0797533, "memory(GiB)": 141.16, "step": 71860, "train_speed(iter/s)": 0.29212 }, { "acc": 0.72794657, "epoch": 0.8039740117749533, "grad_norm": 7.5, "learning_rate": 6.978313653133728e-06, "loss": 1.09372673, "memory(GiB)": 141.16, "step": 71880, "train_speed(iter/s)": 0.292143 }, { "acc": 0.727526, "epoch": 0.8041977107209118, "grad_norm": 6.5, "learning_rate": 6.976615017368455e-06, "loss": 1.06901455, "memory(GiB)": 141.16, "step": 71900, "train_speed(iter/s)": 0.29217 }, { "acc": 0.72754469, "epoch": 0.8044214096668704, "grad_norm": 6.0, "learning_rate": 6.9749161111739946e-06, "loss": 1.09009991, "memory(GiB)": 141.16, "step": 71920, "train_speed(iter/s)": 0.292194 }, { "acc": 0.73026562, "epoch": 0.8046451086128289, "grad_norm": 6.90625, "learning_rate": 6.973216934782785e-06, "loss": 1.06793232, "memory(GiB)": 141.16, "step": 71940, "train_speed(iter/s)": 0.292219 }, { "acc": 0.72962627, "epoch": 0.8048688075587874, "grad_norm": 7.25, "learning_rate": 6.9715174884272925e-06, "loss": 1.08316565, "memory(GiB)": 141.16, "step": 71960, "train_speed(iter/s)": 0.292246 }, { "acc": 0.73059335, "epoch": 0.805092506504746, "grad_norm": 5.03125, "learning_rate": 6.969817772340028e-06, "loss": 1.07513046, "memory(GiB)": 141.16, "step": 71980, "train_speed(iter/s)": 0.292274 }, { "acc": 0.72501245, "epoch": 0.8053162054507045, "grad_norm": 8.0625, "learning_rate": 6.9681177867535385e-06, "loss": 1.11372442, "memory(GiB)": 141.16, "step": 72000, "train_speed(iter/s)": 0.292298 }, { "epoch": 0.8053162054507045, "eval_acc": 0.6898991942560326, "eval_loss": 1.0800745487213135, "eval_runtime": 2322.8206, "eval_samples_per_second": 32.41, "eval_steps_per_second": 16.205, "step": 72000 }, { "acc": 0.72909923, "epoch": 0.805539904396663, "grad_norm": 7.0625, "learning_rate": 6.966417531900405e-06, "loss": 1.08669796, "memory(GiB)": 141.16, "step": 72020, "train_speed(iter/s)": 0.289536 }, { "acc": 0.74012432, "epoch": 0.8057636033426215, "grad_norm": 8.9375, "learning_rate": 6.964717008013245e-06, "loss": 1.04648609, "memory(GiB)": 141.16, "step": 72040, "train_speed(iter/s)": 0.289564 }, { "acc": 0.73123674, "epoch": 0.8059873022885801, "grad_norm": 7.0625, "learning_rate": 6.963016215324717e-06, "loss": 1.08142338, "memory(GiB)": 141.16, "step": 72060, "train_speed(iter/s)": 0.289591 }, { "acc": 0.73640985, "epoch": 0.8062110012345386, "grad_norm": 7.4375, "learning_rate": 6.961315154067513e-06, "loss": 1.05890865, "memory(GiB)": 141.16, "step": 72080, "train_speed(iter/s)": 0.289617 }, { "acc": 0.73785467, "epoch": 0.8064347001804971, "grad_norm": 6.09375, "learning_rate": 6.959613824474361e-06, "loss": 1.05890446, "memory(GiB)": 141.16, "step": 72100, "train_speed(iter/s)": 0.289637 }, { "acc": 0.72786732, "epoch": 0.8066583991264556, "grad_norm": 6.15625, "learning_rate": 6.957912226778029e-06, "loss": 1.09431381, "memory(GiB)": 141.16, "step": 72120, "train_speed(iter/s)": 0.289665 }, { "acc": 0.72995772, "epoch": 0.8068820980724142, "grad_norm": 8.25, "learning_rate": 6.9562103612113205e-06, "loss": 1.09538765, "memory(GiB)": 141.16, "step": 72140, "train_speed(iter/s)": 0.289691 }, { "acc": 0.72968636, "epoch": 0.8071057970183727, "grad_norm": 6.6875, "learning_rate": 6.9545082280070734e-06, "loss": 1.1020256, "memory(GiB)": 141.16, "step": 72160, "train_speed(iter/s)": 0.289715 }, { "acc": 0.71695938, "epoch": 0.8073294959643312, "grad_norm": 8.6875, "learning_rate": 6.952805827398164e-06, "loss": 1.15287094, "memory(GiB)": 141.16, "step": 72180, "train_speed(iter/s)": 0.289742 }, { "acc": 0.74507928, "epoch": 0.8075531949102898, "grad_norm": 7.125, "learning_rate": 6.951103159617505e-06, "loss": 1.00772057, "memory(GiB)": 141.16, "step": 72200, "train_speed(iter/s)": 0.28977 }, { "acc": 0.73298779, "epoch": 0.8077768938562483, "grad_norm": 7.5625, "learning_rate": 6.949400224898045e-06, "loss": 1.07161999, "memory(GiB)": 141.16, "step": 72220, "train_speed(iter/s)": 0.289798 }, { "acc": 0.73235731, "epoch": 0.8080005928022068, "grad_norm": 6.03125, "learning_rate": 6.9476970234727734e-06, "loss": 1.057057, "memory(GiB)": 141.16, "step": 72240, "train_speed(iter/s)": 0.289827 }, { "acc": 0.73447332, "epoch": 0.8082242917481653, "grad_norm": 8.0625, "learning_rate": 6.945993555574709e-06, "loss": 1.07364445, "memory(GiB)": 141.16, "step": 72260, "train_speed(iter/s)": 0.289849 }, { "acc": 0.72872276, "epoch": 0.8084479906941239, "grad_norm": 6.6875, "learning_rate": 6.9442898214369114e-06, "loss": 1.08548203, "memory(GiB)": 141.16, "step": 72280, "train_speed(iter/s)": 0.289875 }, { "acc": 0.73152409, "epoch": 0.8086716896400824, "grad_norm": 10.5625, "learning_rate": 6.942585821292476e-06, "loss": 1.08047695, "memory(GiB)": 141.16, "step": 72300, "train_speed(iter/s)": 0.289902 }, { "acc": 0.72849326, "epoch": 0.8088953885860409, "grad_norm": 7.59375, "learning_rate": 6.940881555374533e-06, "loss": 1.09530296, "memory(GiB)": 141.16, "step": 72320, "train_speed(iter/s)": 0.289929 }, { "acc": 0.73626461, "epoch": 0.8091190875319995, "grad_norm": 7.59375, "learning_rate": 6.939177023916255e-06, "loss": 1.0601778, "memory(GiB)": 141.16, "step": 72340, "train_speed(iter/s)": 0.289958 }, { "acc": 0.7240973, "epoch": 0.809342786477958, "grad_norm": 6.84375, "learning_rate": 6.93747222715084e-06, "loss": 1.11002569, "memory(GiB)": 141.16, "step": 72360, "train_speed(iter/s)": 0.289989 }, { "acc": 0.73188801, "epoch": 0.8095664854239165, "grad_norm": 6.25, "learning_rate": 6.935767165311532e-06, "loss": 1.08440886, "memory(GiB)": 141.16, "step": 72380, "train_speed(iter/s)": 0.290015 }, { "acc": 0.71934533, "epoch": 0.809790184369875, "grad_norm": 7.46875, "learning_rate": 6.934061838631607e-06, "loss": 1.13085718, "memory(GiB)": 141.16, "step": 72400, "train_speed(iter/s)": 0.290045 }, { "acc": 0.73343091, "epoch": 0.8100138833158336, "grad_norm": 9.0625, "learning_rate": 6.932356247344379e-06, "loss": 1.07413483, "memory(GiB)": 141.16, "step": 72420, "train_speed(iter/s)": 0.290073 }, { "acc": 0.73482866, "epoch": 0.8102375822617921, "grad_norm": 7.0625, "learning_rate": 6.930650391683198e-06, "loss": 1.06476297, "memory(GiB)": 141.16, "step": 72440, "train_speed(iter/s)": 0.290104 }, { "acc": 0.74089298, "epoch": 0.8104612812077506, "grad_norm": 8.3125, "learning_rate": 6.928944271881447e-06, "loss": 1.02715416, "memory(GiB)": 141.16, "step": 72460, "train_speed(iter/s)": 0.290135 }, { "acc": 0.73284588, "epoch": 0.8106849801537092, "grad_norm": 7.21875, "learning_rate": 6.927237888172549e-06, "loss": 1.0603941, "memory(GiB)": 141.16, "step": 72480, "train_speed(iter/s)": 0.290161 }, { "acc": 0.72448149, "epoch": 0.8109086790996677, "grad_norm": 5.78125, "learning_rate": 6.92553124078996e-06, "loss": 1.11987791, "memory(GiB)": 141.16, "step": 72500, "train_speed(iter/s)": 0.290189 }, { "acc": 0.73433266, "epoch": 0.8111323780456262, "grad_norm": 8.25, "learning_rate": 6.9238243299671746e-06, "loss": 1.05562382, "memory(GiB)": 141.16, "step": 72520, "train_speed(iter/s)": 0.290212 }, { "acc": 0.72681761, "epoch": 0.8113560769915847, "grad_norm": 6.34375, "learning_rate": 6.922117155937725e-06, "loss": 1.11741619, "memory(GiB)": 141.16, "step": 72540, "train_speed(iter/s)": 0.290238 }, { "acc": 0.72731228, "epoch": 0.8115797759375433, "grad_norm": 6.25, "learning_rate": 6.920409718935175e-06, "loss": 1.10426388, "memory(GiB)": 141.16, "step": 72560, "train_speed(iter/s)": 0.290266 }, { "acc": 0.72255964, "epoch": 0.8118034748835018, "grad_norm": 7.5, "learning_rate": 6.918702019193125e-06, "loss": 1.13158836, "memory(GiB)": 141.16, "step": 72580, "train_speed(iter/s)": 0.290293 }, { "acc": 0.72888827, "epoch": 0.8120271738294603, "grad_norm": 6.46875, "learning_rate": 6.916994056945215e-06, "loss": 1.07165709, "memory(GiB)": 141.16, "step": 72600, "train_speed(iter/s)": 0.290316 }, { "acc": 0.73644314, "epoch": 0.8122508727754189, "grad_norm": 5.5625, "learning_rate": 6.915285832425117e-06, "loss": 1.03732729, "memory(GiB)": 141.16, "step": 72620, "train_speed(iter/s)": 0.290344 }, { "acc": 0.73565354, "epoch": 0.8124745717213774, "grad_norm": 6.8125, "learning_rate": 6.913577345866542e-06, "loss": 1.0589201, "memory(GiB)": 141.16, "step": 72640, "train_speed(iter/s)": 0.290367 }, { "acc": 0.73485708, "epoch": 0.8126982706673359, "grad_norm": 7.21875, "learning_rate": 6.911868597503236e-06, "loss": 1.06203651, "memory(GiB)": 141.16, "step": 72660, "train_speed(iter/s)": 0.290395 }, { "acc": 0.72522168, "epoch": 0.8129219696132944, "grad_norm": 5.53125, "learning_rate": 6.910159587568978e-06, "loss": 1.09866657, "memory(GiB)": 141.16, "step": 72680, "train_speed(iter/s)": 0.290422 }, { "acc": 0.72346287, "epoch": 0.813145668559253, "grad_norm": 6.8125, "learning_rate": 6.908450316297586e-06, "loss": 1.10421066, "memory(GiB)": 141.16, "step": 72700, "train_speed(iter/s)": 0.290449 }, { "acc": 0.73122411, "epoch": 0.8133693675052115, "grad_norm": 5.15625, "learning_rate": 6.9067407839229115e-06, "loss": 1.06413956, "memory(GiB)": 141.16, "step": 72720, "train_speed(iter/s)": 0.290477 }, { "acc": 0.73295369, "epoch": 0.81359306645117, "grad_norm": 9.25, "learning_rate": 6.905030990678845e-06, "loss": 1.06978178, "memory(GiB)": 141.16, "step": 72740, "train_speed(iter/s)": 0.290503 }, { "acc": 0.7404263, "epoch": 0.8138167653971285, "grad_norm": 7.90625, "learning_rate": 6.9033209367993104e-06, "loss": 1.03858728, "memory(GiB)": 141.16, "step": 72760, "train_speed(iter/s)": 0.29053 }, { "acc": 0.72126894, "epoch": 0.8140404643430871, "grad_norm": 7.0625, "learning_rate": 6.901610622518266e-06, "loss": 1.11804066, "memory(GiB)": 141.16, "step": 72780, "train_speed(iter/s)": 0.290556 }, { "acc": 0.7424087, "epoch": 0.8142641632890456, "grad_norm": 9.75, "learning_rate": 6.899900048069709e-06, "loss": 1.02508621, "memory(GiB)": 141.16, "step": 72800, "train_speed(iter/s)": 0.290581 }, { "acc": 0.72926979, "epoch": 0.8144878622350041, "grad_norm": 6.46875, "learning_rate": 6.89818921368767e-06, "loss": 1.084408, "memory(GiB)": 141.16, "step": 72820, "train_speed(iter/s)": 0.290605 }, { "acc": 0.74732842, "epoch": 0.8147115611809627, "grad_norm": 6.59375, "learning_rate": 6.896478119606214e-06, "loss": 1.00092926, "memory(GiB)": 141.16, "step": 72840, "train_speed(iter/s)": 0.290631 }, { "acc": 0.72401657, "epoch": 0.8149352601269212, "grad_norm": 6.5, "learning_rate": 6.894766766059444e-06, "loss": 1.10620823, "memory(GiB)": 141.16, "step": 72860, "train_speed(iter/s)": 0.290655 }, { "acc": 0.73118582, "epoch": 0.8151589590728797, "grad_norm": 7.59375, "learning_rate": 6.893055153281499e-06, "loss": 1.08863773, "memory(GiB)": 141.16, "step": 72880, "train_speed(iter/s)": 0.290679 }, { "acc": 0.73052483, "epoch": 0.8153826580188382, "grad_norm": 8.5625, "learning_rate": 6.8913432815065504e-06, "loss": 1.07559185, "memory(GiB)": 141.16, "step": 72900, "train_speed(iter/s)": 0.290706 }, { "acc": 0.73363228, "epoch": 0.8156063569647968, "grad_norm": 5.46875, "learning_rate": 6.889631150968807e-06, "loss": 1.06764269, "memory(GiB)": 141.16, "step": 72920, "train_speed(iter/s)": 0.290731 }, { "acc": 0.7338028, "epoch": 0.8158300559107553, "grad_norm": 7.4375, "learning_rate": 6.887918761902515e-06, "loss": 1.05942554, "memory(GiB)": 141.16, "step": 72940, "train_speed(iter/s)": 0.290755 }, { "acc": 0.73611331, "epoch": 0.8160537548567138, "grad_norm": 5.78125, "learning_rate": 6.886206114541951e-06, "loss": 1.05758486, "memory(GiB)": 141.16, "step": 72960, "train_speed(iter/s)": 0.290781 }, { "acc": 0.72396331, "epoch": 0.8162774538026724, "grad_norm": 6.40625, "learning_rate": 6.88449320912143e-06, "loss": 1.10216732, "memory(GiB)": 141.16, "step": 72980, "train_speed(iter/s)": 0.290805 }, { "acc": 0.71149879, "epoch": 0.8165011527486309, "grad_norm": 7.75, "learning_rate": 6.882780045875302e-06, "loss": 1.17916718, "memory(GiB)": 141.16, "step": 73000, "train_speed(iter/s)": 0.290829 }, { "acc": 0.72698884, "epoch": 0.8167248516945894, "grad_norm": 5.6875, "learning_rate": 6.8810666250379534e-06, "loss": 1.10840826, "memory(GiB)": 141.16, "step": 73020, "train_speed(iter/s)": 0.290856 }, { "acc": 0.73290567, "epoch": 0.8169485506405479, "grad_norm": 7.4375, "learning_rate": 6.879352946843802e-06, "loss": 1.07614784, "memory(GiB)": 141.16, "step": 73040, "train_speed(iter/s)": 0.290885 }, { "acc": 0.72409258, "epoch": 0.8171722495865065, "grad_norm": 7.6875, "learning_rate": 6.877639011527309e-06, "loss": 1.1037466, "memory(GiB)": 141.16, "step": 73060, "train_speed(iter/s)": 0.290911 }, { "acc": 0.73611507, "epoch": 0.817395948532465, "grad_norm": 6.6875, "learning_rate": 6.8759248193229584e-06, "loss": 1.05347786, "memory(GiB)": 141.16, "step": 73080, "train_speed(iter/s)": 0.290937 }, { "acc": 0.7199976, "epoch": 0.8176196474784235, "grad_norm": 8.875, "learning_rate": 6.874210370465281e-06, "loss": 1.15080585, "memory(GiB)": 141.16, "step": 73100, "train_speed(iter/s)": 0.290963 }, { "acc": 0.7308742, "epoch": 0.8178433464243821, "grad_norm": 6.46875, "learning_rate": 6.8724956651888355e-06, "loss": 1.07449455, "memory(GiB)": 141.16, "step": 73120, "train_speed(iter/s)": 0.290988 }, { "acc": 0.73778558, "epoch": 0.8180670453703406, "grad_norm": 5.96875, "learning_rate": 6.870780703728219e-06, "loss": 1.05824633, "memory(GiB)": 141.16, "step": 73140, "train_speed(iter/s)": 0.291015 }, { "acc": 0.72898989, "epoch": 0.8182907443162991, "grad_norm": 6.875, "learning_rate": 6.869065486318063e-06, "loss": 1.08154087, "memory(GiB)": 141.16, "step": 73160, "train_speed(iter/s)": 0.291041 }, { "acc": 0.72532992, "epoch": 0.8185144432622576, "grad_norm": 5.65625, "learning_rate": 6.867350013193032e-06, "loss": 1.11355152, "memory(GiB)": 141.16, "step": 73180, "train_speed(iter/s)": 0.291068 }, { "acc": 0.72563977, "epoch": 0.8187381422082162, "grad_norm": 7.9375, "learning_rate": 6.86563428458783e-06, "loss": 1.08840876, "memory(GiB)": 141.16, "step": 73200, "train_speed(iter/s)": 0.291094 }, { "acc": 0.73012924, "epoch": 0.8189618411541747, "grad_norm": 7.34375, "learning_rate": 6.863918300737191e-06, "loss": 1.10343475, "memory(GiB)": 141.16, "step": 73220, "train_speed(iter/s)": 0.291123 }, { "acc": 0.74052582, "epoch": 0.8191855401001332, "grad_norm": 10.375, "learning_rate": 6.862202061875888e-06, "loss": 1.04650612, "memory(GiB)": 141.16, "step": 73240, "train_speed(iter/s)": 0.29115 }, { "acc": 0.73477516, "epoch": 0.8194092390460918, "grad_norm": 9.125, "learning_rate": 6.860485568238725e-06, "loss": 1.06176395, "memory(GiB)": 141.16, "step": 73260, "train_speed(iter/s)": 0.291181 }, { "acc": 0.73355513, "epoch": 0.8196329379920503, "grad_norm": 7.25, "learning_rate": 6.858768820060544e-06, "loss": 1.06848011, "memory(GiB)": 141.16, "step": 73280, "train_speed(iter/s)": 0.291209 }, { "acc": 0.72157221, "epoch": 0.8198566369380088, "grad_norm": 7.5625, "learning_rate": 6.857051817576221e-06, "loss": 1.12897329, "memory(GiB)": 141.16, "step": 73300, "train_speed(iter/s)": 0.291235 }, { "acc": 0.74004188, "epoch": 0.8200803358839673, "grad_norm": 8.8125, "learning_rate": 6.855334561020666e-06, "loss": 1.02735634, "memory(GiB)": 141.16, "step": 73320, "train_speed(iter/s)": 0.291266 }, { "acc": 0.72683988, "epoch": 0.8203040348299259, "grad_norm": 6.78125, "learning_rate": 6.8536170506288226e-06, "loss": 1.08155804, "memory(GiB)": 141.16, "step": 73340, "train_speed(iter/s)": 0.29129 }, { "acc": 0.73716774, "epoch": 0.8205277337758844, "grad_norm": 9.4375, "learning_rate": 6.851899286635673e-06, "loss": 1.04201641, "memory(GiB)": 141.16, "step": 73360, "train_speed(iter/s)": 0.291318 }, { "acc": 0.7480505, "epoch": 0.8207514327218429, "grad_norm": 8.625, "learning_rate": 6.8501812692762325e-06, "loss": 0.99648323, "memory(GiB)": 141.16, "step": 73380, "train_speed(iter/s)": 0.291343 }, { "acc": 0.74283161, "epoch": 0.8209751316678014, "grad_norm": 6.03125, "learning_rate": 6.848462998785549e-06, "loss": 1.02248993, "memory(GiB)": 141.16, "step": 73400, "train_speed(iter/s)": 0.291368 }, { "acc": 0.72736521, "epoch": 0.82119883061376, "grad_norm": 6.5625, "learning_rate": 6.846744475398706e-06, "loss": 1.08944855, "memory(GiB)": 141.16, "step": 73420, "train_speed(iter/s)": 0.291393 }, { "acc": 0.73493805, "epoch": 0.8214225295597185, "grad_norm": 8.0625, "learning_rate": 6.845025699350822e-06, "loss": 1.06482754, "memory(GiB)": 141.16, "step": 73440, "train_speed(iter/s)": 0.291419 }, { "acc": 0.72977028, "epoch": 0.821646228505677, "grad_norm": 6.96875, "learning_rate": 6.843306670877053e-06, "loss": 1.07894297, "memory(GiB)": 141.16, "step": 73460, "train_speed(iter/s)": 0.291446 }, { "acc": 0.73643603, "epoch": 0.8218699274516356, "grad_norm": 6.6875, "learning_rate": 6.841587390212583e-06, "loss": 1.03931942, "memory(GiB)": 141.16, "step": 73480, "train_speed(iter/s)": 0.291473 }, { "acc": 0.72682953, "epoch": 0.8220936263975941, "grad_norm": 8.6875, "learning_rate": 6.839867857592634e-06, "loss": 1.09478359, "memory(GiB)": 141.16, "step": 73500, "train_speed(iter/s)": 0.291498 }, { "acc": 0.72903881, "epoch": 0.8223173253435526, "grad_norm": 7.21875, "learning_rate": 6.8381480732524675e-06, "loss": 1.09192829, "memory(GiB)": 141.16, "step": 73520, "train_speed(iter/s)": 0.291521 }, { "acc": 0.72853909, "epoch": 0.8225410242895111, "grad_norm": 6.84375, "learning_rate": 6.83642803742737e-06, "loss": 1.10241261, "memory(GiB)": 141.16, "step": 73540, "train_speed(iter/s)": 0.291547 }, { "acc": 0.71989212, "epoch": 0.8227647232354697, "grad_norm": 7.3125, "learning_rate": 6.834707750352667e-06, "loss": 1.14493446, "memory(GiB)": 141.16, "step": 73560, "train_speed(iter/s)": 0.291572 }, { "acc": 0.72611585, "epoch": 0.8229884221814282, "grad_norm": 9.4375, "learning_rate": 6.832987212263722e-06, "loss": 1.08943195, "memory(GiB)": 141.16, "step": 73580, "train_speed(iter/s)": 0.291599 }, { "acc": 0.73444724, "epoch": 0.8232121211273867, "grad_norm": 7.875, "learning_rate": 6.831266423395926e-06, "loss": 1.05318041, "memory(GiB)": 141.16, "step": 73600, "train_speed(iter/s)": 0.291627 }, { "acc": 0.73394299, "epoch": 0.8234358200733453, "grad_norm": 8.6875, "learning_rate": 6.829545383984708e-06, "loss": 1.06349106, "memory(GiB)": 141.16, "step": 73620, "train_speed(iter/s)": 0.291654 }, { "acc": 0.73693757, "epoch": 0.8236595190193038, "grad_norm": 6.3125, "learning_rate": 6.827824094265532e-06, "loss": 1.0465806, "memory(GiB)": 141.16, "step": 73640, "train_speed(iter/s)": 0.291684 }, { "acc": 0.73203411, "epoch": 0.8238832179652623, "grad_norm": 7.875, "learning_rate": 6.826102554473895e-06, "loss": 1.06709099, "memory(GiB)": 141.16, "step": 73660, "train_speed(iter/s)": 0.29171 }, { "acc": 0.71730857, "epoch": 0.8241069169112208, "grad_norm": 6.59375, "learning_rate": 6.8243807648453265e-06, "loss": 1.15099154, "memory(GiB)": 141.16, "step": 73680, "train_speed(iter/s)": 0.291734 }, { "acc": 0.72866549, "epoch": 0.8243306158571794, "grad_norm": 8.3125, "learning_rate": 6.822658725615394e-06, "loss": 1.08398838, "memory(GiB)": 141.16, "step": 73700, "train_speed(iter/s)": 0.291761 }, { "acc": 0.72513876, "epoch": 0.8245543148031379, "grad_norm": 6.28125, "learning_rate": 6.820936437019694e-06, "loss": 1.09692745, "memory(GiB)": 141.16, "step": 73720, "train_speed(iter/s)": 0.291788 }, { "acc": 0.72981806, "epoch": 0.8247780137490964, "grad_norm": 8.125, "learning_rate": 6.819213899293864e-06, "loss": 1.09151745, "memory(GiB)": 141.16, "step": 73740, "train_speed(iter/s)": 0.291814 }, { "acc": 0.72633734, "epoch": 0.825001712695055, "grad_norm": 8.125, "learning_rate": 6.8174911126735685e-06, "loss": 1.10942383, "memory(GiB)": 141.16, "step": 73760, "train_speed(iter/s)": 0.291837 }, { "acc": 0.73138952, "epoch": 0.8252254116410135, "grad_norm": 6.71875, "learning_rate": 6.815768077394511e-06, "loss": 1.08685141, "memory(GiB)": 141.16, "step": 73780, "train_speed(iter/s)": 0.291864 }, { "acc": 0.72114344, "epoch": 0.825449110586972, "grad_norm": 8.0625, "learning_rate": 6.81404479369243e-06, "loss": 1.12842398, "memory(GiB)": 141.16, "step": 73800, "train_speed(iter/s)": 0.291891 }, { "acc": 0.73156567, "epoch": 0.8256728095329305, "grad_norm": 7.125, "learning_rate": 6.81232126180309e-06, "loss": 1.08405933, "memory(GiB)": 141.16, "step": 73820, "train_speed(iter/s)": 0.291916 }, { "acc": 0.72879362, "epoch": 0.8258965084788891, "grad_norm": 6.46875, "learning_rate": 6.8105974819622965e-06, "loss": 1.10198879, "memory(GiB)": 141.16, "step": 73840, "train_speed(iter/s)": 0.291943 }, { "acc": 0.73424101, "epoch": 0.8261202074248476, "grad_norm": 9.875, "learning_rate": 6.8088734544058895e-06, "loss": 1.06703472, "memory(GiB)": 141.16, "step": 73860, "train_speed(iter/s)": 0.291966 }, { "acc": 0.72759619, "epoch": 0.8263439063708061, "grad_norm": 6.8125, "learning_rate": 6.8071491793697386e-06, "loss": 1.0971406, "memory(GiB)": 141.16, "step": 73880, "train_speed(iter/s)": 0.291993 }, { "acc": 0.72834821, "epoch": 0.8265676053167647, "grad_norm": 7.8125, "learning_rate": 6.805424657089752e-06, "loss": 1.08820238, "memory(GiB)": 141.16, "step": 73900, "train_speed(iter/s)": 0.292019 }, { "acc": 0.73265505, "epoch": 0.8267913042627232, "grad_norm": 8.0625, "learning_rate": 6.803699887801865e-06, "loss": 1.07348146, "memory(GiB)": 141.16, "step": 73920, "train_speed(iter/s)": 0.292041 }, { "acc": 0.74053831, "epoch": 0.8270150032086817, "grad_norm": 8.375, "learning_rate": 6.801974871742052e-06, "loss": 1.04196968, "memory(GiB)": 141.16, "step": 73940, "train_speed(iter/s)": 0.292067 }, { "acc": 0.72585144, "epoch": 0.8272387021546402, "grad_norm": 7.6875, "learning_rate": 6.800249609146321e-06, "loss": 1.09771347, "memory(GiB)": 141.16, "step": 73960, "train_speed(iter/s)": 0.292094 }, { "acc": 0.74463673, "epoch": 0.8274624011005988, "grad_norm": 8.3125, "learning_rate": 6.7985241002507116e-06, "loss": 1.0102478, "memory(GiB)": 141.16, "step": 73980, "train_speed(iter/s)": 0.292117 }, { "acc": 0.72092476, "epoch": 0.8276861000465573, "grad_norm": 6.375, "learning_rate": 6.7967983452913e-06, "loss": 1.12131462, "memory(GiB)": 141.16, "step": 74000, "train_speed(iter/s)": 0.29214 }, { "epoch": 0.8276861000465573, "eval_acc": 0.6899121592763093, "eval_loss": 1.079933524131775, "eval_runtime": 2319.6209, "eval_samples_per_second": 32.455, "eval_steps_per_second": 16.228, "step": 74000 }, { "acc": 0.72070818, "epoch": 0.8279097989925158, "grad_norm": 5.25, "learning_rate": 6.79507234450419e-06, "loss": 1.13357744, "memory(GiB)": 141.16, "step": 74020, "train_speed(iter/s)": 0.289455 }, { "acc": 0.72794251, "epoch": 0.8281334979384743, "grad_norm": 6.0, "learning_rate": 6.793346098125527e-06, "loss": 1.08876438, "memory(GiB)": 141.16, "step": 74040, "train_speed(iter/s)": 0.289482 }, { "acc": 0.72398543, "epoch": 0.8283571968844329, "grad_norm": 8.5625, "learning_rate": 6.791619606391486e-06, "loss": 1.12571926, "memory(GiB)": 141.16, "step": 74060, "train_speed(iter/s)": 0.289502 }, { "acc": 0.72593184, "epoch": 0.8285808958303914, "grad_norm": 5.875, "learning_rate": 6.789892869538273e-06, "loss": 1.11038761, "memory(GiB)": 141.16, "step": 74080, "train_speed(iter/s)": 0.289529 }, { "acc": 0.72809248, "epoch": 0.8288045947763499, "grad_norm": 8.1875, "learning_rate": 6.7881658878021335e-06, "loss": 1.09160805, "memory(GiB)": 141.16, "step": 74100, "train_speed(iter/s)": 0.289555 }, { "acc": 0.73478947, "epoch": 0.8290282937223085, "grad_norm": 9.5, "learning_rate": 6.786438661419341e-06, "loss": 1.07656803, "memory(GiB)": 141.16, "step": 74120, "train_speed(iter/s)": 0.289584 }, { "acc": 0.73681431, "epoch": 0.8292519926682671, "grad_norm": 7.4375, "learning_rate": 6.784711190626205e-06, "loss": 1.06873322, "memory(GiB)": 141.16, "step": 74140, "train_speed(iter/s)": 0.289612 }, { "acc": 0.74629202, "epoch": 0.8294756916142256, "grad_norm": 6.6875, "learning_rate": 6.78298347565907e-06, "loss": 1.00335073, "memory(GiB)": 141.16, "step": 74160, "train_speed(iter/s)": 0.289638 }, { "acc": 0.72767382, "epoch": 0.8296993905601842, "grad_norm": 6.15625, "learning_rate": 6.7812555167543106e-06, "loss": 1.10665522, "memory(GiB)": 141.16, "step": 74180, "train_speed(iter/s)": 0.289661 }, { "acc": 0.71086974, "epoch": 0.8299230895061427, "grad_norm": 5.21875, "learning_rate": 6.7795273141483365e-06, "loss": 1.17356176, "memory(GiB)": 141.16, "step": 74200, "train_speed(iter/s)": 0.289686 }, { "acc": 0.72878699, "epoch": 0.8301467884521012, "grad_norm": 8.125, "learning_rate": 6.777798868077589e-06, "loss": 1.09389639, "memory(GiB)": 141.16, "step": 74220, "train_speed(iter/s)": 0.289711 }, { "acc": 0.73028202, "epoch": 0.8303704873980597, "grad_norm": 8.0625, "learning_rate": 6.776070178778549e-06, "loss": 1.08403845, "memory(GiB)": 141.16, "step": 74240, "train_speed(iter/s)": 0.289734 }, { "acc": 0.72816596, "epoch": 0.8305941863440183, "grad_norm": 6.03125, "learning_rate": 6.774341246487719e-06, "loss": 1.09402943, "memory(GiB)": 141.16, "step": 74260, "train_speed(iter/s)": 0.289759 }, { "acc": 0.73896041, "epoch": 0.8308178852899768, "grad_norm": 7.0, "learning_rate": 6.772612071441647e-06, "loss": 1.04739943, "memory(GiB)": 141.16, "step": 74280, "train_speed(iter/s)": 0.289785 }, { "acc": 0.73864279, "epoch": 0.8310415842359353, "grad_norm": 8.625, "learning_rate": 6.7708826538769064e-06, "loss": 1.05197086, "memory(GiB)": 141.16, "step": 74300, "train_speed(iter/s)": 0.289812 }, { "acc": 0.72449317, "epoch": 0.8312652831818939, "grad_norm": 7.15625, "learning_rate": 6.7691529940301085e-06, "loss": 1.10533457, "memory(GiB)": 141.16, "step": 74320, "train_speed(iter/s)": 0.289838 }, { "acc": 0.71675129, "epoch": 0.8314889821278524, "grad_norm": 5.875, "learning_rate": 6.767423092137894e-06, "loss": 1.16027508, "memory(GiB)": 141.16, "step": 74340, "train_speed(iter/s)": 0.289863 }, { "acc": 0.71804457, "epoch": 0.8317126810738109, "grad_norm": 7.78125, "learning_rate": 6.765692948436936e-06, "loss": 1.13966532, "memory(GiB)": 141.16, "step": 74360, "train_speed(iter/s)": 0.28989 }, { "acc": 0.72287593, "epoch": 0.8319363800197694, "grad_norm": 7.71875, "learning_rate": 6.763962563163946e-06, "loss": 1.11679077, "memory(GiB)": 141.16, "step": 74380, "train_speed(iter/s)": 0.289915 }, { "acc": 0.72597537, "epoch": 0.832160078965728, "grad_norm": 6.5625, "learning_rate": 6.7622319365556655e-06, "loss": 1.09481449, "memory(GiB)": 141.16, "step": 74400, "train_speed(iter/s)": 0.289941 }, { "acc": 0.73513207, "epoch": 0.8323837779116865, "grad_norm": 6.21875, "learning_rate": 6.760501068848867e-06, "loss": 1.05567226, "memory(GiB)": 141.16, "step": 74420, "train_speed(iter/s)": 0.289966 }, { "acc": 0.7374754, "epoch": 0.832607476857645, "grad_norm": 6.8125, "learning_rate": 6.75876996028036e-06, "loss": 1.0455409, "memory(GiB)": 141.16, "step": 74440, "train_speed(iter/s)": 0.289991 }, { "acc": 0.72747602, "epoch": 0.8328311758036036, "grad_norm": 6.875, "learning_rate": 6.757038611086984e-06, "loss": 1.08505898, "memory(GiB)": 141.16, "step": 74460, "train_speed(iter/s)": 0.290015 }, { "acc": 0.72268047, "epoch": 0.8330548747495621, "grad_norm": 6.6875, "learning_rate": 6.75530702150561e-06, "loss": 1.11429253, "memory(GiB)": 141.16, "step": 74480, "train_speed(iter/s)": 0.290042 }, { "acc": 0.73660913, "epoch": 0.8332785736955206, "grad_norm": 7.46875, "learning_rate": 6.7535751917731474e-06, "loss": 1.04515457, "memory(GiB)": 141.16, "step": 74500, "train_speed(iter/s)": 0.290069 }, { "acc": 0.72503929, "epoch": 0.8335022726414791, "grad_norm": 8.0625, "learning_rate": 6.751843122126534e-06, "loss": 1.11795731, "memory(GiB)": 141.16, "step": 74520, "train_speed(iter/s)": 0.290095 }, { "acc": 0.72204757, "epoch": 0.8337259715874377, "grad_norm": 7.5, "learning_rate": 6.750110812802744e-06, "loss": 1.11957474, "memory(GiB)": 141.16, "step": 74540, "train_speed(iter/s)": 0.290121 }, { "acc": 0.74068499, "epoch": 0.8339496705333962, "grad_norm": 6.71875, "learning_rate": 6.7483782640387776e-06, "loss": 1.02849064, "memory(GiB)": 141.16, "step": 74560, "train_speed(iter/s)": 0.290145 }, { "acc": 0.73023548, "epoch": 0.8341733694793547, "grad_norm": 8.4375, "learning_rate": 6.746645476071675e-06, "loss": 1.08672428, "memory(GiB)": 141.16, "step": 74580, "train_speed(iter/s)": 0.290173 }, { "acc": 0.74461432, "epoch": 0.8343970684253132, "grad_norm": 5.5625, "learning_rate": 6.744912449138505e-06, "loss": 1.03496914, "memory(GiB)": 141.16, "step": 74600, "train_speed(iter/s)": 0.290201 }, { "acc": 0.72000246, "epoch": 0.8346207673712718, "grad_norm": 7.625, "learning_rate": 6.743179183476373e-06, "loss": 1.13022766, "memory(GiB)": 141.16, "step": 74620, "train_speed(iter/s)": 0.290229 }, { "acc": 0.74658899, "epoch": 0.8348444663172303, "grad_norm": 8.75, "learning_rate": 6.7414456793224135e-06, "loss": 1.01536617, "memory(GiB)": 141.16, "step": 74640, "train_speed(iter/s)": 0.290254 }, { "acc": 0.73276386, "epoch": 0.8350681652631888, "grad_norm": 6.875, "learning_rate": 6.739711936913793e-06, "loss": 1.07514877, "memory(GiB)": 141.16, "step": 74660, "train_speed(iter/s)": 0.290282 }, { "acc": 0.73424482, "epoch": 0.8352918642091474, "grad_norm": 7.21875, "learning_rate": 6.737977956487714e-06, "loss": 1.05066776, "memory(GiB)": 141.16, "step": 74680, "train_speed(iter/s)": 0.290302 }, { "acc": 0.73611031, "epoch": 0.8355155631551059, "grad_norm": 5.6875, "learning_rate": 6.736243738281407e-06, "loss": 1.05891151, "memory(GiB)": 141.16, "step": 74700, "train_speed(iter/s)": 0.290328 }, { "acc": 0.73106275, "epoch": 0.8357392621010644, "grad_norm": 9.5625, "learning_rate": 6.734509282532141e-06, "loss": 1.0812705, "memory(GiB)": 141.16, "step": 74720, "train_speed(iter/s)": 0.290356 }, { "acc": 0.74711981, "epoch": 0.835962961047023, "grad_norm": 7.625, "learning_rate": 6.732774589477216e-06, "loss": 1.01155281, "memory(GiB)": 141.16, "step": 74740, "train_speed(iter/s)": 0.290383 }, { "acc": 0.73361635, "epoch": 0.8361866599929815, "grad_norm": 8.0, "learning_rate": 6.731039659353958e-06, "loss": 1.05596371, "memory(GiB)": 141.16, "step": 74760, "train_speed(iter/s)": 0.290406 }, { "acc": 0.72856798, "epoch": 0.83641035893894, "grad_norm": 7.09375, "learning_rate": 6.729304492399731e-06, "loss": 1.08723516, "memory(GiB)": 141.16, "step": 74780, "train_speed(iter/s)": 0.290433 }, { "acc": 0.72763491, "epoch": 0.8366340578848985, "grad_norm": 6.90625, "learning_rate": 6.727569088851933e-06, "loss": 1.08819027, "memory(GiB)": 141.16, "step": 74800, "train_speed(iter/s)": 0.290459 }, { "acc": 0.73199911, "epoch": 0.8368577568308571, "grad_norm": 6.09375, "learning_rate": 6.725833448947992e-06, "loss": 1.08810959, "memory(GiB)": 141.16, "step": 74820, "train_speed(iter/s)": 0.290484 }, { "acc": 0.71275692, "epoch": 0.8370814557768156, "grad_norm": 7.5, "learning_rate": 6.724097572925366e-06, "loss": 1.16362839, "memory(GiB)": 141.16, "step": 74840, "train_speed(iter/s)": 0.290512 }, { "acc": 0.73528543, "epoch": 0.8373051547227741, "grad_norm": 6.46875, "learning_rate": 6.72236146102155e-06, "loss": 1.06164494, "memory(GiB)": 141.16, "step": 74860, "train_speed(iter/s)": 0.290537 }, { "acc": 0.73873682, "epoch": 0.8375288536687326, "grad_norm": 8.125, "learning_rate": 6.720625113474069e-06, "loss": 1.0383419, "memory(GiB)": 141.16, "step": 74880, "train_speed(iter/s)": 0.290564 }, { "acc": 0.72623415, "epoch": 0.8377525526146912, "grad_norm": 6.65625, "learning_rate": 6.718888530520476e-06, "loss": 1.11045055, "memory(GiB)": 141.16, "step": 74900, "train_speed(iter/s)": 0.290586 }, { "acc": 0.73153305, "epoch": 0.8379762515606497, "grad_norm": 6.03125, "learning_rate": 6.7171517123983655e-06, "loss": 1.08532181, "memory(GiB)": 141.16, "step": 74920, "train_speed(iter/s)": 0.29061 }, { "acc": 0.73099051, "epoch": 0.8381999505066082, "grad_norm": 5.5, "learning_rate": 6.7154146593453565e-06, "loss": 1.07909803, "memory(GiB)": 141.16, "step": 74940, "train_speed(iter/s)": 0.290637 }, { "acc": 0.73714619, "epoch": 0.8384236494525668, "grad_norm": 7.3125, "learning_rate": 6.713677371599103e-06, "loss": 1.05917263, "memory(GiB)": 141.16, "step": 74960, "train_speed(iter/s)": 0.290663 }, { "acc": 0.73386068, "epoch": 0.8386473483985253, "grad_norm": 7.59375, "learning_rate": 6.711939849397291e-06, "loss": 1.04722595, "memory(GiB)": 141.16, "step": 74980, "train_speed(iter/s)": 0.29069 }, { "acc": 0.73743534, "epoch": 0.8388710473444838, "grad_norm": 8.375, "learning_rate": 6.710202092977638e-06, "loss": 1.05151501, "memory(GiB)": 141.16, "step": 75000, "train_speed(iter/s)": 0.290715 }, { "acc": 0.72439752, "epoch": 0.8390947462904423, "grad_norm": 9.1875, "learning_rate": 6.708464102577895e-06, "loss": 1.11263599, "memory(GiB)": 141.16, "step": 75020, "train_speed(iter/s)": 0.290739 }, { "acc": 0.73313513, "epoch": 0.8393184452364009, "grad_norm": 7.28125, "learning_rate": 6.706725878435842e-06, "loss": 1.08180122, "memory(GiB)": 141.16, "step": 75040, "train_speed(iter/s)": 0.290767 }, { "acc": 0.75303411, "epoch": 0.8395421441823594, "grad_norm": 8.1875, "learning_rate": 6.7049874207892965e-06, "loss": 0.98851624, "memory(GiB)": 141.16, "step": 75060, "train_speed(iter/s)": 0.290793 }, { "acc": 0.72479219, "epoch": 0.8397658431283179, "grad_norm": 8.125, "learning_rate": 6.7032487298761e-06, "loss": 1.11198578, "memory(GiB)": 141.16, "step": 75080, "train_speed(iter/s)": 0.29082 }, { "acc": 0.72724743, "epoch": 0.8399895420742765, "grad_norm": 7.75, "learning_rate": 6.7015098059341325e-06, "loss": 1.08947372, "memory(GiB)": 141.16, "step": 75100, "train_speed(iter/s)": 0.290848 }, { "acc": 0.73651972, "epoch": 0.840213241020235, "grad_norm": 8.25, "learning_rate": 6.699770649201304e-06, "loss": 1.05131636, "memory(GiB)": 141.16, "step": 75120, "train_speed(iter/s)": 0.290874 }, { "acc": 0.73085027, "epoch": 0.8404369399661935, "grad_norm": 6.75, "learning_rate": 6.698031259915554e-06, "loss": 1.07862387, "memory(GiB)": 141.16, "step": 75140, "train_speed(iter/s)": 0.290901 }, { "acc": 0.73560514, "epoch": 0.840660638912152, "grad_norm": 8.25, "learning_rate": 6.696291638314859e-06, "loss": 1.05088806, "memory(GiB)": 141.16, "step": 75160, "train_speed(iter/s)": 0.29093 }, { "acc": 0.72871799, "epoch": 0.8408843378581106, "grad_norm": 7.21875, "learning_rate": 6.694551784637222e-06, "loss": 1.08567343, "memory(GiB)": 141.16, "step": 75180, "train_speed(iter/s)": 0.290958 }, { "acc": 0.73455944, "epoch": 0.8411080368040691, "grad_norm": 6.4375, "learning_rate": 6.692811699120678e-06, "loss": 1.06397409, "memory(GiB)": 141.16, "step": 75200, "train_speed(iter/s)": 0.29098 }, { "acc": 0.72600603, "epoch": 0.8413317357500276, "grad_norm": 7.375, "learning_rate": 6.6910713820033e-06, "loss": 1.08604813, "memory(GiB)": 141.16, "step": 75220, "train_speed(iter/s)": 0.291008 }, { "acc": 0.71780939, "epoch": 0.8415554346959861, "grad_norm": 6.90625, "learning_rate": 6.689330833523184e-06, "loss": 1.14866276, "memory(GiB)": 141.16, "step": 75240, "train_speed(iter/s)": 0.291031 }, { "acc": 0.73627005, "epoch": 0.8417791336419447, "grad_norm": 7.65625, "learning_rate": 6.687590053918467e-06, "loss": 1.0463562, "memory(GiB)": 141.16, "step": 75260, "train_speed(iter/s)": 0.291056 }, { "acc": 0.71213064, "epoch": 0.8420028325879032, "grad_norm": 7.375, "learning_rate": 6.6858490434273075e-06, "loss": 1.16993675, "memory(GiB)": 141.16, "step": 75280, "train_speed(iter/s)": 0.291079 }, { "acc": 0.73672171, "epoch": 0.8422265315338617, "grad_norm": 6.0, "learning_rate": 6.6841078022879025e-06, "loss": 1.05592041, "memory(GiB)": 141.16, "step": 75300, "train_speed(iter/s)": 0.291103 }, { "acc": 0.72912216, "epoch": 0.8424502304798203, "grad_norm": 7.90625, "learning_rate": 6.6823663307384774e-06, "loss": 1.0944315, "memory(GiB)": 141.16, "step": 75320, "train_speed(iter/s)": 0.291123 }, { "acc": 0.7444561, "epoch": 0.8426739294257788, "grad_norm": 6.21875, "learning_rate": 6.680624629017294e-06, "loss": 1.0139473, "memory(GiB)": 141.16, "step": 75340, "train_speed(iter/s)": 0.291146 }, { "acc": 0.7311264, "epoch": 0.8428976283717373, "grad_norm": 6.34375, "learning_rate": 6.6788826973626385e-06, "loss": 1.07505188, "memory(GiB)": 141.16, "step": 75360, "train_speed(iter/s)": 0.291172 }, { "acc": 0.73394184, "epoch": 0.8431213273176958, "grad_norm": 8.9375, "learning_rate": 6.677140536012834e-06, "loss": 1.05881023, "memory(GiB)": 141.16, "step": 75380, "train_speed(iter/s)": 0.291196 }, { "acc": 0.71441507, "epoch": 0.8433450262636544, "grad_norm": 7.9375, "learning_rate": 6.675398145206231e-06, "loss": 1.1547554, "memory(GiB)": 141.16, "step": 75400, "train_speed(iter/s)": 0.291221 }, { "acc": 0.73957357, "epoch": 0.8435687252096129, "grad_norm": 8.1875, "learning_rate": 6.6736555251812164e-06, "loss": 1.0696907, "memory(GiB)": 141.16, "step": 75420, "train_speed(iter/s)": 0.291241 }, { "acc": 0.73162136, "epoch": 0.8437924241555714, "grad_norm": 9.0625, "learning_rate": 6.671912676176202e-06, "loss": 1.07749424, "memory(GiB)": 141.16, "step": 75440, "train_speed(iter/s)": 0.291269 }, { "acc": 0.71865873, "epoch": 0.84401612310153, "grad_norm": 7.4375, "learning_rate": 6.670169598429638e-06, "loss": 1.13415232, "memory(GiB)": 141.16, "step": 75460, "train_speed(iter/s)": 0.291295 }, { "acc": 0.7363833, "epoch": 0.8442398220474885, "grad_norm": 8.0, "learning_rate": 6.668426292180002e-06, "loss": 1.04075384, "memory(GiB)": 141.16, "step": 75480, "train_speed(iter/s)": 0.291323 }, { "acc": 0.72802572, "epoch": 0.844463520993447, "grad_norm": 9.1875, "learning_rate": 6.6666827576657985e-06, "loss": 1.08740501, "memory(GiB)": 141.16, "step": 75500, "train_speed(iter/s)": 0.291351 }, { "acc": 0.72365074, "epoch": 0.8446872199394055, "grad_norm": 6.53125, "learning_rate": 6.664938995125573e-06, "loss": 1.1070097, "memory(GiB)": 141.16, "step": 75520, "train_speed(iter/s)": 0.291377 }, { "acc": 0.73894587, "epoch": 0.8449109188853641, "grad_norm": 8.1875, "learning_rate": 6.663195004797896e-06, "loss": 1.03404865, "memory(GiB)": 141.16, "step": 75540, "train_speed(iter/s)": 0.291402 }, { "acc": 0.72734847, "epoch": 0.8451346178313226, "grad_norm": 8.9375, "learning_rate": 6.661450786921368e-06, "loss": 1.07723427, "memory(GiB)": 141.16, "step": 75560, "train_speed(iter/s)": 0.291424 }, { "acc": 0.7263998, "epoch": 0.8453583167772811, "grad_norm": 9.125, "learning_rate": 6.6597063417346266e-06, "loss": 1.08605747, "memory(GiB)": 141.16, "step": 75580, "train_speed(iter/s)": 0.291452 }, { "acc": 0.7151741, "epoch": 0.8455820157232397, "grad_norm": 6.75, "learning_rate": 6.6579616694763334e-06, "loss": 1.15819855, "memory(GiB)": 141.16, "step": 75600, "train_speed(iter/s)": 0.291477 }, { "acc": 0.72644911, "epoch": 0.8458057146691982, "grad_norm": 9.125, "learning_rate": 6.656216770385188e-06, "loss": 1.10509691, "memory(GiB)": 141.16, "step": 75620, "train_speed(iter/s)": 0.291501 }, { "acc": 0.72804928, "epoch": 0.8460294136151567, "grad_norm": 5.78125, "learning_rate": 6.654471644699914e-06, "loss": 1.08449535, "memory(GiB)": 141.16, "step": 75640, "train_speed(iter/s)": 0.291528 }, { "acc": 0.73495021, "epoch": 0.8462531125611152, "grad_norm": 7.9375, "learning_rate": 6.652726292659272e-06, "loss": 1.06100378, "memory(GiB)": 141.16, "step": 75660, "train_speed(iter/s)": 0.291554 }, { "acc": 0.71586409, "epoch": 0.8464768115070738, "grad_norm": 6.4375, "learning_rate": 6.650980714502051e-06, "loss": 1.14413395, "memory(GiB)": 141.16, "step": 75680, "train_speed(iter/s)": 0.291578 }, { "acc": 0.73922176, "epoch": 0.8467005104530323, "grad_norm": 8.1875, "learning_rate": 6.649234910467068e-06, "loss": 1.0412262, "memory(GiB)": 141.16, "step": 75700, "train_speed(iter/s)": 0.291602 }, { "acc": 0.74303317, "epoch": 0.8469242093989908, "grad_norm": 7.0, "learning_rate": 6.647488880793178e-06, "loss": 1.03870316, "memory(GiB)": 141.16, "step": 75720, "train_speed(iter/s)": 0.291627 }, { "acc": 0.72298732, "epoch": 0.8471479083449494, "grad_norm": 7.625, "learning_rate": 6.64574262571926e-06, "loss": 1.11700706, "memory(GiB)": 141.16, "step": 75740, "train_speed(iter/s)": 0.291651 }, { "acc": 0.73918066, "epoch": 0.8473716072909079, "grad_norm": 6.15625, "learning_rate": 6.6439961454842285e-06, "loss": 1.03618202, "memory(GiB)": 141.16, "step": 75760, "train_speed(iter/s)": 0.291677 }, { "acc": 0.72999253, "epoch": 0.8475953062368664, "grad_norm": 7.46875, "learning_rate": 6.642249440327026e-06, "loss": 1.09003716, "memory(GiB)": 141.16, "step": 75780, "train_speed(iter/s)": 0.291705 }, { "acc": 0.71559105, "epoch": 0.8478190051828249, "grad_norm": 6.875, "learning_rate": 6.640502510486628e-06, "loss": 1.14843197, "memory(GiB)": 141.16, "step": 75800, "train_speed(iter/s)": 0.291732 }, { "acc": 0.72811794, "epoch": 0.8480427041287835, "grad_norm": 8.625, "learning_rate": 6.638755356202037e-06, "loss": 1.09907646, "memory(GiB)": 141.16, "step": 75820, "train_speed(iter/s)": 0.291758 }, { "acc": 0.73114872, "epoch": 0.848266403074742, "grad_norm": 5.875, "learning_rate": 6.637007977712291e-06, "loss": 1.0781476, "memory(GiB)": 141.16, "step": 75840, "train_speed(iter/s)": 0.291783 }, { "acc": 0.73680992, "epoch": 0.8484901020207005, "grad_norm": 8.25, "learning_rate": 6.635260375256453e-06, "loss": 1.03351946, "memory(GiB)": 141.16, "step": 75860, "train_speed(iter/s)": 0.291808 }, { "acc": 0.72882276, "epoch": 0.848713800966659, "grad_norm": 7.84375, "learning_rate": 6.633512549073626e-06, "loss": 1.09952469, "memory(GiB)": 141.16, "step": 75880, "train_speed(iter/s)": 0.291835 }, { "acc": 0.72494888, "epoch": 0.8489374999126176, "grad_norm": 9.0625, "learning_rate": 6.631764499402932e-06, "loss": 1.11504612, "memory(GiB)": 141.16, "step": 75900, "train_speed(iter/s)": 0.291856 }, { "acc": 0.7300725, "epoch": 0.8491611988585761, "grad_norm": 7.84375, "learning_rate": 6.630016226483531e-06, "loss": 1.06514359, "memory(GiB)": 141.16, "step": 75920, "train_speed(iter/s)": 0.291882 }, { "acc": 0.72176638, "epoch": 0.8493848978045346, "grad_norm": 9.0, "learning_rate": 6.628267730554613e-06, "loss": 1.12939281, "memory(GiB)": 141.16, "step": 75940, "train_speed(iter/s)": 0.291906 }, { "acc": 0.73403444, "epoch": 0.8496085967504932, "grad_norm": 6.75, "learning_rate": 6.6265190118553945e-06, "loss": 1.06697807, "memory(GiB)": 141.16, "step": 75960, "train_speed(iter/s)": 0.29193 }, { "acc": 0.73006859, "epoch": 0.8498322956964517, "grad_norm": 7.625, "learning_rate": 6.624770070625129e-06, "loss": 1.0903204, "memory(GiB)": 141.16, "step": 75980, "train_speed(iter/s)": 0.291957 }, { "acc": 0.73475237, "epoch": 0.8500559946424102, "grad_norm": 5.375, "learning_rate": 6.623020907103093e-06, "loss": 1.06549644, "memory(GiB)": 141.16, "step": 76000, "train_speed(iter/s)": 0.291981 }, { "epoch": 0.8500559946424102, "eval_acc": 0.6899364132305912, "eval_loss": 1.079738974571228, "eval_runtime": 2322.0445, "eval_samples_per_second": 32.421, "eval_steps_per_second": 16.211, "step": 76000 }, { "acc": 0.74847269, "epoch": 0.8502796935883687, "grad_norm": 8.375, "learning_rate": 6.6212715215286e-06, "loss": 0.98972397, "memory(GiB)": 141.16, "step": 76020, "train_speed(iter/s)": 0.28937 }, { "acc": 0.73776665, "epoch": 0.8505033925343273, "grad_norm": 7.71875, "learning_rate": 6.619521914140988e-06, "loss": 1.02347889, "memory(GiB)": 141.16, "step": 76040, "train_speed(iter/s)": 0.289397 }, { "acc": 0.72764654, "epoch": 0.8507270914802858, "grad_norm": 8.0625, "learning_rate": 6.61777208517963e-06, "loss": 1.09115582, "memory(GiB)": 141.16, "step": 76060, "train_speed(iter/s)": 0.289423 }, { "acc": 0.7199801, "epoch": 0.8509507904262443, "grad_norm": 5.375, "learning_rate": 6.616022034883928e-06, "loss": 1.12517691, "memory(GiB)": 141.16, "step": 76080, "train_speed(iter/s)": 0.289448 }, { "acc": 0.7340519, "epoch": 0.8511744893722029, "grad_norm": 6.59375, "learning_rate": 6.614271763493314e-06, "loss": 1.0726016, "memory(GiB)": 141.16, "step": 76100, "train_speed(iter/s)": 0.289471 }, { "acc": 0.73303881, "epoch": 0.8513981883181614, "grad_norm": 9.0625, "learning_rate": 6.6125212712472485e-06, "loss": 1.07196503, "memory(GiB)": 141.16, "step": 76120, "train_speed(iter/s)": 0.289498 }, { "acc": 0.73804779, "epoch": 0.8516218872641199, "grad_norm": 6.1875, "learning_rate": 6.610770558385224e-06, "loss": 1.05555973, "memory(GiB)": 141.16, "step": 76140, "train_speed(iter/s)": 0.289522 }, { "acc": 0.72503381, "epoch": 0.8518455862100784, "grad_norm": 7.375, "learning_rate": 6.6090196251467655e-06, "loss": 1.09481449, "memory(GiB)": 141.16, "step": 76160, "train_speed(iter/s)": 0.289548 }, { "acc": 0.74215622, "epoch": 0.852069285156037, "grad_norm": 6.8125, "learning_rate": 6.607268471771424e-06, "loss": 1.01985312, "memory(GiB)": 141.16, "step": 76180, "train_speed(iter/s)": 0.289576 }, { "acc": 0.71584764, "epoch": 0.8522929841019955, "grad_norm": 8.1875, "learning_rate": 6.605517098498783e-06, "loss": 1.17046824, "memory(GiB)": 141.16, "step": 76200, "train_speed(iter/s)": 0.289601 }, { "acc": 0.733988, "epoch": 0.852516683047954, "grad_norm": 6.65625, "learning_rate": 6.603765505568452e-06, "loss": 1.05916557, "memory(GiB)": 141.16, "step": 76220, "train_speed(iter/s)": 0.289625 }, { "acc": 0.7342927, "epoch": 0.8527403819939126, "grad_norm": 6.9375, "learning_rate": 6.6020136932200796e-06, "loss": 1.05429382, "memory(GiB)": 141.16, "step": 76240, "train_speed(iter/s)": 0.289648 }, { "acc": 0.73148685, "epoch": 0.8529640809398711, "grad_norm": 5.84375, "learning_rate": 6.6002616616933345e-06, "loss": 1.06880188, "memory(GiB)": 141.16, "step": 76260, "train_speed(iter/s)": 0.289673 }, { "acc": 0.7351716, "epoch": 0.8531877798858296, "grad_norm": 7.125, "learning_rate": 6.5985094112279204e-06, "loss": 1.06722069, "memory(GiB)": 141.16, "step": 76280, "train_speed(iter/s)": 0.2897 }, { "acc": 0.72465391, "epoch": 0.8534114788317881, "grad_norm": 6.46875, "learning_rate": 6.596756942063573e-06, "loss": 1.10329132, "memory(GiB)": 141.16, "step": 76300, "train_speed(iter/s)": 0.289726 }, { "acc": 0.72249112, "epoch": 0.8536351777777467, "grad_norm": 8.625, "learning_rate": 6.595004254440051e-06, "loss": 1.13139715, "memory(GiB)": 141.16, "step": 76320, "train_speed(iter/s)": 0.28975 }, { "acc": 0.73838568, "epoch": 0.8538588767237052, "grad_norm": 6.96875, "learning_rate": 6.593251348597151e-06, "loss": 1.0444109, "memory(GiB)": 141.16, "step": 76340, "train_speed(iter/s)": 0.289776 }, { "acc": 0.72432179, "epoch": 0.8540825756696637, "grad_norm": 6.84375, "learning_rate": 6.591498224774692e-06, "loss": 1.11137753, "memory(GiB)": 141.16, "step": 76360, "train_speed(iter/s)": 0.289803 }, { "acc": 0.72152176, "epoch": 0.8543062746156223, "grad_norm": 7.53125, "learning_rate": 6.589744883212529e-06, "loss": 1.12419415, "memory(GiB)": 141.16, "step": 76380, "train_speed(iter/s)": 0.28983 }, { "acc": 0.7285078, "epoch": 0.8545299735615808, "grad_norm": 7.1875, "learning_rate": 6.587991324150544e-06, "loss": 1.08632221, "memory(GiB)": 141.16, "step": 76400, "train_speed(iter/s)": 0.289857 }, { "acc": 0.73438673, "epoch": 0.8547536725075393, "grad_norm": 7.625, "learning_rate": 6.586237547828647e-06, "loss": 1.07621756, "memory(GiB)": 141.16, "step": 76420, "train_speed(iter/s)": 0.289884 }, { "acc": 0.72450371, "epoch": 0.8549773714534978, "grad_norm": 5.875, "learning_rate": 6.58448355448678e-06, "loss": 1.12183208, "memory(GiB)": 141.16, "step": 76440, "train_speed(iter/s)": 0.289909 }, { "acc": 0.71702814, "epoch": 0.8552010703994564, "grad_norm": 6.875, "learning_rate": 6.5827293443649164e-06, "loss": 1.13558521, "memory(GiB)": 141.16, "step": 76460, "train_speed(iter/s)": 0.289934 }, { "acc": 0.73030434, "epoch": 0.8554247693454149, "grad_norm": 5.28125, "learning_rate": 6.580974917703056e-06, "loss": 1.08185673, "memory(GiB)": 141.16, "step": 76480, "train_speed(iter/s)": 0.289956 }, { "acc": 0.72520838, "epoch": 0.8556484682913734, "grad_norm": 8.1875, "learning_rate": 6.57922027474123e-06, "loss": 1.1072238, "memory(GiB)": 141.16, "step": 76500, "train_speed(iter/s)": 0.289979 }, { "acc": 0.73442764, "epoch": 0.855872167237332, "grad_norm": 6.4375, "learning_rate": 6.577465415719498e-06, "loss": 1.06930676, "memory(GiB)": 141.16, "step": 76520, "train_speed(iter/s)": 0.290006 }, { "acc": 0.72328272, "epoch": 0.8560958661832905, "grad_norm": 6.15625, "learning_rate": 6.57571034087795e-06, "loss": 1.11277761, "memory(GiB)": 141.16, "step": 76540, "train_speed(iter/s)": 0.290031 }, { "acc": 0.72963042, "epoch": 0.856319565129249, "grad_norm": 6.0625, "learning_rate": 6.573955050456704e-06, "loss": 1.08780003, "memory(GiB)": 141.16, "step": 76560, "train_speed(iter/s)": 0.290054 }, { "acc": 0.72897301, "epoch": 0.8565432640752075, "grad_norm": 6.65625, "learning_rate": 6.572199544695912e-06, "loss": 1.09801083, "memory(GiB)": 141.16, "step": 76580, "train_speed(iter/s)": 0.29008 }, { "acc": 0.72950807, "epoch": 0.8567669630211661, "grad_norm": 7.71875, "learning_rate": 6.5704438238357505e-06, "loss": 1.09374323, "memory(GiB)": 141.16, "step": 76600, "train_speed(iter/s)": 0.290105 }, { "acc": 0.73961606, "epoch": 0.8569906619671246, "grad_norm": 6.78125, "learning_rate": 6.568687888116426e-06, "loss": 1.03636494, "memory(GiB)": 141.16, "step": 76620, "train_speed(iter/s)": 0.29013 }, { "acc": 0.73544807, "epoch": 0.8572143609130831, "grad_norm": 7.375, "learning_rate": 6.566931737778177e-06, "loss": 1.06128387, "memory(GiB)": 141.16, "step": 76640, "train_speed(iter/s)": 0.290153 }, { "acc": 0.73945408, "epoch": 0.8574380598590418, "grad_norm": 6.34375, "learning_rate": 6.565175373061269e-06, "loss": 1.04450474, "memory(GiB)": 141.16, "step": 76660, "train_speed(iter/s)": 0.290177 }, { "acc": 0.73222265, "epoch": 0.8576617588050003, "grad_norm": 6.8125, "learning_rate": 6.563418794205999e-06, "loss": 1.0788002, "memory(GiB)": 141.16, "step": 76680, "train_speed(iter/s)": 0.290204 }, { "acc": 0.72572803, "epoch": 0.8578854577509588, "grad_norm": 6.9375, "learning_rate": 6.561662001452691e-06, "loss": 1.1139925, "memory(GiB)": 141.16, "step": 76700, "train_speed(iter/s)": 0.290228 }, { "acc": 0.72769842, "epoch": 0.8581091566969173, "grad_norm": 8.4375, "learning_rate": 6.559904995041701e-06, "loss": 1.09316835, "memory(GiB)": 141.16, "step": 76720, "train_speed(iter/s)": 0.290251 }, { "acc": 0.74289341, "epoch": 0.8583328556428759, "grad_norm": 6.84375, "learning_rate": 6.55814777521341e-06, "loss": 1.0461441, "memory(GiB)": 141.16, "step": 76740, "train_speed(iter/s)": 0.290277 }, { "acc": 0.72916851, "epoch": 0.8585565545888344, "grad_norm": 7.90625, "learning_rate": 6.556390342208234e-06, "loss": 1.0877739, "memory(GiB)": 141.16, "step": 76760, "train_speed(iter/s)": 0.290303 }, { "acc": 0.7186717, "epoch": 0.8587802535347929, "grad_norm": 6.4375, "learning_rate": 6.554632696266612e-06, "loss": 1.14867325, "memory(GiB)": 141.16, "step": 76780, "train_speed(iter/s)": 0.290328 }, { "acc": 0.74605155, "epoch": 0.8590039524807515, "grad_norm": 6.40625, "learning_rate": 6.5528748376290165e-06, "loss": 1.01343117, "memory(GiB)": 141.16, "step": 76800, "train_speed(iter/s)": 0.290355 }, { "acc": 0.71952558, "epoch": 0.85922765142671, "grad_norm": 6.9375, "learning_rate": 6.551116766535949e-06, "loss": 1.12584171, "memory(GiB)": 141.16, "step": 76820, "train_speed(iter/s)": 0.290378 }, { "acc": 0.73759699, "epoch": 0.8594513503726685, "grad_norm": 6.96875, "learning_rate": 6.5493584832279355e-06, "loss": 1.0594758, "memory(GiB)": 141.16, "step": 76840, "train_speed(iter/s)": 0.290405 }, { "acc": 0.72921171, "epoch": 0.859675049318627, "grad_norm": 6.25, "learning_rate": 6.547599987945537e-06, "loss": 1.08721237, "memory(GiB)": 141.16, "step": 76860, "train_speed(iter/s)": 0.290429 }, { "acc": 0.7352294, "epoch": 0.8598987482645856, "grad_norm": 6.4375, "learning_rate": 6.545841280929338e-06, "loss": 1.07070065, "memory(GiB)": 141.16, "step": 76880, "train_speed(iter/s)": 0.290456 }, { "acc": 0.73041148, "epoch": 0.8601224472105441, "grad_norm": 7.25, "learning_rate": 6.544082362419958e-06, "loss": 1.06939888, "memory(GiB)": 141.16, "step": 76900, "train_speed(iter/s)": 0.29048 }, { "acc": 0.72385449, "epoch": 0.8603461461565026, "grad_norm": 7.625, "learning_rate": 6.542323232658041e-06, "loss": 1.11082649, "memory(GiB)": 141.16, "step": 76920, "train_speed(iter/s)": 0.290506 }, { "acc": 0.72769318, "epoch": 0.8605698451024612, "grad_norm": 6.78125, "learning_rate": 6.540563891884262e-06, "loss": 1.08148727, "memory(GiB)": 141.16, "step": 76940, "train_speed(iter/s)": 0.290531 }, { "acc": 0.737886, "epoch": 0.8607935440484197, "grad_norm": 7.03125, "learning_rate": 6.538804340339321e-06, "loss": 1.04674864, "memory(GiB)": 141.16, "step": 76960, "train_speed(iter/s)": 0.290556 }, { "acc": 0.72720852, "epoch": 0.8610172429943782, "grad_norm": 7.75, "learning_rate": 6.5370445782639515e-06, "loss": 1.0963131, "memory(GiB)": 141.16, "step": 76980, "train_speed(iter/s)": 0.290579 }, { "acc": 0.73104639, "epoch": 0.8612409419403367, "grad_norm": 6.21875, "learning_rate": 6.535284605898915e-06, "loss": 1.07239809, "memory(GiB)": 141.16, "step": 77000, "train_speed(iter/s)": 0.290602 }, { "acc": 0.74070663, "epoch": 0.8614646408862953, "grad_norm": 6.5, "learning_rate": 6.5335244234850005e-06, "loss": 1.02191315, "memory(GiB)": 141.16, "step": 77020, "train_speed(iter/s)": 0.290627 }, { "acc": 0.72847157, "epoch": 0.8616883398322538, "grad_norm": 7.5625, "learning_rate": 6.531764031263026e-06, "loss": 1.08650808, "memory(GiB)": 141.16, "step": 77040, "train_speed(iter/s)": 0.290653 }, { "acc": 0.74330101, "epoch": 0.8619120387782123, "grad_norm": 7.78125, "learning_rate": 6.530003429473837e-06, "loss": 1.02102737, "memory(GiB)": 141.16, "step": 77060, "train_speed(iter/s)": 0.290677 }, { "acc": 0.73555098, "epoch": 0.8621357377241708, "grad_norm": 7.0625, "learning_rate": 6.52824261835831e-06, "loss": 1.06957798, "memory(GiB)": 141.16, "step": 77080, "train_speed(iter/s)": 0.2907 }, { "acc": 0.73660727, "epoch": 0.8623594366701294, "grad_norm": 6.75, "learning_rate": 6.52648159815735e-06, "loss": 1.05442238, "memory(GiB)": 141.16, "step": 77100, "train_speed(iter/s)": 0.290729 }, { "acc": 0.72427287, "epoch": 0.8625831356160879, "grad_norm": 5.6875, "learning_rate": 6.524720369111888e-06, "loss": 1.08791132, "memory(GiB)": 141.16, "step": 77120, "train_speed(iter/s)": 0.290753 }, { "acc": 0.71635904, "epoch": 0.8628068345620464, "grad_norm": 6.53125, "learning_rate": 6.5229589314628885e-06, "loss": 1.14196243, "memory(GiB)": 141.16, "step": 77140, "train_speed(iter/s)": 0.290779 }, { "acc": 0.7144927, "epoch": 0.863030533508005, "grad_norm": 6.125, "learning_rate": 6.521197285451337e-06, "loss": 1.17701435, "memory(GiB)": 141.16, "step": 77160, "train_speed(iter/s)": 0.290803 }, { "acc": 0.74040756, "epoch": 0.8632542324539635, "grad_norm": 7.4375, "learning_rate": 6.519435431318254e-06, "loss": 1.04712639, "memory(GiB)": 141.16, "step": 77180, "train_speed(iter/s)": 0.290831 }, { "acc": 0.73399725, "epoch": 0.863477931399922, "grad_norm": 6.3125, "learning_rate": 6.517673369304687e-06, "loss": 1.06227989, "memory(GiB)": 141.16, "step": 77200, "train_speed(iter/s)": 0.290855 }, { "acc": 0.72837887, "epoch": 0.8637016303458805, "grad_norm": 8.0625, "learning_rate": 6.515911099651711e-06, "loss": 1.08467226, "memory(GiB)": 141.16, "step": 77220, "train_speed(iter/s)": 0.290881 }, { "acc": 0.73196402, "epoch": 0.8639253292918391, "grad_norm": 7.15625, "learning_rate": 6.5141486226004265e-06, "loss": 1.07494469, "memory(GiB)": 141.16, "step": 77240, "train_speed(iter/s)": 0.29091 }, { "acc": 0.72410417, "epoch": 0.8641490282377976, "grad_norm": 7.75, "learning_rate": 6.512385938391972e-06, "loss": 1.11040516, "memory(GiB)": 141.16, "step": 77260, "train_speed(iter/s)": 0.290933 }, { "acc": 0.72259817, "epoch": 0.8643727271837561, "grad_norm": 6.96875, "learning_rate": 6.510623047267502e-06, "loss": 1.11416225, "memory(GiB)": 141.16, "step": 77280, "train_speed(iter/s)": 0.290959 }, { "acc": 0.7375927, "epoch": 0.8645964261297147, "grad_norm": 7.78125, "learning_rate": 6.508859949468207e-06, "loss": 1.04407578, "memory(GiB)": 141.16, "step": 77300, "train_speed(iter/s)": 0.290984 }, { "acc": 0.72865076, "epoch": 0.8648201250756732, "grad_norm": 6.96875, "learning_rate": 6.507096645235304e-06, "loss": 1.08698215, "memory(GiB)": 141.16, "step": 77320, "train_speed(iter/s)": 0.291011 }, { "acc": 0.71955676, "epoch": 0.8650438240216317, "grad_norm": 7.625, "learning_rate": 6.50533313481004e-06, "loss": 1.13263464, "memory(GiB)": 141.16, "step": 77340, "train_speed(iter/s)": 0.291037 }, { "acc": 0.7288672, "epoch": 0.8652675229675902, "grad_norm": 7.03125, "learning_rate": 6.503569418433687e-06, "loss": 1.08618069, "memory(GiB)": 141.16, "step": 77360, "train_speed(iter/s)": 0.291064 }, { "acc": 0.73983331, "epoch": 0.8654912219135488, "grad_norm": 4.625, "learning_rate": 6.501805496347547e-06, "loss": 1.03265381, "memory(GiB)": 141.16, "step": 77380, "train_speed(iter/s)": 0.29109 }, { "acc": 0.73653183, "epoch": 0.8657149208595073, "grad_norm": 5.96875, "learning_rate": 6.500041368792948e-06, "loss": 1.06479721, "memory(GiB)": 141.16, "step": 77400, "train_speed(iter/s)": 0.291112 }, { "acc": 0.72142792, "epoch": 0.8659386198054658, "grad_norm": 7.6875, "learning_rate": 6.498277036011249e-06, "loss": 1.12672186, "memory(GiB)": 141.16, "step": 77420, "train_speed(iter/s)": 0.291138 }, { "acc": 0.74330359, "epoch": 0.8661623187514244, "grad_norm": 6.75, "learning_rate": 6.496512498243837e-06, "loss": 1.01428719, "memory(GiB)": 141.16, "step": 77440, "train_speed(iter/s)": 0.29116 }, { "acc": 0.73084459, "epoch": 0.8663860176973829, "grad_norm": 8.0625, "learning_rate": 6.494747755732126e-06, "loss": 1.05989552, "memory(GiB)": 141.16, "step": 77460, "train_speed(iter/s)": 0.291182 }, { "acc": 0.71668282, "epoch": 0.8666097166433414, "grad_norm": 6.40625, "learning_rate": 6.492982808717556e-06, "loss": 1.1464015, "memory(GiB)": 141.16, "step": 77480, "train_speed(iter/s)": 0.291206 }, { "acc": 0.73342819, "epoch": 0.8668334155892999, "grad_norm": 7.6875, "learning_rate": 6.491217657441598e-06, "loss": 1.06508789, "memory(GiB)": 141.16, "step": 77500, "train_speed(iter/s)": 0.29123 }, { "acc": 0.74038057, "epoch": 0.8670571145352585, "grad_norm": 7.65625, "learning_rate": 6.48945230214575e-06, "loss": 1.04405308, "memory(GiB)": 141.16, "step": 77520, "train_speed(iter/s)": 0.291255 }, { "acc": 0.7332448, "epoch": 0.867280813481217, "grad_norm": 7.25, "learning_rate": 6.4876867430715375e-06, "loss": 1.05948124, "memory(GiB)": 141.16, "step": 77540, "train_speed(iter/s)": 0.291278 }, { "acc": 0.73277912, "epoch": 0.8675045124271755, "grad_norm": 6.9375, "learning_rate": 6.485920980460516e-06, "loss": 1.0717083, "memory(GiB)": 141.16, "step": 77560, "train_speed(iter/s)": 0.291302 }, { "acc": 0.72580624, "epoch": 0.867728211373134, "grad_norm": 8.0625, "learning_rate": 6.4841550145542655e-06, "loss": 1.08564148, "memory(GiB)": 141.16, "step": 77580, "train_speed(iter/s)": 0.291329 }, { "acc": 0.73171577, "epoch": 0.8679519103190926, "grad_norm": 7.65625, "learning_rate": 6.4823888455943936e-06, "loss": 1.08195553, "memory(GiB)": 141.16, "step": 77600, "train_speed(iter/s)": 0.291355 }, { "acc": 0.7413177, "epoch": 0.8681756092650511, "grad_norm": 6.28125, "learning_rate": 6.480622473822541e-06, "loss": 1.03546171, "memory(GiB)": 141.16, "step": 77620, "train_speed(iter/s)": 0.291379 }, { "acc": 0.72798491, "epoch": 0.8683993082110096, "grad_norm": 7.4375, "learning_rate": 6.478855899480371e-06, "loss": 1.07539806, "memory(GiB)": 141.16, "step": 77640, "train_speed(iter/s)": 0.291402 }, { "acc": 0.72118616, "epoch": 0.8686230071569682, "grad_norm": 8.625, "learning_rate": 6.477089122809577e-06, "loss": 1.14183168, "memory(GiB)": 141.16, "step": 77660, "train_speed(iter/s)": 0.291426 }, { "acc": 0.7318099, "epoch": 0.8688467061029267, "grad_norm": 7.4375, "learning_rate": 6.475322144051877e-06, "loss": 1.07867966, "memory(GiB)": 141.16, "step": 77680, "train_speed(iter/s)": 0.291453 }, { "acc": 0.72604122, "epoch": 0.8690704050488852, "grad_norm": 9.125, "learning_rate": 6.473554963449021e-06, "loss": 1.09557066, "memory(GiB)": 141.16, "step": 77700, "train_speed(iter/s)": 0.291476 }, { "acc": 0.7436172, "epoch": 0.8692941039948437, "grad_norm": 6.875, "learning_rate": 6.471787581242784e-06, "loss": 1.00834999, "memory(GiB)": 141.16, "step": 77720, "train_speed(iter/s)": 0.291501 }, { "acc": 0.74822674, "epoch": 0.8695178029408023, "grad_norm": 7.9375, "learning_rate": 6.470019997674969e-06, "loss": 0.98855724, "memory(GiB)": 141.16, "step": 77740, "train_speed(iter/s)": 0.291526 }, { "acc": 0.73273735, "epoch": 0.8697415018867608, "grad_norm": 7.3125, "learning_rate": 6.468252212987408e-06, "loss": 1.06529922, "memory(GiB)": 141.16, "step": 77760, "train_speed(iter/s)": 0.291551 }, { "acc": 0.72213364, "epoch": 0.8699652008327193, "grad_norm": 7.15625, "learning_rate": 6.466484227421957e-06, "loss": 1.12825508, "memory(GiB)": 141.16, "step": 77780, "train_speed(iter/s)": 0.291574 }, { "acc": 0.73080959, "epoch": 0.8701888997786779, "grad_norm": 8.5, "learning_rate": 6.464716041220505e-06, "loss": 1.0722477, "memory(GiB)": 141.16, "step": 77800, "train_speed(iter/s)": 0.291599 }, { "acc": 0.72287884, "epoch": 0.8704125987246364, "grad_norm": 6.1875, "learning_rate": 6.46294765462496e-06, "loss": 1.1121542, "memory(GiB)": 141.16, "step": 77820, "train_speed(iter/s)": 0.291622 }, { "acc": 0.72838764, "epoch": 0.8706362976705949, "grad_norm": 7.21875, "learning_rate": 6.461179067877266e-06, "loss": 1.09019575, "memory(GiB)": 141.16, "step": 77840, "train_speed(iter/s)": 0.291646 }, { "acc": 0.74046283, "epoch": 0.8708599966165534, "grad_norm": 7.375, "learning_rate": 6.4594102812193916e-06, "loss": 1.02280998, "memory(GiB)": 141.16, "step": 77860, "train_speed(iter/s)": 0.291668 }, { "acc": 0.72504621, "epoch": 0.871083695562512, "grad_norm": 7.65625, "learning_rate": 6.457641294893331e-06, "loss": 1.09841366, "memory(GiB)": 141.16, "step": 77880, "train_speed(iter/s)": 0.291693 }, { "acc": 0.73604121, "epoch": 0.8713073945084705, "grad_norm": 7.78125, "learning_rate": 6.455872109141106e-06, "loss": 1.06968441, "memory(GiB)": 141.16, "step": 77900, "train_speed(iter/s)": 0.291714 }, { "acc": 0.73738971, "epoch": 0.871531093454429, "grad_norm": 8.125, "learning_rate": 6.454102724204767e-06, "loss": 1.03314533, "memory(GiB)": 141.16, "step": 77920, "train_speed(iter/s)": 0.29174 }, { "acc": 0.73771009, "epoch": 0.8717547924003876, "grad_norm": 7.15625, "learning_rate": 6.452333140326391e-06, "loss": 1.04604263, "memory(GiB)": 141.16, "step": 77940, "train_speed(iter/s)": 0.291764 }, { "acc": 0.7264905, "epoch": 0.8719784913463461, "grad_norm": 6.1875, "learning_rate": 6.450563357748084e-06, "loss": 1.08921175, "memory(GiB)": 141.16, "step": 77960, "train_speed(iter/s)": 0.291789 }, { "acc": 0.71990285, "epoch": 0.8722021902923046, "grad_norm": 6.53125, "learning_rate": 6.448793376711977e-06, "loss": 1.13171406, "memory(GiB)": 141.16, "step": 77980, "train_speed(iter/s)": 0.291812 }, { "acc": 0.74105816, "epoch": 0.8724258892382631, "grad_norm": 7.4375, "learning_rate": 6.447023197460226e-06, "loss": 1.03257141, "memory(GiB)": 141.16, "step": 78000, "train_speed(iter/s)": 0.291838 }, { "epoch": 0.8724258892382631, "eval_acc": 0.6899832450528834, "eval_loss": 1.0795847177505493, "eval_runtime": 2321.4508, "eval_samples_per_second": 32.429, "eval_steps_per_second": 16.215, "step": 78000 }, { "acc": 0.72230453, "epoch": 0.8726495881842217, "grad_norm": 7.9375, "learning_rate": 6.44525282023502e-06, "loss": 1.11989202, "memory(GiB)": 141.16, "step": 78020, "train_speed(iter/s)": 0.289292 }, { "acc": 0.73537283, "epoch": 0.8728732871301802, "grad_norm": 6.65625, "learning_rate": 6.443482245278571e-06, "loss": 1.05301495, "memory(GiB)": 141.16, "step": 78040, "train_speed(iter/s)": 0.289316 }, { "acc": 0.75061369, "epoch": 0.8730969860761387, "grad_norm": 7.09375, "learning_rate": 6.441711472833118e-06, "loss": 0.99952297, "memory(GiB)": 141.16, "step": 78060, "train_speed(iter/s)": 0.289343 }, { "acc": 0.73087931, "epoch": 0.8733206850220973, "grad_norm": 5.6875, "learning_rate": 6.439940503140929e-06, "loss": 1.08190117, "memory(GiB)": 141.16, "step": 78080, "train_speed(iter/s)": 0.289367 }, { "acc": 0.72875729, "epoch": 0.8735443839680558, "grad_norm": 6.96875, "learning_rate": 6.438169336444298e-06, "loss": 1.08650627, "memory(GiB)": 141.16, "step": 78100, "train_speed(iter/s)": 0.289392 }, { "acc": 0.72103224, "epoch": 0.8737680829140143, "grad_norm": 6.9375, "learning_rate": 6.436397972985544e-06, "loss": 1.12944813, "memory(GiB)": 141.16, "step": 78120, "train_speed(iter/s)": 0.289419 }, { "acc": 0.73035793, "epoch": 0.8739917818599728, "grad_norm": 6.28125, "learning_rate": 6.434626413007018e-06, "loss": 1.08866997, "memory(GiB)": 141.16, "step": 78140, "train_speed(iter/s)": 0.289444 }, { "acc": 0.74394255, "epoch": 0.8742154808059314, "grad_norm": 7.75, "learning_rate": 6.432854656751093e-06, "loss": 1.01249428, "memory(GiB)": 141.16, "step": 78160, "train_speed(iter/s)": 0.289468 }, { "acc": 0.72962003, "epoch": 0.8744391797518899, "grad_norm": 6.65625, "learning_rate": 6.431082704460172e-06, "loss": 1.08326645, "memory(GiB)": 141.16, "step": 78180, "train_speed(iter/s)": 0.289492 }, { "acc": 0.73359485, "epoch": 0.8746628786978484, "grad_norm": 8.4375, "learning_rate": 6.42931055637668e-06, "loss": 1.06476946, "memory(GiB)": 141.16, "step": 78200, "train_speed(iter/s)": 0.289518 }, { "acc": 0.73452673, "epoch": 0.874886577643807, "grad_norm": 7.84375, "learning_rate": 6.427538212743075e-06, "loss": 1.06248188, "memory(GiB)": 141.16, "step": 78220, "train_speed(iter/s)": 0.289545 }, { "acc": 0.73731127, "epoch": 0.8751102765897655, "grad_norm": 6.5, "learning_rate": 6.4257656738018385e-06, "loss": 1.02941494, "memory(GiB)": 141.16, "step": 78240, "train_speed(iter/s)": 0.289571 }, { "acc": 0.73600473, "epoch": 0.875333975535724, "grad_norm": 7.21875, "learning_rate": 6.423992939795478e-06, "loss": 1.06748524, "memory(GiB)": 141.16, "step": 78260, "train_speed(iter/s)": 0.289593 }, { "acc": 0.72941985, "epoch": 0.8755576744816825, "grad_norm": 6.46875, "learning_rate": 6.422220010966531e-06, "loss": 1.09912548, "memory(GiB)": 141.16, "step": 78280, "train_speed(iter/s)": 0.289616 }, { "acc": 0.725524, "epoch": 0.8757813734276411, "grad_norm": 6.84375, "learning_rate": 6.4204468875575585e-06, "loss": 1.10064125, "memory(GiB)": 141.16, "step": 78300, "train_speed(iter/s)": 0.289641 }, { "acc": 0.7223505, "epoch": 0.8760050723735996, "grad_norm": 7.5, "learning_rate": 6.418673569811148e-06, "loss": 1.11471729, "memory(GiB)": 141.16, "step": 78320, "train_speed(iter/s)": 0.289665 }, { "acc": 0.74022045, "epoch": 0.8762287713195581, "grad_norm": 9.125, "learning_rate": 6.416900057969916e-06, "loss": 1.0320899, "memory(GiB)": 141.16, "step": 78340, "train_speed(iter/s)": 0.289689 }, { "acc": 0.73763185, "epoch": 0.8764524702655166, "grad_norm": 6.5, "learning_rate": 6.415126352276504e-06, "loss": 1.03964367, "memory(GiB)": 141.16, "step": 78360, "train_speed(iter/s)": 0.289715 }, { "acc": 0.73037891, "epoch": 0.8766761692114752, "grad_norm": 7.65625, "learning_rate": 6.41335245297358e-06, "loss": 1.08306065, "memory(GiB)": 141.16, "step": 78380, "train_speed(iter/s)": 0.28974 }, { "acc": 0.73627882, "epoch": 0.8768998681574337, "grad_norm": 7.375, "learning_rate": 6.411578360303841e-06, "loss": 1.05677643, "memory(GiB)": 141.16, "step": 78400, "train_speed(iter/s)": 0.289763 }, { "acc": 0.73186202, "epoch": 0.8771235671033922, "grad_norm": 6.3125, "learning_rate": 6.409804074510003e-06, "loss": 1.06992989, "memory(GiB)": 141.16, "step": 78420, "train_speed(iter/s)": 0.289789 }, { "acc": 0.7349267, "epoch": 0.8773472660493508, "grad_norm": 7.78125, "learning_rate": 6.408029595834818e-06, "loss": 1.07226944, "memory(GiB)": 141.16, "step": 78440, "train_speed(iter/s)": 0.289816 }, { "acc": 0.73208766, "epoch": 0.8775709649953093, "grad_norm": 7.03125, "learning_rate": 6.4062549245210595e-06, "loss": 1.06597528, "memory(GiB)": 141.16, "step": 78460, "train_speed(iter/s)": 0.28984 }, { "acc": 0.72627006, "epoch": 0.8777946639412678, "grad_norm": 7.25, "learning_rate": 6.4044800608115265e-06, "loss": 1.09493771, "memory(GiB)": 141.16, "step": 78480, "train_speed(iter/s)": 0.289868 }, { "acc": 0.7365921, "epoch": 0.8780183628872263, "grad_norm": 6.375, "learning_rate": 6.402705004949047e-06, "loss": 1.05271587, "memory(GiB)": 141.16, "step": 78500, "train_speed(iter/s)": 0.289895 }, { "acc": 0.72527523, "epoch": 0.8782420618331849, "grad_norm": 6.59375, "learning_rate": 6.400929757176473e-06, "loss": 1.10820761, "memory(GiB)": 141.16, "step": 78520, "train_speed(iter/s)": 0.289919 }, { "acc": 0.73272867, "epoch": 0.8784657607791434, "grad_norm": 7.5, "learning_rate": 6.399154317736685e-06, "loss": 1.08150272, "memory(GiB)": 141.16, "step": 78540, "train_speed(iter/s)": 0.289942 }, { "acc": 0.73046379, "epoch": 0.8786894597251019, "grad_norm": 5.78125, "learning_rate": 6.397378686872587e-06, "loss": 1.08709116, "memory(GiB)": 141.16, "step": 78560, "train_speed(iter/s)": 0.289966 }, { "acc": 0.72221489, "epoch": 0.8789131586710605, "grad_norm": 7.1875, "learning_rate": 6.395602864827112e-06, "loss": 1.12623405, "memory(GiB)": 141.16, "step": 78580, "train_speed(iter/s)": 0.289991 }, { "acc": 0.73036036, "epoch": 0.879136857617019, "grad_norm": 7.0625, "learning_rate": 6.393826851843218e-06, "loss": 1.08635054, "memory(GiB)": 141.16, "step": 78600, "train_speed(iter/s)": 0.290011 }, { "acc": 0.73531961, "epoch": 0.8793605565629775, "grad_norm": 7.34375, "learning_rate": 6.392050648163888e-06, "loss": 1.07430553, "memory(GiB)": 141.16, "step": 78620, "train_speed(iter/s)": 0.290037 }, { "acc": 0.7316937, "epoch": 0.879584255508936, "grad_norm": 7.25, "learning_rate": 6.390274254032132e-06, "loss": 1.0634263, "memory(GiB)": 141.16, "step": 78640, "train_speed(iter/s)": 0.290061 }, { "acc": 0.7261941, "epoch": 0.8798079544548946, "grad_norm": 6.5625, "learning_rate": 6.388497669690985e-06, "loss": 1.10183392, "memory(GiB)": 141.16, "step": 78660, "train_speed(iter/s)": 0.290084 }, { "acc": 0.73648844, "epoch": 0.8800316534008531, "grad_norm": 7.21875, "learning_rate": 6.386720895383512e-06, "loss": 1.05238876, "memory(GiB)": 141.16, "step": 78680, "train_speed(iter/s)": 0.290109 }, { "acc": 0.72502074, "epoch": 0.8802553523468116, "grad_norm": 6.90625, "learning_rate": 6.384943931352801e-06, "loss": 1.09545527, "memory(GiB)": 141.16, "step": 78700, "train_speed(iter/s)": 0.290131 }, { "acc": 0.71872091, "epoch": 0.8804790512927702, "grad_norm": 5.53125, "learning_rate": 6.383166777841963e-06, "loss": 1.13887901, "memory(GiB)": 141.16, "step": 78720, "train_speed(iter/s)": 0.290156 }, { "acc": 0.71686411, "epoch": 0.8807027502387287, "grad_norm": 6.3125, "learning_rate": 6.38138943509414e-06, "loss": 1.14798326, "memory(GiB)": 141.16, "step": 78740, "train_speed(iter/s)": 0.290181 }, { "acc": 0.7327456, "epoch": 0.8809264491846872, "grad_norm": 8.75, "learning_rate": 6.379611903352498e-06, "loss": 1.06970272, "memory(GiB)": 141.16, "step": 78760, "train_speed(iter/s)": 0.290205 }, { "acc": 0.73662419, "epoch": 0.8811501481306457, "grad_norm": 6.3125, "learning_rate": 6.377834182860229e-06, "loss": 1.06639233, "memory(GiB)": 141.16, "step": 78780, "train_speed(iter/s)": 0.290231 }, { "acc": 0.74130659, "epoch": 0.8813738470766043, "grad_norm": 5.40625, "learning_rate": 6.376056273860549e-06, "loss": 1.03565464, "memory(GiB)": 141.16, "step": 78800, "train_speed(iter/s)": 0.290255 }, { "acc": 0.72797441, "epoch": 0.8815975460225628, "grad_norm": 8.125, "learning_rate": 6.374278176596703e-06, "loss": 1.08800745, "memory(GiB)": 141.16, "step": 78820, "train_speed(iter/s)": 0.290279 }, { "acc": 0.72323289, "epoch": 0.8818212449685213, "grad_norm": 6.09375, "learning_rate": 6.372499891311958e-06, "loss": 1.11377449, "memory(GiB)": 141.16, "step": 78840, "train_speed(iter/s)": 0.290305 }, { "acc": 0.73276987, "epoch": 0.8820449439144799, "grad_norm": 6.9375, "learning_rate": 6.370721418249612e-06, "loss": 1.06341267, "memory(GiB)": 141.16, "step": 78860, "train_speed(iter/s)": 0.290329 }, { "acc": 0.75256028, "epoch": 0.8822686428604384, "grad_norm": 8.5, "learning_rate": 6.368942757652984e-06, "loss": 0.96077328, "memory(GiB)": 141.16, "step": 78880, "train_speed(iter/s)": 0.290354 }, { "acc": 0.72771816, "epoch": 0.8824923418063969, "grad_norm": 5.53125, "learning_rate": 6.367163909765419e-06, "loss": 1.09260731, "memory(GiB)": 141.16, "step": 78900, "train_speed(iter/s)": 0.290379 }, { "acc": 0.72437735, "epoch": 0.8827160407523554, "grad_norm": 7.71875, "learning_rate": 6.365384874830291e-06, "loss": 1.09386387, "memory(GiB)": 141.16, "step": 78920, "train_speed(iter/s)": 0.290405 }, { "acc": 0.73228927, "epoch": 0.882939739698314, "grad_norm": 6.1875, "learning_rate": 6.3636056530909955e-06, "loss": 1.0679081, "memory(GiB)": 141.16, "step": 78940, "train_speed(iter/s)": 0.29043 }, { "acc": 0.72185912, "epoch": 0.8831634386442725, "grad_norm": 6.75, "learning_rate": 6.3618262447909565e-06, "loss": 1.11597395, "memory(GiB)": 141.16, "step": 78960, "train_speed(iter/s)": 0.290456 }, { "acc": 0.72750635, "epoch": 0.883387137590231, "grad_norm": 10.75, "learning_rate": 6.360046650173623e-06, "loss": 1.10680809, "memory(GiB)": 141.16, "step": 78980, "train_speed(iter/s)": 0.290482 }, { "acc": 0.72509608, "epoch": 0.8836108365361895, "grad_norm": 7.9375, "learning_rate": 6.358266869482466e-06, "loss": 1.08844328, "memory(GiB)": 141.16, "step": 79000, "train_speed(iter/s)": 0.290508 }, { "acc": 0.73546495, "epoch": 0.8838345354821481, "grad_norm": 8.25, "learning_rate": 6.3564869029609895e-06, "loss": 1.06617508, "memory(GiB)": 141.16, "step": 79020, "train_speed(iter/s)": 0.290533 }, { "acc": 0.72971506, "epoch": 0.8840582344281066, "grad_norm": 6.1875, "learning_rate": 6.354706750852715e-06, "loss": 1.07926903, "memory(GiB)": 141.16, "step": 79040, "train_speed(iter/s)": 0.290555 }, { "acc": 0.75169725, "epoch": 0.8842819333740651, "grad_norm": 5.90625, "learning_rate": 6.3529264134011935e-06, "loss": 0.98819504, "memory(GiB)": 141.16, "step": 79060, "train_speed(iter/s)": 0.290579 }, { "acc": 0.72969875, "epoch": 0.8845056323200237, "grad_norm": 6.09375, "learning_rate": 6.351145890850001e-06, "loss": 1.08388729, "memory(GiB)": 141.16, "step": 79080, "train_speed(iter/s)": 0.290604 }, { "acc": 0.73861723, "epoch": 0.8847293312659822, "grad_norm": 6.8125, "learning_rate": 6.349365183442738e-06, "loss": 1.03313198, "memory(GiB)": 141.16, "step": 79100, "train_speed(iter/s)": 0.290627 }, { "acc": 0.72790399, "epoch": 0.8849530302119407, "grad_norm": 7.375, "learning_rate": 6.347584291423033e-06, "loss": 1.08300381, "memory(GiB)": 141.16, "step": 79120, "train_speed(iter/s)": 0.290651 }, { "acc": 0.72101011, "epoch": 0.8851767291578992, "grad_norm": 8.5, "learning_rate": 6.3458032150345325e-06, "loss": 1.11154852, "memory(GiB)": 141.16, "step": 79140, "train_speed(iter/s)": 0.290676 }, { "acc": 0.73367538, "epoch": 0.8854004281038578, "grad_norm": 7.15625, "learning_rate": 6.344021954520918e-06, "loss": 1.07326565, "memory(GiB)": 141.16, "step": 79160, "train_speed(iter/s)": 0.290702 }, { "acc": 0.73802605, "epoch": 0.8856241270498164, "grad_norm": 6.3125, "learning_rate": 6.342240510125889e-06, "loss": 1.03632717, "memory(GiB)": 141.16, "step": 79180, "train_speed(iter/s)": 0.290728 }, { "acc": 0.73072462, "epoch": 0.8858478259957749, "grad_norm": 6.40625, "learning_rate": 6.340458882093173e-06, "loss": 1.10065765, "memory(GiB)": 141.16, "step": 79200, "train_speed(iter/s)": 0.290756 }, { "acc": 0.72821417, "epoch": 0.8860715249417335, "grad_norm": 6.4375, "learning_rate": 6.3386770706665235e-06, "loss": 1.08392067, "memory(GiB)": 141.16, "step": 79220, "train_speed(iter/s)": 0.290782 }, { "acc": 0.72667241, "epoch": 0.886295223887692, "grad_norm": 6.875, "learning_rate": 6.336895076089717e-06, "loss": 1.09309311, "memory(GiB)": 141.16, "step": 79240, "train_speed(iter/s)": 0.290808 }, { "acc": 0.73299541, "epoch": 0.8865189228336505, "grad_norm": 6.96875, "learning_rate": 6.335112898606553e-06, "loss": 1.07066431, "memory(GiB)": 141.16, "step": 79260, "train_speed(iter/s)": 0.290836 }, { "acc": 0.72345705, "epoch": 0.886742621779609, "grad_norm": 6.6875, "learning_rate": 6.333330538460863e-06, "loss": 1.11458073, "memory(GiB)": 141.16, "step": 79280, "train_speed(iter/s)": 0.290862 }, { "acc": 0.73599863, "epoch": 0.8869663207255676, "grad_norm": 8.5, "learning_rate": 6.331547995896496e-06, "loss": 1.04533081, "memory(GiB)": 141.16, "step": 79300, "train_speed(iter/s)": 0.290893 }, { "acc": 0.73941827, "epoch": 0.8871900196715261, "grad_norm": 7.21875, "learning_rate": 6.3297652711573345e-06, "loss": 1.03573799, "memory(GiB)": 141.16, "step": 79320, "train_speed(iter/s)": 0.290917 }, { "acc": 0.72477632, "epoch": 0.8874137186174846, "grad_norm": 7.21875, "learning_rate": 6.327982364487275e-06, "loss": 1.11168156, "memory(GiB)": 141.16, "step": 79340, "train_speed(iter/s)": 0.290943 }, { "acc": 0.7351089, "epoch": 0.8876374175634432, "grad_norm": 7.25, "learning_rate": 6.326199276130246e-06, "loss": 1.06449614, "memory(GiB)": 141.16, "step": 79360, "train_speed(iter/s)": 0.290966 }, { "acc": 0.72649484, "epoch": 0.8878611165094017, "grad_norm": 8.75, "learning_rate": 6.3244160063302e-06, "loss": 1.11261311, "memory(GiB)": 141.16, "step": 79380, "train_speed(iter/s)": 0.290991 }, { "acc": 0.73999062, "epoch": 0.8880848154553602, "grad_norm": 6.0625, "learning_rate": 6.322632555331116e-06, "loss": 1.02995701, "memory(GiB)": 141.16, "step": 79400, "train_speed(iter/s)": 0.291015 }, { "acc": 0.73899288, "epoch": 0.8883085144013187, "grad_norm": 7.5625, "learning_rate": 6.320848923376993e-06, "loss": 1.04372635, "memory(GiB)": 141.16, "step": 79420, "train_speed(iter/s)": 0.291038 }, { "acc": 0.73299389, "epoch": 0.8885322133472773, "grad_norm": 8.4375, "learning_rate": 6.319065110711858e-06, "loss": 1.06031418, "memory(GiB)": 141.16, "step": 79440, "train_speed(iter/s)": 0.291063 }, { "acc": 0.71898117, "epoch": 0.8887559122932358, "grad_norm": 7.4375, "learning_rate": 6.317281117579761e-06, "loss": 1.13191442, "memory(GiB)": 141.16, "step": 79460, "train_speed(iter/s)": 0.291085 }, { "acc": 0.73186493, "epoch": 0.8889796112391943, "grad_norm": 9.125, "learning_rate": 6.31549694422478e-06, "loss": 1.08252678, "memory(GiB)": 141.16, "step": 79480, "train_speed(iter/s)": 0.29111 }, { "acc": 0.73318243, "epoch": 0.8892033101851529, "grad_norm": 7.125, "learning_rate": 6.313712590891014e-06, "loss": 1.07562618, "memory(GiB)": 141.16, "step": 79500, "train_speed(iter/s)": 0.291133 }, { "acc": 0.72021918, "epoch": 0.8894270091311114, "grad_norm": 7.96875, "learning_rate": 6.311928057822589e-06, "loss": 1.1322691, "memory(GiB)": 141.16, "step": 79520, "train_speed(iter/s)": 0.291155 }, { "acc": 0.73748941, "epoch": 0.8896507080770699, "grad_norm": 8.625, "learning_rate": 6.3101433452636525e-06, "loss": 1.04857092, "memory(GiB)": 141.16, "step": 79540, "train_speed(iter/s)": 0.291179 }, { "acc": 0.71582165, "epoch": 0.8898744070230284, "grad_norm": 7.3125, "learning_rate": 6.308358453458381e-06, "loss": 1.14516611, "memory(GiB)": 141.16, "step": 79560, "train_speed(iter/s)": 0.291201 }, { "acc": 0.73182192, "epoch": 0.890098105968987, "grad_norm": 7.25, "learning_rate": 6.306573382650974e-06, "loss": 1.08025494, "memory(GiB)": 141.16, "step": 79580, "train_speed(iter/s)": 0.291224 }, { "acc": 0.72185383, "epoch": 0.8903218049149455, "grad_norm": 8.1875, "learning_rate": 6.30478813308565e-06, "loss": 1.13527269, "memory(GiB)": 141.16, "step": 79600, "train_speed(iter/s)": 0.291249 }, { "acc": 0.72316866, "epoch": 0.890545503860904, "grad_norm": 5.71875, "learning_rate": 6.30300270500666e-06, "loss": 1.10186405, "memory(GiB)": 141.16, "step": 79620, "train_speed(iter/s)": 0.291272 }, { "acc": 0.74402647, "epoch": 0.8907692028068626, "grad_norm": 7.65625, "learning_rate": 6.301217098658277e-06, "loss": 1.028689, "memory(GiB)": 141.16, "step": 79640, "train_speed(iter/s)": 0.291298 }, { "acc": 0.73765507, "epoch": 0.8909929017528211, "grad_norm": 9.3125, "learning_rate": 6.299431314284796e-06, "loss": 1.05201111, "memory(GiB)": 141.16, "step": 79660, "train_speed(iter/s)": 0.291321 }, { "acc": 0.73020267, "epoch": 0.8912166006987796, "grad_norm": 6.25, "learning_rate": 6.297645352130538e-06, "loss": 1.08567896, "memory(GiB)": 141.16, "step": 79680, "train_speed(iter/s)": 0.291345 }, { "acc": 0.74510188, "epoch": 0.8914402996447381, "grad_norm": 7.28125, "learning_rate": 6.295859212439847e-06, "loss": 1.02718754, "memory(GiB)": 141.16, "step": 79700, "train_speed(iter/s)": 0.29137 }, { "acc": 0.74594388, "epoch": 0.8916639985906967, "grad_norm": 6.34375, "learning_rate": 6.2940728954570955e-06, "loss": 1.01035881, "memory(GiB)": 141.16, "step": 79720, "train_speed(iter/s)": 0.291393 }, { "acc": 0.71959429, "epoch": 0.8918876975366552, "grad_norm": 6.90625, "learning_rate": 6.292286401426674e-06, "loss": 1.12840014, "memory(GiB)": 141.16, "step": 79740, "train_speed(iter/s)": 0.291418 }, { "acc": 0.7263968, "epoch": 0.8921113964826137, "grad_norm": 5.4375, "learning_rate": 6.2904997305930025e-06, "loss": 1.10289192, "memory(GiB)": 141.16, "step": 79760, "train_speed(iter/s)": 0.291442 }, { "acc": 0.73013082, "epoch": 0.8923350954285723, "grad_norm": 7.5, "learning_rate": 6.288712883200521e-06, "loss": 1.08575172, "memory(GiB)": 141.16, "step": 79780, "train_speed(iter/s)": 0.291467 }, { "acc": 0.72975783, "epoch": 0.8925587943745308, "grad_norm": 5.78125, "learning_rate": 6.286925859493699e-06, "loss": 1.07734985, "memory(GiB)": 141.16, "step": 79800, "train_speed(iter/s)": 0.291493 }, { "acc": 0.72865829, "epoch": 0.8927824933204893, "grad_norm": 7.96875, "learning_rate": 6.2851386597170235e-06, "loss": 1.08744354, "memory(GiB)": 141.16, "step": 79820, "train_speed(iter/s)": 0.291518 }, { "acc": 0.72366271, "epoch": 0.8930061922664478, "grad_norm": 6.40625, "learning_rate": 6.2833512841150116e-06, "loss": 1.12495947, "memory(GiB)": 141.16, "step": 79840, "train_speed(iter/s)": 0.291541 }, { "acc": 0.73706093, "epoch": 0.8932298912124064, "grad_norm": 5.84375, "learning_rate": 6.281563732932201e-06, "loss": 1.05715504, "memory(GiB)": 141.16, "step": 79860, "train_speed(iter/s)": 0.291566 }, { "acc": 0.74111023, "epoch": 0.8934535901583649, "grad_norm": 6.4375, "learning_rate": 6.279776006413153e-06, "loss": 1.03092747, "memory(GiB)": 141.16, "step": 79880, "train_speed(iter/s)": 0.291592 }, { "acc": 0.73216009, "epoch": 0.8936772891043234, "grad_norm": 5.5625, "learning_rate": 6.277988104802455e-06, "loss": 1.04582138, "memory(GiB)": 141.16, "step": 79900, "train_speed(iter/s)": 0.291615 }, { "acc": 0.72354727, "epoch": 0.893900988050282, "grad_norm": 6.4375, "learning_rate": 6.2762000283447185e-06, "loss": 1.11350851, "memory(GiB)": 141.16, "step": 79920, "train_speed(iter/s)": 0.291636 }, { "acc": 0.72867641, "epoch": 0.8941246869962405, "grad_norm": 6.40625, "learning_rate": 6.274411777284576e-06, "loss": 1.09326105, "memory(GiB)": 141.16, "step": 79940, "train_speed(iter/s)": 0.29166 }, { "acc": 0.72181253, "epoch": 0.894348385942199, "grad_norm": 5.75, "learning_rate": 6.272623351866688e-06, "loss": 1.13394089, "memory(GiB)": 141.16, "step": 79960, "train_speed(iter/s)": 0.291686 }, { "acc": 0.73115635, "epoch": 0.8945720848881575, "grad_norm": 7.03125, "learning_rate": 6.270834752335735e-06, "loss": 1.0712101, "memory(GiB)": 141.16, "step": 79980, "train_speed(iter/s)": 0.291708 }, { "acc": 0.72179317, "epoch": 0.8947957838341161, "grad_norm": 7.5, "learning_rate": 6.269045978936423e-06, "loss": 1.12593784, "memory(GiB)": 141.16, "step": 80000, "train_speed(iter/s)": 0.291731 }, { "epoch": 0.8947957838341161, "eval_acc": 0.6900112948496038, "eval_loss": 1.0794517993927002, "eval_runtime": 2317.1937, "eval_samples_per_second": 32.489, "eval_steps_per_second": 16.245, "step": 80000 }, { "acc": 0.73089705, "epoch": 0.8950194827800746, "grad_norm": 6.6875, "learning_rate": 6.267257031913483e-06, "loss": 1.08077278, "memory(GiB)": 141.16, "step": 80020, "train_speed(iter/s)": 0.28926 }, { "acc": 0.72515726, "epoch": 0.8952431817260331, "grad_norm": 6.5625, "learning_rate": 6.265467911511667e-06, "loss": 1.11671515, "memory(GiB)": 141.16, "step": 80040, "train_speed(iter/s)": 0.289285 }, { "acc": 0.73167362, "epoch": 0.8954668806719916, "grad_norm": 6.71875, "learning_rate": 6.263678617975754e-06, "loss": 1.07657776, "memory(GiB)": 141.16, "step": 80060, "train_speed(iter/s)": 0.289309 }, { "acc": 0.73703461, "epoch": 0.8956905796179502, "grad_norm": 7.6875, "learning_rate": 6.261889151550542e-06, "loss": 1.05029917, "memory(GiB)": 141.16, "step": 80080, "train_speed(iter/s)": 0.289334 }, { "acc": 0.72672815, "epoch": 0.8959142785639087, "grad_norm": 7.15625, "learning_rate": 6.260099512480859e-06, "loss": 1.0959444, "memory(GiB)": 141.16, "step": 80100, "train_speed(iter/s)": 0.289358 }, { "acc": 0.73909702, "epoch": 0.8961379775098672, "grad_norm": 8.6875, "learning_rate": 6.258309701011551e-06, "loss": 1.06279163, "memory(GiB)": 141.16, "step": 80120, "train_speed(iter/s)": 0.289378 }, { "acc": 0.74684176, "epoch": 0.8963616764558258, "grad_norm": 7.4375, "learning_rate": 6.256519717387492e-06, "loss": 1.00823736, "memory(GiB)": 141.16, "step": 80140, "train_speed(iter/s)": 0.2894 }, { "acc": 0.72742043, "epoch": 0.8965853754017843, "grad_norm": 6.4375, "learning_rate": 6.254729561853575e-06, "loss": 1.12094078, "memory(GiB)": 141.16, "step": 80160, "train_speed(iter/s)": 0.289425 }, { "acc": 0.73188744, "epoch": 0.8968090743477428, "grad_norm": 5.9375, "learning_rate": 6.252939234654721e-06, "loss": 1.07472744, "memory(GiB)": 141.16, "step": 80180, "train_speed(iter/s)": 0.28945 }, { "acc": 0.74284534, "epoch": 0.8970327732937013, "grad_norm": 5.125, "learning_rate": 6.251148736035869e-06, "loss": 1.0430438, "memory(GiB)": 141.16, "step": 80200, "train_speed(iter/s)": 0.289474 }, { "acc": 0.72692442, "epoch": 0.8972564722396599, "grad_norm": 6.3125, "learning_rate": 6.249358066241987e-06, "loss": 1.0953249, "memory(GiB)": 141.16, "step": 80220, "train_speed(iter/s)": 0.289498 }, { "acc": 0.72472458, "epoch": 0.8974801711856184, "grad_norm": 7.1875, "learning_rate": 6.247567225518064e-06, "loss": 1.09321394, "memory(GiB)": 141.16, "step": 80240, "train_speed(iter/s)": 0.289521 }, { "acc": 0.71332979, "epoch": 0.8977038701315769, "grad_norm": 6.46875, "learning_rate": 6.245776214109114e-06, "loss": 1.15019608, "memory(GiB)": 141.16, "step": 80260, "train_speed(iter/s)": 0.289545 }, { "acc": 0.728901, "epoch": 0.8979275690775355, "grad_norm": 7.875, "learning_rate": 6.243985032260171e-06, "loss": 1.09038401, "memory(GiB)": 141.16, "step": 80280, "train_speed(iter/s)": 0.289568 }, { "acc": 0.7235146, "epoch": 0.898151268023494, "grad_norm": 7.90625, "learning_rate": 6.242193680216295e-06, "loss": 1.10091324, "memory(GiB)": 141.16, "step": 80300, "train_speed(iter/s)": 0.289591 }, { "acc": 0.72214928, "epoch": 0.8983749669694525, "grad_norm": 7.90625, "learning_rate": 6.240402158222568e-06, "loss": 1.12289391, "memory(GiB)": 141.16, "step": 80320, "train_speed(iter/s)": 0.289615 }, { "acc": 0.74311838, "epoch": 0.898598665915411, "grad_norm": 7.15625, "learning_rate": 6.238610466524097e-06, "loss": 1.03157959, "memory(GiB)": 141.16, "step": 80340, "train_speed(iter/s)": 0.289638 }, { "acc": 0.73246965, "epoch": 0.8988223648613696, "grad_norm": 7.8125, "learning_rate": 6.2368186053660095e-06, "loss": 1.04593039, "memory(GiB)": 141.16, "step": 80360, "train_speed(iter/s)": 0.289666 }, { "acc": 0.70742002, "epoch": 0.8990460638073281, "grad_norm": 7.46875, "learning_rate": 6.23502657499346e-06, "loss": 1.19853697, "memory(GiB)": 141.16, "step": 80380, "train_speed(iter/s)": 0.289691 }, { "acc": 0.74373708, "epoch": 0.8992697627532866, "grad_norm": 6.34375, "learning_rate": 6.233234375651621e-06, "loss": 1.00524998, "memory(GiB)": 141.16, "step": 80400, "train_speed(iter/s)": 0.289716 }, { "acc": 0.73560095, "epoch": 0.8994934616992452, "grad_norm": 7.59375, "learning_rate": 6.2314420075856926e-06, "loss": 1.05579891, "memory(GiB)": 141.16, "step": 80420, "train_speed(iter/s)": 0.289741 }, { "acc": 0.73535204, "epoch": 0.8997171606452037, "grad_norm": 7.0, "learning_rate": 6.229649471040897e-06, "loss": 1.06155834, "memory(GiB)": 141.16, "step": 80440, "train_speed(iter/s)": 0.289765 }, { "acc": 0.72169294, "epoch": 0.8999408595911622, "grad_norm": 8.1875, "learning_rate": 6.227856766262478e-06, "loss": 1.13478374, "memory(GiB)": 141.16, "step": 80460, "train_speed(iter/s)": 0.289787 }, { "acc": 0.72828884, "epoch": 0.9001645585371207, "grad_norm": 6.96875, "learning_rate": 6.226063893495704e-06, "loss": 1.0939764, "memory(GiB)": 141.16, "step": 80480, "train_speed(iter/s)": 0.289814 }, { "acc": 0.73692589, "epoch": 0.9003882574830793, "grad_norm": 5.875, "learning_rate": 6.224270852985863e-06, "loss": 1.06349907, "memory(GiB)": 141.16, "step": 80500, "train_speed(iter/s)": 0.289837 }, { "acc": 0.73704691, "epoch": 0.9006119564290378, "grad_norm": 6.625, "learning_rate": 6.2224776449782705e-06, "loss": 1.0422905, "memory(GiB)": 141.16, "step": 80520, "train_speed(iter/s)": 0.289862 }, { "acc": 0.72531471, "epoch": 0.9008356553749963, "grad_norm": 6.46875, "learning_rate": 6.2206842697182645e-06, "loss": 1.12985497, "memory(GiB)": 141.16, "step": 80540, "train_speed(iter/s)": 0.289887 }, { "acc": 0.73008432, "epoch": 0.9010593543209549, "grad_norm": 8.5625, "learning_rate": 6.2188907274512015e-06, "loss": 1.08529644, "memory(GiB)": 141.16, "step": 80560, "train_speed(iter/s)": 0.289913 }, { "acc": 0.71961331, "epoch": 0.9012830532669134, "grad_norm": 5.375, "learning_rate": 6.217097018422466e-06, "loss": 1.13881559, "memory(GiB)": 141.16, "step": 80580, "train_speed(iter/s)": 0.289938 }, { "acc": 0.71169486, "epoch": 0.9015067522128719, "grad_norm": 5.09375, "learning_rate": 6.215303142877461e-06, "loss": 1.14926872, "memory(GiB)": 141.16, "step": 80600, "train_speed(iter/s)": 0.289964 }, { "acc": 0.73173604, "epoch": 0.9017304511588304, "grad_norm": 7.59375, "learning_rate": 6.213509101061616e-06, "loss": 1.0818512, "memory(GiB)": 141.16, "step": 80620, "train_speed(iter/s)": 0.28999 }, { "acc": 0.73343811, "epoch": 0.901954150104789, "grad_norm": 6.1875, "learning_rate": 6.211714893220381e-06, "loss": 1.05430584, "memory(GiB)": 141.16, "step": 80640, "train_speed(iter/s)": 0.290015 }, { "acc": 0.72636061, "epoch": 0.9021778490507475, "grad_norm": 7.59375, "learning_rate": 6.209920519599228e-06, "loss": 1.09021969, "memory(GiB)": 141.16, "step": 80660, "train_speed(iter/s)": 0.29004 }, { "acc": 0.7247364, "epoch": 0.902401547996706, "grad_norm": 7.90625, "learning_rate": 6.208125980443657e-06, "loss": 1.11070995, "memory(GiB)": 141.16, "step": 80680, "train_speed(iter/s)": 0.290065 }, { "acc": 0.7326694, "epoch": 0.9026252469426645, "grad_norm": 7.40625, "learning_rate": 6.206331275999182e-06, "loss": 1.05872993, "memory(GiB)": 141.16, "step": 80700, "train_speed(iter/s)": 0.29009 }, { "acc": 0.72614546, "epoch": 0.9028489458886231, "grad_norm": 6.90625, "learning_rate": 6.204536406511346e-06, "loss": 1.10255423, "memory(GiB)": 141.16, "step": 80720, "train_speed(iter/s)": 0.290113 }, { "acc": 0.71577134, "epoch": 0.9030726448345816, "grad_norm": 6.40625, "learning_rate": 6.202741372225713e-06, "loss": 1.14372063, "memory(GiB)": 141.16, "step": 80740, "train_speed(iter/s)": 0.290134 }, { "acc": 0.73302212, "epoch": 0.9032963437805401, "grad_norm": 8.0625, "learning_rate": 6.20094617338787e-06, "loss": 1.08585691, "memory(GiB)": 141.16, "step": 80760, "train_speed(iter/s)": 0.290159 }, { "acc": 0.72399354, "epoch": 0.9035200427264987, "grad_norm": 6.875, "learning_rate": 6.199150810243423e-06, "loss": 1.1209465, "memory(GiB)": 141.16, "step": 80780, "train_speed(iter/s)": 0.290184 }, { "acc": 0.71607237, "epoch": 0.9037437416724572, "grad_norm": 6.15625, "learning_rate": 6.197355283038007e-06, "loss": 1.14981222, "memory(GiB)": 141.16, "step": 80800, "train_speed(iter/s)": 0.290207 }, { "acc": 0.72672715, "epoch": 0.9039674406184157, "grad_norm": 7.0625, "learning_rate": 6.195559592017273e-06, "loss": 1.09467382, "memory(GiB)": 141.16, "step": 80820, "train_speed(iter/s)": 0.29023 }, { "acc": 0.73715782, "epoch": 0.9041911395643742, "grad_norm": 7.4375, "learning_rate": 6.193763737426899e-06, "loss": 1.04469709, "memory(GiB)": 141.16, "step": 80840, "train_speed(iter/s)": 0.290252 }, { "acc": 0.72812672, "epoch": 0.9044148385103328, "grad_norm": 6.90625, "learning_rate": 6.1919677195125825e-06, "loss": 1.09152699, "memory(GiB)": 141.16, "step": 80860, "train_speed(iter/s)": 0.290277 }, { "acc": 0.74191418, "epoch": 0.9046385374562913, "grad_norm": 8.25, "learning_rate": 6.190171538520045e-06, "loss": 1.02197838, "memory(GiB)": 141.16, "step": 80880, "train_speed(iter/s)": 0.290303 }, { "acc": 0.7291996, "epoch": 0.9048622364022498, "grad_norm": 7.8125, "learning_rate": 6.18837519469503e-06, "loss": 1.08890362, "memory(GiB)": 141.16, "step": 80900, "train_speed(iter/s)": 0.290326 }, { "acc": 0.71672192, "epoch": 0.9050859353482084, "grad_norm": 7.5625, "learning_rate": 6.186578688283302e-06, "loss": 1.14400826, "memory(GiB)": 141.16, "step": 80920, "train_speed(iter/s)": 0.290349 }, { "acc": 0.73405514, "epoch": 0.9053096342941669, "grad_norm": 6.4375, "learning_rate": 6.18478201953065e-06, "loss": 1.06993828, "memory(GiB)": 141.16, "step": 80940, "train_speed(iter/s)": 0.290371 }, { "acc": 0.72150097, "epoch": 0.9055333332401254, "grad_norm": 7.28125, "learning_rate": 6.182985188682882e-06, "loss": 1.12327557, "memory(GiB)": 141.16, "step": 80960, "train_speed(iter/s)": 0.290396 }, { "acc": 0.73033566, "epoch": 0.9057570321860839, "grad_norm": 7.46875, "learning_rate": 6.181188195985832e-06, "loss": 1.09126387, "memory(GiB)": 141.16, "step": 80980, "train_speed(iter/s)": 0.290422 }, { "acc": 0.71807132, "epoch": 0.9059807311320425, "grad_norm": 6.9375, "learning_rate": 6.179391041685354e-06, "loss": 1.13491983, "memory(GiB)": 141.16, "step": 81000, "train_speed(iter/s)": 0.290448 }, { "acc": 0.72889242, "epoch": 0.906204430078001, "grad_norm": 6.5625, "learning_rate": 6.177593726027325e-06, "loss": 1.09228287, "memory(GiB)": 141.16, "step": 81020, "train_speed(iter/s)": 0.290473 }, { "acc": 0.73143959, "epoch": 0.9064281290239595, "grad_norm": 8.0, "learning_rate": 6.175796249257641e-06, "loss": 1.06382847, "memory(GiB)": 141.16, "step": 81040, "train_speed(iter/s)": 0.290497 }, { "acc": 0.72611027, "epoch": 0.9066518279699181, "grad_norm": 6.46875, "learning_rate": 6.173998611622224e-06, "loss": 1.11330948, "memory(GiB)": 141.16, "step": 81060, "train_speed(iter/s)": 0.29052 }, { "acc": 0.71999187, "epoch": 0.9068755269158766, "grad_norm": 7.625, "learning_rate": 6.172200813367017e-06, "loss": 1.12126131, "memory(GiB)": 141.16, "step": 81080, "train_speed(iter/s)": 0.290546 }, { "acc": 0.72937841, "epoch": 0.9070992258618351, "grad_norm": 7.59375, "learning_rate": 6.170402854737986e-06, "loss": 1.07849474, "memory(GiB)": 141.16, "step": 81100, "train_speed(iter/s)": 0.290572 }, { "acc": 0.74173136, "epoch": 0.9073229248077936, "grad_norm": 7.875, "learning_rate": 6.1686047359811145e-06, "loss": 1.03309746, "memory(GiB)": 141.16, "step": 81120, "train_speed(iter/s)": 0.290597 }, { "acc": 0.71960936, "epoch": 0.9075466237537522, "grad_norm": 7.90625, "learning_rate": 6.1668064573424105e-06, "loss": 1.1046298, "memory(GiB)": 141.16, "step": 81140, "train_speed(iter/s)": 0.29062 }, { "acc": 0.73580647, "epoch": 0.9077703226997107, "grad_norm": 7.25, "learning_rate": 6.1650080190679064e-06, "loss": 1.05644493, "memory(GiB)": 141.16, "step": 81160, "train_speed(iter/s)": 0.290644 }, { "acc": 0.72304983, "epoch": 0.9079940216456692, "grad_norm": 7.34375, "learning_rate": 6.1632094214036534e-06, "loss": 1.09685822, "memory(GiB)": 141.16, "step": 81180, "train_speed(iter/s)": 0.290662 }, { "acc": 0.74005194, "epoch": 0.9082177205916278, "grad_norm": 8.5625, "learning_rate": 6.1614106645957265e-06, "loss": 1.03297539, "memory(GiB)": 141.16, "step": 81200, "train_speed(iter/s)": 0.290683 }, { "acc": 0.72815366, "epoch": 0.9084414195375863, "grad_norm": 7.3125, "learning_rate": 6.15961174889022e-06, "loss": 1.09833012, "memory(GiB)": 141.16, "step": 81220, "train_speed(iter/s)": 0.290708 }, { "acc": 0.73524637, "epoch": 0.9086651184835448, "grad_norm": 5.78125, "learning_rate": 6.15781267453325e-06, "loss": 1.07073917, "memory(GiB)": 141.16, "step": 81240, "train_speed(iter/s)": 0.290731 }, { "acc": 0.73968887, "epoch": 0.9088888174295033, "grad_norm": 5.9375, "learning_rate": 6.156013441770958e-06, "loss": 1.04284496, "memory(GiB)": 141.16, "step": 81260, "train_speed(iter/s)": 0.290754 }, { "acc": 0.74420462, "epoch": 0.9091125163754619, "grad_norm": 7.71875, "learning_rate": 6.154214050849504e-06, "loss": 1.0126255, "memory(GiB)": 141.16, "step": 81280, "train_speed(iter/s)": 0.290779 }, { "acc": 0.73514299, "epoch": 0.9093362153214204, "grad_norm": 8.3125, "learning_rate": 6.152414502015071e-06, "loss": 1.04472599, "memory(GiB)": 141.16, "step": 81300, "train_speed(iter/s)": 0.2908 }, { "acc": 0.74343109, "epoch": 0.9095599142673789, "grad_norm": 6.65625, "learning_rate": 6.1506147955138615e-06, "loss": 1.02724838, "memory(GiB)": 141.16, "step": 81320, "train_speed(iter/s)": 0.290822 }, { "acc": 0.72255025, "epoch": 0.9097836132133374, "grad_norm": 7.90625, "learning_rate": 6.148814931592102e-06, "loss": 1.09800415, "memory(GiB)": 141.16, "step": 81340, "train_speed(iter/s)": 0.290847 }, { "acc": 0.7475431, "epoch": 0.910007312159296, "grad_norm": 10.1875, "learning_rate": 6.147014910496041e-06, "loss": 1.01125937, "memory(GiB)": 141.16, "step": 81360, "train_speed(iter/s)": 0.29087 }, { "acc": 0.73083143, "epoch": 0.9102310111052545, "grad_norm": 5.90625, "learning_rate": 6.1452147324719444e-06, "loss": 1.08332968, "memory(GiB)": 141.16, "step": 81380, "train_speed(iter/s)": 0.290896 }, { "acc": 0.72214489, "epoch": 0.910454710051213, "grad_norm": 5.90625, "learning_rate": 6.143414397766103e-06, "loss": 1.10667152, "memory(GiB)": 141.16, "step": 81400, "train_speed(iter/s)": 0.29092 }, { "acc": 0.73093152, "epoch": 0.9106784089971716, "grad_norm": 6.21875, "learning_rate": 6.14161390662483e-06, "loss": 1.07944756, "memory(GiB)": 141.16, "step": 81420, "train_speed(iter/s)": 0.290945 }, { "acc": 0.74284439, "epoch": 0.9109021079431301, "grad_norm": 7.59375, "learning_rate": 6.139813259294456e-06, "loss": 1.02195778, "memory(GiB)": 141.16, "step": 81440, "train_speed(iter/s)": 0.290969 }, { "acc": 0.72898378, "epoch": 0.9111258068890886, "grad_norm": 7.53125, "learning_rate": 6.138012456021337e-06, "loss": 1.08534622, "memory(GiB)": 141.16, "step": 81460, "train_speed(iter/s)": 0.290993 }, { "acc": 0.72730026, "epoch": 0.9113495058350471, "grad_norm": 6.75, "learning_rate": 6.136211497051848e-06, "loss": 1.09834099, "memory(GiB)": 141.16, "step": 81480, "train_speed(iter/s)": 0.291017 }, { "acc": 0.71198463, "epoch": 0.9115732047810057, "grad_norm": 6.78125, "learning_rate": 6.134410382632385e-06, "loss": 1.17008801, "memory(GiB)": 141.16, "step": 81500, "train_speed(iter/s)": 0.291031 }, { "acc": 0.73968458, "epoch": 0.9117969037269642, "grad_norm": 7.875, "learning_rate": 6.13260911300937e-06, "loss": 1.04334354, "memory(GiB)": 141.16, "step": 81520, "train_speed(iter/s)": 0.291054 }, { "acc": 0.73031445, "epoch": 0.9120206026729227, "grad_norm": 7.5, "learning_rate": 6.130807688429237e-06, "loss": 1.07344027, "memory(GiB)": 141.16, "step": 81540, "train_speed(iter/s)": 0.291078 }, { "acc": 0.73690128, "epoch": 0.9122443016188813, "grad_norm": 9.1875, "learning_rate": 6.12900610913845e-06, "loss": 1.06309509, "memory(GiB)": 141.16, "step": 81560, "train_speed(iter/s)": 0.291102 }, { "acc": 0.72061014, "epoch": 0.9124680005648398, "grad_norm": 8.0625, "learning_rate": 6.12720437538349e-06, "loss": 1.11768723, "memory(GiB)": 141.16, "step": 81580, "train_speed(iter/s)": 0.291124 }, { "acc": 0.74184999, "epoch": 0.9126916995107983, "grad_norm": 7.34375, "learning_rate": 6.125402487410859e-06, "loss": 1.03099852, "memory(GiB)": 141.16, "step": 81600, "train_speed(iter/s)": 0.291149 }, { "acc": 0.7374403, "epoch": 0.9129153984567568, "grad_norm": 6.6875, "learning_rate": 6.123600445467085e-06, "loss": 1.03740082, "memory(GiB)": 141.16, "step": 81620, "train_speed(iter/s)": 0.291172 }, { "acc": 0.73473687, "epoch": 0.9131390974027154, "grad_norm": 7.0625, "learning_rate": 6.1217982497987075e-06, "loss": 1.07776499, "memory(GiB)": 141.16, "step": 81640, "train_speed(iter/s)": 0.291198 }, { "acc": 0.74095001, "epoch": 0.9133627963486739, "grad_norm": 6.5, "learning_rate": 6.119995900652296e-06, "loss": 1.02732458, "memory(GiB)": 141.16, "step": 81660, "train_speed(iter/s)": 0.291221 }, { "acc": 0.73071413, "epoch": 0.9135864952946325, "grad_norm": 7.6875, "learning_rate": 6.118193398274437e-06, "loss": 1.08509073, "memory(GiB)": 141.16, "step": 81680, "train_speed(iter/s)": 0.291242 }, { "acc": 0.72561903, "epoch": 0.9138101942405911, "grad_norm": 7.0, "learning_rate": 6.116390742911738e-06, "loss": 1.10378761, "memory(GiB)": 141.16, "step": 81700, "train_speed(iter/s)": 0.291267 }, { "acc": 0.72610731, "epoch": 0.9140338931865496, "grad_norm": 6.5, "learning_rate": 6.114587934810829e-06, "loss": 1.10703239, "memory(GiB)": 141.16, "step": 81720, "train_speed(iter/s)": 0.291292 }, { "acc": 0.73065376, "epoch": 0.9142575921325081, "grad_norm": 6.90625, "learning_rate": 6.112784974218358e-06, "loss": 1.0699832, "memory(GiB)": 141.16, "step": 81740, "train_speed(iter/s)": 0.291316 }, { "acc": 0.72577248, "epoch": 0.9144812910784667, "grad_norm": 8.5625, "learning_rate": 6.110981861380999e-06, "loss": 1.10649872, "memory(GiB)": 141.16, "step": 81760, "train_speed(iter/s)": 0.291338 }, { "acc": 0.73234825, "epoch": 0.9147049900244252, "grad_norm": 8.1875, "learning_rate": 6.109178596545441e-06, "loss": 1.07435169, "memory(GiB)": 141.16, "step": 81780, "train_speed(iter/s)": 0.291361 }, { "acc": 0.73531647, "epoch": 0.9149286889703837, "grad_norm": 7.84375, "learning_rate": 6.107375179958397e-06, "loss": 1.05182457, "memory(GiB)": 141.16, "step": 81800, "train_speed(iter/s)": 0.291384 }, { "acc": 0.74304848, "epoch": 0.9151523879163422, "grad_norm": 7.8125, "learning_rate": 6.105571611866601e-06, "loss": 1.02219677, "memory(GiB)": 141.16, "step": 81820, "train_speed(iter/s)": 0.29141 }, { "acc": 0.72930737, "epoch": 0.9153760868623008, "grad_norm": 7.78125, "learning_rate": 6.103767892516806e-06, "loss": 1.07589855, "memory(GiB)": 141.16, "step": 81840, "train_speed(iter/s)": 0.291433 }, { "acc": 0.7305644, "epoch": 0.9155997858082593, "grad_norm": 7.625, "learning_rate": 6.101964022155787e-06, "loss": 1.08947086, "memory(GiB)": 141.16, "step": 81860, "train_speed(iter/s)": 0.291456 }, { "acc": 0.72612886, "epoch": 0.9158234847542178, "grad_norm": 7.625, "learning_rate": 6.100160001030337e-06, "loss": 1.10561256, "memory(GiB)": 141.16, "step": 81880, "train_speed(iter/s)": 0.291478 }, { "acc": 0.72517657, "epoch": 0.9160471837001763, "grad_norm": 7.6875, "learning_rate": 6.098355829387277e-06, "loss": 1.12252865, "memory(GiB)": 141.16, "step": 81900, "train_speed(iter/s)": 0.291503 }, { "acc": 0.72923536, "epoch": 0.9162708826461349, "grad_norm": 6.1875, "learning_rate": 6.0965515074734395e-06, "loss": 1.07568245, "memory(GiB)": 141.16, "step": 81920, "train_speed(iter/s)": 0.291525 }, { "acc": 0.73720036, "epoch": 0.9164945815920934, "grad_norm": 6.40625, "learning_rate": 6.094747035535683e-06, "loss": 1.04618359, "memory(GiB)": 141.16, "step": 81940, "train_speed(iter/s)": 0.291549 }, { "acc": 0.72386875, "epoch": 0.9167182805380519, "grad_norm": 5.84375, "learning_rate": 6.092942413820883e-06, "loss": 1.11505508, "memory(GiB)": 141.16, "step": 81960, "train_speed(iter/s)": 0.291571 }, { "acc": 0.72926292, "epoch": 0.9169419794840105, "grad_norm": 7.78125, "learning_rate": 6.091137642575939e-06, "loss": 1.08778915, "memory(GiB)": 141.16, "step": 81980, "train_speed(iter/s)": 0.291593 }, { "acc": 0.7285315, "epoch": 0.917165678429969, "grad_norm": 5.65625, "learning_rate": 6.08933272204777e-06, "loss": 1.08960114, "memory(GiB)": 141.16, "step": 82000, "train_speed(iter/s)": 0.291615 }, { "epoch": 0.917165678429969, "eval_acc": 0.6900319501480674, "eval_loss": 1.079362392425537, "eval_runtime": 2319.5014, "eval_samples_per_second": 32.457, "eval_steps_per_second": 16.228, "step": 82000 }, { "acc": 0.73080864, "epoch": 0.9173893773759275, "grad_norm": 5.0625, "learning_rate": 6.087527652483315e-06, "loss": 1.09522419, "memory(GiB)": 141.16, "step": 82020, "train_speed(iter/s)": 0.289201 }, { "acc": 0.72099285, "epoch": 0.917613076321886, "grad_norm": 6.53125, "learning_rate": 6.085722434129533e-06, "loss": 1.12896214, "memory(GiB)": 141.16, "step": 82040, "train_speed(iter/s)": 0.289225 }, { "acc": 0.73495302, "epoch": 0.9178367752678446, "grad_norm": 7.9375, "learning_rate": 6.083917067233402e-06, "loss": 1.0699688, "memory(GiB)": 141.16, "step": 82060, "train_speed(iter/s)": 0.289249 }, { "acc": 0.73120346, "epoch": 0.9180604742138031, "grad_norm": 6.75, "learning_rate": 6.082111552041925e-06, "loss": 1.08501472, "memory(GiB)": 141.16, "step": 82080, "train_speed(iter/s)": 0.289274 }, { "acc": 0.71560082, "epoch": 0.9182841731597616, "grad_norm": 5.9375, "learning_rate": 6.080305888802119e-06, "loss": 1.16366129, "memory(GiB)": 141.16, "step": 82100, "train_speed(iter/s)": 0.289297 }, { "acc": 0.72352552, "epoch": 0.9185078721057202, "grad_norm": 6.46875, "learning_rate": 6.078500077761027e-06, "loss": 1.1266571, "memory(GiB)": 141.16, "step": 82120, "train_speed(iter/s)": 0.289322 }, { "acc": 0.73258996, "epoch": 0.9187315710516787, "grad_norm": 6.09375, "learning_rate": 6.07669411916571e-06, "loss": 1.06170235, "memory(GiB)": 141.16, "step": 82140, "train_speed(iter/s)": 0.289346 }, { "acc": 0.71946983, "epoch": 0.9189552699976372, "grad_norm": 8.3125, "learning_rate": 6.074888013263247e-06, "loss": 1.12696772, "memory(GiB)": 141.16, "step": 82160, "train_speed(iter/s)": 0.289372 }, { "acc": 0.73296452, "epoch": 0.9191789689435957, "grad_norm": 6.15625, "learning_rate": 6.073081760300741e-06, "loss": 1.07760429, "memory(GiB)": 141.16, "step": 82180, "train_speed(iter/s)": 0.289398 }, { "acc": 0.72782793, "epoch": 0.9194026678895543, "grad_norm": 6.40625, "learning_rate": 6.071275360525311e-06, "loss": 1.11142979, "memory(GiB)": 141.16, "step": 82200, "train_speed(iter/s)": 0.289421 }, { "acc": 0.73320508, "epoch": 0.9196263668355128, "grad_norm": 8.25, "learning_rate": 6.069468814184101e-06, "loss": 1.06677437, "memory(GiB)": 141.16, "step": 82220, "train_speed(iter/s)": 0.289443 }, { "acc": 0.72272921, "epoch": 0.9198500657814713, "grad_norm": 6.84375, "learning_rate": 6.067662121524271e-06, "loss": 1.11497946, "memory(GiB)": 141.16, "step": 82240, "train_speed(iter/s)": 0.289466 }, { "acc": 0.72623987, "epoch": 0.9200737647274299, "grad_norm": 6.6875, "learning_rate": 6.0658552827930016e-06, "loss": 1.09431276, "memory(GiB)": 141.16, "step": 82260, "train_speed(iter/s)": 0.28949 }, { "acc": 0.73324032, "epoch": 0.9202974636733884, "grad_norm": 6.5, "learning_rate": 6.064048298237495e-06, "loss": 1.04931755, "memory(GiB)": 141.16, "step": 82280, "train_speed(iter/s)": 0.289514 }, { "acc": 0.74189339, "epoch": 0.9205211626193469, "grad_norm": 6.59375, "learning_rate": 6.062241168104972e-06, "loss": 1.01392307, "memory(GiB)": 141.16, "step": 82300, "train_speed(iter/s)": 0.289539 }, { "acc": 0.71938705, "epoch": 0.9207448615653054, "grad_norm": 8.0, "learning_rate": 6.0604338926426745e-06, "loss": 1.12097273, "memory(GiB)": 141.16, "step": 82320, "train_speed(iter/s)": 0.289563 }, { "acc": 0.72306032, "epoch": 0.920968560511264, "grad_norm": 7.78125, "learning_rate": 6.058626472097865e-06, "loss": 1.10339832, "memory(GiB)": 141.16, "step": 82340, "train_speed(iter/s)": 0.289587 }, { "acc": 0.71851358, "epoch": 0.9211922594572225, "grad_norm": 6.15625, "learning_rate": 6.0568189067178206e-06, "loss": 1.13356409, "memory(GiB)": 141.16, "step": 82360, "train_speed(iter/s)": 0.28961 }, { "acc": 0.7386559, "epoch": 0.921415958403181, "grad_norm": 7.375, "learning_rate": 6.055011196749845e-06, "loss": 1.04394093, "memory(GiB)": 141.16, "step": 82380, "train_speed(iter/s)": 0.289635 }, { "acc": 0.72800002, "epoch": 0.9216396573491396, "grad_norm": 6.53125, "learning_rate": 6.053203342441259e-06, "loss": 1.09278355, "memory(GiB)": 141.16, "step": 82400, "train_speed(iter/s)": 0.289658 }, { "acc": 0.75225272, "epoch": 0.9218633562950981, "grad_norm": 7.84375, "learning_rate": 6.0513953440394e-06, "loss": 0.9961256, "memory(GiB)": 141.16, "step": 82420, "train_speed(iter/s)": 0.289684 }, { "acc": 0.73590307, "epoch": 0.9220870552410566, "grad_norm": 6.21875, "learning_rate": 6.049587201791631e-06, "loss": 1.07094574, "memory(GiB)": 141.16, "step": 82440, "train_speed(iter/s)": 0.289709 }, { "acc": 0.72877617, "epoch": 0.9223107541870151, "grad_norm": 7.65625, "learning_rate": 6.047778915945333e-06, "loss": 1.08469954, "memory(GiB)": 141.16, "step": 82460, "train_speed(iter/s)": 0.289734 }, { "acc": 0.73274879, "epoch": 0.9225344531329737, "grad_norm": 9.1875, "learning_rate": 6.0459704867479005e-06, "loss": 1.07233009, "memory(GiB)": 141.16, "step": 82480, "train_speed(iter/s)": 0.289757 }, { "acc": 0.7342052, "epoch": 0.9227581520789322, "grad_norm": 6.53125, "learning_rate": 6.044161914446756e-06, "loss": 1.05654488, "memory(GiB)": 141.16, "step": 82500, "train_speed(iter/s)": 0.289779 }, { "acc": 0.723452, "epoch": 0.9229818510248907, "grad_norm": 6.5625, "learning_rate": 6.042353199289337e-06, "loss": 1.11061945, "memory(GiB)": 141.16, "step": 82520, "train_speed(iter/s)": 0.289801 }, { "acc": 0.72874632, "epoch": 0.9232055499708492, "grad_norm": 6.5, "learning_rate": 6.040544341523103e-06, "loss": 1.08696327, "memory(GiB)": 141.16, "step": 82540, "train_speed(iter/s)": 0.289824 }, { "acc": 0.7271162, "epoch": 0.9234292489168078, "grad_norm": 6.375, "learning_rate": 6.038735341395528e-06, "loss": 1.07170935, "memory(GiB)": 141.16, "step": 82560, "train_speed(iter/s)": 0.28985 }, { "acc": 0.73406744, "epoch": 0.9236529478627663, "grad_norm": 7.03125, "learning_rate": 6.036926199154113e-06, "loss": 1.06137152, "memory(GiB)": 141.16, "step": 82580, "train_speed(iter/s)": 0.289876 }, { "acc": 0.71856613, "epoch": 0.9238766468087248, "grad_norm": 7.40625, "learning_rate": 6.035116915046372e-06, "loss": 1.13989058, "memory(GiB)": 141.16, "step": 82600, "train_speed(iter/s)": 0.2899 }, { "acc": 0.72576756, "epoch": 0.9241003457546834, "grad_norm": 6.65625, "learning_rate": 6.033307489319842e-06, "loss": 1.10069218, "memory(GiB)": 141.16, "step": 82620, "train_speed(iter/s)": 0.289921 }, { "acc": 0.7375967, "epoch": 0.9243240447006419, "grad_norm": 6.46875, "learning_rate": 6.031497922222077e-06, "loss": 1.03908825, "memory(GiB)": 141.16, "step": 82640, "train_speed(iter/s)": 0.289947 }, { "acc": 0.74084549, "epoch": 0.9245477436466004, "grad_norm": 7.875, "learning_rate": 6.029688214000653e-06, "loss": 1.04538698, "memory(GiB)": 141.16, "step": 82660, "train_speed(iter/s)": 0.28997 }, { "acc": 0.71991749, "epoch": 0.924771442592559, "grad_norm": 5.875, "learning_rate": 6.027878364903166e-06, "loss": 1.13742962, "memory(GiB)": 141.16, "step": 82680, "train_speed(iter/s)": 0.289994 }, { "acc": 0.72657318, "epoch": 0.9249951415385175, "grad_norm": 6.90625, "learning_rate": 6.0260683751772255e-06, "loss": 1.09606552, "memory(GiB)": 141.16, "step": 82700, "train_speed(iter/s)": 0.290015 }, { "acc": 0.72688947, "epoch": 0.925218840484476, "grad_norm": 8.0625, "learning_rate": 6.024258245070465e-06, "loss": 1.08936882, "memory(GiB)": 141.16, "step": 82720, "train_speed(iter/s)": 0.290037 }, { "acc": 0.72683172, "epoch": 0.9254425394304345, "grad_norm": 6.0, "learning_rate": 6.022447974830535e-06, "loss": 1.08812962, "memory(GiB)": 141.16, "step": 82740, "train_speed(iter/s)": 0.29006 }, { "acc": 0.72905922, "epoch": 0.9256662383763931, "grad_norm": 7.34375, "learning_rate": 6.02063756470511e-06, "loss": 1.08064365, "memory(GiB)": 141.16, "step": 82760, "train_speed(iter/s)": 0.290083 }, { "acc": 0.74166021, "epoch": 0.9258899373223516, "grad_norm": 7.78125, "learning_rate": 6.0188270149418784e-06, "loss": 1.04346876, "memory(GiB)": 141.16, "step": 82780, "train_speed(iter/s)": 0.290108 }, { "acc": 0.72417955, "epoch": 0.9261136362683101, "grad_norm": 6.15625, "learning_rate": 6.017016325788547e-06, "loss": 1.10498123, "memory(GiB)": 141.16, "step": 82800, "train_speed(iter/s)": 0.290134 }, { "acc": 0.7400548, "epoch": 0.9263373352142686, "grad_norm": 6.34375, "learning_rate": 6.0152054974928465e-06, "loss": 1.04578629, "memory(GiB)": 141.16, "step": 82820, "train_speed(iter/s)": 0.290157 }, { "acc": 0.72396221, "epoch": 0.9265610341602272, "grad_norm": 7.3125, "learning_rate": 6.013394530302523e-06, "loss": 1.10724134, "memory(GiB)": 141.16, "step": 82840, "train_speed(iter/s)": 0.290173 }, { "acc": 0.72728143, "epoch": 0.9267847331061857, "grad_norm": 8.3125, "learning_rate": 6.011583424465344e-06, "loss": 1.11320581, "memory(GiB)": 141.16, "step": 82860, "train_speed(iter/s)": 0.290197 }, { "acc": 0.73037949, "epoch": 0.9270084320521442, "grad_norm": 8.1875, "learning_rate": 6.009772180229094e-06, "loss": 1.09342957, "memory(GiB)": 141.16, "step": 82880, "train_speed(iter/s)": 0.290225 }, { "acc": 0.71980867, "epoch": 0.9272321309981028, "grad_norm": 6.78125, "learning_rate": 6.007960797841575e-06, "loss": 1.13686981, "memory(GiB)": 141.16, "step": 82900, "train_speed(iter/s)": 0.290251 }, { "acc": 0.72908325, "epoch": 0.9274558299440613, "grad_norm": 7.9375, "learning_rate": 6.006149277550613e-06, "loss": 1.09652176, "memory(GiB)": 141.16, "step": 82920, "train_speed(iter/s)": 0.290276 }, { "acc": 0.73641753, "epoch": 0.9276795288900198, "grad_norm": 6.375, "learning_rate": 6.0043376196040485e-06, "loss": 1.06037216, "memory(GiB)": 141.16, "step": 82940, "train_speed(iter/s)": 0.2903 }, { "acc": 0.73056927, "epoch": 0.9279032278359783, "grad_norm": 7.8125, "learning_rate": 6.002525824249741e-06, "loss": 1.08992367, "memory(GiB)": 141.16, "step": 82960, "train_speed(iter/s)": 0.290327 }, { "acc": 0.73234596, "epoch": 0.9281269267819369, "grad_norm": 7.71875, "learning_rate": 6.000713891735573e-06, "loss": 1.06812305, "memory(GiB)": 141.16, "step": 82980, "train_speed(iter/s)": 0.290353 }, { "acc": 0.74206076, "epoch": 0.9283506257278954, "grad_norm": 8.6875, "learning_rate": 5.998901822309441e-06, "loss": 1.02599096, "memory(GiB)": 141.16, "step": 83000, "train_speed(iter/s)": 0.290376 }, { "acc": 0.74354024, "epoch": 0.9285743246738539, "grad_norm": 7.875, "learning_rate": 5.9970896162192614e-06, "loss": 1.02782269, "memory(GiB)": 141.16, "step": 83020, "train_speed(iter/s)": 0.290397 }, { "acc": 0.73131199, "epoch": 0.9287980236198125, "grad_norm": 6.46875, "learning_rate": 5.9952772737129706e-06, "loss": 1.08551979, "memory(GiB)": 141.16, "step": 83040, "train_speed(iter/s)": 0.290419 }, { "acc": 0.73491678, "epoch": 0.929021722565771, "grad_norm": 7.3125, "learning_rate": 5.993464795038523e-06, "loss": 1.06371517, "memory(GiB)": 141.16, "step": 83060, "train_speed(iter/s)": 0.290442 }, { "acc": 0.72343602, "epoch": 0.9292454215117295, "grad_norm": 6.53125, "learning_rate": 5.991652180443893e-06, "loss": 1.13604813, "memory(GiB)": 141.16, "step": 83080, "train_speed(iter/s)": 0.290464 }, { "acc": 0.73283539, "epoch": 0.929469120457688, "grad_norm": 7.5, "learning_rate": 5.989839430177069e-06, "loss": 1.06110868, "memory(GiB)": 141.16, "step": 83100, "train_speed(iter/s)": 0.290487 }, { "acc": 0.7354383, "epoch": 0.9296928194036466, "grad_norm": 8.3125, "learning_rate": 5.988026544486063e-06, "loss": 1.06966534, "memory(GiB)": 141.16, "step": 83120, "train_speed(iter/s)": 0.290509 }, { "acc": 0.73138914, "epoch": 0.9299165183496051, "grad_norm": 8.0625, "learning_rate": 5.9862135236189045e-06, "loss": 1.07455816, "memory(GiB)": 141.16, "step": 83140, "train_speed(iter/s)": 0.290533 }, { "acc": 0.7230793, "epoch": 0.9301402172955636, "grad_norm": 5.78125, "learning_rate": 5.98440036782364e-06, "loss": 1.11471977, "memory(GiB)": 141.16, "step": 83160, "train_speed(iter/s)": 0.290557 }, { "acc": 0.72960749, "epoch": 0.9303639162415221, "grad_norm": 8.625, "learning_rate": 5.982587077348333e-06, "loss": 1.09418354, "memory(GiB)": 141.16, "step": 83180, "train_speed(iter/s)": 0.29058 }, { "acc": 0.72457628, "epoch": 0.9305876151874807, "grad_norm": 6.625, "learning_rate": 5.980773652441072e-06, "loss": 1.10988102, "memory(GiB)": 141.16, "step": 83200, "train_speed(iter/s)": 0.2906 }, { "acc": 0.74287543, "epoch": 0.9308113141334392, "grad_norm": 6.8125, "learning_rate": 5.978960093349955e-06, "loss": 1.03936062, "memory(GiB)": 141.16, "step": 83220, "train_speed(iter/s)": 0.290624 }, { "acc": 0.73502645, "epoch": 0.9310350130793977, "grad_norm": 7.03125, "learning_rate": 5.977146400323105e-06, "loss": 1.06415539, "memory(GiB)": 141.16, "step": 83240, "train_speed(iter/s)": 0.29065 }, { "acc": 0.7349185, "epoch": 0.9312587120253563, "grad_norm": 6.75, "learning_rate": 5.975332573608661e-06, "loss": 1.05886126, "memory(GiB)": 141.16, "step": 83260, "train_speed(iter/s)": 0.29067 }, { "acc": 0.73267946, "epoch": 0.9314824109713148, "grad_norm": 8.5625, "learning_rate": 5.97351861345478e-06, "loss": 1.06397839, "memory(GiB)": 141.16, "step": 83280, "train_speed(iter/s)": 0.290693 }, { "acc": 0.73020673, "epoch": 0.9317061099172733, "grad_norm": 7.5625, "learning_rate": 5.971704520109638e-06, "loss": 1.07998848, "memory(GiB)": 141.16, "step": 83300, "train_speed(iter/s)": 0.290715 }, { "acc": 0.73505421, "epoch": 0.9319298088632318, "grad_norm": 6.59375, "learning_rate": 5.9698902938214285e-06, "loss": 1.06981449, "memory(GiB)": 141.16, "step": 83320, "train_speed(iter/s)": 0.29074 }, { "acc": 0.72632632, "epoch": 0.9321535078091904, "grad_norm": 5.9375, "learning_rate": 5.968075934838364e-06, "loss": 1.10543079, "memory(GiB)": 141.16, "step": 83340, "train_speed(iter/s)": 0.290761 }, { "acc": 0.72163577, "epoch": 0.9323772067551489, "grad_norm": 7.34375, "learning_rate": 5.966261443408674e-06, "loss": 1.1217123, "memory(GiB)": 141.16, "step": 83360, "train_speed(iter/s)": 0.290786 }, { "acc": 0.72386322, "epoch": 0.9326009057011074, "grad_norm": 7.90625, "learning_rate": 5.964446819780608e-06, "loss": 1.10836124, "memory(GiB)": 141.16, "step": 83380, "train_speed(iter/s)": 0.29081 }, { "acc": 0.74519486, "epoch": 0.932824604647066, "grad_norm": 6.15625, "learning_rate": 5.962632064202434e-06, "loss": 1.01758814, "memory(GiB)": 141.16, "step": 83400, "train_speed(iter/s)": 0.290833 }, { "acc": 0.73582582, "epoch": 0.9330483035930245, "grad_norm": 7.34375, "learning_rate": 5.960817176922432e-06, "loss": 1.07168274, "memory(GiB)": 141.16, "step": 83420, "train_speed(iter/s)": 0.290857 }, { "acc": 0.73551378, "epoch": 0.933272002538983, "grad_norm": 6.3125, "learning_rate": 5.959002158188907e-06, "loss": 1.04545631, "memory(GiB)": 141.16, "step": 83440, "train_speed(iter/s)": 0.29088 }, { "acc": 0.73986473, "epoch": 0.9334957014849415, "grad_norm": 6.5625, "learning_rate": 5.9571870082501794e-06, "loss": 1.0420948, "memory(GiB)": 141.16, "step": 83460, "train_speed(iter/s)": 0.290904 }, { "acc": 0.73149905, "epoch": 0.9337194004309001, "grad_norm": 6.59375, "learning_rate": 5.9553717273545885e-06, "loss": 1.08280087, "memory(GiB)": 141.16, "step": 83480, "train_speed(iter/s)": 0.290929 }, { "acc": 0.73021121, "epoch": 0.9339430993768586, "grad_norm": 5.96875, "learning_rate": 5.953556315750491e-06, "loss": 1.08593664, "memory(GiB)": 141.16, "step": 83500, "train_speed(iter/s)": 0.290955 }, { "acc": 0.72078543, "epoch": 0.9341667983228171, "grad_norm": 7.53125, "learning_rate": 5.951740773686257e-06, "loss": 1.12726917, "memory(GiB)": 141.16, "step": 83520, "train_speed(iter/s)": 0.290977 }, { "acc": 0.71856365, "epoch": 0.9343904972687757, "grad_norm": 6.40625, "learning_rate": 5.949925101410284e-06, "loss": 1.14070625, "memory(GiB)": 141.16, "step": 83540, "train_speed(iter/s)": 0.291001 }, { "acc": 0.72964964, "epoch": 0.9346141962147342, "grad_norm": 7.125, "learning_rate": 5.9481092991709785e-06, "loss": 1.09461613, "memory(GiB)": 141.16, "step": 83560, "train_speed(iter/s)": 0.291025 }, { "acc": 0.72762055, "epoch": 0.9348378951606927, "grad_norm": 9.25, "learning_rate": 5.94629336721677e-06, "loss": 1.08012934, "memory(GiB)": 141.16, "step": 83580, "train_speed(iter/s)": 0.291051 }, { "acc": 0.73248005, "epoch": 0.9350615941066512, "grad_norm": 5.96875, "learning_rate": 5.944477305796104e-06, "loss": 1.0643713, "memory(GiB)": 141.16, "step": 83600, "train_speed(iter/s)": 0.291074 }, { "acc": 0.7166667, "epoch": 0.9352852930526098, "grad_norm": 6.71875, "learning_rate": 5.942661115157441e-06, "loss": 1.13986864, "memory(GiB)": 141.16, "step": 83620, "train_speed(iter/s)": 0.291095 }, { "acc": 0.74042282, "epoch": 0.9355089919985683, "grad_norm": 8.0, "learning_rate": 5.940844795549264e-06, "loss": 1.03884068, "memory(GiB)": 141.16, "step": 83640, "train_speed(iter/s)": 0.291119 }, { "acc": 0.72859988, "epoch": 0.9357326909445268, "grad_norm": 6.1875, "learning_rate": 5.939028347220072e-06, "loss": 1.10365677, "memory(GiB)": 141.16, "step": 83660, "train_speed(iter/s)": 0.291144 }, { "acc": 0.72886634, "epoch": 0.9359563898904854, "grad_norm": 6.8125, "learning_rate": 5.93721177041838e-06, "loss": 1.08813114, "memory(GiB)": 141.16, "step": 83680, "train_speed(iter/s)": 0.291167 }, { "acc": 0.73631492, "epoch": 0.9361800888364439, "grad_norm": 7.84375, "learning_rate": 5.935395065392723e-06, "loss": 1.05412998, "memory(GiB)": 141.16, "step": 83700, "train_speed(iter/s)": 0.291193 }, { "acc": 0.73464622, "epoch": 0.9364037877824024, "grad_norm": 6.875, "learning_rate": 5.93357823239165e-06, "loss": 1.04717159, "memory(GiB)": 141.16, "step": 83720, "train_speed(iter/s)": 0.291219 }, { "acc": 0.72118649, "epoch": 0.9366274867283609, "grad_norm": 8.0, "learning_rate": 5.931761271663732e-06, "loss": 1.13204403, "memory(GiB)": 141.16, "step": 83740, "train_speed(iter/s)": 0.291243 }, { "acc": 0.71844049, "epoch": 0.9368511856743195, "grad_norm": 5.5, "learning_rate": 5.929944183457552e-06, "loss": 1.139505, "memory(GiB)": 141.16, "step": 83760, "train_speed(iter/s)": 0.291264 }, { "acc": 0.72537813, "epoch": 0.937074884620278, "grad_norm": 7.03125, "learning_rate": 5.928126968021717e-06, "loss": 1.10744038, "memory(GiB)": 141.16, "step": 83780, "train_speed(iter/s)": 0.291288 }, { "acc": 0.72633877, "epoch": 0.9372985835662365, "grad_norm": 7.84375, "learning_rate": 5.926309625604847e-06, "loss": 1.10552416, "memory(GiB)": 141.16, "step": 83800, "train_speed(iter/s)": 0.29131 }, { "acc": 0.73658247, "epoch": 0.937522282512195, "grad_norm": 4.4375, "learning_rate": 5.924492156455581e-06, "loss": 1.05394287, "memory(GiB)": 141.16, "step": 83820, "train_speed(iter/s)": 0.291331 }, { "acc": 0.72516484, "epoch": 0.9377459814581536, "grad_norm": 6.75, "learning_rate": 5.9226745608225724e-06, "loss": 1.09883204, "memory(GiB)": 141.16, "step": 83840, "train_speed(iter/s)": 0.291356 }, { "acc": 0.72710323, "epoch": 0.9379696804041121, "grad_norm": 6.34375, "learning_rate": 5.920856838954496e-06, "loss": 1.09365807, "memory(GiB)": 141.16, "step": 83860, "train_speed(iter/s)": 0.29138 }, { "acc": 0.74302206, "epoch": 0.9381933793500706, "grad_norm": 7.21875, "learning_rate": 5.9190389911000415e-06, "loss": 1.04499168, "memory(GiB)": 141.16, "step": 83880, "train_speed(iter/s)": 0.291404 }, { "acc": 0.73556814, "epoch": 0.9384170782960292, "grad_norm": 6.96875, "learning_rate": 5.917221017507917e-06, "loss": 1.05313158, "memory(GiB)": 141.16, "step": 83900, "train_speed(iter/s)": 0.291428 }, { "acc": 0.74760723, "epoch": 0.9386407772419877, "grad_norm": 5.65625, "learning_rate": 5.9154029184268495e-06, "loss": 0.99903908, "memory(GiB)": 141.16, "step": 83920, "train_speed(iter/s)": 0.291449 }, { "acc": 0.73252559, "epoch": 0.9388644761879462, "grad_norm": 7.53125, "learning_rate": 5.913584694105576e-06, "loss": 1.0706151, "memory(GiB)": 141.16, "step": 83940, "train_speed(iter/s)": 0.291471 }, { "acc": 0.7325778, "epoch": 0.9390881751339047, "grad_norm": 6.53125, "learning_rate": 5.91176634479286e-06, "loss": 1.06679401, "memory(GiB)": 141.16, "step": 83960, "train_speed(iter/s)": 0.291496 }, { "acc": 0.72489281, "epoch": 0.9393118740798633, "grad_norm": 7.75, "learning_rate": 5.9099478707374745e-06, "loss": 1.11127071, "memory(GiB)": 141.16, "step": 83980, "train_speed(iter/s)": 0.291515 }, { "acc": 0.72164879, "epoch": 0.9395355730258218, "grad_norm": 7.78125, "learning_rate": 5.908129272188215e-06, "loss": 1.13499031, "memory(GiB)": 141.16, "step": 84000, "train_speed(iter/s)": 0.291538 }, { "epoch": 0.9395355730258218, "eval_acc": 0.6900623661842299, "eval_loss": 1.0792464017868042, "eval_runtime": 2320.4043, "eval_samples_per_second": 32.444, "eval_steps_per_second": 16.222, "step": 84000 }, { "acc": 0.73290777, "epoch": 0.9397592719717803, "grad_norm": 6.9375, "learning_rate": 5.906310549393891e-06, "loss": 1.06360636, "memory(GiB)": 141.16, "step": 84020, "train_speed(iter/s)": 0.28918 }, { "acc": 0.72983065, "epoch": 0.9399829709177389, "grad_norm": 7.6875, "learning_rate": 5.904491702603329e-06, "loss": 1.07983284, "memory(GiB)": 141.16, "step": 84040, "train_speed(iter/s)": 0.289202 }, { "acc": 0.74512243, "epoch": 0.9402066698636974, "grad_norm": 5.34375, "learning_rate": 5.902672732065374e-06, "loss": 1.02210617, "memory(GiB)": 141.16, "step": 84060, "train_speed(iter/s)": 0.289223 }, { "acc": 0.73141026, "epoch": 0.9404303688096559, "grad_norm": 7.84375, "learning_rate": 5.9008536380288875e-06, "loss": 1.08592329, "memory(GiB)": 141.16, "step": 84080, "train_speed(iter/s)": 0.289244 }, { "acc": 0.74462161, "epoch": 0.9406540677556144, "grad_norm": 5.1875, "learning_rate": 5.899034420742746e-06, "loss": 1.00379238, "memory(GiB)": 141.16, "step": 84100, "train_speed(iter/s)": 0.289268 }, { "acc": 0.74524651, "epoch": 0.940877766701573, "grad_norm": 9.0625, "learning_rate": 5.897215080455848e-06, "loss": 1.02555847, "memory(GiB)": 141.16, "step": 84120, "train_speed(iter/s)": 0.289293 }, { "acc": 0.72868462, "epoch": 0.9411014656475315, "grad_norm": 6.78125, "learning_rate": 5.895395617417101e-06, "loss": 1.09445343, "memory(GiB)": 141.16, "step": 84140, "train_speed(iter/s)": 0.289315 }, { "acc": 0.73289051, "epoch": 0.94132516459349, "grad_norm": 6.65625, "learning_rate": 5.893576031875435e-06, "loss": 1.06131725, "memory(GiB)": 141.16, "step": 84160, "train_speed(iter/s)": 0.289338 }, { "acc": 0.71587105, "epoch": 0.9415488635394486, "grad_norm": 9.875, "learning_rate": 5.891756324079797e-06, "loss": 1.15858698, "memory(GiB)": 141.16, "step": 84180, "train_speed(iter/s)": 0.289357 }, { "acc": 0.73337798, "epoch": 0.9417725624854072, "grad_norm": 6.625, "learning_rate": 5.889936494279147e-06, "loss": 1.06150131, "memory(GiB)": 141.16, "step": 84200, "train_speed(iter/s)": 0.289381 }, { "acc": 0.74334226, "epoch": 0.9419962614313657, "grad_norm": 8.3125, "learning_rate": 5.888116542722465e-06, "loss": 1.02956924, "memory(GiB)": 141.16, "step": 84220, "train_speed(iter/s)": 0.289406 }, { "acc": 0.72174206, "epoch": 0.9422199603773243, "grad_norm": 7.125, "learning_rate": 5.886296469658746e-06, "loss": 1.12070332, "memory(GiB)": 141.16, "step": 84240, "train_speed(iter/s)": 0.289429 }, { "acc": 0.73918886, "epoch": 0.9424436593232828, "grad_norm": 9.375, "learning_rate": 5.884476275337e-06, "loss": 1.04557219, "memory(GiB)": 141.16, "step": 84260, "train_speed(iter/s)": 0.289454 }, { "acc": 0.72987957, "epoch": 0.9426673582692413, "grad_norm": 7.8125, "learning_rate": 5.8826559600062595e-06, "loss": 1.0843173, "memory(GiB)": 141.16, "step": 84280, "train_speed(iter/s)": 0.289477 }, { "acc": 0.7178443, "epoch": 0.9428910572151998, "grad_norm": 6.15625, "learning_rate": 5.880835523915565e-06, "loss": 1.13739862, "memory(GiB)": 141.16, "step": 84300, "train_speed(iter/s)": 0.2895 }, { "acc": 0.73602705, "epoch": 0.9431147561611584, "grad_norm": 6.25, "learning_rate": 5.8790149673139855e-06, "loss": 1.06009932, "memory(GiB)": 141.16, "step": 84320, "train_speed(iter/s)": 0.289519 }, { "acc": 0.72288771, "epoch": 0.9433384551071169, "grad_norm": 6.40625, "learning_rate": 5.8771942904505915e-06, "loss": 1.12407398, "memory(GiB)": 141.16, "step": 84340, "train_speed(iter/s)": 0.289542 }, { "acc": 0.73306203, "epoch": 0.9435621540530754, "grad_norm": 6.8125, "learning_rate": 5.8753734935744814e-06, "loss": 1.04591026, "memory(GiB)": 141.16, "step": 84360, "train_speed(iter/s)": 0.289566 }, { "acc": 0.72890821, "epoch": 0.943785852999034, "grad_norm": 7.09375, "learning_rate": 5.8735525769347634e-06, "loss": 1.0808279, "memory(GiB)": 141.16, "step": 84380, "train_speed(iter/s)": 0.289591 }, { "acc": 0.72078476, "epoch": 0.9440095519449925, "grad_norm": 7.09375, "learning_rate": 5.8717315407805685e-06, "loss": 1.13007107, "memory(GiB)": 141.16, "step": 84400, "train_speed(iter/s)": 0.289615 }, { "acc": 0.74694085, "epoch": 0.944233250890951, "grad_norm": 8.25, "learning_rate": 5.869910385361039e-06, "loss": 1.01300554, "memory(GiB)": 141.16, "step": 84420, "train_speed(iter/s)": 0.289641 }, { "acc": 0.7292057, "epoch": 0.9444569498369095, "grad_norm": 8.6875, "learning_rate": 5.868089110925335e-06, "loss": 1.08185978, "memory(GiB)": 141.16, "step": 84440, "train_speed(iter/s)": 0.289664 }, { "acc": 0.7359242, "epoch": 0.9446806487828681, "grad_norm": 7.03125, "learning_rate": 5.866267717722632e-06, "loss": 1.05662804, "memory(GiB)": 141.16, "step": 84460, "train_speed(iter/s)": 0.289686 }, { "acc": 0.71707091, "epoch": 0.9449043477288266, "grad_norm": 6.03125, "learning_rate": 5.864446206002124e-06, "loss": 1.14346714, "memory(GiB)": 141.16, "step": 84480, "train_speed(iter/s)": 0.289709 }, { "acc": 0.72295284, "epoch": 0.9451280466747851, "grad_norm": 9.1875, "learning_rate": 5.862624576013019e-06, "loss": 1.1048954, "memory(GiB)": 141.16, "step": 84500, "train_speed(iter/s)": 0.28973 }, { "acc": 0.74280252, "epoch": 0.9453517456207436, "grad_norm": 8.3125, "learning_rate": 5.860802828004541e-06, "loss": 1.00436592, "memory(GiB)": 141.16, "step": 84520, "train_speed(iter/s)": 0.289754 }, { "acc": 0.71735067, "epoch": 0.9455754445667022, "grad_norm": 7.84375, "learning_rate": 5.858980962225935e-06, "loss": 1.14349985, "memory(GiB)": 141.16, "step": 84540, "train_speed(iter/s)": 0.289776 }, { "acc": 0.70978813, "epoch": 0.9457991435126607, "grad_norm": 7.125, "learning_rate": 5.857158978926454e-06, "loss": 1.18018646, "memory(GiB)": 141.16, "step": 84560, "train_speed(iter/s)": 0.289795 }, { "acc": 0.7342783, "epoch": 0.9460228424586192, "grad_norm": 7.4375, "learning_rate": 5.855336878355373e-06, "loss": 1.07129478, "memory(GiB)": 141.16, "step": 84580, "train_speed(iter/s)": 0.289818 }, { "acc": 0.72739968, "epoch": 0.9462465414045778, "grad_norm": 7.71875, "learning_rate": 5.853514660761982e-06, "loss": 1.08503246, "memory(GiB)": 141.16, "step": 84600, "train_speed(iter/s)": 0.289841 }, { "acc": 0.72179675, "epoch": 0.9464702403505363, "grad_norm": 7.03125, "learning_rate": 5.851692326395585e-06, "loss": 1.1185317, "memory(GiB)": 141.16, "step": 84620, "train_speed(iter/s)": 0.289864 }, { "acc": 0.72977643, "epoch": 0.9466939392964948, "grad_norm": 6.8125, "learning_rate": 5.8498698755055065e-06, "loss": 1.07606106, "memory(GiB)": 141.16, "step": 84640, "train_speed(iter/s)": 0.289886 }, { "acc": 0.7305995, "epoch": 0.9469176382424533, "grad_norm": 6.78125, "learning_rate": 5.84804730834108e-06, "loss": 1.07945566, "memory(GiB)": 141.16, "step": 84660, "train_speed(iter/s)": 0.28991 }, { "acc": 0.72882972, "epoch": 0.9471413371884119, "grad_norm": 6.34375, "learning_rate": 5.8462246251516594e-06, "loss": 1.08538704, "memory(GiB)": 141.16, "step": 84680, "train_speed(iter/s)": 0.289933 }, { "acc": 0.74420342, "epoch": 0.9473650361343704, "grad_norm": 6.75, "learning_rate": 5.844401826186616e-06, "loss": 1.0182579, "memory(GiB)": 141.16, "step": 84700, "train_speed(iter/s)": 0.289958 }, { "acc": 0.73395271, "epoch": 0.9475887350803289, "grad_norm": 8.625, "learning_rate": 5.842578911695333e-06, "loss": 1.06366673, "memory(GiB)": 141.16, "step": 84720, "train_speed(iter/s)": 0.289982 }, { "acc": 0.73619294, "epoch": 0.9478124340262875, "grad_norm": 6.53125, "learning_rate": 5.840755881927213e-06, "loss": 1.04459982, "memory(GiB)": 141.16, "step": 84740, "train_speed(iter/s)": 0.290006 }, { "acc": 0.72243094, "epoch": 0.948036132972246, "grad_norm": 6.90625, "learning_rate": 5.838932737131669e-06, "loss": 1.11967535, "memory(GiB)": 141.16, "step": 84760, "train_speed(iter/s)": 0.290027 }, { "acc": 0.73862476, "epoch": 0.9482598319182045, "grad_norm": 7.25, "learning_rate": 5.837109477558137e-06, "loss": 1.05079403, "memory(GiB)": 141.16, "step": 84780, "train_speed(iter/s)": 0.29005 }, { "acc": 0.74127855, "epoch": 0.948483530864163, "grad_norm": 4.90625, "learning_rate": 5.835286103456063e-06, "loss": 1.02733879, "memory(GiB)": 141.16, "step": 84800, "train_speed(iter/s)": 0.290073 }, { "acc": 0.74059882, "epoch": 0.9487072298101216, "grad_norm": 8.3125, "learning_rate": 5.83346261507491e-06, "loss": 1.02699718, "memory(GiB)": 141.16, "step": 84820, "train_speed(iter/s)": 0.290097 }, { "acc": 0.73056822, "epoch": 0.9489309287560801, "grad_norm": 6.90625, "learning_rate": 5.831639012664161e-06, "loss": 1.07818317, "memory(GiB)": 141.16, "step": 84840, "train_speed(iter/s)": 0.290118 }, { "acc": 0.72929153, "epoch": 0.9491546277020386, "grad_norm": 7.3125, "learning_rate": 5.829815296473306e-06, "loss": 1.08267097, "memory(GiB)": 141.16, "step": 84860, "train_speed(iter/s)": 0.290136 }, { "acc": 0.73710122, "epoch": 0.9493783266479972, "grad_norm": 7.875, "learning_rate": 5.827991466751858e-06, "loss": 1.05377264, "memory(GiB)": 141.16, "step": 84880, "train_speed(iter/s)": 0.290159 }, { "acc": 0.74630237, "epoch": 0.9496020255939557, "grad_norm": 9.4375, "learning_rate": 5.826167523749343e-06, "loss": 1.00632238, "memory(GiB)": 141.16, "step": 84900, "train_speed(iter/s)": 0.290185 }, { "acc": 0.74942656, "epoch": 0.9498257245399142, "grad_norm": 7.90625, "learning_rate": 5.824343467715302e-06, "loss": 1.00811644, "memory(GiB)": 141.16, "step": 84920, "train_speed(iter/s)": 0.290206 }, { "acc": 0.73424196, "epoch": 0.9500494234858727, "grad_norm": 7.8125, "learning_rate": 5.8225192988992916e-06, "loss": 1.0631588, "memory(GiB)": 141.16, "step": 84940, "train_speed(iter/s)": 0.290228 }, { "acc": 0.72603703, "epoch": 0.9502731224318313, "grad_norm": 7.21875, "learning_rate": 5.820695017550886e-06, "loss": 1.10072861, "memory(GiB)": 141.16, "step": 84960, "train_speed(iter/s)": 0.290252 }, { "acc": 0.73747568, "epoch": 0.9504968213777898, "grad_norm": 7.53125, "learning_rate": 5.81887062391967e-06, "loss": 1.05939465, "memory(GiB)": 141.16, "step": 84980, "train_speed(iter/s)": 0.290277 }, { "acc": 0.72724524, "epoch": 0.9507205203237483, "grad_norm": 7.53125, "learning_rate": 5.817046118255249e-06, "loss": 1.10685654, "memory(GiB)": 141.16, "step": 85000, "train_speed(iter/s)": 0.290301 }, { "acc": 0.73442106, "epoch": 0.9509442192697068, "grad_norm": 5.9375, "learning_rate": 5.81522150080724e-06, "loss": 1.06355705, "memory(GiB)": 141.16, "step": 85020, "train_speed(iter/s)": 0.290322 }, { "acc": 0.71933002, "epoch": 0.9511679182156654, "grad_norm": 8.3125, "learning_rate": 5.813396771825278e-06, "loss": 1.13639164, "memory(GiB)": 141.16, "step": 85040, "train_speed(iter/s)": 0.290345 }, { "acc": 0.74191127, "epoch": 0.9513916171616239, "grad_norm": 7.28125, "learning_rate": 5.811571931559012e-06, "loss": 1.01269302, "memory(GiB)": 141.16, "step": 85060, "train_speed(iter/s)": 0.290368 }, { "acc": 0.7198185, "epoch": 0.9516153161075824, "grad_norm": 7.3125, "learning_rate": 5.8097469802581055e-06, "loss": 1.13840446, "memory(GiB)": 141.16, "step": 85080, "train_speed(iter/s)": 0.290391 }, { "acc": 0.72761307, "epoch": 0.951839015053541, "grad_norm": 8.625, "learning_rate": 5.807921918172238e-06, "loss": 1.09854145, "memory(GiB)": 141.16, "step": 85100, "train_speed(iter/s)": 0.290415 }, { "acc": 0.73704977, "epoch": 0.9520627139994995, "grad_norm": 7.15625, "learning_rate": 5.806096745551104e-06, "loss": 1.05428705, "memory(GiB)": 141.16, "step": 85120, "train_speed(iter/s)": 0.290438 }, { "acc": 0.73274841, "epoch": 0.952286412945458, "grad_norm": 8.125, "learning_rate": 5.804271462644413e-06, "loss": 1.07260399, "memory(GiB)": 141.16, "step": 85140, "train_speed(iter/s)": 0.290461 }, { "acc": 0.7272191, "epoch": 0.9525101118914165, "grad_norm": 8.875, "learning_rate": 5.80244606970189e-06, "loss": 1.0917737, "memory(GiB)": 141.16, "step": 85160, "train_speed(iter/s)": 0.290485 }, { "acc": 0.73215408, "epoch": 0.9527338108373751, "grad_norm": 7.28125, "learning_rate": 5.8006205669732775e-06, "loss": 1.08220816, "memory(GiB)": 141.16, "step": 85180, "train_speed(iter/s)": 0.290511 }, { "acc": 0.73173161, "epoch": 0.9529575097833336, "grad_norm": 6.71875, "learning_rate": 5.798794954708326e-06, "loss": 1.08167114, "memory(GiB)": 141.16, "step": 85200, "train_speed(iter/s)": 0.290533 }, { "acc": 0.74051366, "epoch": 0.9531812087292921, "grad_norm": 7.5625, "learning_rate": 5.796969233156807e-06, "loss": 1.0413044, "memory(GiB)": 141.16, "step": 85220, "train_speed(iter/s)": 0.290552 }, { "acc": 0.74422302, "epoch": 0.9534049076752507, "grad_norm": 9.3125, "learning_rate": 5.795143402568506e-06, "loss": 1.0166111, "memory(GiB)": 141.16, "step": 85240, "train_speed(iter/s)": 0.290577 }, { "acc": 0.73456039, "epoch": 0.9536286066212092, "grad_norm": 6.84375, "learning_rate": 5.793317463193222e-06, "loss": 1.06692066, "memory(GiB)": 141.16, "step": 85260, "train_speed(iter/s)": 0.290601 }, { "acc": 0.74784555, "epoch": 0.9538523055671677, "grad_norm": 7.09375, "learning_rate": 5.791491415280772e-06, "loss": 1.01764193, "memory(GiB)": 141.16, "step": 85280, "train_speed(iter/s)": 0.290624 }, { "acc": 0.72757015, "epoch": 0.9540760045131262, "grad_norm": 7.0, "learning_rate": 5.789665259080981e-06, "loss": 1.08555984, "memory(GiB)": 141.16, "step": 85300, "train_speed(iter/s)": 0.290649 }, { "acc": 0.72917056, "epoch": 0.9542997034590848, "grad_norm": 8.5, "learning_rate": 5.787838994843696e-06, "loss": 1.09943123, "memory(GiB)": 141.16, "step": 85320, "train_speed(iter/s)": 0.290671 }, { "acc": 0.73793712, "epoch": 0.9545234024050433, "grad_norm": 7.15625, "learning_rate": 5.786012622818776e-06, "loss": 1.03986549, "memory(GiB)": 141.16, "step": 85340, "train_speed(iter/s)": 0.290692 }, { "acc": 0.72457914, "epoch": 0.9547471013510018, "grad_norm": 5.71875, "learning_rate": 5.784186143256094e-06, "loss": 1.11059446, "memory(GiB)": 141.16, "step": 85360, "train_speed(iter/s)": 0.290711 }, { "acc": 0.74940195, "epoch": 0.9549708002969604, "grad_norm": 7.46875, "learning_rate": 5.782359556405541e-06, "loss": 0.99376183, "memory(GiB)": 141.16, "step": 85380, "train_speed(iter/s)": 0.290733 }, { "acc": 0.72843595, "epoch": 0.9551944992429189, "grad_norm": 6.125, "learning_rate": 5.780532862517016e-06, "loss": 1.08975677, "memory(GiB)": 141.16, "step": 85400, "train_speed(iter/s)": 0.290754 }, { "acc": 0.72815666, "epoch": 0.9554181981888774, "grad_norm": 8.5, "learning_rate": 5.77870606184044e-06, "loss": 1.10701141, "memory(GiB)": 141.16, "step": 85420, "train_speed(iter/s)": 0.290776 }, { "acc": 0.72454996, "epoch": 0.9556418971348359, "grad_norm": 6.78125, "learning_rate": 5.776879154625744e-06, "loss": 1.09685879, "memory(GiB)": 141.16, "step": 85440, "train_speed(iter/s)": 0.290801 }, { "acc": 0.73333654, "epoch": 0.9558655960807945, "grad_norm": 7.875, "learning_rate": 5.775052141122876e-06, "loss": 1.07473907, "memory(GiB)": 141.16, "step": 85460, "train_speed(iter/s)": 0.290824 }, { "acc": 0.72195587, "epoch": 0.956089295026753, "grad_norm": 7.25, "learning_rate": 5.773225021581797e-06, "loss": 1.11381626, "memory(GiB)": 141.16, "step": 85480, "train_speed(iter/s)": 0.290848 }, { "acc": 0.72183571, "epoch": 0.9563129939727115, "grad_norm": 8.125, "learning_rate": 5.771397796252485e-06, "loss": 1.12842283, "memory(GiB)": 141.16, "step": 85500, "train_speed(iter/s)": 0.290872 }, { "acc": 0.72190971, "epoch": 0.95653669291867, "grad_norm": 7.21875, "learning_rate": 5.769570465384926e-06, "loss": 1.10942421, "memory(GiB)": 141.16, "step": 85520, "train_speed(iter/s)": 0.290896 }, { "acc": 0.73469529, "epoch": 0.9567603918646286, "grad_norm": 7.75, "learning_rate": 5.767743029229128e-06, "loss": 1.07870064, "memory(GiB)": 141.16, "step": 85540, "train_speed(iter/s)": 0.290915 }, { "acc": 0.73686562, "epoch": 0.9569840908105871, "grad_norm": 7.40625, "learning_rate": 5.76591548803511e-06, "loss": 1.0527523, "memory(GiB)": 141.16, "step": 85560, "train_speed(iter/s)": 0.290938 }, { "acc": 0.74431677, "epoch": 0.9572077897565456, "grad_norm": 7.34375, "learning_rate": 5.764087842052906e-06, "loss": 1.00929422, "memory(GiB)": 141.16, "step": 85580, "train_speed(iter/s)": 0.290958 }, { "acc": 0.73333921, "epoch": 0.9574314887025042, "grad_norm": 6.09375, "learning_rate": 5.762260091532564e-06, "loss": 1.06826267, "memory(GiB)": 141.16, "step": 85600, "train_speed(iter/s)": 0.29098 }, { "acc": 0.71723657, "epoch": 0.9576551876484627, "grad_norm": 9.625, "learning_rate": 5.760432236724146e-06, "loss": 1.14506931, "memory(GiB)": 141.16, "step": 85620, "train_speed(iter/s)": 0.291005 }, { "acc": 0.71619987, "epoch": 0.9578788865944212, "grad_norm": 6.40625, "learning_rate": 5.75860427787773e-06, "loss": 1.13578672, "memory(GiB)": 141.16, "step": 85640, "train_speed(iter/s)": 0.291026 }, { "acc": 0.73877039, "epoch": 0.9581025855403797, "grad_norm": 8.75, "learning_rate": 5.756776215243404e-06, "loss": 1.02629862, "memory(GiB)": 141.16, "step": 85660, "train_speed(iter/s)": 0.291051 }, { "acc": 0.72284951, "epoch": 0.9583262844863383, "grad_norm": 6.71875, "learning_rate": 5.754948049071276e-06, "loss": 1.12280674, "memory(GiB)": 141.16, "step": 85680, "train_speed(iter/s)": 0.291075 }, { "acc": 0.74793048, "epoch": 0.9585499834322968, "grad_norm": 7.0, "learning_rate": 5.7531197796114645e-06, "loss": 1.02096405, "memory(GiB)": 141.16, "step": 85700, "train_speed(iter/s)": 0.291098 }, { "acc": 0.724508, "epoch": 0.9587736823782553, "grad_norm": 8.4375, "learning_rate": 5.7512914071141014e-06, "loss": 1.10815287, "memory(GiB)": 141.16, "step": 85720, "train_speed(iter/s)": 0.291121 }, { "acc": 0.73819833, "epoch": 0.9589973813242139, "grad_norm": 5.53125, "learning_rate": 5.749462931829336e-06, "loss": 1.04820881, "memory(GiB)": 141.16, "step": 85740, "train_speed(iter/s)": 0.291142 }, { "acc": 0.73011856, "epoch": 0.9592210802701724, "grad_norm": 5.9375, "learning_rate": 5.74763435400733e-06, "loss": 1.097925, "memory(GiB)": 141.16, "step": 85760, "train_speed(iter/s)": 0.291163 }, { "acc": 0.74461679, "epoch": 0.9594447792161309, "grad_norm": 7.71875, "learning_rate": 5.745805673898257e-06, "loss": 1.00758152, "memory(GiB)": 141.16, "step": 85780, "train_speed(iter/s)": 0.291185 }, { "acc": 0.71500325, "epoch": 0.9596684781620894, "grad_norm": 6.40625, "learning_rate": 5.743976891752309e-06, "loss": 1.16768131, "memory(GiB)": 141.16, "step": 85800, "train_speed(iter/s)": 0.291207 }, { "acc": 0.7309885, "epoch": 0.959892177108048, "grad_norm": 6.59375, "learning_rate": 5.742148007819688e-06, "loss": 1.06728287, "memory(GiB)": 141.16, "step": 85820, "train_speed(iter/s)": 0.29123 }, { "acc": 0.72566252, "epoch": 0.9601158760540065, "grad_norm": 7.03125, "learning_rate": 5.740319022350611e-06, "loss": 1.10042009, "memory(GiB)": 141.16, "step": 85840, "train_speed(iter/s)": 0.291252 }, { "acc": 0.73041945, "epoch": 0.960339574999965, "grad_norm": 5.6875, "learning_rate": 5.738489935595311e-06, "loss": 1.08756371, "memory(GiB)": 141.16, "step": 85860, "train_speed(iter/s)": 0.291275 }, { "acc": 0.72158079, "epoch": 0.9605632739459236, "grad_norm": 5.75, "learning_rate": 5.7366607478040304e-06, "loss": 1.13937702, "memory(GiB)": 141.16, "step": 85880, "train_speed(iter/s)": 0.291298 }, { "acc": 0.74089375, "epoch": 0.9607869728918821, "grad_norm": 7.15625, "learning_rate": 5.734831459227032e-06, "loss": 1.0296876, "memory(GiB)": 141.16, "step": 85900, "train_speed(iter/s)": 0.291318 }, { "acc": 0.72895479, "epoch": 0.9610106718378406, "grad_norm": 6.90625, "learning_rate": 5.7330020701145876e-06, "loss": 1.09721222, "memory(GiB)": 141.16, "step": 85920, "train_speed(iter/s)": 0.29134 }, { "acc": 0.73740501, "epoch": 0.9612343707837991, "grad_norm": 5.75, "learning_rate": 5.7311725807169815e-06, "loss": 1.05264206, "memory(GiB)": 141.16, "step": 85940, "train_speed(iter/s)": 0.291361 }, { "acc": 0.73444042, "epoch": 0.9614580697297577, "grad_norm": 7.5, "learning_rate": 5.729342991284516e-06, "loss": 1.0639183, "memory(GiB)": 141.16, "step": 85960, "train_speed(iter/s)": 0.291383 }, { "acc": 0.74640903, "epoch": 0.9616817686757162, "grad_norm": 7.5, "learning_rate": 5.727513302067504e-06, "loss": 1.00760899, "memory(GiB)": 141.16, "step": 85980, "train_speed(iter/s)": 0.291407 }, { "acc": 0.72826924, "epoch": 0.9619054676216747, "grad_norm": 7.1875, "learning_rate": 5.725683513316276e-06, "loss": 1.07828674, "memory(GiB)": 141.16, "step": 86000, "train_speed(iter/s)": 0.291427 }, { "epoch": 0.9619054676216747, "eval_acc": 0.6900620211076446, "eval_loss": 1.0791908502578735, "eval_runtime": 2320.5148, "eval_samples_per_second": 32.442, "eval_steps_per_second": 16.221, "step": 86000 }, { "acc": 0.74067287, "epoch": 0.9621291665676333, "grad_norm": 5.96875, "learning_rate": 5.7238536252811685e-06, "loss": 1.03051367, "memory(GiB)": 141.16, "step": 86020, "train_speed(iter/s)": 0.289127 }, { "acc": 0.72359409, "epoch": 0.9623528655135918, "grad_norm": 7.53125, "learning_rate": 5.722023638212539e-06, "loss": 1.11157169, "memory(GiB)": 141.16, "step": 86040, "train_speed(iter/s)": 0.289146 }, { "acc": 0.73700962, "epoch": 0.9625765644595503, "grad_norm": 8.875, "learning_rate": 5.720193552360757e-06, "loss": 1.07126865, "memory(GiB)": 141.16, "step": 86060, "train_speed(iter/s)": 0.28917 }, { "acc": 0.72512379, "epoch": 0.9628002634055088, "grad_norm": 5.90625, "learning_rate": 5.718363367976202e-06, "loss": 1.10981178, "memory(GiB)": 141.16, "step": 86080, "train_speed(iter/s)": 0.289192 }, { "acc": 0.72796993, "epoch": 0.9630239623514674, "grad_norm": 7.21875, "learning_rate": 5.716533085309272e-06, "loss": 1.09589539, "memory(GiB)": 141.16, "step": 86100, "train_speed(iter/s)": 0.289213 }, { "acc": 0.73022642, "epoch": 0.9632476612974259, "grad_norm": 7.9375, "learning_rate": 5.714702704610373e-06, "loss": 1.07477207, "memory(GiB)": 141.16, "step": 86120, "train_speed(iter/s)": 0.289236 }, { "acc": 0.72356362, "epoch": 0.9634713602433844, "grad_norm": 8.0, "learning_rate": 5.712872226129929e-06, "loss": 1.12130299, "memory(GiB)": 141.16, "step": 86140, "train_speed(iter/s)": 0.289261 }, { "acc": 0.72361164, "epoch": 0.963695059189343, "grad_norm": 8.1875, "learning_rate": 5.711041650118374e-06, "loss": 1.10969601, "memory(GiB)": 141.16, "step": 86160, "train_speed(iter/s)": 0.289284 }, { "acc": 0.73742218, "epoch": 0.9639187581353015, "grad_norm": 6.9375, "learning_rate": 5.70921097682616e-06, "loss": 1.05112925, "memory(GiB)": 141.16, "step": 86180, "train_speed(iter/s)": 0.28931 }, { "acc": 0.73662319, "epoch": 0.96414245708126, "grad_norm": 7.65625, "learning_rate": 5.707380206503745e-06, "loss": 1.05692005, "memory(GiB)": 141.16, "step": 86200, "train_speed(iter/s)": 0.289333 }, { "acc": 0.72148266, "epoch": 0.9643661560272185, "grad_norm": 7.96875, "learning_rate": 5.705549339401609e-06, "loss": 1.11927338, "memory(GiB)": 141.16, "step": 86220, "train_speed(iter/s)": 0.289354 }, { "acc": 0.72779479, "epoch": 0.9645898549731771, "grad_norm": 7.21875, "learning_rate": 5.703718375770239e-06, "loss": 1.07690868, "memory(GiB)": 141.16, "step": 86240, "train_speed(iter/s)": 0.289376 }, { "acc": 0.73238721, "epoch": 0.9648135539191356, "grad_norm": 7.09375, "learning_rate": 5.701887315860135e-06, "loss": 1.07176018, "memory(GiB)": 141.16, "step": 86260, "train_speed(iter/s)": 0.289397 }, { "acc": 0.73211966, "epoch": 0.9650372528650941, "grad_norm": 8.125, "learning_rate": 5.7000561599218155e-06, "loss": 1.0705822, "memory(GiB)": 141.16, "step": 86280, "train_speed(iter/s)": 0.289418 }, { "acc": 0.74776354, "epoch": 0.9652609518110526, "grad_norm": 6.0, "learning_rate": 5.698224908205805e-06, "loss": 1.0123436, "memory(GiB)": 141.16, "step": 86300, "train_speed(iter/s)": 0.289441 }, { "acc": 0.73293123, "epoch": 0.9654846507570112, "grad_norm": 7.09375, "learning_rate": 5.69639356096265e-06, "loss": 1.07640553, "memory(GiB)": 141.16, "step": 86320, "train_speed(iter/s)": 0.289462 }, { "acc": 0.73198814, "epoch": 0.9657083497029697, "grad_norm": 8.0, "learning_rate": 5.6945621184429005e-06, "loss": 1.08227558, "memory(GiB)": 141.16, "step": 86340, "train_speed(iter/s)": 0.289484 }, { "acc": 0.72805805, "epoch": 0.9659320486489282, "grad_norm": 8.1875, "learning_rate": 5.692730580897126e-06, "loss": 1.10806942, "memory(GiB)": 141.16, "step": 86360, "train_speed(iter/s)": 0.289507 }, { "acc": 0.74945269, "epoch": 0.9661557475948868, "grad_norm": 7.6875, "learning_rate": 5.690898948575906e-06, "loss": 1.00120907, "memory(GiB)": 141.16, "step": 86380, "train_speed(iter/s)": 0.28953 }, { "acc": 0.72663808, "epoch": 0.9663794465408453, "grad_norm": 6.84375, "learning_rate": 5.689067221729835e-06, "loss": 1.1027153, "memory(GiB)": 141.16, "step": 86400, "train_speed(iter/s)": 0.289552 }, { "acc": 0.7400485, "epoch": 0.9666031454868038, "grad_norm": 6.9375, "learning_rate": 5.68723540060952e-06, "loss": 1.03621063, "memory(GiB)": 141.16, "step": 86420, "train_speed(iter/s)": 0.289574 }, { "acc": 0.72473049, "epoch": 0.9668268444327623, "grad_norm": 7.8125, "learning_rate": 5.685403485465578e-06, "loss": 1.08602562, "memory(GiB)": 141.16, "step": 86440, "train_speed(iter/s)": 0.289597 }, { "acc": 0.73089972, "epoch": 0.9670505433787209, "grad_norm": 6.5625, "learning_rate": 5.683571476548643e-06, "loss": 1.08085947, "memory(GiB)": 141.16, "step": 86460, "train_speed(iter/s)": 0.289617 }, { "acc": 0.72445083, "epoch": 0.9672742423246794, "grad_norm": 5.625, "learning_rate": 5.681739374109359e-06, "loss": 1.11585846, "memory(GiB)": 141.16, "step": 86480, "train_speed(iter/s)": 0.289641 }, { "acc": 0.73798542, "epoch": 0.9674979412706379, "grad_norm": 8.9375, "learning_rate": 5.679907178398385e-06, "loss": 1.05637951, "memory(GiB)": 141.16, "step": 86500, "train_speed(iter/s)": 0.28966 }, { "acc": 0.71846862, "epoch": 0.9677216402165965, "grad_norm": 5.875, "learning_rate": 5.67807488966639e-06, "loss": 1.13871403, "memory(GiB)": 141.16, "step": 86520, "train_speed(iter/s)": 0.289682 }, { "acc": 0.72946606, "epoch": 0.967945339162555, "grad_norm": 6.375, "learning_rate": 5.67624250816406e-06, "loss": 1.07804174, "memory(GiB)": 141.16, "step": 86540, "train_speed(iter/s)": 0.289703 }, { "acc": 0.72728858, "epoch": 0.9681690381085135, "grad_norm": 7.5, "learning_rate": 5.674410034142087e-06, "loss": 1.10287914, "memory(GiB)": 141.16, "step": 86560, "train_speed(iter/s)": 0.289726 }, { "acc": 0.74629049, "epoch": 0.968392737054472, "grad_norm": 7.6875, "learning_rate": 5.672577467851184e-06, "loss": 1.02062092, "memory(GiB)": 141.16, "step": 86580, "train_speed(iter/s)": 0.289748 }, { "acc": 0.72809, "epoch": 0.9686164360004306, "grad_norm": 7.6875, "learning_rate": 5.670744809542068e-06, "loss": 1.08788834, "memory(GiB)": 141.16, "step": 86600, "train_speed(iter/s)": 0.289772 }, { "acc": 0.72131634, "epoch": 0.9688401349463891, "grad_norm": 7.125, "learning_rate": 5.668912059465477e-06, "loss": 1.12219667, "memory(GiB)": 141.16, "step": 86620, "train_speed(iter/s)": 0.289791 }, { "acc": 0.72352266, "epoch": 0.9690638338923476, "grad_norm": 7.5625, "learning_rate": 5.667079217872153e-06, "loss": 1.10993938, "memory(GiB)": 141.16, "step": 86640, "train_speed(iter/s)": 0.289812 }, { "acc": 0.73503304, "epoch": 0.9692875328383062, "grad_norm": 7.1875, "learning_rate": 5.665246285012858e-06, "loss": 1.06349983, "memory(GiB)": 141.16, "step": 86660, "train_speed(iter/s)": 0.289833 }, { "acc": 0.73817816, "epoch": 0.9695112317842647, "grad_norm": 7.65625, "learning_rate": 5.663413261138364e-06, "loss": 1.03568087, "memory(GiB)": 141.16, "step": 86680, "train_speed(iter/s)": 0.289856 }, { "acc": 0.73232222, "epoch": 0.9697349307302232, "grad_norm": 7.3125, "learning_rate": 5.661580146499452e-06, "loss": 1.06468477, "memory(GiB)": 141.16, "step": 86700, "train_speed(iter/s)": 0.289879 }, { "acc": 0.71539798, "epoch": 0.9699586296761818, "grad_norm": 6.25, "learning_rate": 5.659746941346919e-06, "loss": 1.14806938, "memory(GiB)": 141.16, "step": 86720, "train_speed(iter/s)": 0.289903 }, { "acc": 0.72982793, "epoch": 0.9701823286221404, "grad_norm": 7.40625, "learning_rate": 5.657913645931578e-06, "loss": 1.0800581, "memory(GiB)": 141.16, "step": 86740, "train_speed(iter/s)": 0.289929 }, { "acc": 0.71945944, "epoch": 0.9704060275680989, "grad_norm": 8.1875, "learning_rate": 5.6560802605042445e-06, "loss": 1.14369745, "memory(GiB)": 141.16, "step": 86760, "train_speed(iter/s)": 0.289951 }, { "acc": 0.72461195, "epoch": 0.9706297265140574, "grad_norm": 7.8125, "learning_rate": 5.6542467853157525e-06, "loss": 1.09397278, "memory(GiB)": 141.16, "step": 86780, "train_speed(iter/s)": 0.289971 }, { "acc": 0.72688289, "epoch": 0.970853425460016, "grad_norm": 7.0, "learning_rate": 5.65241322061695e-06, "loss": 1.09586134, "memory(GiB)": 141.16, "step": 86800, "train_speed(iter/s)": 0.289994 }, { "acc": 0.7324152, "epoch": 0.9710771244059745, "grad_norm": 8.375, "learning_rate": 5.650579566658694e-06, "loss": 1.07215605, "memory(GiB)": 141.16, "step": 86820, "train_speed(iter/s)": 0.290015 }, { "acc": 0.73029957, "epoch": 0.971300823351933, "grad_norm": 8.125, "learning_rate": 5.6487458236918545e-06, "loss": 1.09231253, "memory(GiB)": 141.16, "step": 86840, "train_speed(iter/s)": 0.290039 }, { "acc": 0.7255394, "epoch": 0.9715245222978915, "grad_norm": 6.21875, "learning_rate": 5.646911991967313e-06, "loss": 1.09845562, "memory(GiB)": 141.16, "step": 86860, "train_speed(iter/s)": 0.290064 }, { "acc": 0.72352605, "epoch": 0.9717482212438501, "grad_norm": 4.8125, "learning_rate": 5.645078071735964e-06, "loss": 1.12096891, "memory(GiB)": 141.16, "step": 86880, "train_speed(iter/s)": 0.290086 }, { "acc": 0.73682432, "epoch": 0.9719719201898086, "grad_norm": 6.75, "learning_rate": 5.643244063248715e-06, "loss": 1.06085806, "memory(GiB)": 141.16, "step": 86900, "train_speed(iter/s)": 0.290107 }, { "acc": 0.71792383, "epoch": 0.9721956191357671, "grad_norm": 8.0625, "learning_rate": 5.641409966756483e-06, "loss": 1.15372143, "memory(GiB)": 141.16, "step": 86920, "train_speed(iter/s)": 0.290129 }, { "acc": 0.72462544, "epoch": 0.9724193180817257, "grad_norm": 6.71875, "learning_rate": 5.6395757825102025e-06, "loss": 1.10127163, "memory(GiB)": 141.16, "step": 86940, "train_speed(iter/s)": 0.290152 }, { "acc": 0.74021769, "epoch": 0.9726430170276842, "grad_norm": 7.59375, "learning_rate": 5.637741510760812e-06, "loss": 1.048034, "memory(GiB)": 141.16, "step": 86960, "train_speed(iter/s)": 0.290174 }, { "acc": 0.72453175, "epoch": 0.9728667159736427, "grad_norm": 6.9375, "learning_rate": 5.635907151759267e-06, "loss": 1.12626343, "memory(GiB)": 141.16, "step": 86980, "train_speed(iter/s)": 0.290195 }, { "acc": 0.73011932, "epoch": 0.9730904149196012, "grad_norm": 7.21875, "learning_rate": 5.634072705756535e-06, "loss": 1.08480301, "memory(GiB)": 141.16, "step": 87000, "train_speed(iter/s)": 0.290215 }, { "acc": 0.72952485, "epoch": 0.9733141138655598, "grad_norm": 6.90625, "learning_rate": 5.632238173003593e-06, "loss": 1.08839808, "memory(GiB)": 141.16, "step": 87020, "train_speed(iter/s)": 0.290237 }, { "acc": 0.74740639, "epoch": 0.9735378128115183, "grad_norm": 7.6875, "learning_rate": 5.630403553751433e-06, "loss": 0.99841356, "memory(GiB)": 141.16, "step": 87040, "train_speed(iter/s)": 0.290258 }, { "acc": 0.73720474, "epoch": 0.9737615117574768, "grad_norm": 6.53125, "learning_rate": 5.628568848251056e-06, "loss": 1.04637737, "memory(GiB)": 141.16, "step": 87060, "train_speed(iter/s)": 0.29028 }, { "acc": 0.72012496, "epoch": 0.9739852107034354, "grad_norm": 6.5625, "learning_rate": 5.626734056753475e-06, "loss": 1.12941704, "memory(GiB)": 141.16, "step": 87080, "train_speed(iter/s)": 0.290302 }, { "acc": 0.71790695, "epoch": 0.9742089096493939, "grad_norm": 8.0, "learning_rate": 5.624899179509719e-06, "loss": 1.13217812, "memory(GiB)": 141.16, "step": 87100, "train_speed(iter/s)": 0.290325 }, { "acc": 0.73357425, "epoch": 0.9744326085953524, "grad_norm": 6.40625, "learning_rate": 5.623064216770821e-06, "loss": 1.06458054, "memory(GiB)": 141.16, "step": 87120, "train_speed(iter/s)": 0.290348 }, { "acc": 0.7447022, "epoch": 0.9746563075413109, "grad_norm": 7.34375, "learning_rate": 5.621229168787836e-06, "loss": 1.02662582, "memory(GiB)": 141.16, "step": 87140, "train_speed(iter/s)": 0.29037 }, { "acc": 0.73205013, "epoch": 0.9748800064872695, "grad_norm": 5.875, "learning_rate": 5.61939403581182e-06, "loss": 1.06159325, "memory(GiB)": 141.16, "step": 87160, "train_speed(iter/s)": 0.29039 }, { "acc": 0.7272594, "epoch": 0.975103705433228, "grad_norm": 6.21875, "learning_rate": 5.617558818093844e-06, "loss": 1.09162521, "memory(GiB)": 141.16, "step": 87180, "train_speed(iter/s)": 0.29041 }, { "acc": 0.72144327, "epoch": 0.9753274043791865, "grad_norm": 5.40625, "learning_rate": 5.615723515884998e-06, "loss": 1.10535889, "memory(GiB)": 141.16, "step": 87200, "train_speed(iter/s)": 0.290432 }, { "acc": 0.73093958, "epoch": 0.975551103325145, "grad_norm": 5.625, "learning_rate": 5.613888129436372e-06, "loss": 1.09019985, "memory(GiB)": 141.16, "step": 87220, "train_speed(iter/s)": 0.290452 }, { "acc": 0.73313012, "epoch": 0.9757748022711036, "grad_norm": 6.84375, "learning_rate": 5.612052658999078e-06, "loss": 1.0707654, "memory(GiB)": 141.16, "step": 87240, "train_speed(iter/s)": 0.290475 }, { "acc": 0.73115497, "epoch": 0.9759985012170621, "grad_norm": 6.625, "learning_rate": 5.6102171048242294e-06, "loss": 1.08624887, "memory(GiB)": 141.16, "step": 87260, "train_speed(iter/s)": 0.290497 }, { "acc": 0.72435503, "epoch": 0.9762222001630206, "grad_norm": 6.46875, "learning_rate": 5.608381467162961e-06, "loss": 1.08670864, "memory(GiB)": 141.16, "step": 87280, "train_speed(iter/s)": 0.290518 }, { "acc": 0.72992349, "epoch": 0.9764458991089792, "grad_norm": 8.0625, "learning_rate": 5.606545746266411e-06, "loss": 1.09599934, "memory(GiB)": 141.16, "step": 87300, "train_speed(iter/s)": 0.29054 }, { "acc": 0.7331706, "epoch": 0.9766695980549377, "grad_norm": 9.0, "learning_rate": 5.6047099423857335e-06, "loss": 1.06709251, "memory(GiB)": 141.16, "step": 87320, "train_speed(iter/s)": 0.290563 }, { "acc": 0.7302247, "epoch": 0.9768932970008962, "grad_norm": 7.9375, "learning_rate": 5.6028740557720915e-06, "loss": 1.08215446, "memory(GiB)": 141.16, "step": 87340, "train_speed(iter/s)": 0.290586 }, { "acc": 0.74098287, "epoch": 0.9771169959468547, "grad_norm": 7.15625, "learning_rate": 5.601038086676663e-06, "loss": 1.02923183, "memory(GiB)": 141.16, "step": 87360, "train_speed(iter/s)": 0.290611 }, { "acc": 0.73168745, "epoch": 0.9773406948928133, "grad_norm": 5.0, "learning_rate": 5.599202035350634e-06, "loss": 1.07858744, "memory(GiB)": 141.16, "step": 87380, "train_speed(iter/s)": 0.290633 }, { "acc": 0.71967869, "epoch": 0.9775643938387718, "grad_norm": 7.59375, "learning_rate": 5.5973659020451995e-06, "loss": 1.12358246, "memory(GiB)": 141.16, "step": 87400, "train_speed(iter/s)": 0.290656 }, { "acc": 0.73246708, "epoch": 0.9777880927847303, "grad_norm": 6.875, "learning_rate": 5.595529687011574e-06, "loss": 1.0779213, "memory(GiB)": 141.16, "step": 87420, "train_speed(iter/s)": 0.290681 }, { "acc": 0.72480569, "epoch": 0.9780117917306889, "grad_norm": 8.375, "learning_rate": 5.593693390500973e-06, "loss": 1.10572224, "memory(GiB)": 141.16, "step": 87440, "train_speed(iter/s)": 0.290707 }, { "acc": 0.74338188, "epoch": 0.9782354906766474, "grad_norm": 8.875, "learning_rate": 5.591857012764632e-06, "loss": 1.02296171, "memory(GiB)": 141.16, "step": 87460, "train_speed(iter/s)": 0.290734 }, { "acc": 0.73203249, "epoch": 0.9784591896226059, "grad_norm": 7.09375, "learning_rate": 5.590020554053792e-06, "loss": 1.05395937, "memory(GiB)": 141.16, "step": 87480, "train_speed(iter/s)": 0.290758 }, { "acc": 0.71567311, "epoch": 0.9786828885685644, "grad_norm": 5.1875, "learning_rate": 5.588184014619705e-06, "loss": 1.1456872, "memory(GiB)": 141.16, "step": 87500, "train_speed(iter/s)": 0.290781 }, { "acc": 0.72639475, "epoch": 0.978906587514523, "grad_norm": 6.25, "learning_rate": 5.58634739471364e-06, "loss": 1.08396206, "memory(GiB)": 141.16, "step": 87520, "train_speed(iter/s)": 0.290804 }, { "acc": 0.72538948, "epoch": 0.9791302864604815, "grad_norm": 6.03125, "learning_rate": 5.584510694586869e-06, "loss": 1.10825291, "memory(GiB)": 141.16, "step": 87540, "train_speed(iter/s)": 0.290826 }, { "acc": 0.72648687, "epoch": 0.97935398540644, "grad_norm": 5.75, "learning_rate": 5.582673914490682e-06, "loss": 1.10372858, "memory(GiB)": 141.16, "step": 87560, "train_speed(iter/s)": 0.290847 }, { "acc": 0.72464008, "epoch": 0.9795776843523986, "grad_norm": 7.0625, "learning_rate": 5.5808370546763735e-06, "loss": 1.10465031, "memory(GiB)": 141.16, "step": 87580, "train_speed(iter/s)": 0.29087 }, { "acc": 0.72473202, "epoch": 0.9798013832983571, "grad_norm": 5.28125, "learning_rate": 5.579000115395254e-06, "loss": 1.12861137, "memory(GiB)": 141.16, "step": 87600, "train_speed(iter/s)": 0.290891 }, { "acc": 0.74015627, "epoch": 0.9800250822443156, "grad_norm": 7.6875, "learning_rate": 5.577163096898643e-06, "loss": 1.03341236, "memory(GiB)": 141.16, "step": 87620, "train_speed(iter/s)": 0.290915 }, { "acc": 0.73267159, "epoch": 0.9802487811902741, "grad_norm": 8.5, "learning_rate": 5.575325999437872e-06, "loss": 1.07347946, "memory(GiB)": 141.16, "step": 87640, "train_speed(iter/s)": 0.290934 }, { "acc": 0.73477221, "epoch": 0.9804724801362327, "grad_norm": 7.25, "learning_rate": 5.57348882326428e-06, "loss": 1.06029644, "memory(GiB)": 141.16, "step": 87660, "train_speed(iter/s)": 0.290956 }, { "acc": 0.72346687, "epoch": 0.9806961790821912, "grad_norm": 6.9375, "learning_rate": 5.57165156862922e-06, "loss": 1.10821762, "memory(GiB)": 141.16, "step": 87680, "train_speed(iter/s)": 0.290978 }, { "acc": 0.72886324, "epoch": 0.9809198780281497, "grad_norm": 9.4375, "learning_rate": 5.569814235784056e-06, "loss": 1.084972, "memory(GiB)": 141.16, "step": 87700, "train_speed(iter/s)": 0.291003 }, { "acc": 0.73518133, "epoch": 0.9811435769741083, "grad_norm": 7.28125, "learning_rate": 5.567976824980158e-06, "loss": 1.04345989, "memory(GiB)": 141.16, "step": 87720, "train_speed(iter/s)": 0.291023 }, { "acc": 0.72766962, "epoch": 0.9813672759200668, "grad_norm": 7.625, "learning_rate": 5.566139336468912e-06, "loss": 1.09375916, "memory(GiB)": 141.16, "step": 87740, "train_speed(iter/s)": 0.291046 }, { "acc": 0.73203707, "epoch": 0.9815909748660253, "grad_norm": 6.5, "learning_rate": 5.564301770501714e-06, "loss": 1.07493553, "memory(GiB)": 141.16, "step": 87760, "train_speed(iter/s)": 0.291066 }, { "acc": 0.72872057, "epoch": 0.9818146738119838, "grad_norm": 6.65625, "learning_rate": 5.562464127329968e-06, "loss": 1.09029808, "memory(GiB)": 141.16, "step": 87780, "train_speed(iter/s)": 0.291088 }, { "acc": 0.72506018, "epoch": 0.9820383727579424, "grad_norm": 7.625, "learning_rate": 5.56062640720509e-06, "loss": 1.1164814, "memory(GiB)": 141.16, "step": 87800, "train_speed(iter/s)": 0.291107 }, { "acc": 0.74193602, "epoch": 0.9822620717039009, "grad_norm": 7.96875, "learning_rate": 5.558788610378505e-06, "loss": 1.04898071, "memory(GiB)": 141.16, "step": 87820, "train_speed(iter/s)": 0.291127 }, { "acc": 0.72873969, "epoch": 0.9824857706498594, "grad_norm": 6.0, "learning_rate": 5.556950737101651e-06, "loss": 1.08005075, "memory(GiB)": 141.16, "step": 87840, "train_speed(iter/s)": 0.291147 }, { "acc": 0.72927256, "epoch": 0.982709469595818, "grad_norm": 6.21875, "learning_rate": 5.555112787625977e-06, "loss": 1.09904242, "memory(GiB)": 141.16, "step": 87860, "train_speed(iter/s)": 0.291169 }, { "acc": 0.74028893, "epoch": 0.9829331685417765, "grad_norm": 7.09375, "learning_rate": 5.55327476220294e-06, "loss": 1.04062672, "memory(GiB)": 141.16, "step": 87880, "train_speed(iter/s)": 0.291191 }, { "acc": 0.73432045, "epoch": 0.983156867487735, "grad_norm": 8.4375, "learning_rate": 5.551436661084008e-06, "loss": 1.06830311, "memory(GiB)": 141.16, "step": 87900, "train_speed(iter/s)": 0.291211 }, { "acc": 0.71190186, "epoch": 0.9833805664336935, "grad_norm": 8.0, "learning_rate": 5.549598484520656e-06, "loss": 1.17053404, "memory(GiB)": 141.16, "step": 87920, "train_speed(iter/s)": 0.291233 }, { "acc": 0.73938351, "epoch": 0.9836042653796521, "grad_norm": 6.625, "learning_rate": 5.547760232764376e-06, "loss": 1.04262581, "memory(GiB)": 141.16, "step": 87940, "train_speed(iter/s)": 0.291255 }, { "acc": 0.72526684, "epoch": 0.9838279643256106, "grad_norm": 6.8125, "learning_rate": 5.545921906066668e-06, "loss": 1.10069599, "memory(GiB)": 141.16, "step": 87960, "train_speed(iter/s)": 0.291277 }, { "acc": 0.72887745, "epoch": 0.9840516632715691, "grad_norm": 7.59375, "learning_rate": 5.5440835046790395e-06, "loss": 1.09085655, "memory(GiB)": 141.16, "step": 87980, "train_speed(iter/s)": 0.291297 }, { "acc": 0.72743506, "epoch": 0.9842753622175276, "grad_norm": 7.8125, "learning_rate": 5.5422450288530125e-06, "loss": 1.10966167, "memory(GiB)": 141.16, "step": 88000, "train_speed(iter/s)": 0.291316 }, { "epoch": 0.9842753622175276, "eval_acc": 0.6901071768436653, "eval_loss": 1.07905113697052, "eval_runtime": 2323.6408, "eval_samples_per_second": 32.399, "eval_steps_per_second": 16.2, "step": 88000 }, { "acc": 0.73974943, "epoch": 0.9844990611634862, "grad_norm": 6.4375, "learning_rate": 5.540406478840114e-06, "loss": 1.02975302, "memory(GiB)": 141.16, "step": 88020, "train_speed(iter/s)": 0.289067 }, { "acc": 0.72777891, "epoch": 0.9847227601094447, "grad_norm": 7.71875, "learning_rate": 5.5385678548918845e-06, "loss": 1.08913088, "memory(GiB)": 141.16, "step": 88040, "train_speed(iter/s)": 0.289086 }, { "acc": 0.72092781, "epoch": 0.9849464590554032, "grad_norm": 6.875, "learning_rate": 5.5367291572598744e-06, "loss": 1.12372599, "memory(GiB)": 141.16, "step": 88060, "train_speed(iter/s)": 0.289105 }, { "acc": 0.72838521, "epoch": 0.9851701580013618, "grad_norm": 6.125, "learning_rate": 5.534890386195645e-06, "loss": 1.08749237, "memory(GiB)": 141.16, "step": 88080, "train_speed(iter/s)": 0.289128 }, { "acc": 0.73413043, "epoch": 0.9853938569473203, "grad_norm": 7.65625, "learning_rate": 5.5330515419507656e-06, "loss": 1.05892315, "memory(GiB)": 141.16, "step": 88100, "train_speed(iter/s)": 0.289149 }, { "acc": 0.73425455, "epoch": 0.9856175558932788, "grad_norm": 6.4375, "learning_rate": 5.531212624776815e-06, "loss": 1.04607048, "memory(GiB)": 141.16, "step": 88120, "train_speed(iter/s)": 0.289174 }, { "acc": 0.73512669, "epoch": 0.9858412548392373, "grad_norm": 6.96875, "learning_rate": 5.529373634925385e-06, "loss": 1.05910082, "memory(GiB)": 141.16, "step": 88140, "train_speed(iter/s)": 0.289197 }, { "acc": 0.72712631, "epoch": 0.9860649537851959, "grad_norm": 6.4375, "learning_rate": 5.5275345726480756e-06, "loss": 1.08752499, "memory(GiB)": 141.16, "step": 88160, "train_speed(iter/s)": 0.289219 }, { "acc": 0.7368535, "epoch": 0.9862886527311544, "grad_norm": 8.125, "learning_rate": 5.525695438196496e-06, "loss": 1.04978666, "memory(GiB)": 141.16, "step": 88180, "train_speed(iter/s)": 0.289243 }, { "acc": 0.73564415, "epoch": 0.9865123516771129, "grad_norm": 7.34375, "learning_rate": 5.5238562318222665e-06, "loss": 1.04986763, "memory(GiB)": 141.16, "step": 88200, "train_speed(iter/s)": 0.289265 }, { "acc": 0.75305204, "epoch": 0.9867360506230715, "grad_norm": 6.1875, "learning_rate": 5.522016953777017e-06, "loss": 0.97314053, "memory(GiB)": 141.16, "step": 88220, "train_speed(iter/s)": 0.289288 }, { "acc": 0.72773209, "epoch": 0.98695974956903, "grad_norm": 7.0, "learning_rate": 5.520177604312386e-06, "loss": 1.08991089, "memory(GiB)": 141.16, "step": 88240, "train_speed(iter/s)": 0.289311 }, { "acc": 0.72858067, "epoch": 0.9871834485149885, "grad_norm": 7.875, "learning_rate": 5.5183381836800255e-06, "loss": 1.09092102, "memory(GiB)": 141.16, "step": 88260, "train_speed(iter/s)": 0.289333 }, { "acc": 0.73483019, "epoch": 0.987407147460947, "grad_norm": 6.40625, "learning_rate": 5.516498692131592e-06, "loss": 1.05752583, "memory(GiB)": 141.16, "step": 88280, "train_speed(iter/s)": 0.289354 }, { "acc": 0.74451475, "epoch": 0.9876308464069056, "grad_norm": 9.3125, "learning_rate": 5.514659129918756e-06, "loss": 1.02222147, "memory(GiB)": 141.16, "step": 88300, "train_speed(iter/s)": 0.289377 }, { "acc": 0.71588597, "epoch": 0.9878545453528641, "grad_norm": 7.09375, "learning_rate": 5.512819497293193e-06, "loss": 1.15507183, "memory(GiB)": 141.16, "step": 88320, "train_speed(iter/s)": 0.289398 }, { "acc": 0.72861271, "epoch": 0.9880782442988226, "grad_norm": 6.21875, "learning_rate": 5.510979794506593e-06, "loss": 1.08482437, "memory(GiB)": 141.16, "step": 88340, "train_speed(iter/s)": 0.289421 }, { "acc": 0.7351717, "epoch": 0.9883019432447812, "grad_norm": 7.90625, "learning_rate": 5.509140021810654e-06, "loss": 1.06292362, "memory(GiB)": 141.16, "step": 88360, "train_speed(iter/s)": 0.289443 }, { "acc": 0.75407834, "epoch": 0.9885256421907397, "grad_norm": 7.9375, "learning_rate": 5.507300179457082e-06, "loss": 0.97446089, "memory(GiB)": 141.16, "step": 88380, "train_speed(iter/s)": 0.289467 }, { "acc": 0.72807746, "epoch": 0.9887493411366982, "grad_norm": 6.84375, "learning_rate": 5.505460267697597e-06, "loss": 1.08493996, "memory(GiB)": 141.16, "step": 88400, "train_speed(iter/s)": 0.289489 }, { "acc": 0.74955106, "epoch": 0.9889730400826567, "grad_norm": 7.78125, "learning_rate": 5.503620286783921e-06, "loss": 1.0046711, "memory(GiB)": 141.16, "step": 88420, "train_speed(iter/s)": 0.28951 }, { "acc": 0.72255459, "epoch": 0.9891967390286153, "grad_norm": 7.0, "learning_rate": 5.5017802369677905e-06, "loss": 1.13620968, "memory(GiB)": 141.16, "step": 88440, "train_speed(iter/s)": 0.289529 }, { "acc": 0.72229624, "epoch": 0.9894204379745738, "grad_norm": 5.75, "learning_rate": 5.499940118500953e-06, "loss": 1.12679968, "memory(GiB)": 141.16, "step": 88460, "train_speed(iter/s)": 0.289548 }, { "acc": 0.73481331, "epoch": 0.9896441369205323, "grad_norm": 8.9375, "learning_rate": 5.49809993163516e-06, "loss": 1.05870476, "memory(GiB)": 141.16, "step": 88480, "train_speed(iter/s)": 0.28957 }, { "acc": 0.74065509, "epoch": 0.9898678358664909, "grad_norm": 7.25, "learning_rate": 5.496259676622178e-06, "loss": 1.04523335, "memory(GiB)": 141.16, "step": 88500, "train_speed(iter/s)": 0.28959 }, { "acc": 0.71930399, "epoch": 0.9900915348124494, "grad_norm": 6.4375, "learning_rate": 5.49441935371378e-06, "loss": 1.11391373, "memory(GiB)": 141.16, "step": 88520, "train_speed(iter/s)": 0.289613 }, { "acc": 0.73472548, "epoch": 0.9903152337584079, "grad_norm": 5.3125, "learning_rate": 5.492578963161746e-06, "loss": 1.06514149, "memory(GiB)": 141.16, "step": 88540, "train_speed(iter/s)": 0.289633 }, { "acc": 0.72954984, "epoch": 0.9905389327043664, "grad_norm": 6.8125, "learning_rate": 5.490738505217869e-06, "loss": 1.09297161, "memory(GiB)": 141.16, "step": 88560, "train_speed(iter/s)": 0.289654 }, { "acc": 0.73922787, "epoch": 0.990762631650325, "grad_norm": 8.3125, "learning_rate": 5.488897980133951e-06, "loss": 1.04961519, "memory(GiB)": 141.16, "step": 88580, "train_speed(iter/s)": 0.289675 }, { "acc": 0.73075175, "epoch": 0.9909863305962835, "grad_norm": 8.375, "learning_rate": 5.487057388161801e-06, "loss": 1.07539234, "memory(GiB)": 141.16, "step": 88600, "train_speed(iter/s)": 0.289696 }, { "acc": 0.74056706, "epoch": 0.991210029542242, "grad_norm": 7.96875, "learning_rate": 5.485216729553239e-06, "loss": 1.0499342, "memory(GiB)": 141.16, "step": 88620, "train_speed(iter/s)": 0.289719 }, { "acc": 0.74403, "epoch": 0.9914337284882005, "grad_norm": 6.21875, "learning_rate": 5.4833760045600926e-06, "loss": 1.00586338, "memory(GiB)": 141.16, "step": 88640, "train_speed(iter/s)": 0.289741 }, { "acc": 0.73747501, "epoch": 0.9916574274341591, "grad_norm": 7.6875, "learning_rate": 5.481535213434199e-06, "loss": 1.05719824, "memory(GiB)": 141.16, "step": 88660, "train_speed(iter/s)": 0.28976 }, { "acc": 0.73057203, "epoch": 0.9918811263801176, "grad_norm": 9.3125, "learning_rate": 5.479694356427407e-06, "loss": 1.08878622, "memory(GiB)": 141.16, "step": 88680, "train_speed(iter/s)": 0.289782 }, { "acc": 0.73447337, "epoch": 0.9921048253260761, "grad_norm": 7.0, "learning_rate": 5.47785343379157e-06, "loss": 1.06638641, "memory(GiB)": 141.16, "step": 88700, "train_speed(iter/s)": 0.289805 }, { "acc": 0.71161327, "epoch": 0.9923285242720347, "grad_norm": 7.3125, "learning_rate": 5.476012445778554e-06, "loss": 1.17026234, "memory(GiB)": 141.16, "step": 88720, "train_speed(iter/s)": 0.289828 }, { "acc": 0.73160458, "epoch": 0.9925522232179932, "grad_norm": 9.9375, "learning_rate": 5.47417139264023e-06, "loss": 1.07401791, "memory(GiB)": 141.16, "step": 88740, "train_speed(iter/s)": 0.289851 }, { "acc": 0.74030905, "epoch": 0.9927759221639517, "grad_norm": 6.96875, "learning_rate": 5.472330274628484e-06, "loss": 1.02461834, "memory(GiB)": 141.16, "step": 88760, "train_speed(iter/s)": 0.289873 }, { "acc": 0.73740215, "epoch": 0.9929996211099102, "grad_norm": 6.0, "learning_rate": 5.470489091995203e-06, "loss": 1.0532093, "memory(GiB)": 141.16, "step": 88780, "train_speed(iter/s)": 0.289896 }, { "acc": 0.72715034, "epoch": 0.9932233200558688, "grad_norm": 5.6875, "learning_rate": 5.46864784499229e-06, "loss": 1.10328388, "memory(GiB)": 141.16, "step": 88800, "train_speed(iter/s)": 0.289916 }, { "acc": 0.72905569, "epoch": 0.9934470190018273, "grad_norm": 7.03125, "learning_rate": 5.466806533871655e-06, "loss": 1.06352625, "memory(GiB)": 141.16, "step": 88820, "train_speed(iter/s)": 0.28994 }, { "acc": 0.72107744, "epoch": 0.9936707179477858, "grad_norm": 5.96875, "learning_rate": 5.464965158885212e-06, "loss": 1.11962519, "memory(GiB)": 141.16, "step": 88840, "train_speed(iter/s)": 0.289964 }, { "acc": 0.72922163, "epoch": 0.9938944168937444, "grad_norm": 7.4375, "learning_rate": 5.463123720284889e-06, "loss": 1.08339186, "memory(GiB)": 141.16, "step": 88860, "train_speed(iter/s)": 0.289984 }, { "acc": 0.73344707, "epoch": 0.9941181158397029, "grad_norm": 7.40625, "learning_rate": 5.461282218322623e-06, "loss": 1.07804508, "memory(GiB)": 141.16, "step": 88880, "train_speed(iter/s)": 0.290004 }, { "acc": 0.72687135, "epoch": 0.9943418147856614, "grad_norm": 6.1875, "learning_rate": 5.4594406532503564e-06, "loss": 1.11108685, "memory(GiB)": 141.16, "step": 88900, "train_speed(iter/s)": 0.290028 }, { "acc": 0.73269129, "epoch": 0.9945655137316199, "grad_norm": 6.40625, "learning_rate": 5.4575990253200415e-06, "loss": 1.07830791, "memory(GiB)": 141.16, "step": 88920, "train_speed(iter/s)": 0.290049 }, { "acc": 0.7403194, "epoch": 0.9947892126775785, "grad_norm": 7.34375, "learning_rate": 5.455757334783639e-06, "loss": 1.03698292, "memory(GiB)": 141.16, "step": 88940, "train_speed(iter/s)": 0.29007 }, { "acc": 0.72862272, "epoch": 0.995012911623537, "grad_norm": 7.03125, "learning_rate": 5.453915581893119e-06, "loss": 1.08108196, "memory(GiB)": 141.16, "step": 88960, "train_speed(iter/s)": 0.290092 }, { "acc": 0.73596478, "epoch": 0.9952366105694955, "grad_norm": 7.46875, "learning_rate": 5.4520737669004585e-06, "loss": 1.06645241, "memory(GiB)": 141.16, "step": 88980, "train_speed(iter/s)": 0.290109 }, { "acc": 0.73552685, "epoch": 0.9954603095154541, "grad_norm": 7.3125, "learning_rate": 5.450231890057646e-06, "loss": 1.06418676, "memory(GiB)": 141.16, "step": 89000, "train_speed(iter/s)": 0.290134 }, { "acc": 0.73296962, "epoch": 0.9956840084614126, "grad_norm": 5.65625, "learning_rate": 5.448389951616675e-06, "loss": 1.09763889, "memory(GiB)": 141.16, "step": 89020, "train_speed(iter/s)": 0.290159 }, { "acc": 0.74029684, "epoch": 0.9959077074073711, "grad_norm": 7.0625, "learning_rate": 5.4465479518295505e-06, "loss": 1.04750786, "memory(GiB)": 141.16, "step": 89040, "train_speed(iter/s)": 0.29018 }, { "acc": 0.72633848, "epoch": 0.9961314063533296, "grad_norm": 7.4375, "learning_rate": 5.4447058909482844e-06, "loss": 1.10926304, "memory(GiB)": 141.16, "step": 89060, "train_speed(iter/s)": 0.290203 }, { "acc": 0.72857294, "epoch": 0.9963551052992882, "grad_norm": 7.1875, "learning_rate": 5.442863769224894e-06, "loss": 1.09039402, "memory(GiB)": 141.16, "step": 89080, "train_speed(iter/s)": 0.290224 }, { "acc": 0.74060278, "epoch": 0.9965788042452467, "grad_norm": 6.3125, "learning_rate": 5.44102158691141e-06, "loss": 1.04313641, "memory(GiB)": 141.16, "step": 89100, "train_speed(iter/s)": 0.290245 }, { "acc": 0.7252667, "epoch": 0.9968025031912052, "grad_norm": 7.1875, "learning_rate": 5.4391793442598705e-06, "loss": 1.09625626, "memory(GiB)": 141.16, "step": 89120, "train_speed(iter/s)": 0.290268 }, { "acc": 0.74744983, "epoch": 0.9970262021371638, "grad_norm": 8.3125, "learning_rate": 5.437337041522319e-06, "loss": 1.01174507, "memory(GiB)": 141.16, "step": 89140, "train_speed(iter/s)": 0.290289 }, { "acc": 0.73865886, "epoch": 0.9972499010831223, "grad_norm": 5.75, "learning_rate": 5.435494678950809e-06, "loss": 1.03249149, "memory(GiB)": 141.16, "step": 89160, "train_speed(iter/s)": 0.29031 }, { "acc": 0.73514709, "epoch": 0.9974736000290808, "grad_norm": 7.03125, "learning_rate": 5.4336522567974025e-06, "loss": 1.05115995, "memory(GiB)": 141.16, "step": 89180, "train_speed(iter/s)": 0.290332 }, { "acc": 0.74410381, "epoch": 0.9976972989750393, "grad_norm": 7.34375, "learning_rate": 5.4318097753141686e-06, "loss": 1.03323383, "memory(GiB)": 141.16, "step": 89200, "train_speed(iter/s)": 0.290352 }, { "acc": 0.72436104, "epoch": 0.997920997920998, "grad_norm": 6.28125, "learning_rate": 5.429967234753185e-06, "loss": 1.11217995, "memory(GiB)": 141.16, "step": 89220, "train_speed(iter/s)": 0.290375 }, { "acc": 0.72288246, "epoch": 0.9981446968669565, "grad_norm": 6.59375, "learning_rate": 5.428124635366539e-06, "loss": 1.11038284, "memory(GiB)": 141.16, "step": 89240, "train_speed(iter/s)": 0.290399 }, { "acc": 0.73426185, "epoch": 0.998368395812915, "grad_norm": 5.59375, "learning_rate": 5.4262819774063244e-06, "loss": 1.06626453, "memory(GiB)": 141.16, "step": 89260, "train_speed(iter/s)": 0.29042 }, { "acc": 0.72110405, "epoch": 0.9985920947588736, "grad_norm": 6.78125, "learning_rate": 5.424439261124641e-06, "loss": 1.13870735, "memory(GiB)": 141.16, "step": 89280, "train_speed(iter/s)": 0.290439 }, { "acc": 0.73318605, "epoch": 0.9988157937048321, "grad_norm": 7.65625, "learning_rate": 5.422596486773599e-06, "loss": 1.06283846, "memory(GiB)": 141.16, "step": 89300, "train_speed(iter/s)": 0.290458 }, { "acc": 0.73654795, "epoch": 0.9990394926507906, "grad_norm": 7.53125, "learning_rate": 5.42075365460532e-06, "loss": 1.05532169, "memory(GiB)": 141.16, "step": 89320, "train_speed(iter/s)": 0.290482 }, { "acc": 0.72408171, "epoch": 0.9992631915967491, "grad_norm": 5.46875, "learning_rate": 5.418910764871925e-06, "loss": 1.1031477, "memory(GiB)": 141.16, "step": 89340, "train_speed(iter/s)": 0.290501 }, { "acc": 0.72261958, "epoch": 0.9994868905427077, "grad_norm": 8.375, "learning_rate": 5.417067817825551e-06, "loss": 1.13710022, "memory(GiB)": 141.16, "step": 89360, "train_speed(iter/s)": 0.290523 }, { "acc": 0.72596216, "epoch": 0.9997105894886662, "grad_norm": 6.28125, "learning_rate": 5.415224813718337e-06, "loss": 1.10319433, "memory(GiB)": 141.16, "step": 89380, "train_speed(iter/s)": 0.29054 }, { "acc": 0.72780499, "epoch": 0.9999342884346247, "grad_norm": 7.375, "learning_rate": 5.4133817528024345e-06, "loss": 1.08927193, "memory(GiB)": 141.16, "step": 89400, "train_speed(iter/s)": 0.290562 }, { "acc": 0.7322175, "epoch": 1.0001579873805833, "grad_norm": 5.6875, "learning_rate": 5.411538635329999e-06, "loss": 1.08337231, "memory(GiB)": 141.16, "step": 89420, "train_speed(iter/s)": 0.290581 }, { "acc": 0.72482748, "epoch": 1.0003816863265418, "grad_norm": 6.15625, "learning_rate": 5.409695461553197e-06, "loss": 1.11211147, "memory(GiB)": 141.16, "step": 89440, "train_speed(iter/s)": 0.290602 }, { "acc": 0.73842182, "epoch": 1.0006053852725003, "grad_norm": 6.96875, "learning_rate": 5.407852231724199e-06, "loss": 1.04597435, "memory(GiB)": 141.16, "step": 89460, "train_speed(iter/s)": 0.290624 }, { "acc": 0.74670753, "epoch": 1.0008290842184588, "grad_norm": 7.1875, "learning_rate": 5.406008946095186e-06, "loss": 1.0153141, "memory(GiB)": 141.16, "step": 89480, "train_speed(iter/s)": 0.290646 }, { "acc": 0.72669029, "epoch": 1.0010527831644174, "grad_norm": 6.71875, "learning_rate": 5.404165604918346e-06, "loss": 1.10507374, "memory(GiB)": 141.16, "step": 89500, "train_speed(iter/s)": 0.290667 }, { "acc": 0.72908163, "epoch": 1.001276482110376, "grad_norm": 6.96875, "learning_rate": 5.402322208445875e-06, "loss": 1.10999746, "memory(GiB)": 141.16, "step": 89520, "train_speed(iter/s)": 0.29069 }, { "acc": 0.74188781, "epoch": 1.0015001810563344, "grad_norm": 6.15625, "learning_rate": 5.400478756929977e-06, "loss": 1.02812462, "memory(GiB)": 141.16, "step": 89540, "train_speed(iter/s)": 0.29071 }, { "acc": 0.72669473, "epoch": 1.001723880002293, "grad_norm": 7.375, "learning_rate": 5.398635250622858e-06, "loss": 1.10877323, "memory(GiB)": 141.16, "step": 89560, "train_speed(iter/s)": 0.29073 }, { "acc": 0.74878902, "epoch": 1.0019475789482515, "grad_norm": 5.40625, "learning_rate": 5.396791689776739e-06, "loss": 1.00410118, "memory(GiB)": 141.16, "step": 89580, "train_speed(iter/s)": 0.290748 }, { "acc": 0.74607325, "epoch": 1.00217127789421, "grad_norm": 6.09375, "learning_rate": 5.394948074643846e-06, "loss": 1.00938034, "memory(GiB)": 141.16, "step": 89600, "train_speed(iter/s)": 0.29077 }, { "acc": 0.72567601, "epoch": 1.0023949768401685, "grad_norm": 7.75, "learning_rate": 5.393104405476413e-06, "loss": 1.10612125, "memory(GiB)": 141.16, "step": 89620, "train_speed(iter/s)": 0.290792 }, { "acc": 0.73251009, "epoch": 1.002618675786127, "grad_norm": 6.5625, "learning_rate": 5.3912606825266765e-06, "loss": 1.06621647, "memory(GiB)": 141.16, "step": 89640, "train_speed(iter/s)": 0.290815 }, { "acc": 0.7522644, "epoch": 1.0028423747320856, "grad_norm": 7.28125, "learning_rate": 5.389416906046888e-06, "loss": 0.99190016, "memory(GiB)": 141.16, "step": 89660, "train_speed(iter/s)": 0.290836 }, { "acc": 0.73916912, "epoch": 1.0030660736780441, "grad_norm": 8.0625, "learning_rate": 5.3875730762893e-06, "loss": 1.05387144, "memory(GiB)": 141.16, "step": 89680, "train_speed(iter/s)": 0.290859 }, { "acc": 0.72494965, "epoch": 1.0032897726240027, "grad_norm": 6.4375, "learning_rate": 5.385729193506175e-06, "loss": 1.11896667, "memory(GiB)": 141.16, "step": 89700, "train_speed(iter/s)": 0.290881 }, { "acc": 0.73329258, "epoch": 1.0035134715699612, "grad_norm": 7.3125, "learning_rate": 5.383885257949783e-06, "loss": 1.08443794, "memory(GiB)": 141.16, "step": 89720, "train_speed(iter/s)": 0.290901 }, { "acc": 0.72777405, "epoch": 1.0037371705159197, "grad_norm": 5.25, "learning_rate": 5.3820412698724e-06, "loss": 1.10467873, "memory(GiB)": 141.16, "step": 89740, "train_speed(iter/s)": 0.290923 }, { "acc": 0.72873278, "epoch": 1.0039608694618782, "grad_norm": 8.125, "learning_rate": 5.380197229526313e-06, "loss": 1.07953606, "memory(GiB)": 141.16, "step": 89760, "train_speed(iter/s)": 0.290947 }, { "acc": 0.74072428, "epoch": 1.0041845684078368, "grad_norm": 8.625, "learning_rate": 5.378353137163808e-06, "loss": 1.05073738, "memory(GiB)": 141.16, "step": 89780, "train_speed(iter/s)": 0.290967 }, { "acc": 0.74116817, "epoch": 1.0044082673537953, "grad_norm": 7.875, "learning_rate": 5.376508993037187e-06, "loss": 1.02526722, "memory(GiB)": 141.16, "step": 89800, "train_speed(iter/s)": 0.290989 }, { "acc": 0.73565989, "epoch": 1.0046319662997538, "grad_norm": 7.65625, "learning_rate": 5.374664797398754e-06, "loss": 1.05293722, "memory(GiB)": 141.16, "step": 89820, "train_speed(iter/s)": 0.291014 }, { "acc": 0.73279438, "epoch": 1.0048556652457123, "grad_norm": 9.6875, "learning_rate": 5.372820550500822e-06, "loss": 1.07059364, "memory(GiB)": 141.16, "step": 89840, "train_speed(iter/s)": 0.291037 }, { "acc": 0.74217873, "epoch": 1.0050793641916709, "grad_norm": 7.03125, "learning_rate": 5.3709762525957095e-06, "loss": 1.03265362, "memory(GiB)": 141.16, "step": 89860, "train_speed(iter/s)": 0.291058 }, { "acc": 0.74113111, "epoch": 1.0053030631376294, "grad_norm": 6.375, "learning_rate": 5.369131903935744e-06, "loss": 1.03983793, "memory(GiB)": 141.16, "step": 89880, "train_speed(iter/s)": 0.291079 }, { "acc": 0.72491894, "epoch": 1.005526762083588, "grad_norm": 6.84375, "learning_rate": 5.367287504773256e-06, "loss": 1.10212393, "memory(GiB)": 141.16, "step": 89900, "train_speed(iter/s)": 0.291102 }, { "acc": 0.74397745, "epoch": 1.0057504610295465, "grad_norm": 6.15625, "learning_rate": 5.36544305536059e-06, "loss": 1.01381197, "memory(GiB)": 141.16, "step": 89920, "train_speed(iter/s)": 0.291122 }, { "acc": 0.74940872, "epoch": 1.005974159975505, "grad_norm": 6.21875, "learning_rate": 5.3635985559500895e-06, "loss": 0.99498539, "memory(GiB)": 141.16, "step": 89940, "train_speed(iter/s)": 0.291144 }, { "acc": 0.73336506, "epoch": 1.0061978589214635, "grad_norm": 6.65625, "learning_rate": 5.36175400679411e-06, "loss": 1.06002274, "memory(GiB)": 141.16, "step": 89960, "train_speed(iter/s)": 0.291166 }, { "acc": 0.71817775, "epoch": 1.006421557867422, "grad_norm": 7.25, "learning_rate": 5.359909408145011e-06, "loss": 1.1392189, "memory(GiB)": 141.16, "step": 89980, "train_speed(iter/s)": 0.291186 }, { "acc": 0.72985878, "epoch": 1.0066452568133806, "grad_norm": 7.1875, "learning_rate": 5.358064760255161e-06, "loss": 1.06713314, "memory(GiB)": 141.16, "step": 90000, "train_speed(iter/s)": 0.291209 }, { "epoch": 1.0066452568133806, "eval_acc": 0.690120141863942, "eval_loss": 1.0790406465530396, "eval_runtime": 2323.7843, "eval_samples_per_second": 32.397, "eval_steps_per_second": 16.199, "step": 90000 }, { "acc": 0.72815266, "epoch": 1.006868955759339, "grad_norm": 5.59375, "learning_rate": 5.356220063376933e-06, "loss": 1.1088419, "memory(GiB)": 141.16, "step": 90020, "train_speed(iter/s)": 0.28901 }, { "acc": 0.73170671, "epoch": 1.0070926547052976, "grad_norm": 7.5, "learning_rate": 5.35437531776271e-06, "loss": 1.06910887, "memory(GiB)": 141.16, "step": 90040, "train_speed(iter/s)": 0.289035 }, { "acc": 0.7419776, "epoch": 1.0073163536512562, "grad_norm": 7.0625, "learning_rate": 5.352530523664878e-06, "loss": 1.03214836, "memory(GiB)": 141.16, "step": 90060, "train_speed(iter/s)": 0.289057 }, { "acc": 0.72947631, "epoch": 1.0075400525972147, "grad_norm": 6.375, "learning_rate": 5.350685681335831e-06, "loss": 1.08320684, "memory(GiB)": 141.16, "step": 90080, "train_speed(iter/s)": 0.289078 }, { "acc": 0.74204326, "epoch": 1.0077637515431732, "grad_norm": 6.75, "learning_rate": 5.348840791027971e-06, "loss": 1.02523022, "memory(GiB)": 141.16, "step": 90100, "train_speed(iter/s)": 0.2891 }, { "acc": 0.74573922, "epoch": 1.0079874504891317, "grad_norm": 8.875, "learning_rate": 5.346995852993704e-06, "loss": 1.01769724, "memory(GiB)": 141.16, "step": 90120, "train_speed(iter/s)": 0.289119 }, { "acc": 0.73159537, "epoch": 1.0082111494350903, "grad_norm": 6.53125, "learning_rate": 5.345150867485445e-06, "loss": 1.08179731, "memory(GiB)": 141.16, "step": 90140, "train_speed(iter/s)": 0.289137 }, { "acc": 0.73030305, "epoch": 1.0084348483810488, "grad_norm": 6.46875, "learning_rate": 5.343305834755615e-06, "loss": 1.07463312, "memory(GiB)": 141.16, "step": 90160, "train_speed(iter/s)": 0.289158 }, { "acc": 0.73857679, "epoch": 1.0086585473270073, "grad_norm": 8.5, "learning_rate": 5.341460755056639e-06, "loss": 1.0445199, "memory(GiB)": 141.16, "step": 90180, "train_speed(iter/s)": 0.28918 }, { "acc": 0.7471025, "epoch": 1.0088822462729659, "grad_norm": 8.3125, "learning_rate": 5.339615628640951e-06, "loss": 1.00659981, "memory(GiB)": 141.16, "step": 90200, "train_speed(iter/s)": 0.289202 }, { "acc": 0.72018156, "epoch": 1.0091059452189244, "grad_norm": 6.96875, "learning_rate": 5.33777045576099e-06, "loss": 1.12943172, "memory(GiB)": 141.16, "step": 90220, "train_speed(iter/s)": 0.289225 }, { "acc": 0.72880344, "epoch": 1.009329644164883, "grad_norm": 5.96875, "learning_rate": 5.335925236669205e-06, "loss": 1.0806673, "memory(GiB)": 141.16, "step": 90240, "train_speed(iter/s)": 0.289245 }, { "acc": 0.72295666, "epoch": 1.0095533431108414, "grad_norm": 7.75, "learning_rate": 5.334079971618045e-06, "loss": 1.10616055, "memory(GiB)": 141.16, "step": 90260, "train_speed(iter/s)": 0.289268 }, { "acc": 0.747651, "epoch": 1.0097770420568, "grad_norm": 7.125, "learning_rate": 5.332234660859969e-06, "loss": 0.98673582, "memory(GiB)": 141.16, "step": 90280, "train_speed(iter/s)": 0.289289 }, { "acc": 0.75317521, "epoch": 1.0100007410027585, "grad_norm": 7.71875, "learning_rate": 5.330389304647443e-06, "loss": 0.97675343, "memory(GiB)": 141.16, "step": 90300, "train_speed(iter/s)": 0.289311 }, { "acc": 0.72585487, "epoch": 1.010224439948717, "grad_norm": 7.0625, "learning_rate": 5.328543903232939e-06, "loss": 1.10810862, "memory(GiB)": 141.16, "step": 90320, "train_speed(iter/s)": 0.28933 }, { "acc": 0.74125004, "epoch": 1.0104481388946756, "grad_norm": 10.8125, "learning_rate": 5.326698456868931e-06, "loss": 1.04961319, "memory(GiB)": 141.16, "step": 90340, "train_speed(iter/s)": 0.289352 }, { "acc": 0.72957935, "epoch": 1.010671837840634, "grad_norm": 6.25, "learning_rate": 5.324852965807905e-06, "loss": 1.08265209, "memory(GiB)": 141.16, "step": 90360, "train_speed(iter/s)": 0.289377 }, { "acc": 0.7507504, "epoch": 1.0108955367865926, "grad_norm": 9.6875, "learning_rate": 5.3230074303023515e-06, "loss": 0.98313551, "memory(GiB)": 141.16, "step": 90380, "train_speed(iter/s)": 0.289396 }, { "acc": 0.7348011, "epoch": 1.0111192357325511, "grad_norm": 6.375, "learning_rate": 5.321161850604763e-06, "loss": 1.06805553, "memory(GiB)": 141.16, "step": 90400, "train_speed(iter/s)": 0.289416 }, { "acc": 0.72315197, "epoch": 1.0113429346785097, "grad_norm": 7.0625, "learning_rate": 5.319316226967645e-06, "loss": 1.12216806, "memory(GiB)": 141.16, "step": 90420, "train_speed(iter/s)": 0.289437 }, { "acc": 0.72749863, "epoch": 1.0115666336244682, "grad_norm": 9.125, "learning_rate": 5.3174705596435e-06, "loss": 1.10695362, "memory(GiB)": 141.16, "step": 90440, "train_speed(iter/s)": 0.289459 }, { "acc": 0.72296419, "epoch": 1.0117903325704267, "grad_norm": 6.0625, "learning_rate": 5.315624848884847e-06, "loss": 1.10902691, "memory(GiB)": 141.16, "step": 90460, "train_speed(iter/s)": 0.289479 }, { "acc": 0.73043318, "epoch": 1.0120140315163852, "grad_norm": 5.90625, "learning_rate": 5.3137790949442025e-06, "loss": 1.08190784, "memory(GiB)": 141.16, "step": 90480, "train_speed(iter/s)": 0.2895 }, { "acc": 0.73334084, "epoch": 1.0122377304623438, "grad_norm": 6.15625, "learning_rate": 5.311933298074094e-06, "loss": 1.08461342, "memory(GiB)": 141.16, "step": 90500, "train_speed(iter/s)": 0.289519 }, { "acc": 0.731359, "epoch": 1.0124614294083023, "grad_norm": 7.40625, "learning_rate": 5.310087458527051e-06, "loss": 1.06794186, "memory(GiB)": 141.16, "step": 90520, "train_speed(iter/s)": 0.28954 }, { "acc": 0.74594517, "epoch": 1.0126851283542608, "grad_norm": 7.625, "learning_rate": 5.308241576555612e-06, "loss": 1.00505886, "memory(GiB)": 141.16, "step": 90540, "train_speed(iter/s)": 0.289562 }, { "acc": 0.73898234, "epoch": 1.0129088273002194, "grad_norm": 5.40625, "learning_rate": 5.306395652412318e-06, "loss": 1.04711199, "memory(GiB)": 141.16, "step": 90560, "train_speed(iter/s)": 0.289584 }, { "acc": 0.72929111, "epoch": 1.013132526246178, "grad_norm": 7.40625, "learning_rate": 5.30454968634972e-06, "loss": 1.09275627, "memory(GiB)": 141.16, "step": 90580, "train_speed(iter/s)": 0.289606 }, { "acc": 0.73991861, "epoch": 1.0133562251921364, "grad_norm": 7.15625, "learning_rate": 5.302703678620374e-06, "loss": 1.03064327, "memory(GiB)": 141.16, "step": 90600, "train_speed(iter/s)": 0.289627 }, { "acc": 0.73854485, "epoch": 1.013579924138095, "grad_norm": 6.59375, "learning_rate": 5.300857629476835e-06, "loss": 1.04112959, "memory(GiB)": 141.16, "step": 90620, "train_speed(iter/s)": 0.289648 }, { "acc": 0.73284016, "epoch": 1.0138036230840535, "grad_norm": 8.0625, "learning_rate": 5.299011539171673e-06, "loss": 1.06655788, "memory(GiB)": 141.16, "step": 90640, "train_speed(iter/s)": 0.289669 }, { "acc": 0.74130173, "epoch": 1.014027322030012, "grad_norm": 8.25, "learning_rate": 5.29716540795746e-06, "loss": 1.03265476, "memory(GiB)": 141.16, "step": 90660, "train_speed(iter/s)": 0.28969 }, { "acc": 0.73812604, "epoch": 1.0142510209759705, "grad_norm": 6.25, "learning_rate": 5.29531923608677e-06, "loss": 1.04936161, "memory(GiB)": 141.16, "step": 90680, "train_speed(iter/s)": 0.289709 }, { "acc": 0.73858628, "epoch": 1.014474719921929, "grad_norm": 7.125, "learning_rate": 5.293473023812189e-06, "loss": 1.03745308, "memory(GiB)": 141.16, "step": 90700, "train_speed(iter/s)": 0.28973 }, { "acc": 0.72318025, "epoch": 1.0146984188678876, "grad_norm": 6.65625, "learning_rate": 5.291626771386302e-06, "loss": 1.11747646, "memory(GiB)": 141.16, "step": 90720, "train_speed(iter/s)": 0.289751 }, { "acc": 0.72360764, "epoch": 1.0149221178138461, "grad_norm": 6.75, "learning_rate": 5.289780479061706e-06, "loss": 1.11075792, "memory(GiB)": 141.16, "step": 90740, "train_speed(iter/s)": 0.289775 }, { "acc": 0.73055286, "epoch": 1.0151458167598046, "grad_norm": 5.96875, "learning_rate": 5.287934147090997e-06, "loss": 1.07858982, "memory(GiB)": 141.16, "step": 90760, "train_speed(iter/s)": 0.289796 }, { "acc": 0.74230323, "epoch": 1.0153695157057632, "grad_norm": 8.3125, "learning_rate": 5.286087775726782e-06, "loss": 1.04356918, "memory(GiB)": 141.16, "step": 90780, "train_speed(iter/s)": 0.289819 }, { "acc": 0.74624238, "epoch": 1.0155932146517217, "grad_norm": 7.4375, "learning_rate": 5.28424136522167e-06, "loss": 1.01259298, "memory(GiB)": 141.16, "step": 90800, "train_speed(iter/s)": 0.289841 }, { "acc": 0.72674046, "epoch": 1.0158169135976802, "grad_norm": 7.25, "learning_rate": 5.282394915828277e-06, "loss": 1.09439774, "memory(GiB)": 141.16, "step": 90820, "train_speed(iter/s)": 0.289861 }, { "acc": 0.73431554, "epoch": 1.0160406125436388, "grad_norm": 6.625, "learning_rate": 5.280548427799224e-06, "loss": 1.06313915, "memory(GiB)": 141.16, "step": 90840, "train_speed(iter/s)": 0.289882 }, { "acc": 0.72802849, "epoch": 1.0162643114895973, "grad_norm": 6.03125, "learning_rate": 5.278701901387135e-06, "loss": 1.11261225, "memory(GiB)": 141.16, "step": 90860, "train_speed(iter/s)": 0.289901 }, { "acc": 0.7302578, "epoch": 1.0164880104355558, "grad_norm": 6.34375, "learning_rate": 5.276855336844641e-06, "loss": 1.08071976, "memory(GiB)": 141.16, "step": 90880, "train_speed(iter/s)": 0.289922 }, { "acc": 0.73741384, "epoch": 1.0167117093815143, "grad_norm": 5.75, "learning_rate": 5.2750087344243805e-06, "loss": 1.04160919, "memory(GiB)": 141.16, "step": 90900, "train_speed(iter/s)": 0.289945 }, { "acc": 0.74930325, "epoch": 1.0169354083274729, "grad_norm": 7.1875, "learning_rate": 5.273162094378995e-06, "loss": 0.99218903, "memory(GiB)": 141.16, "step": 90920, "train_speed(iter/s)": 0.289966 }, { "acc": 0.73362617, "epoch": 1.0171591072734314, "grad_norm": 9.3125, "learning_rate": 5.271315416961131e-06, "loss": 1.06678638, "memory(GiB)": 141.16, "step": 90940, "train_speed(iter/s)": 0.289984 }, { "acc": 0.71782751, "epoch": 1.01738280621939, "grad_norm": 8.125, "learning_rate": 5.269468702423438e-06, "loss": 1.12810373, "memory(GiB)": 141.16, "step": 90960, "train_speed(iter/s)": 0.290006 }, { "acc": 0.73738418, "epoch": 1.0176065051653485, "grad_norm": 5.1875, "learning_rate": 5.267621951018577e-06, "loss": 1.05900936, "memory(GiB)": 141.16, "step": 90980, "train_speed(iter/s)": 0.290027 }, { "acc": 0.73587675, "epoch": 1.017830204111307, "grad_norm": 5.71875, "learning_rate": 5.265775162999206e-06, "loss": 1.04820518, "memory(GiB)": 141.16, "step": 91000, "train_speed(iter/s)": 0.290046 }, { "acc": 0.73343372, "epoch": 1.0180539030572655, "grad_norm": 6.0625, "learning_rate": 5.263928338617996e-06, "loss": 1.0652935, "memory(GiB)": 141.16, "step": 91020, "train_speed(iter/s)": 0.290067 }, { "acc": 0.73842068, "epoch": 1.018277602003224, "grad_norm": 5.875, "learning_rate": 5.262081478127616e-06, "loss": 1.03930264, "memory(GiB)": 141.16, "step": 91040, "train_speed(iter/s)": 0.290087 }, { "acc": 0.73531475, "epoch": 1.0185013009491826, "grad_norm": 7.09375, "learning_rate": 5.260234581780743e-06, "loss": 1.06104031, "memory(GiB)": 141.16, "step": 91060, "train_speed(iter/s)": 0.290107 }, { "acc": 0.7274497, "epoch": 1.018724999895141, "grad_norm": 8.3125, "learning_rate": 5.25838764983006e-06, "loss": 1.10000515, "memory(GiB)": 141.16, "step": 91080, "train_speed(iter/s)": 0.290127 }, { "acc": 0.74452004, "epoch": 1.0189486988410996, "grad_norm": 7.5, "learning_rate": 5.256540682528254e-06, "loss": 1.01954851, "memory(GiB)": 141.16, "step": 91100, "train_speed(iter/s)": 0.290148 }, { "acc": 0.74468107, "epoch": 1.0191723977870581, "grad_norm": 6.75, "learning_rate": 5.254693680128016e-06, "loss": 1.03317013, "memory(GiB)": 141.16, "step": 91120, "train_speed(iter/s)": 0.290168 }, { "acc": 0.74925265, "epoch": 1.0193960967330167, "grad_norm": 4.8125, "learning_rate": 5.252846642882041e-06, "loss": 1.00380039, "memory(GiB)": 141.16, "step": 91140, "train_speed(iter/s)": 0.29019 }, { "acc": 0.73091102, "epoch": 1.0196197956789752, "grad_norm": 7.0, "learning_rate": 5.250999571043031e-06, "loss": 1.08024178, "memory(GiB)": 141.16, "step": 91160, "train_speed(iter/s)": 0.290211 }, { "acc": 0.73753133, "epoch": 1.0198434946249337, "grad_norm": 7.28125, "learning_rate": 5.249152464863692e-06, "loss": 1.06267147, "memory(GiB)": 141.16, "step": 91180, "train_speed(iter/s)": 0.290231 }, { "acc": 0.73379469, "epoch": 1.0200671935708923, "grad_norm": 8.5, "learning_rate": 5.247305324596736e-06, "loss": 1.09334641, "memory(GiB)": 141.16, "step": 91200, "train_speed(iter/s)": 0.290252 }, { "acc": 0.7306602, "epoch": 1.0202908925168508, "grad_norm": 7.78125, "learning_rate": 5.245458150494877e-06, "loss": 1.09149704, "memory(GiB)": 141.16, "step": 91220, "train_speed(iter/s)": 0.290272 }, { "acc": 0.72461214, "epoch": 1.0205145914628093, "grad_norm": 6.4375, "learning_rate": 5.243610942810834e-06, "loss": 1.10361557, "memory(GiB)": 141.16, "step": 91240, "train_speed(iter/s)": 0.290292 }, { "acc": 0.72743793, "epoch": 1.0207382904087678, "grad_norm": 7.40625, "learning_rate": 5.2417637017973315e-06, "loss": 1.08652067, "memory(GiB)": 141.16, "step": 91260, "train_speed(iter/s)": 0.290313 }, { "acc": 0.73919973, "epoch": 1.0209619893547264, "grad_norm": 6.09375, "learning_rate": 5.239916427707099e-06, "loss": 1.04331875, "memory(GiB)": 141.16, "step": 91280, "train_speed(iter/s)": 0.290335 }, { "acc": 0.7601583, "epoch": 1.021185688300685, "grad_norm": 6.84375, "learning_rate": 5.23806912079287e-06, "loss": 0.95189533, "memory(GiB)": 141.16, "step": 91300, "train_speed(iter/s)": 0.290357 }, { "acc": 0.74549227, "epoch": 1.0214093872466434, "grad_norm": 7.5625, "learning_rate": 5.236221781307383e-06, "loss": 1.03257103, "memory(GiB)": 141.16, "step": 91320, "train_speed(iter/s)": 0.290376 }, { "acc": 0.75403214, "epoch": 1.021633086192602, "grad_norm": 8.4375, "learning_rate": 5.23437440950338e-06, "loss": 0.96725082, "memory(GiB)": 141.16, "step": 91340, "train_speed(iter/s)": 0.290394 }, { "acc": 0.74393425, "epoch": 1.0218567851385605, "grad_norm": 6.15625, "learning_rate": 5.232527005633608e-06, "loss": 1.01196442, "memory(GiB)": 141.16, "step": 91360, "train_speed(iter/s)": 0.290415 }, { "acc": 0.74226398, "epoch": 1.022080484084519, "grad_norm": 7.65625, "learning_rate": 5.230679569950817e-06, "loss": 1.03767433, "memory(GiB)": 141.16, "step": 91380, "train_speed(iter/s)": 0.290434 }, { "acc": 0.73899031, "epoch": 1.0223041830304775, "grad_norm": 6.40625, "learning_rate": 5.228832102707763e-06, "loss": 1.04513712, "memory(GiB)": 141.16, "step": 91400, "train_speed(iter/s)": 0.290456 }, { "acc": 0.73506322, "epoch": 1.022527881976436, "grad_norm": 7.75, "learning_rate": 5.226984604157209e-06, "loss": 1.05848875, "memory(GiB)": 141.16, "step": 91420, "train_speed(iter/s)": 0.290478 }, { "acc": 0.73732219, "epoch": 1.0227515809223946, "grad_norm": 6.46875, "learning_rate": 5.225137074551917e-06, "loss": 1.0602953, "memory(GiB)": 141.16, "step": 91440, "train_speed(iter/s)": 0.290502 }, { "acc": 0.73604856, "epoch": 1.0229752798683531, "grad_norm": 6.78125, "learning_rate": 5.223289514144654e-06, "loss": 1.04373665, "memory(GiB)": 141.16, "step": 91460, "train_speed(iter/s)": 0.290525 }, { "acc": 0.74725981, "epoch": 1.0231989788143117, "grad_norm": 7.40625, "learning_rate": 5.221441923188193e-06, "loss": 0.99745407, "memory(GiB)": 141.16, "step": 91480, "train_speed(iter/s)": 0.290546 }, { "acc": 0.73528819, "epoch": 1.0234226777602702, "grad_norm": 6.8125, "learning_rate": 5.219594301935313e-06, "loss": 1.06058874, "memory(GiB)": 141.16, "step": 91500, "train_speed(iter/s)": 0.290567 }, { "acc": 0.72611189, "epoch": 1.0236463767062287, "grad_norm": 8.625, "learning_rate": 5.217746650638793e-06, "loss": 1.10782137, "memory(GiB)": 141.16, "step": 91520, "train_speed(iter/s)": 0.290588 }, { "acc": 0.73472948, "epoch": 1.0238700756521872, "grad_norm": 7.03125, "learning_rate": 5.21589896955142e-06, "loss": 1.0786005, "memory(GiB)": 141.16, "step": 91540, "train_speed(iter/s)": 0.29061 }, { "acc": 0.72838697, "epoch": 1.0240937745981458, "grad_norm": 8.875, "learning_rate": 5.2140512589259804e-06, "loss": 1.09210377, "memory(GiB)": 141.16, "step": 91560, "train_speed(iter/s)": 0.290632 }, { "acc": 0.75065832, "epoch": 1.0243174735441043, "grad_norm": 6.53125, "learning_rate": 5.21220351901527e-06, "loss": 1.00720177, "memory(GiB)": 141.16, "step": 91580, "train_speed(iter/s)": 0.290653 }, { "acc": 0.73687048, "epoch": 1.0245411724900628, "grad_norm": 5.53125, "learning_rate": 5.210355750072085e-06, "loss": 1.05628338, "memory(GiB)": 141.16, "step": 91600, "train_speed(iter/s)": 0.290677 }, { "acc": 0.73112464, "epoch": 1.0247648714360214, "grad_norm": 6.0, "learning_rate": 5.208507952349227e-06, "loss": 1.0758852, "memory(GiB)": 141.16, "step": 91620, "train_speed(iter/s)": 0.290697 }, { "acc": 0.72862453, "epoch": 1.0249885703819799, "grad_norm": 6.84375, "learning_rate": 5.206660126099501e-06, "loss": 1.08211346, "memory(GiB)": 141.16, "step": 91640, "train_speed(iter/s)": 0.29072 }, { "acc": 0.73419933, "epoch": 1.0252122693279384, "grad_norm": 7.15625, "learning_rate": 5.2048122715757154e-06, "loss": 1.06230774, "memory(GiB)": 141.16, "step": 91660, "train_speed(iter/s)": 0.290738 }, { "acc": 0.72248597, "epoch": 1.025435968273897, "grad_norm": 5.34375, "learning_rate": 5.202964389030683e-06, "loss": 1.1259407, "memory(GiB)": 141.16, "step": 91680, "train_speed(iter/s)": 0.290757 }, { "acc": 0.73353558, "epoch": 1.0256596672198555, "grad_norm": 6.28125, "learning_rate": 5.201116478717222e-06, "loss": 1.06785688, "memory(GiB)": 141.16, "step": 91700, "train_speed(iter/s)": 0.290776 }, { "acc": 0.71932726, "epoch": 1.025883366165814, "grad_norm": 8.6875, "learning_rate": 5.1992685408881515e-06, "loss": 1.1408596, "memory(GiB)": 141.16, "step": 91720, "train_speed(iter/s)": 0.290799 }, { "acc": 0.73427544, "epoch": 1.0261070651117725, "grad_norm": 7.84375, "learning_rate": 5.197420575796298e-06, "loss": 1.06706276, "memory(GiB)": 141.16, "step": 91740, "train_speed(iter/s)": 0.290822 }, { "acc": 0.73099365, "epoch": 1.026330764057731, "grad_norm": 7.28125, "learning_rate": 5.1955725836944874e-06, "loss": 1.08322592, "memory(GiB)": 141.16, "step": 91760, "train_speed(iter/s)": 0.290845 }, { "acc": 0.72771196, "epoch": 1.0265544630036896, "grad_norm": 7.59375, "learning_rate": 5.19372456483555e-06, "loss": 1.1073123, "memory(GiB)": 141.16, "step": 91780, "train_speed(iter/s)": 0.290865 }, { "acc": 0.74471664, "epoch": 1.026778161949648, "grad_norm": 7.75, "learning_rate": 5.191876519472325e-06, "loss": 1.01391096, "memory(GiB)": 141.16, "step": 91800, "train_speed(iter/s)": 0.290888 }, { "acc": 0.74738083, "epoch": 1.0270018608956066, "grad_norm": 6.84375, "learning_rate": 5.190028447857649e-06, "loss": 1.02295341, "memory(GiB)": 141.16, "step": 91820, "train_speed(iter/s)": 0.290909 }, { "acc": 0.72485976, "epoch": 1.0272255598415652, "grad_norm": 6.71875, "learning_rate": 5.188180350244366e-06, "loss": 1.11491699, "memory(GiB)": 141.16, "step": 91840, "train_speed(iter/s)": 0.290928 }, { "acc": 0.73363667, "epoch": 1.0274492587875237, "grad_norm": 7.0625, "learning_rate": 5.18633222688532e-06, "loss": 1.07772121, "memory(GiB)": 141.16, "step": 91860, "train_speed(iter/s)": 0.29095 }, { "acc": 0.73338346, "epoch": 1.0276729577334822, "grad_norm": 6.5625, "learning_rate": 5.184484078033363e-06, "loss": 1.07268944, "memory(GiB)": 141.16, "step": 91880, "train_speed(iter/s)": 0.290971 }, { "acc": 0.72503767, "epoch": 1.0278966566794407, "grad_norm": 7.125, "learning_rate": 5.182635903941346e-06, "loss": 1.1087657, "memory(GiB)": 141.16, "step": 91900, "train_speed(iter/s)": 0.290995 }, { "acc": 0.73175154, "epoch": 1.0281203556253993, "grad_norm": 7.25, "learning_rate": 5.180787704862128e-06, "loss": 1.06598606, "memory(GiB)": 141.16, "step": 91920, "train_speed(iter/s)": 0.291015 }, { "acc": 0.72474289, "epoch": 1.0283440545713578, "grad_norm": 6.1875, "learning_rate": 5.17893948104857e-06, "loss": 1.11403561, "memory(GiB)": 141.16, "step": 91940, "train_speed(iter/s)": 0.291039 }, { "acc": 0.73048716, "epoch": 1.0285677535173163, "grad_norm": 6.8125, "learning_rate": 5.17709123275353e-06, "loss": 1.08382244, "memory(GiB)": 141.16, "step": 91960, "train_speed(iter/s)": 0.291058 }, { "acc": 0.73349037, "epoch": 1.0287914524632749, "grad_norm": 8.125, "learning_rate": 5.17524296022988e-06, "loss": 1.06537838, "memory(GiB)": 141.16, "step": 91980, "train_speed(iter/s)": 0.291081 }, { "acc": 0.74219866, "epoch": 1.0290151514092334, "grad_norm": 6.875, "learning_rate": 5.173394663730486e-06, "loss": 1.04066315, "memory(GiB)": 141.16, "step": 92000, "train_speed(iter/s)": 0.291102 }, { "epoch": 1.0290151514092334, "eval_acc": 0.6901174305479145, "eval_loss": 1.0791404247283936, "eval_runtime": 2318.9665, "eval_samples_per_second": 32.464, "eval_steps_per_second": 16.232, "step": 92000 }, { "acc": 0.74682388, "epoch": 1.029238850355192, "grad_norm": 7.9375, "learning_rate": 5.171546343508227e-06, "loss": 1.00441971, "memory(GiB)": 141.16, "step": 92020, "train_speed(iter/s)": 0.288957 }, { "acc": 0.73214369, "epoch": 1.0294625493011504, "grad_norm": 11.625, "learning_rate": 5.169697999815974e-06, "loss": 1.08029881, "memory(GiB)": 141.16, "step": 92040, "train_speed(iter/s)": 0.288978 }, { "acc": 0.73557434, "epoch": 1.029686248247109, "grad_norm": 6.5625, "learning_rate": 5.167849632906609e-06, "loss": 1.04979401, "memory(GiB)": 141.16, "step": 92060, "train_speed(iter/s)": 0.288999 }, { "acc": 0.73732862, "epoch": 1.0299099471930675, "grad_norm": 7.375, "learning_rate": 5.166001243033016e-06, "loss": 1.04399319, "memory(GiB)": 141.16, "step": 92080, "train_speed(iter/s)": 0.289019 }, { "acc": 0.72567291, "epoch": 1.030133646139026, "grad_norm": 7.34375, "learning_rate": 5.16415283044808e-06, "loss": 1.09092922, "memory(GiB)": 141.16, "step": 92100, "train_speed(iter/s)": 0.28904 }, { "acc": 0.73134508, "epoch": 1.0303573450849846, "grad_norm": 8.125, "learning_rate": 5.16230439540469e-06, "loss": 1.05950537, "memory(GiB)": 141.16, "step": 92120, "train_speed(iter/s)": 0.289064 }, { "acc": 0.73977737, "epoch": 1.030581044030943, "grad_norm": 6.53125, "learning_rate": 5.16045593815574e-06, "loss": 1.04529667, "memory(GiB)": 141.16, "step": 92140, "train_speed(iter/s)": 0.289084 }, { "acc": 0.73944941, "epoch": 1.0308047429769016, "grad_norm": 7.59375, "learning_rate": 5.158607458954123e-06, "loss": 1.0519537, "memory(GiB)": 141.16, "step": 92160, "train_speed(iter/s)": 0.289103 }, { "acc": 0.72840643, "epoch": 1.0310284419228601, "grad_norm": 7.53125, "learning_rate": 5.156758958052739e-06, "loss": 1.09392796, "memory(GiB)": 141.16, "step": 92180, "train_speed(iter/s)": 0.28912 }, { "acc": 0.73061628, "epoch": 1.0312521408688187, "grad_norm": 6.46875, "learning_rate": 5.1549104357044886e-06, "loss": 1.0698781, "memory(GiB)": 141.16, "step": 92200, "train_speed(iter/s)": 0.289142 }, { "acc": 0.74120898, "epoch": 1.0314758398147772, "grad_norm": 5.84375, "learning_rate": 5.153061892162276e-06, "loss": 1.03449802, "memory(GiB)": 141.16, "step": 92220, "train_speed(iter/s)": 0.289164 }, { "acc": 0.73364801, "epoch": 1.0316995387607357, "grad_norm": 6.4375, "learning_rate": 5.15121332767901e-06, "loss": 1.07806892, "memory(GiB)": 141.16, "step": 92240, "train_speed(iter/s)": 0.289186 }, { "acc": 0.71929469, "epoch": 1.0319232377066943, "grad_norm": 5.84375, "learning_rate": 5.1493647425076e-06, "loss": 1.12748919, "memory(GiB)": 141.16, "step": 92260, "train_speed(iter/s)": 0.289207 }, { "acc": 0.74274282, "epoch": 1.0321469366526528, "grad_norm": 8.75, "learning_rate": 5.147516136900957e-06, "loss": 1.02982197, "memory(GiB)": 141.16, "step": 92280, "train_speed(iter/s)": 0.289229 }, { "acc": 0.73861933, "epoch": 1.0323706355986113, "grad_norm": 6.625, "learning_rate": 5.145667511111998e-06, "loss": 1.04256229, "memory(GiB)": 141.16, "step": 92300, "train_speed(iter/s)": 0.289251 }, { "acc": 0.72938037, "epoch": 1.0325943345445698, "grad_norm": 8.0625, "learning_rate": 5.1438188653936415e-06, "loss": 1.08455887, "memory(GiB)": 141.16, "step": 92320, "train_speed(iter/s)": 0.289272 }, { "acc": 0.74162812, "epoch": 1.0328180334905284, "grad_norm": 5.5625, "learning_rate": 5.141970199998808e-06, "loss": 1.04627523, "memory(GiB)": 141.16, "step": 92340, "train_speed(iter/s)": 0.289292 }, { "acc": 0.73815689, "epoch": 1.033041732436487, "grad_norm": 7.5, "learning_rate": 5.140121515180424e-06, "loss": 1.04624157, "memory(GiB)": 141.16, "step": 92360, "train_speed(iter/s)": 0.289314 }, { "acc": 0.73682232, "epoch": 1.0332654313824454, "grad_norm": 7.03125, "learning_rate": 5.138272811191413e-06, "loss": 1.05196953, "memory(GiB)": 141.16, "step": 92380, "train_speed(iter/s)": 0.289337 }, { "acc": 0.73422461, "epoch": 1.033489130328404, "grad_norm": 6.125, "learning_rate": 5.136424088284704e-06, "loss": 1.07446365, "memory(GiB)": 141.16, "step": 92400, "train_speed(iter/s)": 0.289359 }, { "acc": 0.73938856, "epoch": 1.0337128292743625, "grad_norm": 7.84375, "learning_rate": 5.13457534671323e-06, "loss": 1.04537163, "memory(GiB)": 141.16, "step": 92420, "train_speed(iter/s)": 0.289377 }, { "acc": 0.73528047, "epoch": 1.033936528220321, "grad_norm": 5.53125, "learning_rate": 5.132726586729926e-06, "loss": 1.06438332, "memory(GiB)": 141.16, "step": 92440, "train_speed(iter/s)": 0.289398 }, { "acc": 0.74913239, "epoch": 1.0341602271662795, "grad_norm": 6.15625, "learning_rate": 5.130877808587728e-06, "loss": 0.98969421, "memory(GiB)": 141.16, "step": 92460, "train_speed(iter/s)": 0.289417 }, { "acc": 0.73873148, "epoch": 1.034383926112238, "grad_norm": 7.78125, "learning_rate": 5.129029012539574e-06, "loss": 1.04007444, "memory(GiB)": 141.16, "step": 92480, "train_speed(iter/s)": 0.289439 }, { "acc": 0.72444868, "epoch": 1.0346076250581966, "grad_norm": 6.625, "learning_rate": 5.127180198838407e-06, "loss": 1.115242, "memory(GiB)": 141.16, "step": 92500, "train_speed(iter/s)": 0.289461 }, { "acc": 0.73692026, "epoch": 1.0348313240041551, "grad_norm": 6.75, "learning_rate": 5.125331367737171e-06, "loss": 1.05778599, "memory(GiB)": 141.16, "step": 92520, "train_speed(iter/s)": 0.289482 }, { "acc": 0.72830372, "epoch": 1.0350550229501136, "grad_norm": 7.28125, "learning_rate": 5.1234825194888125e-06, "loss": 1.08833542, "memory(GiB)": 141.16, "step": 92540, "train_speed(iter/s)": 0.289502 }, { "acc": 0.73515677, "epoch": 1.0352787218960722, "grad_norm": 7.28125, "learning_rate": 5.121633654346282e-06, "loss": 1.07792444, "memory(GiB)": 141.16, "step": 92560, "train_speed(iter/s)": 0.289522 }, { "acc": 0.73292084, "epoch": 1.0355024208420307, "grad_norm": 7.4375, "learning_rate": 5.119784772562527e-06, "loss": 1.06400318, "memory(GiB)": 141.16, "step": 92580, "train_speed(iter/s)": 0.289542 }, { "acc": 0.73221035, "epoch": 1.0357261197879892, "grad_norm": 7.5, "learning_rate": 5.117935874390503e-06, "loss": 1.07151928, "memory(GiB)": 141.16, "step": 92600, "train_speed(iter/s)": 0.289564 }, { "acc": 0.74766464, "epoch": 1.0359498187339478, "grad_norm": 7.5625, "learning_rate": 5.116086960083168e-06, "loss": 1.00177803, "memory(GiB)": 141.16, "step": 92620, "train_speed(iter/s)": 0.289585 }, { "acc": 0.72692547, "epoch": 1.0361735176799063, "grad_norm": 7.3125, "learning_rate": 5.114238029893475e-06, "loss": 1.11251335, "memory(GiB)": 141.16, "step": 92640, "train_speed(iter/s)": 0.289606 }, { "acc": 0.74037371, "epoch": 1.0363972166258648, "grad_norm": 9.75, "learning_rate": 5.1123890840743875e-06, "loss": 1.04605579, "memory(GiB)": 141.16, "step": 92660, "train_speed(iter/s)": 0.289624 }, { "acc": 0.73632507, "epoch": 1.0366209155718233, "grad_norm": 6.46875, "learning_rate": 5.110540122878868e-06, "loss": 1.06558714, "memory(GiB)": 141.16, "step": 92680, "train_speed(iter/s)": 0.289643 }, { "acc": 0.7159905, "epoch": 1.0368446145177819, "grad_norm": 5.3125, "learning_rate": 5.108691146559878e-06, "loss": 1.15013542, "memory(GiB)": 141.16, "step": 92700, "train_speed(iter/s)": 0.289663 }, { "acc": 0.7365808, "epoch": 1.0370683134637404, "grad_norm": 7.28125, "learning_rate": 5.106842155370386e-06, "loss": 1.03763847, "memory(GiB)": 141.16, "step": 92720, "train_speed(iter/s)": 0.289683 }, { "acc": 0.7556869, "epoch": 1.037292012409699, "grad_norm": 6.09375, "learning_rate": 5.10499314956336e-06, "loss": 0.97644806, "memory(GiB)": 141.16, "step": 92740, "train_speed(iter/s)": 0.289705 }, { "acc": 0.73787508, "epoch": 1.0375157113556575, "grad_norm": 6.15625, "learning_rate": 5.10314412939177e-06, "loss": 1.03911514, "memory(GiB)": 141.16, "step": 92760, "train_speed(iter/s)": 0.289727 }, { "acc": 0.73792877, "epoch": 1.037739410301616, "grad_norm": 7.625, "learning_rate": 5.101295095108592e-06, "loss": 1.04777851, "memory(GiB)": 141.16, "step": 92780, "train_speed(iter/s)": 0.289751 }, { "acc": 0.7380981, "epoch": 1.0379631092475745, "grad_norm": 6.125, "learning_rate": 5.099446046966794e-06, "loss": 1.05970821, "memory(GiB)": 141.16, "step": 92800, "train_speed(iter/s)": 0.289769 }, { "acc": 0.74219027, "epoch": 1.038186808193533, "grad_norm": 8.25, "learning_rate": 5.097596985219355e-06, "loss": 1.03924885, "memory(GiB)": 141.16, "step": 92820, "train_speed(iter/s)": 0.289787 }, { "acc": 0.73353987, "epoch": 1.0384105071394916, "grad_norm": 7.25, "learning_rate": 5.095747910119255e-06, "loss": 1.07533407, "memory(GiB)": 141.16, "step": 92840, "train_speed(iter/s)": 0.289807 }, { "acc": 0.7301899, "epoch": 1.03863420608545, "grad_norm": 6.4375, "learning_rate": 5.0938988219194715e-06, "loss": 1.08763275, "memory(GiB)": 141.16, "step": 92860, "train_speed(iter/s)": 0.289828 }, { "acc": 0.72646213, "epoch": 1.0388579050314086, "grad_norm": 6.46875, "learning_rate": 5.092049720872988e-06, "loss": 1.09894447, "memory(GiB)": 141.16, "step": 92880, "train_speed(iter/s)": 0.28985 }, { "acc": 0.73465352, "epoch": 1.0390816039773672, "grad_norm": 6.71875, "learning_rate": 5.090200607232787e-06, "loss": 1.05470581, "memory(GiB)": 141.16, "step": 92900, "train_speed(iter/s)": 0.28987 }, { "acc": 0.73300533, "epoch": 1.0393053029233257, "grad_norm": 6.84375, "learning_rate": 5.088351481251852e-06, "loss": 1.09366341, "memory(GiB)": 141.16, "step": 92920, "train_speed(iter/s)": 0.28989 }, { "acc": 0.73233271, "epoch": 1.0395290018692842, "grad_norm": 6.4375, "learning_rate": 5.086502343183173e-06, "loss": 1.07892199, "memory(GiB)": 141.16, "step": 92940, "train_speed(iter/s)": 0.289913 }, { "acc": 0.72043567, "epoch": 1.0397527008152427, "grad_norm": 8.625, "learning_rate": 5.084653193279736e-06, "loss": 1.13144093, "memory(GiB)": 141.16, "step": 92960, "train_speed(iter/s)": 0.289935 }, { "acc": 0.72831087, "epoch": 1.0399763997612013, "grad_norm": 6.3125, "learning_rate": 5.082804031794534e-06, "loss": 1.09190502, "memory(GiB)": 141.16, "step": 92980, "train_speed(iter/s)": 0.289957 }, { "acc": 0.74133053, "epoch": 1.0402000987071598, "grad_norm": 7.25, "learning_rate": 5.0809548589805555e-06, "loss": 1.030721, "memory(GiB)": 141.16, "step": 93000, "train_speed(iter/s)": 0.289978 }, { "acc": 0.72044277, "epoch": 1.0404237976531183, "grad_norm": 6.90625, "learning_rate": 5.079105675090795e-06, "loss": 1.13375463, "memory(GiB)": 141.16, "step": 93020, "train_speed(iter/s)": 0.289997 }, { "acc": 0.74794607, "epoch": 1.0406474965990768, "grad_norm": 7.0, "learning_rate": 5.077256480378248e-06, "loss": 1.00828075, "memory(GiB)": 141.16, "step": 93040, "train_speed(iter/s)": 0.290019 }, { "acc": 0.73867302, "epoch": 1.0408711955450354, "grad_norm": 9.5625, "learning_rate": 5.0754072750959095e-06, "loss": 1.06334047, "memory(GiB)": 141.16, "step": 93060, "train_speed(iter/s)": 0.29004 }, { "acc": 0.74331398, "epoch": 1.0410948944909941, "grad_norm": 7.53125, "learning_rate": 5.073558059496779e-06, "loss": 1.03047943, "memory(GiB)": 141.16, "step": 93080, "train_speed(iter/s)": 0.290062 }, { "acc": 0.73664155, "epoch": 1.0413185934369527, "grad_norm": 6.84375, "learning_rate": 5.071708833833855e-06, "loss": 1.05485563, "memory(GiB)": 141.16, "step": 93100, "train_speed(iter/s)": 0.290083 }, { "acc": 0.7246973, "epoch": 1.0415422923829112, "grad_norm": 5.90625, "learning_rate": 5.069859598360136e-06, "loss": 1.10555916, "memory(GiB)": 141.16, "step": 93120, "train_speed(iter/s)": 0.290105 }, { "acc": 0.7332509, "epoch": 1.0417659913288697, "grad_norm": 7.96875, "learning_rate": 5.068010353328626e-06, "loss": 1.08105106, "memory(GiB)": 141.16, "step": 93140, "train_speed(iter/s)": 0.290127 }, { "acc": 0.73141022, "epoch": 1.0419896902748282, "grad_norm": 6.71875, "learning_rate": 5.066161098992327e-06, "loss": 1.09437809, "memory(GiB)": 141.16, "step": 93160, "train_speed(iter/s)": 0.290149 }, { "acc": 0.72760658, "epoch": 1.0422133892207868, "grad_norm": 7.625, "learning_rate": 5.064311835604245e-06, "loss": 1.10130997, "memory(GiB)": 141.16, "step": 93180, "train_speed(iter/s)": 0.290169 }, { "acc": 0.72194872, "epoch": 1.0424370881667453, "grad_norm": 6.21875, "learning_rate": 5.062462563417385e-06, "loss": 1.11948013, "memory(GiB)": 141.16, "step": 93200, "train_speed(iter/s)": 0.290188 }, { "acc": 0.73964128, "epoch": 1.0426607871127038, "grad_norm": 7.34375, "learning_rate": 5.060613282684754e-06, "loss": 1.02847576, "memory(GiB)": 141.16, "step": 93220, "train_speed(iter/s)": 0.290209 }, { "acc": 0.73470669, "epoch": 1.0428844860586624, "grad_norm": 7.5625, "learning_rate": 5.058763993659358e-06, "loss": 1.06262779, "memory(GiB)": 141.16, "step": 93240, "train_speed(iter/s)": 0.29023 }, { "acc": 0.74632716, "epoch": 1.0431081850046209, "grad_norm": 7.625, "learning_rate": 5.056914696594209e-06, "loss": 1.03495388, "memory(GiB)": 141.16, "step": 93260, "train_speed(iter/s)": 0.290249 }, { "acc": 0.72722311, "epoch": 1.0433318839505794, "grad_norm": 6.78125, "learning_rate": 5.055065391742314e-06, "loss": 1.08946323, "memory(GiB)": 141.16, "step": 93280, "train_speed(iter/s)": 0.29027 }, { "acc": 0.7269877, "epoch": 1.043555582896538, "grad_norm": 8.5, "learning_rate": 5.053216079356688e-06, "loss": 1.10027609, "memory(GiB)": 141.16, "step": 93300, "train_speed(iter/s)": 0.29029 }, { "acc": 0.73256731, "epoch": 1.0437792818424965, "grad_norm": 6.90625, "learning_rate": 5.051366759690342e-06, "loss": 1.06392651, "memory(GiB)": 141.16, "step": 93320, "train_speed(iter/s)": 0.29031 }, { "acc": 0.73082952, "epoch": 1.044002980788455, "grad_norm": 8.875, "learning_rate": 5.049517432996287e-06, "loss": 1.06963625, "memory(GiB)": 141.16, "step": 93340, "train_speed(iter/s)": 0.290332 }, { "acc": 0.74799051, "epoch": 1.0442266797344135, "grad_norm": 7.5, "learning_rate": 5.047668099527541e-06, "loss": 0.99838657, "memory(GiB)": 141.16, "step": 93360, "train_speed(iter/s)": 0.290355 }, { "acc": 0.74022169, "epoch": 1.044450378680372, "grad_norm": 6.6875, "learning_rate": 5.045818759537116e-06, "loss": 1.03360415, "memory(GiB)": 141.16, "step": 93380, "train_speed(iter/s)": 0.290379 }, { "acc": 0.74302745, "epoch": 1.0446740776263306, "grad_norm": 6.46875, "learning_rate": 5.043969413278033e-06, "loss": 1.03391209, "memory(GiB)": 141.16, "step": 93400, "train_speed(iter/s)": 0.290398 }, { "acc": 0.72652922, "epoch": 1.044897776572289, "grad_norm": 7.1875, "learning_rate": 5.042120061003304e-06, "loss": 1.10357389, "memory(GiB)": 141.16, "step": 93420, "train_speed(iter/s)": 0.290417 }, { "acc": 0.73280249, "epoch": 1.0451214755182476, "grad_norm": 8.5625, "learning_rate": 5.040270702965948e-06, "loss": 1.05199194, "memory(GiB)": 141.16, "step": 93440, "train_speed(iter/s)": 0.29044 }, { "acc": 0.72688265, "epoch": 1.0453451744642062, "grad_norm": 8.0625, "learning_rate": 5.038421339418985e-06, "loss": 1.11286545, "memory(GiB)": 141.16, "step": 93460, "train_speed(iter/s)": 0.290462 }, { "acc": 0.72278299, "epoch": 1.0455688734101647, "grad_norm": 7.28125, "learning_rate": 5.036571970615434e-06, "loss": 1.10326967, "memory(GiB)": 141.16, "step": 93480, "train_speed(iter/s)": 0.290483 }, { "acc": 0.74723167, "epoch": 1.0457925723561232, "grad_norm": 8.3125, "learning_rate": 5.034722596808314e-06, "loss": 1.00553493, "memory(GiB)": 141.16, "step": 93500, "train_speed(iter/s)": 0.290502 }, { "acc": 0.72960567, "epoch": 1.0460162713020817, "grad_norm": 6.34375, "learning_rate": 5.032873218250647e-06, "loss": 1.08426609, "memory(GiB)": 141.16, "step": 93520, "train_speed(iter/s)": 0.290521 }, { "acc": 0.73874307, "epoch": 1.0462399702480403, "grad_norm": 6.3125, "learning_rate": 5.031023835195454e-06, "loss": 1.04304352, "memory(GiB)": 141.16, "step": 93540, "train_speed(iter/s)": 0.290542 }, { "acc": 0.74479303, "epoch": 1.0464636691939988, "grad_norm": 9.0, "learning_rate": 5.0291744478957545e-06, "loss": 1.02033501, "memory(GiB)": 141.16, "step": 93560, "train_speed(iter/s)": 0.290562 }, { "acc": 0.73260612, "epoch": 1.0466873681399573, "grad_norm": 7.53125, "learning_rate": 5.027325056604575e-06, "loss": 1.08915415, "memory(GiB)": 141.16, "step": 93580, "train_speed(iter/s)": 0.290582 }, { "acc": 0.72685804, "epoch": 1.0469110670859159, "grad_norm": 6.875, "learning_rate": 5.025475661574938e-06, "loss": 1.07758169, "memory(GiB)": 141.16, "step": 93600, "train_speed(iter/s)": 0.290604 }, { "acc": 0.71994171, "epoch": 1.0471347660318744, "grad_norm": 8.125, "learning_rate": 5.023626263059866e-06, "loss": 1.12689571, "memory(GiB)": 141.16, "step": 93620, "train_speed(iter/s)": 0.290626 }, { "acc": 0.74251814, "epoch": 1.047358464977833, "grad_norm": 7.25, "learning_rate": 5.021776861312384e-06, "loss": 1.03028116, "memory(GiB)": 141.16, "step": 93640, "train_speed(iter/s)": 0.290647 }, { "acc": 0.72709312, "epoch": 1.0475821639237914, "grad_norm": 6.75, "learning_rate": 5.0199274565855146e-06, "loss": 1.09484768, "memory(GiB)": 141.16, "step": 93660, "train_speed(iter/s)": 0.290667 }, { "acc": 0.72622442, "epoch": 1.04780586286975, "grad_norm": 6.75, "learning_rate": 5.018078049132286e-06, "loss": 1.10854168, "memory(GiB)": 141.16, "step": 93680, "train_speed(iter/s)": 0.290686 }, { "acc": 0.74233961, "epoch": 1.0480295618157085, "grad_norm": 6.59375, "learning_rate": 5.01622863920572e-06, "loss": 1.02679625, "memory(GiB)": 141.16, "step": 93700, "train_speed(iter/s)": 0.290708 }, { "acc": 0.73144407, "epoch": 1.048253260761667, "grad_norm": 6.875, "learning_rate": 5.014379227058847e-06, "loss": 1.07393093, "memory(GiB)": 141.16, "step": 93720, "train_speed(iter/s)": 0.290726 }, { "acc": 0.73392916, "epoch": 1.0484769597076256, "grad_norm": 8.625, "learning_rate": 5.012529812944688e-06, "loss": 1.06366053, "memory(GiB)": 141.16, "step": 93740, "train_speed(iter/s)": 0.290746 }, { "acc": 0.73582602, "epoch": 1.048700658653584, "grad_norm": 8.0625, "learning_rate": 5.010680397116272e-06, "loss": 1.06291561, "memory(GiB)": 141.16, "step": 93760, "train_speed(iter/s)": 0.290769 }, { "acc": 0.7418427, "epoch": 1.0489243575995426, "grad_norm": 8.5, "learning_rate": 5.008830979826625e-06, "loss": 1.03066006, "memory(GiB)": 141.16, "step": 93780, "train_speed(iter/s)": 0.29079 }, { "acc": 0.72455053, "epoch": 1.0491480565455011, "grad_norm": 7.40625, "learning_rate": 5.006981561328774e-06, "loss": 1.11778183, "memory(GiB)": 141.16, "step": 93800, "train_speed(iter/s)": 0.290812 }, { "acc": 0.73727884, "epoch": 1.0493717554914597, "grad_norm": 6.96875, "learning_rate": 5.005132141875746e-06, "loss": 1.04747372, "memory(GiB)": 141.16, "step": 93820, "train_speed(iter/s)": 0.290834 }, { "acc": 0.73566928, "epoch": 1.0495954544374182, "grad_norm": 5.4375, "learning_rate": 5.003282721720568e-06, "loss": 1.06252995, "memory(GiB)": 141.16, "step": 93840, "train_speed(iter/s)": 0.290856 }, { "acc": 0.73318224, "epoch": 1.0498191533833767, "grad_norm": 7.28125, "learning_rate": 5.001433301116265e-06, "loss": 1.0793273, "memory(GiB)": 141.16, "step": 93860, "train_speed(iter/s)": 0.290879 }, { "acc": 0.72631683, "epoch": 1.0500428523293353, "grad_norm": 6.28125, "learning_rate": 4.9995838803158666e-06, "loss": 1.10019855, "memory(GiB)": 141.16, "step": 93880, "train_speed(iter/s)": 0.290903 }, { "acc": 0.73958421, "epoch": 1.0502665512752938, "grad_norm": 8.3125, "learning_rate": 4.9977344595724e-06, "loss": 1.03519039, "memory(GiB)": 141.16, "step": 93900, "train_speed(iter/s)": 0.290925 }, { "acc": 0.73623924, "epoch": 1.0504902502212523, "grad_norm": 6.3125, "learning_rate": 4.99588503913889e-06, "loss": 1.06048336, "memory(GiB)": 141.16, "step": 93920, "train_speed(iter/s)": 0.290946 }, { "acc": 0.74537015, "epoch": 1.0507139491672108, "grad_norm": 6.96875, "learning_rate": 4.9940356192683685e-06, "loss": 1.01522579, "memory(GiB)": 141.16, "step": 93940, "train_speed(iter/s)": 0.290966 }, { "acc": 0.73860168, "epoch": 1.0509376481131694, "grad_norm": 6.8125, "learning_rate": 4.992186200213857e-06, "loss": 1.02981396, "memory(GiB)": 141.16, "step": 93960, "train_speed(iter/s)": 0.290987 }, { "acc": 0.74109302, "epoch": 1.051161347059128, "grad_norm": 6.21875, "learning_rate": 4.990336782228386e-06, "loss": 1.04668217, "memory(GiB)": 141.16, "step": 93980, "train_speed(iter/s)": 0.291008 }, { "acc": 0.73860693, "epoch": 1.0513850460050864, "grad_norm": 7.1875, "learning_rate": 4.98848736556498e-06, "loss": 1.04486237, "memory(GiB)": 141.16, "step": 94000, "train_speed(iter/s)": 0.291029 }, { "epoch": 1.0513850460050864, "eval_acc": 0.6901336984440792, "eval_loss": 1.0792105197906494, "eval_runtime": 2321.8306, "eval_samples_per_second": 32.424, "eval_steps_per_second": 16.212, "step": 94000 }, { "acc": 0.7347579, "epoch": 1.051608744951045, "grad_norm": 6.71875, "learning_rate": 4.9866379504766674e-06, "loss": 1.06528778, "memory(GiB)": 141.16, "step": 94020, "train_speed(iter/s)": 0.288927 }, { "acc": 0.73199854, "epoch": 1.0518324438970035, "grad_norm": 7.65625, "learning_rate": 4.9847885372164766e-06, "loss": 1.0800333, "memory(GiB)": 141.16, "step": 94040, "train_speed(iter/s)": 0.288948 }, { "acc": 0.73333817, "epoch": 1.052056142842962, "grad_norm": 8.125, "learning_rate": 4.982939126037429e-06, "loss": 1.057833, "memory(GiB)": 141.16, "step": 94060, "train_speed(iter/s)": 0.28897 }, { "acc": 0.72624846, "epoch": 1.0522798417889205, "grad_norm": 6.9375, "learning_rate": 4.981089717192553e-06, "loss": 1.08053312, "memory(GiB)": 141.16, "step": 94080, "train_speed(iter/s)": 0.288992 }, { "acc": 0.73886862, "epoch": 1.052503540734879, "grad_norm": 7.6875, "learning_rate": 4.979240310934873e-06, "loss": 1.05677719, "memory(GiB)": 141.16, "step": 94100, "train_speed(iter/s)": 0.289015 }, { "acc": 0.74386587, "epoch": 1.0527272396808376, "grad_norm": 6.78125, "learning_rate": 4.977390907517416e-06, "loss": 1.00696077, "memory(GiB)": 141.16, "step": 94120, "train_speed(iter/s)": 0.289036 }, { "acc": 0.73487782, "epoch": 1.0529509386267961, "grad_norm": 8.625, "learning_rate": 4.975541507193208e-06, "loss": 1.05515127, "memory(GiB)": 141.16, "step": 94140, "train_speed(iter/s)": 0.289055 }, { "acc": 0.74003305, "epoch": 1.0531746375727546, "grad_norm": 7.78125, "learning_rate": 4.97369211021527e-06, "loss": 1.04025497, "memory(GiB)": 141.16, "step": 94160, "train_speed(iter/s)": 0.289078 }, { "acc": 0.74379864, "epoch": 1.0533983365187132, "grad_norm": 7.0, "learning_rate": 4.971842716836627e-06, "loss": 1.02477646, "memory(GiB)": 141.16, "step": 94180, "train_speed(iter/s)": 0.289099 }, { "acc": 0.73578362, "epoch": 1.0536220354646717, "grad_norm": 10.375, "learning_rate": 4.969993327310303e-06, "loss": 1.06446495, "memory(GiB)": 141.16, "step": 94200, "train_speed(iter/s)": 0.289122 }, { "acc": 0.73732176, "epoch": 1.0538457344106302, "grad_norm": 6.03125, "learning_rate": 4.968143941889319e-06, "loss": 1.0616025, "memory(GiB)": 141.16, "step": 94220, "train_speed(iter/s)": 0.289145 }, { "acc": 0.72647619, "epoch": 1.0540694333565888, "grad_norm": 7.40625, "learning_rate": 4.966294560826702e-06, "loss": 1.09650593, "memory(GiB)": 141.16, "step": 94240, "train_speed(iter/s)": 0.289166 }, { "acc": 0.74820738, "epoch": 1.0542931323025473, "grad_norm": 7.75, "learning_rate": 4.96444518437547e-06, "loss": 1.00236912, "memory(GiB)": 141.16, "step": 94260, "train_speed(iter/s)": 0.289188 }, { "acc": 0.74456487, "epoch": 1.0545168312485058, "grad_norm": 7.0, "learning_rate": 4.962595812788645e-06, "loss": 1.02608767, "memory(GiB)": 141.16, "step": 94280, "train_speed(iter/s)": 0.28921 }, { "acc": 0.73186097, "epoch": 1.0547405301944643, "grad_norm": 6.4375, "learning_rate": 4.960746446319246e-06, "loss": 1.07022572, "memory(GiB)": 141.16, "step": 94300, "train_speed(iter/s)": 0.289231 }, { "acc": 0.73679333, "epoch": 1.0549642291404229, "grad_norm": 7.0625, "learning_rate": 4.958897085220295e-06, "loss": 1.05593395, "memory(GiB)": 141.16, "step": 94320, "train_speed(iter/s)": 0.289249 }, { "acc": 0.73617544, "epoch": 1.0551879280863814, "grad_norm": 8.4375, "learning_rate": 4.957047729744811e-06, "loss": 1.05846996, "memory(GiB)": 141.16, "step": 94340, "train_speed(iter/s)": 0.289271 }, { "acc": 0.73628211, "epoch": 1.05541162703234, "grad_norm": 7.1875, "learning_rate": 4.955198380145811e-06, "loss": 1.06536484, "memory(GiB)": 141.16, "step": 94360, "train_speed(iter/s)": 0.289292 }, { "acc": 0.73812151, "epoch": 1.0556353259782985, "grad_norm": 8.6875, "learning_rate": 4.953349036676313e-06, "loss": 1.06187267, "memory(GiB)": 141.16, "step": 94380, "train_speed(iter/s)": 0.289311 }, { "acc": 0.73821087, "epoch": 1.055859024924257, "grad_norm": 9.6875, "learning_rate": 4.951499699589333e-06, "loss": 1.05629854, "memory(GiB)": 141.16, "step": 94400, "train_speed(iter/s)": 0.289331 }, { "acc": 0.74249387, "epoch": 1.0560827238702155, "grad_norm": 6.09375, "learning_rate": 4.949650369137888e-06, "loss": 1.01771612, "memory(GiB)": 141.16, "step": 94420, "train_speed(iter/s)": 0.289352 }, { "acc": 0.74238539, "epoch": 1.056306422816174, "grad_norm": 6.96875, "learning_rate": 4.947801045574993e-06, "loss": 1.0419795, "memory(GiB)": 141.16, "step": 94440, "train_speed(iter/s)": 0.289371 }, { "acc": 0.74153605, "epoch": 1.0565301217621326, "grad_norm": 7.25, "learning_rate": 4.945951729153659e-06, "loss": 1.02602816, "memory(GiB)": 141.16, "step": 94460, "train_speed(iter/s)": 0.289389 }, { "acc": 0.7349968, "epoch": 1.056753820708091, "grad_norm": 6.09375, "learning_rate": 4.944102420126902e-06, "loss": 1.05885763, "memory(GiB)": 141.16, "step": 94480, "train_speed(iter/s)": 0.28941 }, { "acc": 0.74907885, "epoch": 1.0569775196540496, "grad_norm": 6.03125, "learning_rate": 4.942253118747733e-06, "loss": 1.01227703, "memory(GiB)": 141.16, "step": 94500, "train_speed(iter/s)": 0.289431 }, { "acc": 0.74327202, "epoch": 1.0572012186000082, "grad_norm": 7.71875, "learning_rate": 4.9404038252691625e-06, "loss": 1.01549244, "memory(GiB)": 141.16, "step": 94520, "train_speed(iter/s)": 0.289452 }, { "acc": 0.73275347, "epoch": 1.0574249175459667, "grad_norm": 6.46875, "learning_rate": 4.938554539944201e-06, "loss": 1.07192106, "memory(GiB)": 141.16, "step": 94540, "train_speed(iter/s)": 0.289473 }, { "acc": 0.73456316, "epoch": 1.0576486164919252, "grad_norm": 6.3125, "learning_rate": 4.936705263025856e-06, "loss": 1.06583595, "memory(GiB)": 141.16, "step": 94560, "train_speed(iter/s)": 0.289494 }, { "acc": 0.73021712, "epoch": 1.0578723154378837, "grad_norm": 6.8125, "learning_rate": 4.934855994767136e-06, "loss": 1.09601707, "memory(GiB)": 141.16, "step": 94580, "train_speed(iter/s)": 0.289514 }, { "acc": 0.74465752, "epoch": 1.0580960143838423, "grad_norm": 6.6875, "learning_rate": 4.933006735421047e-06, "loss": 1.00437584, "memory(GiB)": 141.16, "step": 94600, "train_speed(iter/s)": 0.289534 }, { "acc": 0.73191633, "epoch": 1.0583197133298008, "grad_norm": 6.4375, "learning_rate": 4.931157485240594e-06, "loss": 1.08571987, "memory(GiB)": 141.16, "step": 94620, "train_speed(iter/s)": 0.289554 }, { "acc": 0.73911858, "epoch": 1.0585434122757593, "grad_norm": 8.625, "learning_rate": 4.929308244478782e-06, "loss": 1.04917707, "memory(GiB)": 141.16, "step": 94640, "train_speed(iter/s)": 0.289575 }, { "acc": 0.73130469, "epoch": 1.0587671112217178, "grad_norm": 7.875, "learning_rate": 4.927459013388612e-06, "loss": 1.07903862, "memory(GiB)": 141.16, "step": 94660, "train_speed(iter/s)": 0.289593 }, { "acc": 0.73313484, "epoch": 1.0589908101676764, "grad_norm": 7.84375, "learning_rate": 4.925609792223088e-06, "loss": 1.07410507, "memory(GiB)": 141.16, "step": 94680, "train_speed(iter/s)": 0.289612 }, { "acc": 0.72670584, "epoch": 1.059214509113635, "grad_norm": 5.71875, "learning_rate": 4.923760581235204e-06, "loss": 1.11018372, "memory(GiB)": 141.16, "step": 94700, "train_speed(iter/s)": 0.289634 }, { "acc": 0.7353014, "epoch": 1.0594382080595934, "grad_norm": 8.1875, "learning_rate": 4.921911380677964e-06, "loss": 1.05629864, "memory(GiB)": 141.16, "step": 94720, "train_speed(iter/s)": 0.289655 }, { "acc": 0.74129887, "epoch": 1.059661907005552, "grad_norm": 7.90625, "learning_rate": 4.920062190804363e-06, "loss": 1.03094482, "memory(GiB)": 141.16, "step": 94740, "train_speed(iter/s)": 0.289675 }, { "acc": 0.72808585, "epoch": 1.0598856059515105, "grad_norm": 7.0, "learning_rate": 4.918213011867396e-06, "loss": 1.10081425, "memory(GiB)": 141.16, "step": 94760, "train_speed(iter/s)": 0.289694 }, { "acc": 0.72900953, "epoch": 1.060109304897469, "grad_norm": 7.90625, "learning_rate": 4.91636384412006e-06, "loss": 1.08312092, "memory(GiB)": 141.16, "step": 94780, "train_speed(iter/s)": 0.289715 }, { "acc": 0.72805452, "epoch": 1.0603330038434275, "grad_norm": 7.0625, "learning_rate": 4.9145146878153435e-06, "loss": 1.0868577, "memory(GiB)": 141.16, "step": 94800, "train_speed(iter/s)": 0.289734 }, { "acc": 0.72813091, "epoch": 1.060556702789386, "grad_norm": 8.8125, "learning_rate": 4.91266554320624e-06, "loss": 1.08679199, "memory(GiB)": 141.16, "step": 94820, "train_speed(iter/s)": 0.289755 }, { "acc": 0.72630453, "epoch": 1.0607804017353446, "grad_norm": 6.9375, "learning_rate": 4.910816410545739e-06, "loss": 1.092906, "memory(GiB)": 141.16, "step": 94840, "train_speed(iter/s)": 0.289778 }, { "acc": 0.72458286, "epoch": 1.0610041006813031, "grad_norm": 8.4375, "learning_rate": 4.908967290086827e-06, "loss": 1.10345345, "memory(GiB)": 141.16, "step": 94860, "train_speed(iter/s)": 0.289799 }, { "acc": 0.74136701, "epoch": 1.0612277996272617, "grad_norm": 5.9375, "learning_rate": 4.907118182082493e-06, "loss": 1.04235382, "memory(GiB)": 141.16, "step": 94880, "train_speed(iter/s)": 0.289818 }, { "acc": 0.73756275, "epoch": 1.0614514985732202, "grad_norm": 6.15625, "learning_rate": 4.905269086785717e-06, "loss": 1.0650918, "memory(GiB)": 141.16, "step": 94900, "train_speed(iter/s)": 0.289839 }, { "acc": 0.7454875, "epoch": 1.0616751975191787, "grad_norm": 6.53125, "learning_rate": 4.9034200044494845e-06, "loss": 1.01411495, "memory(GiB)": 141.16, "step": 94920, "train_speed(iter/s)": 0.289858 }, { "acc": 0.72534142, "epoch": 1.0618988964651372, "grad_norm": 7.59375, "learning_rate": 4.901570935326776e-06, "loss": 1.10685158, "memory(GiB)": 141.16, "step": 94940, "train_speed(iter/s)": 0.289878 }, { "acc": 0.73560023, "epoch": 1.0621225954110958, "grad_norm": 7.5625, "learning_rate": 4.899721879670571e-06, "loss": 1.05738029, "memory(GiB)": 141.16, "step": 94960, "train_speed(iter/s)": 0.289899 }, { "acc": 0.73834567, "epoch": 1.0623462943570543, "grad_norm": 6.53125, "learning_rate": 4.897872837733845e-06, "loss": 1.03538876, "memory(GiB)": 141.16, "step": 94980, "train_speed(iter/s)": 0.289919 }, { "acc": 0.72702594, "epoch": 1.0625699933030128, "grad_norm": 6.84375, "learning_rate": 4.896023809769576e-06, "loss": 1.09383774, "memory(GiB)": 141.16, "step": 95000, "train_speed(iter/s)": 0.289937 }, { "acc": 0.72582827, "epoch": 1.0627936922489714, "grad_norm": 5.78125, "learning_rate": 4.894174796030735e-06, "loss": 1.11187191, "memory(GiB)": 141.16, "step": 95020, "train_speed(iter/s)": 0.289957 }, { "acc": 0.74113588, "epoch": 1.0630173911949299, "grad_norm": 6.5625, "learning_rate": 4.892325796770294e-06, "loss": 1.04052172, "memory(GiB)": 141.16, "step": 95040, "train_speed(iter/s)": 0.289979 }, { "acc": 0.7405951, "epoch": 1.0632410901408884, "grad_norm": 9.25, "learning_rate": 4.890476812241223e-06, "loss": 1.03888674, "memory(GiB)": 141.16, "step": 95060, "train_speed(iter/s)": 0.289998 }, { "acc": 0.73579984, "epoch": 1.063464789086847, "grad_norm": 7.90625, "learning_rate": 4.8886278426964916e-06, "loss": 1.05854988, "memory(GiB)": 141.16, "step": 95080, "train_speed(iter/s)": 0.290015 }, { "acc": 0.73883648, "epoch": 1.0636884880328055, "grad_norm": 5.28125, "learning_rate": 4.886778888389061e-06, "loss": 1.04605169, "memory(GiB)": 141.16, "step": 95100, "train_speed(iter/s)": 0.290034 }, { "acc": 0.73369613, "epoch": 1.063912186978764, "grad_norm": 7.375, "learning_rate": 4.884929949571898e-06, "loss": 1.07077713, "memory(GiB)": 141.16, "step": 95120, "train_speed(iter/s)": 0.290054 }, { "acc": 0.72969065, "epoch": 1.0641358859247225, "grad_norm": 8.5, "learning_rate": 4.883081026497962e-06, "loss": 1.10425491, "memory(GiB)": 141.16, "step": 95140, "train_speed(iter/s)": 0.290075 }, { "acc": 0.73049512, "epoch": 1.064359584870681, "grad_norm": 8.0625, "learning_rate": 4.881232119420212e-06, "loss": 1.09680462, "memory(GiB)": 141.16, "step": 95160, "train_speed(iter/s)": 0.290096 }, { "acc": 0.72949533, "epoch": 1.0645832838166396, "grad_norm": 7.75, "learning_rate": 4.879383228591608e-06, "loss": 1.09671268, "memory(GiB)": 141.16, "step": 95180, "train_speed(iter/s)": 0.290116 }, { "acc": 0.72563457, "epoch": 1.064806982762598, "grad_norm": 6.78125, "learning_rate": 4.8775343542651e-06, "loss": 1.106394, "memory(GiB)": 141.16, "step": 95200, "train_speed(iter/s)": 0.290136 }, { "acc": 0.71521397, "epoch": 1.0650306817085566, "grad_norm": 6.96875, "learning_rate": 4.875685496693643e-06, "loss": 1.1675312, "memory(GiB)": 141.16, "step": 95220, "train_speed(iter/s)": 0.290157 }, { "acc": 0.72692194, "epoch": 1.0652543806545152, "grad_norm": 7.125, "learning_rate": 4.873836656130188e-06, "loss": 1.09709978, "memory(GiB)": 141.16, "step": 95240, "train_speed(iter/s)": 0.290177 }, { "acc": 0.73949633, "epoch": 1.0654780796004737, "grad_norm": 6.6875, "learning_rate": 4.871987832827681e-06, "loss": 1.05167665, "memory(GiB)": 141.16, "step": 95260, "train_speed(iter/s)": 0.290197 }, { "acc": 0.73283482, "epoch": 1.0657017785464322, "grad_norm": 7.84375, "learning_rate": 4.87013902703907e-06, "loss": 1.06699953, "memory(GiB)": 141.16, "step": 95280, "train_speed(iter/s)": 0.290215 }, { "acc": 0.74010658, "epoch": 1.0659254774923907, "grad_norm": 6.4375, "learning_rate": 4.868290239017293e-06, "loss": 1.0404253, "memory(GiB)": 141.16, "step": 95300, "train_speed(iter/s)": 0.290236 }, { "acc": 0.74395733, "epoch": 1.0661491764383493, "grad_norm": 6.40625, "learning_rate": 4.866441469015296e-06, "loss": 1.02511091, "memory(GiB)": 141.16, "step": 95320, "train_speed(iter/s)": 0.290256 }, { "acc": 0.73581996, "epoch": 1.0663728753843078, "grad_norm": 8.375, "learning_rate": 4.864592717286015e-06, "loss": 1.06059494, "memory(GiB)": 141.16, "step": 95340, "train_speed(iter/s)": 0.290277 }, { "acc": 0.72711325, "epoch": 1.0665965743302663, "grad_norm": 7.15625, "learning_rate": 4.8627439840823845e-06, "loss": 1.10727158, "memory(GiB)": 141.16, "step": 95360, "train_speed(iter/s)": 0.290296 }, { "acc": 0.72967029, "epoch": 1.0668202732762249, "grad_norm": 8.0, "learning_rate": 4.860895269657341e-06, "loss": 1.09316883, "memory(GiB)": 141.16, "step": 95380, "train_speed(iter/s)": 0.290316 }, { "acc": 0.74360533, "epoch": 1.0670439722221834, "grad_norm": 7.8125, "learning_rate": 4.859046574263811e-06, "loss": 1.0145936, "memory(GiB)": 141.16, "step": 95400, "train_speed(iter/s)": 0.290338 }, { "acc": 0.73724327, "epoch": 1.067267671168142, "grad_norm": 8.5, "learning_rate": 4.857197898154725e-06, "loss": 1.03815784, "memory(GiB)": 141.16, "step": 95420, "train_speed(iter/s)": 0.290359 }, { "acc": 0.71961393, "epoch": 1.0674913701141004, "grad_norm": 6.8125, "learning_rate": 4.855349241583007e-06, "loss": 1.1401619, "memory(GiB)": 141.16, "step": 95440, "train_speed(iter/s)": 0.29038 }, { "acc": 0.72827826, "epoch": 1.067715069060059, "grad_norm": 6.5625, "learning_rate": 4.853500604801581e-06, "loss": 1.10021191, "memory(GiB)": 141.16, "step": 95460, "train_speed(iter/s)": 0.290399 }, { "acc": 0.73747816, "epoch": 1.0679387680060175, "grad_norm": 7.0625, "learning_rate": 4.851651988063367e-06, "loss": 1.07082062, "memory(GiB)": 141.16, "step": 95480, "train_speed(iter/s)": 0.290417 }, { "acc": 0.7351553, "epoch": 1.068162466951976, "grad_norm": 5.90625, "learning_rate": 4.849803391621279e-06, "loss": 1.06894999, "memory(GiB)": 141.16, "step": 95500, "train_speed(iter/s)": 0.290436 }, { "acc": 0.7309269, "epoch": 1.0683861658979346, "grad_norm": 6.03125, "learning_rate": 4.847954815728236e-06, "loss": 1.08150539, "memory(GiB)": 141.16, "step": 95520, "train_speed(iter/s)": 0.290456 }, { "acc": 0.72484674, "epoch": 1.068609864843893, "grad_norm": 6.96875, "learning_rate": 4.846106260637146e-06, "loss": 1.10203152, "memory(GiB)": 141.16, "step": 95540, "train_speed(iter/s)": 0.290479 }, { "acc": 0.74282656, "epoch": 1.0688335637898516, "grad_norm": 8.3125, "learning_rate": 4.84425772660092e-06, "loss": 1.03231907, "memory(GiB)": 141.16, "step": 95560, "train_speed(iter/s)": 0.290502 }, { "acc": 0.7379117, "epoch": 1.0690572627358101, "grad_norm": 8.4375, "learning_rate": 4.842409213872464e-06, "loss": 1.06218224, "memory(GiB)": 141.16, "step": 95580, "train_speed(iter/s)": 0.290521 }, { "acc": 0.73102465, "epoch": 1.0692809616817687, "grad_norm": 7.875, "learning_rate": 4.840560722704678e-06, "loss": 1.0949131, "memory(GiB)": 141.16, "step": 95600, "train_speed(iter/s)": 0.290541 }, { "acc": 0.7195117, "epoch": 1.0695046606277272, "grad_norm": 5.53125, "learning_rate": 4.838712253350465e-06, "loss": 1.12202883, "memory(GiB)": 141.16, "step": 95620, "train_speed(iter/s)": 0.290561 }, { "acc": 0.73063631, "epoch": 1.0697283595736857, "grad_norm": 6.1875, "learning_rate": 4.836863806062721e-06, "loss": 1.09057484, "memory(GiB)": 141.16, "step": 95640, "train_speed(iter/s)": 0.290581 }, { "acc": 0.71851931, "epoch": 1.0699520585196443, "grad_norm": 7.90625, "learning_rate": 4.83501538109434e-06, "loss": 1.14884901, "memory(GiB)": 141.16, "step": 95660, "train_speed(iter/s)": 0.290601 }, { "acc": 0.74073281, "epoch": 1.0701757574656028, "grad_norm": 7.5, "learning_rate": 4.8331669786982135e-06, "loss": 1.02718334, "memory(GiB)": 141.16, "step": 95680, "train_speed(iter/s)": 0.29062 }, { "acc": 0.72489872, "epoch": 1.0703994564115613, "grad_norm": 6.375, "learning_rate": 4.831318599127229e-06, "loss": 1.11981859, "memory(GiB)": 141.16, "step": 95700, "train_speed(iter/s)": 0.290639 }, { "acc": 0.73405714, "epoch": 1.0706231553575198, "grad_norm": 4.28125, "learning_rate": 4.8294702426342705e-06, "loss": 1.0600421, "memory(GiB)": 141.16, "step": 95720, "train_speed(iter/s)": 0.290659 }, { "acc": 0.72531238, "epoch": 1.0708468543034784, "grad_norm": 6.65625, "learning_rate": 4.827621909472221e-06, "loss": 1.10341711, "memory(GiB)": 141.16, "step": 95740, "train_speed(iter/s)": 0.290679 }, { "acc": 0.72572441, "epoch": 1.071070553249437, "grad_norm": 5.71875, "learning_rate": 4.825773599893956e-06, "loss": 1.08521233, "memory(GiB)": 141.16, "step": 95760, "train_speed(iter/s)": 0.290701 }, { "acc": 0.72823114, "epoch": 1.0712942521953954, "grad_norm": 7.46875, "learning_rate": 4.8239253141523565e-06, "loss": 1.0890337, "memory(GiB)": 141.16, "step": 95780, "train_speed(iter/s)": 0.290721 }, { "acc": 0.72828975, "epoch": 1.071517951141354, "grad_norm": 7.46875, "learning_rate": 4.822077052500288e-06, "loss": 1.09047565, "memory(GiB)": 141.16, "step": 95800, "train_speed(iter/s)": 0.290745 }, { "acc": 0.72686634, "epoch": 1.0717416500873125, "grad_norm": 6.4375, "learning_rate": 4.820228815190622e-06, "loss": 1.09205713, "memory(GiB)": 141.16, "step": 95820, "train_speed(iter/s)": 0.290765 }, { "acc": 0.74603386, "epoch": 1.071965349033271, "grad_norm": 7.1875, "learning_rate": 4.818380602476224e-06, "loss": 1.01411533, "memory(GiB)": 141.16, "step": 95840, "train_speed(iter/s)": 0.290785 }, { "acc": 0.73677344, "epoch": 1.0721890479792295, "grad_norm": 7.90625, "learning_rate": 4.816532414609956e-06, "loss": 1.04857378, "memory(GiB)": 141.16, "step": 95860, "train_speed(iter/s)": 0.290808 }, { "acc": 0.72921276, "epoch": 1.072412746925188, "grad_norm": 8.8125, "learning_rate": 4.814684251844678e-06, "loss": 1.08533421, "memory(GiB)": 141.16, "step": 95880, "train_speed(iter/s)": 0.29083 }, { "acc": 0.73843665, "epoch": 1.0726364458711466, "grad_norm": 6.5, "learning_rate": 4.81283611443324e-06, "loss": 1.05251293, "memory(GiB)": 141.16, "step": 95900, "train_speed(iter/s)": 0.29085 }, { "acc": 0.7400136, "epoch": 1.0728601448171051, "grad_norm": 7.21875, "learning_rate": 4.810988002628497e-06, "loss": 1.04551182, "memory(GiB)": 141.16, "step": 95920, "train_speed(iter/s)": 0.290871 }, { "acc": 0.73458319, "epoch": 1.0730838437630636, "grad_norm": 6.375, "learning_rate": 4.809139916683298e-06, "loss": 1.06344719, "memory(GiB)": 141.16, "step": 95940, "train_speed(iter/s)": 0.290892 }, { "acc": 0.72981358, "epoch": 1.0733075427090222, "grad_norm": 7.0, "learning_rate": 4.807291856850485e-06, "loss": 1.07034616, "memory(GiB)": 141.16, "step": 95960, "train_speed(iter/s)": 0.290911 }, { "acc": 0.73733606, "epoch": 1.0735312416549807, "grad_norm": 6.96875, "learning_rate": 4.805443823382901e-06, "loss": 1.04659271, "memory(GiB)": 141.16, "step": 95980, "train_speed(iter/s)": 0.290929 }, { "acc": 0.73129568, "epoch": 1.0737549406009392, "grad_norm": 6.125, "learning_rate": 4.8035958165333835e-06, "loss": 1.08170624, "memory(GiB)": 141.16, "step": 96000, "train_speed(iter/s)": 0.290951 }, { "epoch": 1.0737549406009392, "eval_acc": 0.6901353252336957, "eval_loss": 1.079198956489563, "eval_runtime": 2317.9442, "eval_samples_per_second": 32.478, "eval_steps_per_second": 16.239, "step": 96000 }, { "acc": 0.73769512, "epoch": 1.0739786395468978, "grad_norm": 7.9375, "learning_rate": 4.801747836554765e-06, "loss": 1.05808487, "memory(GiB)": 141.16, "step": 96020, "train_speed(iter/s)": 0.288895 }, { "acc": 0.72344012, "epoch": 1.0742023384928563, "grad_norm": 6.65625, "learning_rate": 4.799899883699876e-06, "loss": 1.10629387, "memory(GiB)": 141.16, "step": 96040, "train_speed(iter/s)": 0.288916 }, { "acc": 0.73641267, "epoch": 1.0744260374388148, "grad_norm": 10.0625, "learning_rate": 4.798051958221544e-06, "loss": 1.07600975, "memory(GiB)": 141.16, "step": 96060, "train_speed(iter/s)": 0.288935 }, { "acc": 0.732722, "epoch": 1.0746497363847733, "grad_norm": 7.3125, "learning_rate": 4.796204060372589e-06, "loss": 1.09047604, "memory(GiB)": 141.16, "step": 96080, "train_speed(iter/s)": 0.288954 }, { "acc": 0.72929049, "epoch": 1.0748734353307319, "grad_norm": 6.78125, "learning_rate": 4.794356190405832e-06, "loss": 1.08756828, "memory(GiB)": 141.16, "step": 96100, "train_speed(iter/s)": 0.288976 }, { "acc": 0.73103957, "epoch": 1.0750971342766904, "grad_norm": 8.75, "learning_rate": 4.792508348574088e-06, "loss": 1.08799057, "memory(GiB)": 141.16, "step": 96120, "train_speed(iter/s)": 0.288996 }, { "acc": 0.73470917, "epoch": 1.075320833222649, "grad_norm": 7.875, "learning_rate": 4.790660535130168e-06, "loss": 1.06522102, "memory(GiB)": 141.16, "step": 96140, "train_speed(iter/s)": 0.28902 }, { "acc": 0.7356122, "epoch": 1.0755445321686075, "grad_norm": 7.09375, "learning_rate": 4.788812750326878e-06, "loss": 1.07754745, "memory(GiB)": 141.16, "step": 96160, "train_speed(iter/s)": 0.28904 }, { "acc": 0.73588753, "epoch": 1.075768231114566, "grad_norm": 7.03125, "learning_rate": 4.786964994417023e-06, "loss": 1.03623552, "memory(GiB)": 141.16, "step": 96180, "train_speed(iter/s)": 0.289058 }, { "acc": 0.73560905, "epoch": 1.0759919300605245, "grad_norm": 7.125, "learning_rate": 4.7851172676534006e-06, "loss": 1.06250257, "memory(GiB)": 141.16, "step": 96200, "train_speed(iter/s)": 0.289078 }, { "acc": 0.73421583, "epoch": 1.076215629006483, "grad_norm": 6.8125, "learning_rate": 4.7832695702888085e-06, "loss": 1.06882057, "memory(GiB)": 141.16, "step": 96220, "train_speed(iter/s)": 0.289097 }, { "acc": 0.73534422, "epoch": 1.0764393279524416, "grad_norm": 6.8125, "learning_rate": 4.781421902576037e-06, "loss": 1.0574831, "memory(GiB)": 141.16, "step": 96240, "train_speed(iter/s)": 0.289118 }, { "acc": 0.72817192, "epoch": 1.0766630268984, "grad_norm": 6.71875, "learning_rate": 4.779574264767873e-06, "loss": 1.09702797, "memory(GiB)": 141.16, "step": 96260, "train_speed(iter/s)": 0.289139 }, { "acc": 0.73042345, "epoch": 1.0768867258443586, "grad_norm": 7.53125, "learning_rate": 4.7777266571171e-06, "loss": 1.08912201, "memory(GiB)": 141.16, "step": 96280, "train_speed(iter/s)": 0.289157 }, { "acc": 0.72099066, "epoch": 1.0771104247903172, "grad_norm": 8.5, "learning_rate": 4.775879079876497e-06, "loss": 1.14377394, "memory(GiB)": 141.16, "step": 96300, "train_speed(iter/s)": 0.289176 }, { "acc": 0.72771454, "epoch": 1.0773341237362757, "grad_norm": 5.40625, "learning_rate": 4.77403153329884e-06, "loss": 1.08843069, "memory(GiB)": 141.16, "step": 96320, "train_speed(iter/s)": 0.289196 }, { "acc": 0.74325218, "epoch": 1.0775578226822342, "grad_norm": 7.40625, "learning_rate": 4.7721840176369e-06, "loss": 1.00715504, "memory(GiB)": 141.16, "step": 96340, "train_speed(iter/s)": 0.289217 }, { "acc": 0.73764343, "epoch": 1.0777815216281927, "grad_norm": 7.5, "learning_rate": 4.770336533143442e-06, "loss": 1.05990276, "memory(GiB)": 141.16, "step": 96360, "train_speed(iter/s)": 0.289239 }, { "acc": 0.72826366, "epoch": 1.0780052205741513, "grad_norm": 8.1875, "learning_rate": 4.768489080071227e-06, "loss": 1.09947796, "memory(GiB)": 141.16, "step": 96380, "train_speed(iter/s)": 0.289258 }, { "acc": 0.72943068, "epoch": 1.0782289195201098, "grad_norm": 8.1875, "learning_rate": 4.766641658673017e-06, "loss": 1.09504147, "memory(GiB)": 141.16, "step": 96400, "train_speed(iter/s)": 0.289278 }, { "acc": 0.74463382, "epoch": 1.0784526184660683, "grad_norm": 7.15625, "learning_rate": 4.7647942692015625e-06, "loss": 1.01463051, "memory(GiB)": 141.16, "step": 96420, "train_speed(iter/s)": 0.289298 }, { "acc": 0.73645306, "epoch": 1.0786763174120269, "grad_norm": 6.1875, "learning_rate": 4.762946911909615e-06, "loss": 1.07107086, "memory(GiB)": 141.16, "step": 96440, "train_speed(iter/s)": 0.289318 }, { "acc": 0.73916664, "epoch": 1.0789000163579854, "grad_norm": 7.9375, "learning_rate": 4.761099587049918e-06, "loss": 1.05599127, "memory(GiB)": 141.16, "step": 96460, "train_speed(iter/s)": 0.289336 }, { "acc": 0.72538157, "epoch": 1.079123715303944, "grad_norm": 7.5, "learning_rate": 4.7592522948752115e-06, "loss": 1.12213612, "memory(GiB)": 141.16, "step": 96480, "train_speed(iter/s)": 0.289358 }, { "acc": 0.74229145, "epoch": 1.0793474142499024, "grad_norm": 7.53125, "learning_rate": 4.757405035638232e-06, "loss": 1.04626207, "memory(GiB)": 141.16, "step": 96500, "train_speed(iter/s)": 0.289379 }, { "acc": 0.73588696, "epoch": 1.079571113195861, "grad_norm": 7.96875, "learning_rate": 4.755557809591711e-06, "loss": 1.08284302, "memory(GiB)": 141.16, "step": 96520, "train_speed(iter/s)": 0.289399 }, { "acc": 0.73322897, "epoch": 1.0797948121418195, "grad_norm": 6.125, "learning_rate": 4.753710616988377e-06, "loss": 1.07891178, "memory(GiB)": 141.16, "step": 96540, "train_speed(iter/s)": 0.28942 }, { "acc": 0.73092356, "epoch": 1.080018511087778, "grad_norm": 8.0625, "learning_rate": 4.751863458080949e-06, "loss": 1.08523464, "memory(GiB)": 141.16, "step": 96560, "train_speed(iter/s)": 0.289437 }, { "acc": 0.74419212, "epoch": 1.0802422100337365, "grad_norm": 8.1875, "learning_rate": 4.750016333122147e-06, "loss": 1.02590122, "memory(GiB)": 141.16, "step": 96580, "train_speed(iter/s)": 0.289459 }, { "acc": 0.73164034, "epoch": 1.080465908979695, "grad_norm": 7.15625, "learning_rate": 4.748169242364684e-06, "loss": 1.09119358, "memory(GiB)": 141.16, "step": 96600, "train_speed(iter/s)": 0.289478 }, { "acc": 0.74073968, "epoch": 1.0806896079256536, "grad_norm": 6.90625, "learning_rate": 4.746322186061269e-06, "loss": 1.03503265, "memory(GiB)": 141.16, "step": 96620, "train_speed(iter/s)": 0.289499 }, { "acc": 0.73697443, "epoch": 1.0809133068716121, "grad_norm": 6.40625, "learning_rate": 4.7444751644646045e-06, "loss": 1.04597626, "memory(GiB)": 141.16, "step": 96640, "train_speed(iter/s)": 0.28952 }, { "acc": 0.7414834, "epoch": 1.0811370058175707, "grad_norm": 7.90625, "learning_rate": 4.7426281778273896e-06, "loss": 1.03453617, "memory(GiB)": 141.16, "step": 96660, "train_speed(iter/s)": 0.289541 }, { "acc": 0.72915373, "epoch": 1.0813607047635292, "grad_norm": 6.21875, "learning_rate": 4.740781226402318e-06, "loss": 1.08830357, "memory(GiB)": 141.16, "step": 96680, "train_speed(iter/s)": 0.28956 }, { "acc": 0.74258518, "epoch": 1.0815844037094877, "grad_norm": 5.90625, "learning_rate": 4.73893431044208e-06, "loss": 1.03097477, "memory(GiB)": 141.16, "step": 96700, "train_speed(iter/s)": 0.28958 }, { "acc": 0.73185062, "epoch": 1.0818081026554462, "grad_norm": 6.84375, "learning_rate": 4.73708743019936e-06, "loss": 1.07472057, "memory(GiB)": 141.16, "step": 96720, "train_speed(iter/s)": 0.289602 }, { "acc": 0.74292526, "epoch": 1.0820318016014048, "grad_norm": 5.625, "learning_rate": 4.735240585926838e-06, "loss": 1.03081779, "memory(GiB)": 141.16, "step": 96740, "train_speed(iter/s)": 0.289621 }, { "acc": 0.73736815, "epoch": 1.0822555005473633, "grad_norm": 6.96875, "learning_rate": 4.733393777877187e-06, "loss": 1.04339066, "memory(GiB)": 141.16, "step": 96760, "train_speed(iter/s)": 0.289642 }, { "acc": 0.7405664, "epoch": 1.0824791994933218, "grad_norm": 6.96875, "learning_rate": 4.7315470063030785e-06, "loss": 1.04788971, "memory(GiB)": 141.16, "step": 96780, "train_speed(iter/s)": 0.289662 }, { "acc": 0.74240675, "epoch": 1.0827028984392804, "grad_norm": 8.625, "learning_rate": 4.729700271457176e-06, "loss": 1.03121519, "memory(GiB)": 141.16, "step": 96800, "train_speed(iter/s)": 0.289681 }, { "acc": 0.7381566, "epoch": 1.0829265973852389, "grad_norm": 7.65625, "learning_rate": 4.7278535735921405e-06, "loss": 1.04294891, "memory(GiB)": 141.16, "step": 96820, "train_speed(iter/s)": 0.2897 }, { "acc": 0.72268915, "epoch": 1.0831502963311974, "grad_norm": 6.0625, "learning_rate": 4.7260069129606275e-06, "loss": 1.11916027, "memory(GiB)": 141.16, "step": 96840, "train_speed(iter/s)": 0.289719 }, { "acc": 0.74335155, "epoch": 1.083373995277156, "grad_norm": 7.40625, "learning_rate": 4.724160289815283e-06, "loss": 1.05261192, "memory(GiB)": 141.16, "step": 96860, "train_speed(iter/s)": 0.289741 }, { "acc": 0.74042835, "epoch": 1.0835976942231145, "grad_norm": 6.875, "learning_rate": 4.722313704408754e-06, "loss": 1.03323126, "memory(GiB)": 141.16, "step": 96880, "train_speed(iter/s)": 0.289763 }, { "acc": 0.73462372, "epoch": 1.083821393169073, "grad_norm": 6.21875, "learning_rate": 4.720467156993679e-06, "loss": 1.0508213, "memory(GiB)": 141.16, "step": 96900, "train_speed(iter/s)": 0.289784 }, { "acc": 0.72208109, "epoch": 1.0840450921150315, "grad_norm": 5.9375, "learning_rate": 4.718620647822692e-06, "loss": 1.1302557, "memory(GiB)": 141.16, "step": 96920, "train_speed(iter/s)": 0.289805 }, { "acc": 0.74133768, "epoch": 1.08426879106099, "grad_norm": 7.8125, "learning_rate": 4.716774177148424e-06, "loss": 1.03678474, "memory(GiB)": 141.16, "step": 96940, "train_speed(iter/s)": 0.289826 }, { "acc": 0.7326992, "epoch": 1.0844924900069486, "grad_norm": 6.8125, "learning_rate": 4.714927745223495e-06, "loss": 1.06883707, "memory(GiB)": 141.16, "step": 96960, "train_speed(iter/s)": 0.289845 }, { "acc": 0.73726692, "epoch": 1.0847161889529071, "grad_norm": 6.65625, "learning_rate": 4.7130813523005255e-06, "loss": 1.05653582, "memory(GiB)": 141.16, "step": 96980, "train_speed(iter/s)": 0.289865 }, { "acc": 0.74539199, "epoch": 1.0849398878988656, "grad_norm": 5.90625, "learning_rate": 4.711234998632128e-06, "loss": 1.02528267, "memory(GiB)": 141.16, "step": 97000, "train_speed(iter/s)": 0.289884 }, { "acc": 0.73216219, "epoch": 1.0851635868448242, "grad_norm": 7.34375, "learning_rate": 4.709388684470911e-06, "loss": 1.0695282, "memory(GiB)": 141.16, "step": 97020, "train_speed(iter/s)": 0.289902 }, { "acc": 0.7278811, "epoch": 1.0853872857907827, "grad_norm": 5.3125, "learning_rate": 4.707542410069476e-06, "loss": 1.09464874, "memory(GiB)": 141.16, "step": 97040, "train_speed(iter/s)": 0.289923 }, { "acc": 0.73413773, "epoch": 1.0856109847367412, "grad_norm": 8.125, "learning_rate": 4.705696175680419e-06, "loss": 1.05228195, "memory(GiB)": 141.16, "step": 97060, "train_speed(iter/s)": 0.289941 }, { "acc": 0.74181356, "epoch": 1.0858346836826998, "grad_norm": 5.46875, "learning_rate": 4.703849981556332e-06, "loss": 1.0373888, "memory(GiB)": 141.16, "step": 97080, "train_speed(iter/s)": 0.289959 }, { "acc": 0.72899141, "epoch": 1.0860583826286583, "grad_norm": 7.59375, "learning_rate": 4.7020038279498e-06, "loss": 1.08377724, "memory(GiB)": 141.16, "step": 97100, "train_speed(iter/s)": 0.289979 }, { "acc": 0.72770691, "epoch": 1.0862820815746168, "grad_norm": 6.375, "learning_rate": 4.700157715113403e-06, "loss": 1.09508591, "memory(GiB)": 141.16, "step": 97120, "train_speed(iter/s)": 0.289999 }, { "acc": 0.72584906, "epoch": 1.0865057805205753, "grad_norm": 8.4375, "learning_rate": 4.698311643299717e-06, "loss": 1.09298668, "memory(GiB)": 141.16, "step": 97140, "train_speed(iter/s)": 0.290019 }, { "acc": 0.73262434, "epoch": 1.0867294794665339, "grad_norm": 7.28125, "learning_rate": 4.69646561276131e-06, "loss": 1.06947575, "memory(GiB)": 141.16, "step": 97160, "train_speed(iter/s)": 0.29004 }, { "acc": 0.7319746, "epoch": 1.0869531784124924, "grad_norm": 6.1875, "learning_rate": 4.694619623750746e-06, "loss": 1.08576717, "memory(GiB)": 141.16, "step": 97180, "train_speed(iter/s)": 0.290061 }, { "acc": 0.7338809, "epoch": 1.087176877358451, "grad_norm": 6.40625, "learning_rate": 4.692773676520582e-06, "loss": 1.06365204, "memory(GiB)": 141.16, "step": 97200, "train_speed(iter/s)": 0.290082 }, { "acc": 0.74239588, "epoch": 1.0874005763044094, "grad_norm": 5.9375, "learning_rate": 4.69092777132337e-06, "loss": 1.02806435, "memory(GiB)": 141.16, "step": 97220, "train_speed(iter/s)": 0.290102 }, { "acc": 0.74348898, "epoch": 1.087624275250368, "grad_norm": 6.0, "learning_rate": 4.689081908411658e-06, "loss": 1.02264576, "memory(GiB)": 141.16, "step": 97240, "train_speed(iter/s)": 0.290122 }, { "acc": 0.71650343, "epoch": 1.0878479741963265, "grad_norm": 7.6875, "learning_rate": 4.687236088037983e-06, "loss": 1.16585464, "memory(GiB)": 141.16, "step": 97260, "train_speed(iter/s)": 0.290141 }, { "acc": 0.75107985, "epoch": 1.088071673142285, "grad_norm": 7.125, "learning_rate": 4.685390310454884e-06, "loss": 0.98992891, "memory(GiB)": 141.16, "step": 97280, "train_speed(iter/s)": 0.290161 }, { "acc": 0.74238377, "epoch": 1.0882953720882436, "grad_norm": 6.96875, "learning_rate": 4.683544575914886e-06, "loss": 1.04411945, "memory(GiB)": 141.16, "step": 97300, "train_speed(iter/s)": 0.290181 }, { "acc": 0.74160662, "epoch": 1.088519071034202, "grad_norm": 6.5625, "learning_rate": 4.681698884670512e-06, "loss": 1.02519665, "memory(GiB)": 141.16, "step": 97320, "train_speed(iter/s)": 0.290202 }, { "acc": 0.7239388, "epoch": 1.0887427699801606, "grad_norm": 7.90625, "learning_rate": 4.679853236974281e-06, "loss": 1.1121664, "memory(GiB)": 141.16, "step": 97340, "train_speed(iter/s)": 0.290221 }, { "acc": 0.74251986, "epoch": 1.0889664689261191, "grad_norm": 7.4375, "learning_rate": 4.678007633078703e-06, "loss": 1.03570108, "memory(GiB)": 141.16, "step": 97360, "train_speed(iter/s)": 0.290239 }, { "acc": 0.73876638, "epoch": 1.0891901678720777, "grad_norm": 8.0625, "learning_rate": 4.676162073236285e-06, "loss": 1.05819931, "memory(GiB)": 141.16, "step": 97380, "train_speed(iter/s)": 0.29026 }, { "acc": 0.72762289, "epoch": 1.0894138668180362, "grad_norm": 7.6875, "learning_rate": 4.674316557699522e-06, "loss": 1.10176296, "memory(GiB)": 141.16, "step": 97400, "train_speed(iter/s)": 0.290283 }, { "acc": 0.73011808, "epoch": 1.0896375657639947, "grad_norm": 8.3125, "learning_rate": 4.67247108672091e-06, "loss": 1.08871975, "memory(GiB)": 141.16, "step": 97420, "train_speed(iter/s)": 0.290304 }, { "acc": 0.74366689, "epoch": 1.0898612647099533, "grad_norm": 7.375, "learning_rate": 4.670625660552934e-06, "loss": 1.01338797, "memory(GiB)": 141.16, "step": 97440, "train_speed(iter/s)": 0.290323 }, { "acc": 0.73363276, "epoch": 1.0900849636559118, "grad_norm": 6.9375, "learning_rate": 4.668780279448076e-06, "loss": 1.06864443, "memory(GiB)": 141.16, "step": 97460, "train_speed(iter/s)": 0.29034 }, { "acc": 0.73560629, "epoch": 1.0903086626018703, "grad_norm": 7.21875, "learning_rate": 4.666934943658811e-06, "loss": 1.07149429, "memory(GiB)": 141.16, "step": 97480, "train_speed(iter/s)": 0.290359 }, { "acc": 0.73218784, "epoch": 1.0905323615478288, "grad_norm": 10.0, "learning_rate": 4.665089653437604e-06, "loss": 1.06703663, "memory(GiB)": 141.16, "step": 97500, "train_speed(iter/s)": 0.290378 }, { "acc": 0.74238567, "epoch": 1.0907560604937874, "grad_norm": 6.9375, "learning_rate": 4.6632444090369215e-06, "loss": 1.03020945, "memory(GiB)": 141.16, "step": 97520, "train_speed(iter/s)": 0.290395 }, { "acc": 0.73071318, "epoch": 1.090979759439746, "grad_norm": 7.34375, "learning_rate": 4.661399210709215e-06, "loss": 1.09695034, "memory(GiB)": 141.16, "step": 97540, "train_speed(iter/s)": 0.290416 }, { "acc": 0.73323889, "epoch": 1.0912034583857044, "grad_norm": 7.75, "learning_rate": 4.659554058706937e-06, "loss": 1.08273125, "memory(GiB)": 141.16, "step": 97560, "train_speed(iter/s)": 0.290435 }, { "acc": 0.72075763, "epoch": 1.091427157331663, "grad_norm": 5.96875, "learning_rate": 4.657708953282532e-06, "loss": 1.11919336, "memory(GiB)": 141.16, "step": 97580, "train_speed(iter/s)": 0.290457 }, { "acc": 0.74056082, "epoch": 1.0916508562776215, "grad_norm": 6.5, "learning_rate": 4.655863894688433e-06, "loss": 1.05301523, "memory(GiB)": 141.16, "step": 97600, "train_speed(iter/s)": 0.290475 }, { "acc": 0.73081269, "epoch": 1.09187455522358, "grad_norm": 9.125, "learning_rate": 4.654018883177071e-06, "loss": 1.0974369, "memory(GiB)": 141.16, "step": 97620, "train_speed(iter/s)": 0.290496 }, { "acc": 0.73094797, "epoch": 1.0920982541695385, "grad_norm": 5.84375, "learning_rate": 4.6521739190008725e-06, "loss": 1.07677145, "memory(GiB)": 141.16, "step": 97640, "train_speed(iter/s)": 0.290517 }, { "acc": 0.72597313, "epoch": 1.092321953115497, "grad_norm": 8.3125, "learning_rate": 4.650329002412253e-06, "loss": 1.09312143, "memory(GiB)": 141.16, "step": 97660, "train_speed(iter/s)": 0.290536 }, { "acc": 0.7329608, "epoch": 1.0925456520614556, "grad_norm": 6.75, "learning_rate": 4.6484841336636245e-06, "loss": 1.07913694, "memory(GiB)": 141.16, "step": 97680, "train_speed(iter/s)": 0.290554 }, { "acc": 0.72411666, "epoch": 1.0927693510074141, "grad_norm": 6.65625, "learning_rate": 4.64663931300739e-06, "loss": 1.11433945, "memory(GiB)": 141.16, "step": 97700, "train_speed(iter/s)": 0.290574 }, { "acc": 0.74831276, "epoch": 1.0929930499533727, "grad_norm": 7.28125, "learning_rate": 4.644794540695949e-06, "loss": 1.00189285, "memory(GiB)": 141.16, "step": 97720, "train_speed(iter/s)": 0.290595 }, { "acc": 0.73586121, "epoch": 1.0932167488993312, "grad_norm": 6.75, "learning_rate": 4.64294981698169e-06, "loss": 1.06092548, "memory(GiB)": 141.16, "step": 97740, "train_speed(iter/s)": 0.290614 }, { "acc": 0.73337994, "epoch": 1.0934404478452897, "grad_norm": 8.4375, "learning_rate": 4.641105142117e-06, "loss": 1.07919178, "memory(GiB)": 141.16, "step": 97760, "train_speed(iter/s)": 0.290635 }, { "acc": 0.73829775, "epoch": 1.0936641467912482, "grad_norm": 8.125, "learning_rate": 4.639260516354259e-06, "loss": 1.03350115, "memory(GiB)": 141.16, "step": 97780, "train_speed(iter/s)": 0.290654 }, { "acc": 0.73027411, "epoch": 1.0938878457372068, "grad_norm": 6.53125, "learning_rate": 4.637415939945833e-06, "loss": 1.0960453, "memory(GiB)": 141.16, "step": 97800, "train_speed(iter/s)": 0.290672 }, { "acc": 0.73638229, "epoch": 1.0941115446831653, "grad_norm": 7.53125, "learning_rate": 4.63557141314409e-06, "loss": 1.0634613, "memory(GiB)": 141.16, "step": 97820, "train_speed(iter/s)": 0.290694 }, { "acc": 0.72150812, "epoch": 1.0943352436291238, "grad_norm": 5.65625, "learning_rate": 4.633726936201385e-06, "loss": 1.11953888, "memory(GiB)": 141.16, "step": 97840, "train_speed(iter/s)": 0.290712 }, { "acc": 0.73958979, "epoch": 1.0945589425750826, "grad_norm": 6.5, "learning_rate": 4.631882509370072e-06, "loss": 1.0410881, "memory(GiB)": 141.16, "step": 97860, "train_speed(iter/s)": 0.290734 }, { "acc": 0.74433165, "epoch": 1.094782641521041, "grad_norm": 7.09375, "learning_rate": 4.630038132902494e-06, "loss": 1.01979332, "memory(GiB)": 141.16, "step": 97880, "train_speed(iter/s)": 0.290755 }, { "acc": 0.73290205, "epoch": 1.0950063404669996, "grad_norm": 6.9375, "learning_rate": 4.6281938070509855e-06, "loss": 1.07123289, "memory(GiB)": 141.16, "step": 97900, "train_speed(iter/s)": 0.290773 }, { "acc": 0.74267201, "epoch": 1.0952300394129582, "grad_norm": 7.625, "learning_rate": 4.626349532067879e-06, "loss": 1.02988567, "memory(GiB)": 141.16, "step": 97920, "train_speed(iter/s)": 0.290792 }, { "acc": 0.73038487, "epoch": 1.0954537383589167, "grad_norm": 7.25, "learning_rate": 4.6245053082054975e-06, "loss": 1.09774208, "memory(GiB)": 141.16, "step": 97940, "train_speed(iter/s)": 0.290813 }, { "acc": 0.74211979, "epoch": 1.0956774373048752, "grad_norm": 7.28125, "learning_rate": 4.622661135716157e-06, "loss": 1.02931032, "memory(GiB)": 141.16, "step": 97960, "train_speed(iter/s)": 0.290833 }, { "acc": 0.72761998, "epoch": 1.0959011362508337, "grad_norm": 5.96875, "learning_rate": 4.620817014852167e-06, "loss": 1.10610905, "memory(GiB)": 141.16, "step": 97980, "train_speed(iter/s)": 0.290852 }, { "acc": 0.72325068, "epoch": 1.0961248351967923, "grad_norm": 7.53125, "learning_rate": 4.618972945865828e-06, "loss": 1.13415308, "memory(GiB)": 141.16, "step": 98000, "train_speed(iter/s)": 0.290875 }, { "epoch": 1.0961248351967923, "eval_acc": 0.6901374942865177, "eval_loss": 1.0791937112808228, "eval_runtime": 2324.1912, "eval_samples_per_second": 32.391, "eval_steps_per_second": 16.196, "step": 98000 }, { "acc": 0.73044357, "epoch": 1.0963485341427508, "grad_norm": 5.75, "learning_rate": 4.617128929009436e-06, "loss": 1.08729181, "memory(GiB)": 141.16, "step": 98020, "train_speed(iter/s)": 0.288857 }, { "acc": 0.72489777, "epoch": 1.0965722330887093, "grad_norm": 7.65625, "learning_rate": 4.61528496453528e-06, "loss": 1.10777636, "memory(GiB)": 141.16, "step": 98040, "train_speed(iter/s)": 0.288878 }, { "acc": 0.73628826, "epoch": 1.0967959320346679, "grad_norm": 7.78125, "learning_rate": 4.613441052695639e-06, "loss": 1.04910078, "memory(GiB)": 141.16, "step": 98060, "train_speed(iter/s)": 0.288899 }, { "acc": 0.73947163, "epoch": 1.0970196309806264, "grad_norm": 7.1875, "learning_rate": 4.611597193742789e-06, "loss": 1.05386314, "memory(GiB)": 141.16, "step": 98080, "train_speed(iter/s)": 0.288919 }, { "acc": 0.73386912, "epoch": 1.097243329926585, "grad_norm": 7.5625, "learning_rate": 4.609753387928993e-06, "loss": 1.07453032, "memory(GiB)": 141.16, "step": 98100, "train_speed(iter/s)": 0.288942 }, { "acc": 0.74144812, "epoch": 1.0974670288725434, "grad_norm": 8.875, "learning_rate": 4.60790963550651e-06, "loss": 1.04252586, "memory(GiB)": 141.16, "step": 98120, "train_speed(iter/s)": 0.288963 }, { "acc": 0.74749942, "epoch": 1.097690727818502, "grad_norm": 7.6875, "learning_rate": 4.606065936727595e-06, "loss": 1.00533428, "memory(GiB)": 141.16, "step": 98140, "train_speed(iter/s)": 0.288982 }, { "acc": 0.73158612, "epoch": 1.0979144267644605, "grad_norm": 7.71875, "learning_rate": 4.60422229184449e-06, "loss": 1.07808638, "memory(GiB)": 141.16, "step": 98160, "train_speed(iter/s)": 0.289004 }, { "acc": 0.72794685, "epoch": 1.098138125710419, "grad_norm": 7.15625, "learning_rate": 4.602378701109433e-06, "loss": 1.09422474, "memory(GiB)": 141.16, "step": 98180, "train_speed(iter/s)": 0.289023 }, { "acc": 0.73073187, "epoch": 1.0983618246563776, "grad_norm": 6.5625, "learning_rate": 4.600535164774653e-06, "loss": 1.07344017, "memory(GiB)": 141.16, "step": 98200, "train_speed(iter/s)": 0.289042 }, { "acc": 0.72637606, "epoch": 1.098585523602336, "grad_norm": 7.0, "learning_rate": 4.598691683092371e-06, "loss": 1.1099308, "memory(GiB)": 141.16, "step": 98220, "train_speed(iter/s)": 0.289062 }, { "acc": 0.74452114, "epoch": 1.0988092225482946, "grad_norm": 7.46875, "learning_rate": 4.596848256314805e-06, "loss": 1.01920586, "memory(GiB)": 141.16, "step": 98240, "train_speed(iter/s)": 0.289082 }, { "acc": 0.72992144, "epoch": 1.0990329214942531, "grad_norm": 5.0, "learning_rate": 4.595004884694158e-06, "loss": 1.08809357, "memory(GiB)": 141.16, "step": 98260, "train_speed(iter/s)": 0.289102 }, { "acc": 0.72476978, "epoch": 1.0992566204402117, "grad_norm": 8.3125, "learning_rate": 4.5931615684826324e-06, "loss": 1.10061083, "memory(GiB)": 141.16, "step": 98280, "train_speed(iter/s)": 0.289122 }, { "acc": 0.74025922, "epoch": 1.0994803193861702, "grad_norm": 7.65625, "learning_rate": 4.591318307932418e-06, "loss": 1.04366665, "memory(GiB)": 141.16, "step": 98300, "train_speed(iter/s)": 0.289143 }, { "acc": 0.72523479, "epoch": 1.0997040183321287, "grad_norm": 7.8125, "learning_rate": 4.5894751032957024e-06, "loss": 1.11234474, "memory(GiB)": 141.16, "step": 98320, "train_speed(iter/s)": 0.289162 }, { "acc": 0.73748732, "epoch": 1.0999277172780872, "grad_norm": 6.40625, "learning_rate": 4.587631954824659e-06, "loss": 1.04442568, "memory(GiB)": 141.16, "step": 98340, "train_speed(iter/s)": 0.289183 }, { "acc": 0.73756843, "epoch": 1.1001514162240458, "grad_norm": 5.71875, "learning_rate": 4.585788862771458e-06, "loss": 1.03708382, "memory(GiB)": 141.16, "step": 98360, "train_speed(iter/s)": 0.289205 }, { "acc": 0.71977792, "epoch": 1.1003751151700043, "grad_norm": 6.21875, "learning_rate": 4.583945827388261e-06, "loss": 1.13374805, "memory(GiB)": 141.16, "step": 98380, "train_speed(iter/s)": 0.289225 }, { "acc": 0.73633432, "epoch": 1.1005988141159628, "grad_norm": 7.75, "learning_rate": 4.582102848927222e-06, "loss": 1.04924469, "memory(GiB)": 141.16, "step": 98400, "train_speed(iter/s)": 0.289244 }, { "acc": 0.71529694, "epoch": 1.1008225130619214, "grad_norm": 5.78125, "learning_rate": 4.580259927640488e-06, "loss": 1.15261841, "memory(GiB)": 141.16, "step": 98420, "train_speed(iter/s)": 0.289265 }, { "acc": 0.74259911, "epoch": 1.1010462120078799, "grad_norm": 7.34375, "learning_rate": 4.578417063780193e-06, "loss": 1.02828732, "memory(GiB)": 141.16, "step": 98440, "train_speed(iter/s)": 0.289285 }, { "acc": 0.74105387, "epoch": 1.1012699109538384, "grad_norm": 6.125, "learning_rate": 4.576574257598471e-06, "loss": 1.03189077, "memory(GiB)": 141.16, "step": 98460, "train_speed(iter/s)": 0.289305 }, { "acc": 0.74135914, "epoch": 1.101493609899797, "grad_norm": 7.6875, "learning_rate": 4.574731509347441e-06, "loss": 1.04011726, "memory(GiB)": 141.16, "step": 98480, "train_speed(iter/s)": 0.289325 }, { "acc": 0.73180175, "epoch": 1.1017173088457555, "grad_norm": 7.875, "learning_rate": 4.57288881927922e-06, "loss": 1.07947369, "memory(GiB)": 141.16, "step": 98500, "train_speed(iter/s)": 0.289345 }, { "acc": 0.73720703, "epoch": 1.101941007791714, "grad_norm": 9.1875, "learning_rate": 4.571046187645914e-06, "loss": 1.03213739, "memory(GiB)": 141.16, "step": 98520, "train_speed(iter/s)": 0.289362 }, { "acc": 0.74012122, "epoch": 1.1021647067376725, "grad_norm": 6.59375, "learning_rate": 4.56920361469962e-06, "loss": 1.03464699, "memory(GiB)": 141.16, "step": 98540, "train_speed(iter/s)": 0.289383 }, { "acc": 0.73380322, "epoch": 1.102388405683631, "grad_norm": 7.375, "learning_rate": 4.567361100692429e-06, "loss": 1.07311058, "memory(GiB)": 141.16, "step": 98560, "train_speed(iter/s)": 0.2894 }, { "acc": 0.7355298, "epoch": 1.1026121046295896, "grad_norm": 7.71875, "learning_rate": 4.565518645876424e-06, "loss": 1.07017136, "memory(GiB)": 141.16, "step": 98580, "train_speed(iter/s)": 0.28942 }, { "acc": 0.72631211, "epoch": 1.1028358035755481, "grad_norm": 8.875, "learning_rate": 4.563676250503677e-06, "loss": 1.09470854, "memory(GiB)": 141.16, "step": 98600, "train_speed(iter/s)": 0.289439 }, { "acc": 0.73392425, "epoch": 1.1030595025215066, "grad_norm": 7.40625, "learning_rate": 4.561833914826256e-06, "loss": 1.07468758, "memory(GiB)": 141.16, "step": 98620, "train_speed(iter/s)": 0.289458 }, { "acc": 0.73741131, "epoch": 1.1032832014674652, "grad_norm": 7.90625, "learning_rate": 4.55999163909622e-06, "loss": 1.04845333, "memory(GiB)": 141.16, "step": 98640, "train_speed(iter/s)": 0.289478 }, { "acc": 0.73648787, "epoch": 1.1035069004134237, "grad_norm": 5.375, "learning_rate": 4.5581494235656146e-06, "loss": 1.07669945, "memory(GiB)": 141.16, "step": 98660, "train_speed(iter/s)": 0.289499 }, { "acc": 0.73359356, "epoch": 1.1037305993593822, "grad_norm": 5.40625, "learning_rate": 4.556307268486484e-06, "loss": 1.06735191, "memory(GiB)": 141.16, "step": 98680, "train_speed(iter/s)": 0.289518 }, { "acc": 0.74040108, "epoch": 1.1039542983053408, "grad_norm": 8.0625, "learning_rate": 4.554465174110862e-06, "loss": 1.0270709, "memory(GiB)": 141.16, "step": 98700, "train_speed(iter/s)": 0.289539 }, { "acc": 0.73771906, "epoch": 1.1041779972512993, "grad_norm": 7.84375, "learning_rate": 4.5526231406907705e-06, "loss": 1.04240398, "memory(GiB)": 141.16, "step": 98720, "train_speed(iter/s)": 0.289559 }, { "acc": 0.7442935, "epoch": 1.1044016961972578, "grad_norm": 7.21875, "learning_rate": 4.55078116847823e-06, "loss": 1.02002449, "memory(GiB)": 141.16, "step": 98740, "train_speed(iter/s)": 0.28958 }, { "acc": 0.74246206, "epoch": 1.1046253951432163, "grad_norm": 6.3125, "learning_rate": 4.548939257725245e-06, "loss": 1.03119793, "memory(GiB)": 141.16, "step": 98760, "train_speed(iter/s)": 0.289599 }, { "acc": 0.74294672, "epoch": 1.1048490940891749, "grad_norm": 6.5625, "learning_rate": 4.547097408683817e-06, "loss": 1.02514582, "memory(GiB)": 141.16, "step": 98780, "train_speed(iter/s)": 0.289619 }, { "acc": 0.7393434, "epoch": 1.1050727930351334, "grad_norm": 8.125, "learning_rate": 4.545255621605937e-06, "loss": 1.04974604, "memory(GiB)": 141.16, "step": 98800, "train_speed(iter/s)": 0.289637 }, { "acc": 0.74707861, "epoch": 1.105296491981092, "grad_norm": 7.09375, "learning_rate": 4.543413896743587e-06, "loss": 1.00477772, "memory(GiB)": 141.16, "step": 98820, "train_speed(iter/s)": 0.289657 }, { "acc": 0.74447799, "epoch": 1.1055201909270505, "grad_norm": 9.4375, "learning_rate": 4.541572234348744e-06, "loss": 1.02366838, "memory(GiB)": 141.16, "step": 98840, "train_speed(iter/s)": 0.289679 }, { "acc": 0.74169626, "epoch": 1.105743889873009, "grad_norm": 7.40625, "learning_rate": 4.539730634673371e-06, "loss": 1.02923431, "memory(GiB)": 141.16, "step": 98860, "train_speed(iter/s)": 0.2897 }, { "acc": 0.73593698, "epoch": 1.1059675888189675, "grad_norm": 6.59375, "learning_rate": 4.537889097969425e-06, "loss": 1.06714764, "memory(GiB)": 141.16, "step": 98880, "train_speed(iter/s)": 0.28972 }, { "acc": 0.74650059, "epoch": 1.106191287764926, "grad_norm": 7.09375, "learning_rate": 4.536047624488856e-06, "loss": 1.00848217, "memory(GiB)": 141.16, "step": 98900, "train_speed(iter/s)": 0.289741 }, { "acc": 0.74299493, "epoch": 1.1064149867108846, "grad_norm": 6.65625, "learning_rate": 4.534206214483604e-06, "loss": 1.03221836, "memory(GiB)": 141.16, "step": 98920, "train_speed(iter/s)": 0.289759 }, { "acc": 0.7310338, "epoch": 1.106638685656843, "grad_norm": 5.625, "learning_rate": 4.5323648682055995e-06, "loss": 1.09036932, "memory(GiB)": 141.16, "step": 98940, "train_speed(iter/s)": 0.289778 }, { "acc": 0.73794422, "epoch": 1.1068623846028016, "grad_norm": 8.0, "learning_rate": 4.530523585906764e-06, "loss": 1.04827671, "memory(GiB)": 141.16, "step": 98960, "train_speed(iter/s)": 0.289799 }, { "acc": 0.73648233, "epoch": 1.1070860835487601, "grad_norm": 7.78125, "learning_rate": 4.528682367839013e-06, "loss": 1.0509264, "memory(GiB)": 141.16, "step": 98980, "train_speed(iter/s)": 0.289818 }, { "acc": 0.7342433, "epoch": 1.1073097824947187, "grad_norm": 7.25, "learning_rate": 4.526841214254251e-06, "loss": 1.06585197, "memory(GiB)": 141.16, "step": 99000, "train_speed(iter/s)": 0.289836 }, { "acc": 0.74306574, "epoch": 1.1075334814406772, "grad_norm": 8.375, "learning_rate": 4.525000125404373e-06, "loss": 1.0320837, "memory(GiB)": 141.16, "step": 99020, "train_speed(iter/s)": 0.289854 }, { "acc": 0.74540434, "epoch": 1.1077571803866357, "grad_norm": 7.1875, "learning_rate": 4.523159101541268e-06, "loss": 1.01052732, "memory(GiB)": 141.16, "step": 99040, "train_speed(iter/s)": 0.289872 }, { "acc": 0.73384275, "epoch": 1.1079808793325943, "grad_norm": 7.8125, "learning_rate": 4.521318142916813e-06, "loss": 1.0658287, "memory(GiB)": 141.16, "step": 99060, "train_speed(iter/s)": 0.289891 }, { "acc": 0.73272734, "epoch": 1.1082045782785528, "grad_norm": 5.21875, "learning_rate": 4.519477249782878e-06, "loss": 1.07961788, "memory(GiB)": 141.16, "step": 99080, "train_speed(iter/s)": 0.289911 }, { "acc": 0.73641253, "epoch": 1.1084282772245113, "grad_norm": 6.46875, "learning_rate": 4.517636422391324e-06, "loss": 1.06390762, "memory(GiB)": 141.16, "step": 99100, "train_speed(iter/s)": 0.28993 }, { "acc": 0.72847109, "epoch": 1.1086519761704698, "grad_norm": 8.0, "learning_rate": 4.515795660994002e-06, "loss": 1.09091816, "memory(GiB)": 141.16, "step": 99120, "train_speed(iter/s)": 0.289951 }, { "acc": 0.74599543, "epoch": 1.1088756751164284, "grad_norm": 5.84375, "learning_rate": 4.513954965842755e-06, "loss": 1.00484791, "memory(GiB)": 141.16, "step": 99140, "train_speed(iter/s)": 0.289971 }, { "acc": 0.72398949, "epoch": 1.109099374062387, "grad_norm": 6.28125, "learning_rate": 4.5121143371894146e-06, "loss": 1.1105648, "memory(GiB)": 141.16, "step": 99160, "train_speed(iter/s)": 0.289991 }, { "acc": 0.73465157, "epoch": 1.1093230730083454, "grad_norm": 9.9375, "learning_rate": 4.510273775285807e-06, "loss": 1.05360613, "memory(GiB)": 141.16, "step": 99180, "train_speed(iter/s)": 0.290008 }, { "acc": 0.7207459, "epoch": 1.109546771954304, "grad_norm": 6.34375, "learning_rate": 4.508433280383746e-06, "loss": 1.12404642, "memory(GiB)": 141.16, "step": 99200, "train_speed(iter/s)": 0.290028 }, { "acc": 0.73270063, "epoch": 1.1097704709002625, "grad_norm": 9.0625, "learning_rate": 4.506592852735039e-06, "loss": 1.07221737, "memory(GiB)": 141.16, "step": 99220, "train_speed(iter/s)": 0.290048 }, { "acc": 0.73344431, "epoch": 1.109994169846221, "grad_norm": 6.0, "learning_rate": 4.504752492591483e-06, "loss": 1.06159, "memory(GiB)": 141.16, "step": 99240, "train_speed(iter/s)": 0.29007 }, { "acc": 0.71998501, "epoch": 1.1102178687921795, "grad_norm": 6.625, "learning_rate": 4.502912200204863e-06, "loss": 1.12259865, "memory(GiB)": 141.16, "step": 99260, "train_speed(iter/s)": 0.290087 }, { "acc": 0.73095961, "epoch": 1.110441567738138, "grad_norm": 8.4375, "learning_rate": 4.50107197582696e-06, "loss": 1.08850193, "memory(GiB)": 141.16, "step": 99280, "train_speed(iter/s)": 0.290108 }, { "acc": 0.74003181, "epoch": 1.1106652666840966, "grad_norm": 6.875, "learning_rate": 4.499231819709542e-06, "loss": 1.02205334, "memory(GiB)": 141.16, "step": 99300, "train_speed(iter/s)": 0.290129 }, { "acc": 0.7199863, "epoch": 1.1108889656300551, "grad_norm": 9.5625, "learning_rate": 4.4973917321043684e-06, "loss": 1.14125862, "memory(GiB)": 141.16, "step": 99320, "train_speed(iter/s)": 0.290149 }, { "acc": 0.7422936, "epoch": 1.1111126645760137, "grad_norm": 5.75, "learning_rate": 4.49555171326319e-06, "loss": 1.02870293, "memory(GiB)": 141.16, "step": 99340, "train_speed(iter/s)": 0.290168 }, { "acc": 0.74002852, "epoch": 1.1113363635219722, "grad_norm": 8.1875, "learning_rate": 4.493711763437748e-06, "loss": 1.04064484, "memory(GiB)": 141.16, "step": 99360, "train_speed(iter/s)": 0.290186 }, { "acc": 0.75238094, "epoch": 1.1115600624679307, "grad_norm": 7.1875, "learning_rate": 4.491871882879772e-06, "loss": 0.98881893, "memory(GiB)": 141.16, "step": 99380, "train_speed(iter/s)": 0.290206 }, { "acc": 0.74949775, "epoch": 1.1117837614138892, "grad_norm": 6.21875, "learning_rate": 4.490032071840985e-06, "loss": 1.01442699, "memory(GiB)": 141.16, "step": 99400, "train_speed(iter/s)": 0.290227 }, { "acc": 0.73767471, "epoch": 1.1120074603598478, "grad_norm": 7.21875, "learning_rate": 4.488192330573104e-06, "loss": 1.04828358, "memory(GiB)": 141.16, "step": 99420, "train_speed(iter/s)": 0.290247 }, { "acc": 0.74034686, "epoch": 1.1122311593058063, "grad_norm": 7.5, "learning_rate": 4.486352659327823e-06, "loss": 1.03425131, "memory(GiB)": 141.16, "step": 99440, "train_speed(iter/s)": 0.290267 }, { "acc": 0.72415099, "epoch": 1.1124548582517648, "grad_norm": 7.53125, "learning_rate": 4.484513058356841e-06, "loss": 1.11684647, "memory(GiB)": 141.16, "step": 99460, "train_speed(iter/s)": 0.290286 }, { "acc": 0.74414983, "epoch": 1.1126785571977234, "grad_norm": 7.0, "learning_rate": 4.4826735279118425e-06, "loss": 1.03541946, "memory(GiB)": 141.16, "step": 99480, "train_speed(iter/s)": 0.290306 }, { "acc": 0.72867851, "epoch": 1.1129022561436819, "grad_norm": 8.375, "learning_rate": 4.480834068244498e-06, "loss": 1.09962797, "memory(GiB)": 141.16, "step": 99500, "train_speed(iter/s)": 0.290326 }, { "acc": 0.74586945, "epoch": 1.1131259550896404, "grad_norm": 7.84375, "learning_rate": 4.478994679606473e-06, "loss": 1.00884457, "memory(GiB)": 141.16, "step": 99520, "train_speed(iter/s)": 0.290343 }, { "acc": 0.73589048, "epoch": 1.113349654035599, "grad_norm": 6.40625, "learning_rate": 4.477155362249422e-06, "loss": 1.07223129, "memory(GiB)": 141.16, "step": 99540, "train_speed(iter/s)": 0.290363 }, { "acc": 0.74470553, "epoch": 1.1135733529815575, "grad_norm": 6.34375, "learning_rate": 4.475316116424992e-06, "loss": 1.00449696, "memory(GiB)": 141.16, "step": 99560, "train_speed(iter/s)": 0.290385 }, { "acc": 0.7305037, "epoch": 1.113797051927516, "grad_norm": 7.0, "learning_rate": 4.473476942384817e-06, "loss": 1.07552357, "memory(GiB)": 141.16, "step": 99580, "train_speed(iter/s)": 0.290406 }, { "acc": 0.72984023, "epoch": 1.1140207508734745, "grad_norm": 6.78125, "learning_rate": 4.471637840380522e-06, "loss": 1.07907276, "memory(GiB)": 141.16, "step": 99600, "train_speed(iter/s)": 0.290426 }, { "acc": 0.74001646, "epoch": 1.114244449819433, "grad_norm": 7.0, "learning_rate": 4.469798810663722e-06, "loss": 1.03485727, "memory(GiB)": 141.16, "step": 99620, "train_speed(iter/s)": 0.290448 }, { "acc": 0.72856598, "epoch": 1.1144681487653916, "grad_norm": 6.15625, "learning_rate": 4.467959853486023e-06, "loss": 1.08111124, "memory(GiB)": 141.16, "step": 99640, "train_speed(iter/s)": 0.290471 }, { "acc": 0.72566061, "epoch": 1.11469184771135, "grad_norm": 5.71875, "learning_rate": 4.4661209690990195e-06, "loss": 1.10237741, "memory(GiB)": 141.16, "step": 99660, "train_speed(iter/s)": 0.290494 }, { "acc": 0.73358717, "epoch": 1.1149155466573086, "grad_norm": 7.375, "learning_rate": 4.464282157754301e-06, "loss": 1.07457924, "memory(GiB)": 141.16, "step": 99680, "train_speed(iter/s)": 0.290511 }, { "acc": 0.73262806, "epoch": 1.1151392456032672, "grad_norm": 6.90625, "learning_rate": 4.462443419703439e-06, "loss": 1.07044439, "memory(GiB)": 141.16, "step": 99700, "train_speed(iter/s)": 0.290529 }, { "acc": 0.74040647, "epoch": 1.1153629445492257, "grad_norm": 6.3125, "learning_rate": 4.460604755198e-06, "loss": 1.041189, "memory(GiB)": 141.16, "step": 99720, "train_speed(iter/s)": 0.290548 }, { "acc": 0.73994951, "epoch": 1.1155866434951842, "grad_norm": 7.1875, "learning_rate": 4.458766164489541e-06, "loss": 1.05463753, "memory(GiB)": 141.16, "step": 99740, "train_speed(iter/s)": 0.29057 }, { "acc": 0.74128437, "epoch": 1.1158103424411427, "grad_norm": 5.46875, "learning_rate": 4.456927647829607e-06, "loss": 1.04247227, "memory(GiB)": 141.16, "step": 99760, "train_speed(iter/s)": 0.290589 }, { "acc": 0.73288994, "epoch": 1.1160340413871013, "grad_norm": 7.21875, "learning_rate": 4.455089205469733e-06, "loss": 1.05015011, "memory(GiB)": 141.16, "step": 99780, "train_speed(iter/s)": 0.29061 }, { "acc": 0.73417788, "epoch": 1.1162577403330598, "grad_norm": 6.1875, "learning_rate": 4.4532508376614434e-06, "loss": 1.05395708, "memory(GiB)": 141.16, "step": 99800, "train_speed(iter/s)": 0.29063 }, { "acc": 0.73895435, "epoch": 1.1164814392790183, "grad_norm": 8.125, "learning_rate": 4.451412544656255e-06, "loss": 1.05255241, "memory(GiB)": 141.16, "step": 99820, "train_speed(iter/s)": 0.290648 }, { "acc": 0.73289547, "epoch": 1.1167051382249769, "grad_norm": 8.25, "learning_rate": 4.449574326705671e-06, "loss": 1.07053614, "memory(GiB)": 141.16, "step": 99840, "train_speed(iter/s)": 0.290668 }, { "acc": 0.73194156, "epoch": 1.1169288371709354, "grad_norm": 6.6875, "learning_rate": 4.447736184061186e-06, "loss": 1.08078747, "memory(GiB)": 141.16, "step": 99860, "train_speed(iter/s)": 0.290686 }, { "acc": 0.73381853, "epoch": 1.117152536116894, "grad_norm": 8.375, "learning_rate": 4.4458981169742865e-06, "loss": 1.07309818, "memory(GiB)": 141.16, "step": 99880, "train_speed(iter/s)": 0.290703 }, { "acc": 0.74637032, "epoch": 1.1173762350628524, "grad_norm": 7.78125, "learning_rate": 4.444060125696444e-06, "loss": 1.00287495, "memory(GiB)": 141.16, "step": 99900, "train_speed(iter/s)": 0.290723 }, { "acc": 0.72189527, "epoch": 1.117599934008811, "grad_norm": 9.0, "learning_rate": 4.442222210479121e-06, "loss": 1.12625809, "memory(GiB)": 141.16, "step": 99920, "train_speed(iter/s)": 0.290742 }, { "acc": 0.73408685, "epoch": 1.1178236329547695, "grad_norm": 7.84375, "learning_rate": 4.4403843715737725e-06, "loss": 1.05496197, "memory(GiB)": 141.16, "step": 99940, "train_speed(iter/s)": 0.290764 }, { "acc": 0.72499909, "epoch": 1.118047331900728, "grad_norm": 6.59375, "learning_rate": 4.438546609231841e-06, "loss": 1.10690813, "memory(GiB)": 141.16, "step": 99960, "train_speed(iter/s)": 0.290781 }, { "acc": 0.73243723, "epoch": 1.1182710308466866, "grad_norm": 7.625, "learning_rate": 4.43670892370476e-06, "loss": 1.08168545, "memory(GiB)": 141.16, "step": 99980, "train_speed(iter/s)": 0.290799 }, { "acc": 0.73926258, "epoch": 1.118494729792645, "grad_norm": 7.1875, "learning_rate": 4.434871315243948e-06, "loss": 1.04520531, "memory(GiB)": 141.16, "step": 100000, "train_speed(iter/s)": 0.290819 }, { "epoch": 1.118494729792645, "eval_acc": 0.6901410436456809, "eval_loss": 1.079219102859497, "eval_runtime": 2320.1283, "eval_samples_per_second": 32.448, "eval_steps_per_second": 16.224, "step": 100000 }, { "acc": 0.73430672, "epoch": 1.1187184287386036, "grad_norm": 8.125, "learning_rate": 4.433033784100817e-06, "loss": 1.062463, "memory(GiB)": 141.16, "step": 100020, "train_speed(iter/s)": 0.288845 }, { "acc": 0.73487864, "epoch": 1.1189421276845621, "grad_norm": 6.65625, "learning_rate": 4.431196330526769e-06, "loss": 1.07677441, "memory(GiB)": 141.16, "step": 100040, "train_speed(iter/s)": 0.288863 }, { "acc": 0.72974606, "epoch": 1.1191658266305207, "grad_norm": 8.125, "learning_rate": 4.429358954773192e-06, "loss": 1.09039011, "memory(GiB)": 141.16, "step": 100060, "train_speed(iter/s)": 0.288882 }, { "acc": 0.73633366, "epoch": 1.1193895255764792, "grad_norm": 8.0, "learning_rate": 4.427521657091469e-06, "loss": 1.04261293, "memory(GiB)": 141.16, "step": 100080, "train_speed(iter/s)": 0.2889 }, { "acc": 0.73872547, "epoch": 1.1196132245224377, "grad_norm": 7.1875, "learning_rate": 4.425684437732964e-06, "loss": 1.0521184, "memory(GiB)": 141.16, "step": 100100, "train_speed(iter/s)": 0.288918 }, { "acc": 0.73806229, "epoch": 1.1198369234683963, "grad_norm": 7.1875, "learning_rate": 4.423847296949036e-06, "loss": 1.04794083, "memory(GiB)": 141.16, "step": 100120, "train_speed(iter/s)": 0.28894 }, { "acc": 0.73820639, "epoch": 1.1200606224143548, "grad_norm": 7.125, "learning_rate": 4.422010234991034e-06, "loss": 1.03632889, "memory(GiB)": 141.16, "step": 100140, "train_speed(iter/s)": 0.288961 }, { "acc": 0.72681789, "epoch": 1.1202843213603133, "grad_norm": 8.4375, "learning_rate": 4.4201732521102934e-06, "loss": 1.09621792, "memory(GiB)": 141.16, "step": 100160, "train_speed(iter/s)": 0.288979 }, { "acc": 0.73429785, "epoch": 1.1205080203062718, "grad_norm": 7.09375, "learning_rate": 4.4183363485581395e-06, "loss": 1.07018404, "memory(GiB)": 141.16, "step": 100180, "train_speed(iter/s)": 0.289 }, { "acc": 0.74019666, "epoch": 1.1207317192522304, "grad_norm": 6.21875, "learning_rate": 4.416499524585887e-06, "loss": 1.03003178, "memory(GiB)": 141.16, "step": 100200, "train_speed(iter/s)": 0.28902 }, { "acc": 0.73906698, "epoch": 1.120955418198189, "grad_norm": 8.0, "learning_rate": 4.414662780444839e-06, "loss": 1.0539854, "memory(GiB)": 141.16, "step": 100220, "train_speed(iter/s)": 0.289041 }, { "acc": 0.73258724, "epoch": 1.1211791171441474, "grad_norm": 6.40625, "learning_rate": 4.412826116386289e-06, "loss": 1.08022566, "memory(GiB)": 141.16, "step": 100240, "train_speed(iter/s)": 0.289059 }, { "acc": 0.71392546, "epoch": 1.121402816090106, "grad_norm": 8.0, "learning_rate": 4.4109895326615195e-06, "loss": 1.18231783, "memory(GiB)": 141.16, "step": 100260, "train_speed(iter/s)": 0.289081 }, { "acc": 0.73434153, "epoch": 1.1216265150360645, "grad_norm": 8.1875, "learning_rate": 4.409153029521802e-06, "loss": 1.06349564, "memory(GiB)": 141.16, "step": 100280, "train_speed(iter/s)": 0.2891 }, { "acc": 0.73537245, "epoch": 1.121850213982023, "grad_norm": 6.65625, "learning_rate": 4.407316607218394e-06, "loss": 1.05172739, "memory(GiB)": 141.16, "step": 100300, "train_speed(iter/s)": 0.289121 }, { "acc": 0.72919121, "epoch": 1.1220739129279815, "grad_norm": 6.78125, "learning_rate": 4.405480266002545e-06, "loss": 1.09760075, "memory(GiB)": 141.16, "step": 100320, "train_speed(iter/s)": 0.28914 }, { "acc": 0.73670855, "epoch": 1.12229761187394, "grad_norm": 6.875, "learning_rate": 4.403644006125494e-06, "loss": 1.04827213, "memory(GiB)": 141.16, "step": 100340, "train_speed(iter/s)": 0.289157 }, { "acc": 0.72961831, "epoch": 1.1225213108198986, "grad_norm": 7.125, "learning_rate": 4.401807827838466e-06, "loss": 1.07799797, "memory(GiB)": 141.16, "step": 100360, "train_speed(iter/s)": 0.289177 }, { "acc": 0.72771358, "epoch": 1.1227450097658571, "grad_norm": 5.21875, "learning_rate": 4.399971731392679e-06, "loss": 1.09905281, "memory(GiB)": 141.16, "step": 100380, "train_speed(iter/s)": 0.289198 }, { "acc": 0.7504859, "epoch": 1.1229687087118156, "grad_norm": 6.46875, "learning_rate": 4.398135717039334e-06, "loss": 1.00559111, "memory(GiB)": 141.16, "step": 100400, "train_speed(iter/s)": 0.289217 }, { "acc": 0.72195921, "epoch": 1.1231924076577742, "grad_norm": 6.5, "learning_rate": 4.396299785029626e-06, "loss": 1.11563492, "memory(GiB)": 141.16, "step": 100420, "train_speed(iter/s)": 0.289236 }, { "acc": 0.74328623, "epoch": 1.1234161066037327, "grad_norm": 6.09375, "learning_rate": 4.394463935614736e-06, "loss": 1.04209995, "memory(GiB)": 141.16, "step": 100440, "train_speed(iter/s)": 0.289256 }, { "acc": 0.72495947, "epoch": 1.1236398055496912, "grad_norm": 6.1875, "learning_rate": 4.392628169045835e-06, "loss": 1.10855656, "memory(GiB)": 141.16, "step": 100460, "train_speed(iter/s)": 0.289274 }, { "acc": 0.72772818, "epoch": 1.1238635044956498, "grad_norm": 7.1875, "learning_rate": 4.390792485574082e-06, "loss": 1.10041714, "memory(GiB)": 141.16, "step": 100480, "train_speed(iter/s)": 0.289293 }, { "acc": 0.72696924, "epoch": 1.1240872034416083, "grad_norm": 9.0, "learning_rate": 4.388956885450624e-06, "loss": 1.09814434, "memory(GiB)": 141.16, "step": 100500, "train_speed(iter/s)": 0.289312 }, { "acc": 0.73492765, "epoch": 1.1243109023875668, "grad_norm": 7.1875, "learning_rate": 4.387121368926598e-06, "loss": 1.05751553, "memory(GiB)": 141.16, "step": 100520, "train_speed(iter/s)": 0.28933 }, { "acc": 0.73781013, "epoch": 1.1245346013335253, "grad_norm": 8.125, "learning_rate": 4.385285936253129e-06, "loss": 1.04470081, "memory(GiB)": 141.16, "step": 100540, "train_speed(iter/s)": 0.289349 }, { "acc": 0.74162579, "epoch": 1.1247583002794839, "grad_norm": 9.3125, "learning_rate": 4.38345058768133e-06, "loss": 1.04104137, "memory(GiB)": 141.16, "step": 100560, "train_speed(iter/s)": 0.289367 }, { "acc": 0.72971611, "epoch": 1.1249819992254424, "grad_norm": 8.25, "learning_rate": 4.381615323462304e-06, "loss": 1.09253702, "memory(GiB)": 141.16, "step": 100580, "train_speed(iter/s)": 0.289384 }, { "acc": 0.73001914, "epoch": 1.125205698171401, "grad_norm": 7.5, "learning_rate": 4.37978014384714e-06, "loss": 1.08291168, "memory(GiB)": 141.16, "step": 100600, "train_speed(iter/s)": 0.289399 }, { "acc": 0.7261209, "epoch": 1.1254293971173595, "grad_norm": 6.15625, "learning_rate": 4.3779450490869194e-06, "loss": 1.11724749, "memory(GiB)": 141.16, "step": 100620, "train_speed(iter/s)": 0.289417 }, { "acc": 0.72074499, "epoch": 1.125653096063318, "grad_norm": 7.78125, "learning_rate": 4.376110039432704e-06, "loss": 1.12601051, "memory(GiB)": 141.16, "step": 100640, "train_speed(iter/s)": 0.289435 }, { "acc": 0.74276295, "epoch": 1.1258767950092765, "grad_norm": 5.84375, "learning_rate": 4.3742751151355535e-06, "loss": 1.03153639, "memory(GiB)": 141.16, "step": 100660, "train_speed(iter/s)": 0.289452 }, { "acc": 0.73386674, "epoch": 1.126100493955235, "grad_norm": 9.0, "learning_rate": 4.3724402764465116e-06, "loss": 1.07116709, "memory(GiB)": 141.16, "step": 100680, "train_speed(iter/s)": 0.289472 }, { "acc": 0.72694259, "epoch": 1.1263241929011936, "grad_norm": 7.1875, "learning_rate": 4.370605523616609e-06, "loss": 1.10090942, "memory(GiB)": 141.16, "step": 100700, "train_speed(iter/s)": 0.28949 }, { "acc": 0.73096356, "epoch": 1.126547891847152, "grad_norm": 7.5625, "learning_rate": 4.368770856896868e-06, "loss": 1.06788731, "memory(GiB)": 141.16, "step": 100720, "train_speed(iter/s)": 0.28951 }, { "acc": 0.73868966, "epoch": 1.1267715907931106, "grad_norm": 8.1875, "learning_rate": 4.366936276538295e-06, "loss": 1.03625965, "memory(GiB)": 141.16, "step": 100740, "train_speed(iter/s)": 0.289529 }, { "acc": 0.74699316, "epoch": 1.1269952897390692, "grad_norm": 8.125, "learning_rate": 4.3651017827918875e-06, "loss": 0.99173222, "memory(GiB)": 141.16, "step": 100760, "train_speed(iter/s)": 0.289551 }, { "acc": 0.7374898, "epoch": 1.1272189886850277, "grad_norm": 5.53125, "learning_rate": 4.363267375908631e-06, "loss": 1.04789429, "memory(GiB)": 141.16, "step": 100780, "train_speed(iter/s)": 0.28957 }, { "acc": 0.7526967, "epoch": 1.1274426876309862, "grad_norm": 7.65625, "learning_rate": 4.3614330561394995e-06, "loss": 0.9803196, "memory(GiB)": 141.16, "step": 100800, "train_speed(iter/s)": 0.289589 }, { "acc": 0.72691994, "epoch": 1.1276663865769447, "grad_norm": 7.9375, "learning_rate": 4.3595988237354535e-06, "loss": 1.10162096, "memory(GiB)": 141.16, "step": 100820, "train_speed(iter/s)": 0.289607 }, { "acc": 0.73641176, "epoch": 1.1278900855229033, "grad_norm": 6.9375, "learning_rate": 4.357764678947441e-06, "loss": 1.05169239, "memory(GiB)": 141.16, "step": 100840, "train_speed(iter/s)": 0.289626 }, { "acc": 0.73637104, "epoch": 1.1281137844688618, "grad_norm": 7.71875, "learning_rate": 4.3559306220264e-06, "loss": 1.04568834, "memory(GiB)": 141.16, "step": 100860, "train_speed(iter/s)": 0.289644 }, { "acc": 0.74171944, "epoch": 1.1283374834148203, "grad_norm": 6.4375, "learning_rate": 4.354096653223255e-06, "loss": 1.02990417, "memory(GiB)": 141.16, "step": 100880, "train_speed(iter/s)": 0.289666 }, { "acc": 0.73680973, "epoch": 1.1285611823607788, "grad_norm": 5.75, "learning_rate": 4.352262772788921e-06, "loss": 1.0698164, "memory(GiB)": 141.16, "step": 100900, "train_speed(iter/s)": 0.289685 }, { "acc": 0.73531237, "epoch": 1.1287848813067374, "grad_norm": 11.6875, "learning_rate": 4.350428980974299e-06, "loss": 1.06834202, "memory(GiB)": 141.16, "step": 100920, "train_speed(iter/s)": 0.289705 }, { "acc": 0.74790506, "epoch": 1.129008580252696, "grad_norm": 6.21875, "learning_rate": 4.348595278030276e-06, "loss": 1.01473274, "memory(GiB)": 141.16, "step": 100940, "train_speed(iter/s)": 0.289725 }, { "acc": 0.73804278, "epoch": 1.1292322791986544, "grad_norm": 8.0, "learning_rate": 4.346761664207728e-06, "loss": 1.04908228, "memory(GiB)": 141.16, "step": 100960, "train_speed(iter/s)": 0.289743 }, { "acc": 0.7410604, "epoch": 1.129455978144613, "grad_norm": 7.5, "learning_rate": 4.344928139757523e-06, "loss": 1.03141003, "memory(GiB)": 141.16, "step": 100980, "train_speed(iter/s)": 0.289764 }, { "acc": 0.73978844, "epoch": 1.1296796770905715, "grad_norm": 6.9375, "learning_rate": 4.343094704930512e-06, "loss": 1.04257717, "memory(GiB)": 141.16, "step": 101000, "train_speed(iter/s)": 0.289783 }, { "acc": 0.74038777, "epoch": 1.12990337603653, "grad_norm": 6.34375, "learning_rate": 4.341261359977534e-06, "loss": 1.03887749, "memory(GiB)": 141.16, "step": 101020, "train_speed(iter/s)": 0.289803 }, { "acc": 0.7325069, "epoch": 1.1301270749824885, "grad_norm": 7.90625, "learning_rate": 4.339428105149418e-06, "loss": 1.05609961, "memory(GiB)": 141.16, "step": 101040, "train_speed(iter/s)": 0.289824 }, { "acc": 0.72809563, "epoch": 1.130350773928447, "grad_norm": 5.71875, "learning_rate": 4.337594940696978e-06, "loss": 1.08660717, "memory(GiB)": 141.16, "step": 101060, "train_speed(iter/s)": 0.289844 }, { "acc": 0.74060488, "epoch": 1.1305744728744056, "grad_norm": 7.75, "learning_rate": 4.335761866871018e-06, "loss": 1.03387718, "memory(GiB)": 141.16, "step": 101080, "train_speed(iter/s)": 0.289864 }, { "acc": 0.73140469, "epoch": 1.1307981718203641, "grad_norm": 6.53125, "learning_rate": 4.333928883922329e-06, "loss": 1.07184582, "memory(GiB)": 141.16, "step": 101100, "train_speed(iter/s)": 0.289883 }, { "acc": 0.72882023, "epoch": 1.1310218707663227, "grad_norm": 7.46875, "learning_rate": 4.332095992101691e-06, "loss": 1.08683853, "memory(GiB)": 141.16, "step": 101120, "train_speed(iter/s)": 0.289901 }, { "acc": 0.74294214, "epoch": 1.1312455697122812, "grad_norm": 8.1875, "learning_rate": 4.330263191659866e-06, "loss": 1.03597107, "memory(GiB)": 141.16, "step": 101140, "train_speed(iter/s)": 0.28992 }, { "acc": 0.73473454, "epoch": 1.1314692686582397, "grad_norm": 5.59375, "learning_rate": 4.328430482847609e-06, "loss": 1.06937714, "memory(GiB)": 141.16, "step": 101160, "train_speed(iter/s)": 0.289939 }, { "acc": 0.74080219, "epoch": 1.1316929676041982, "grad_norm": 7.09375, "learning_rate": 4.326597865915661e-06, "loss": 1.04558392, "memory(GiB)": 141.16, "step": 101180, "train_speed(iter/s)": 0.289957 }, { "acc": 0.74640579, "epoch": 1.1319166665501568, "grad_norm": 8.125, "learning_rate": 4.32476534111475e-06, "loss": 1.03040047, "memory(GiB)": 141.16, "step": 101200, "train_speed(iter/s)": 0.289975 }, { "acc": 0.74337549, "epoch": 1.1321403654961153, "grad_norm": 6.4375, "learning_rate": 4.322932908695593e-06, "loss": 1.01858234, "memory(GiB)": 141.16, "step": 101220, "train_speed(iter/s)": 0.289993 }, { "acc": 0.72352304, "epoch": 1.1323640644420738, "grad_norm": 6.75, "learning_rate": 4.3211005689088904e-06, "loss": 1.11619778, "memory(GiB)": 141.16, "step": 101240, "train_speed(iter/s)": 0.290013 }, { "acc": 0.74164944, "epoch": 1.1325877633880324, "grad_norm": 8.5, "learning_rate": 4.319268322005333e-06, "loss": 1.03644142, "memory(GiB)": 141.16, "step": 101260, "train_speed(iter/s)": 0.290031 }, { "acc": 0.73366299, "epoch": 1.1328114623339909, "grad_norm": 7.375, "learning_rate": 4.3174361682356e-06, "loss": 1.06064644, "memory(GiB)": 141.16, "step": 101280, "train_speed(iter/s)": 0.290047 }, { "acc": 0.73058271, "epoch": 1.1330351612799494, "grad_norm": 6.25, "learning_rate": 4.315604107850355e-06, "loss": 1.07669878, "memory(GiB)": 141.16, "step": 101300, "train_speed(iter/s)": 0.290066 }, { "acc": 0.73355503, "epoch": 1.133258860225908, "grad_norm": 7.59375, "learning_rate": 4.313772141100251e-06, "loss": 1.07397213, "memory(GiB)": 141.16, "step": 101320, "train_speed(iter/s)": 0.290081 }, { "acc": 0.72436085, "epoch": 1.1334825591718665, "grad_norm": 8.1875, "learning_rate": 4.311940268235926e-06, "loss": 1.10824671, "memory(GiB)": 141.16, "step": 101340, "train_speed(iter/s)": 0.290102 }, { "acc": 0.73941708, "epoch": 1.133706258117825, "grad_norm": 8.3125, "learning_rate": 4.310108489508007e-06, "loss": 1.04013815, "memory(GiB)": 141.16, "step": 101360, "train_speed(iter/s)": 0.290122 }, { "acc": 0.7406064, "epoch": 1.1339299570637835, "grad_norm": 7.0625, "learning_rate": 4.308276805167107e-06, "loss": 1.02334137, "memory(GiB)": 141.16, "step": 101380, "train_speed(iter/s)": 0.290142 }, { "acc": 0.745785, "epoch": 1.134153656009742, "grad_norm": 7.25, "learning_rate": 4.306445215463827e-06, "loss": 1.00052681, "memory(GiB)": 141.16, "step": 101400, "train_speed(iter/s)": 0.290162 }, { "acc": 0.72962689, "epoch": 1.1343773549557006, "grad_norm": 5.28125, "learning_rate": 4.304613720648756e-06, "loss": 1.10466957, "memory(GiB)": 141.16, "step": 101420, "train_speed(iter/s)": 0.290179 }, { "acc": 0.7280448, "epoch": 1.134601053901659, "grad_norm": 7.21875, "learning_rate": 4.302782320972467e-06, "loss": 1.09900036, "memory(GiB)": 141.16, "step": 101440, "train_speed(iter/s)": 0.290197 }, { "acc": 0.732864, "epoch": 1.1348247528476176, "grad_norm": 7.8125, "learning_rate": 4.300951016685521e-06, "loss": 1.04770193, "memory(GiB)": 141.16, "step": 101460, "train_speed(iter/s)": 0.290219 }, { "acc": 0.72575297, "epoch": 1.1350484517935762, "grad_norm": 6.34375, "learning_rate": 4.299119808038468e-06, "loss": 1.09562969, "memory(GiB)": 141.16, "step": 101480, "train_speed(iter/s)": 0.290237 }, { "acc": 0.73546724, "epoch": 1.1352721507395347, "grad_norm": 5.6875, "learning_rate": 4.297288695281843e-06, "loss": 1.05944176, "memory(GiB)": 141.16, "step": 101500, "train_speed(iter/s)": 0.290255 }, { "acc": 0.73132, "epoch": 1.1354958496854932, "grad_norm": 7.125, "learning_rate": 4.295457678666169e-06, "loss": 1.07572212, "memory(GiB)": 141.16, "step": 101520, "train_speed(iter/s)": 0.290275 }, { "acc": 0.73405428, "epoch": 1.1357195486314517, "grad_norm": 9.6875, "learning_rate": 4.293626758441955e-06, "loss": 1.06363373, "memory(GiB)": 141.16, "step": 101540, "train_speed(iter/s)": 0.290294 }, { "acc": 0.74578762, "epoch": 1.1359432475774103, "grad_norm": 5.34375, "learning_rate": 4.291795934859697e-06, "loss": 1.00621843, "memory(GiB)": 141.16, "step": 101560, "train_speed(iter/s)": 0.290314 }, { "acc": 0.73240309, "epoch": 1.1361669465233688, "grad_norm": 5.53125, "learning_rate": 4.289965208169877e-06, "loss": 1.07532778, "memory(GiB)": 141.16, "step": 101580, "train_speed(iter/s)": 0.290332 }, { "acc": 0.74510775, "epoch": 1.1363906454693273, "grad_norm": 6.25, "learning_rate": 4.288134578622965e-06, "loss": 1.01298141, "memory(GiB)": 141.16, "step": 101600, "train_speed(iter/s)": 0.29035 }, { "acc": 0.73949852, "epoch": 1.1366143444152859, "grad_norm": 5.34375, "learning_rate": 4.286304046469418e-06, "loss": 1.04794426, "memory(GiB)": 141.16, "step": 101620, "train_speed(iter/s)": 0.29037 }, { "acc": 0.72736645, "epoch": 1.1368380433612444, "grad_norm": 5.71875, "learning_rate": 4.284473611959679e-06, "loss": 1.0858757, "memory(GiB)": 141.16, "step": 101640, "train_speed(iter/s)": 0.290389 }, { "acc": 0.74424934, "epoch": 1.137061742307203, "grad_norm": 7.125, "learning_rate": 4.2826432753441764e-06, "loss": 1.0257267, "memory(GiB)": 141.16, "step": 101660, "train_speed(iter/s)": 0.29041 }, { "acc": 0.72635489, "epoch": 1.1372854412531614, "grad_norm": 7.46875, "learning_rate": 4.280813036873327e-06, "loss": 1.09551811, "memory(GiB)": 141.16, "step": 101680, "train_speed(iter/s)": 0.290428 }, { "acc": 0.7339838, "epoch": 1.13750914019912, "grad_norm": 6.375, "learning_rate": 4.278982896797535e-06, "loss": 1.07627964, "memory(GiB)": 141.16, "step": 101700, "train_speed(iter/s)": 0.290449 }, { "acc": 0.74295788, "epoch": 1.1377328391450785, "grad_norm": 6.03125, "learning_rate": 4.277152855367186e-06, "loss": 1.04673166, "memory(GiB)": 141.16, "step": 101720, "train_speed(iter/s)": 0.290466 }, { "acc": 0.72695799, "epoch": 1.137956538091037, "grad_norm": 9.25, "learning_rate": 4.275322912832661e-06, "loss": 1.09577789, "memory(GiB)": 141.16, "step": 101740, "train_speed(iter/s)": 0.290487 }, { "acc": 0.73320107, "epoch": 1.1381802370369956, "grad_norm": 7.25, "learning_rate": 4.273493069444318e-06, "loss": 1.05545673, "memory(GiB)": 141.16, "step": 101760, "train_speed(iter/s)": 0.290505 }, { "acc": 0.72040281, "epoch": 1.138403935982954, "grad_norm": 7.09375, "learning_rate": 4.271663325452508e-06, "loss": 1.12452183, "memory(GiB)": 141.16, "step": 101780, "train_speed(iter/s)": 0.290522 }, { "acc": 0.74339414, "epoch": 1.1386276349289126, "grad_norm": 6.65625, "learning_rate": 4.269833681107567e-06, "loss": 1.02790899, "memory(GiB)": 141.16, "step": 101800, "train_speed(iter/s)": 0.29054 }, { "acc": 0.71986341, "epoch": 1.1388513338748711, "grad_norm": 6.46875, "learning_rate": 4.268004136659813e-06, "loss": 1.12799292, "memory(GiB)": 141.16, "step": 101820, "train_speed(iter/s)": 0.290557 }, { "acc": 0.71908045, "epoch": 1.1390750328208297, "grad_norm": 6.28125, "learning_rate": 4.2661746923595545e-06, "loss": 1.14404373, "memory(GiB)": 141.16, "step": 101840, "train_speed(iter/s)": 0.290575 }, { "acc": 0.73774767, "epoch": 1.1392987317667882, "grad_norm": 6.59375, "learning_rate": 4.2643453484570875e-06, "loss": 1.05780563, "memory(GiB)": 141.16, "step": 101860, "train_speed(iter/s)": 0.290588 }, { "acc": 0.72204909, "epoch": 1.1395224307127467, "grad_norm": 4.4375, "learning_rate": 4.262516105202694e-06, "loss": 1.10857925, "memory(GiB)": 141.16, "step": 101880, "train_speed(iter/s)": 0.290605 }, { "acc": 0.74389839, "epoch": 1.1397461296587053, "grad_norm": 6.53125, "learning_rate": 4.260686962846636e-06, "loss": 1.01657219, "memory(GiB)": 141.16, "step": 101900, "train_speed(iter/s)": 0.290625 }, { "acc": 0.7387414, "epoch": 1.1399698286046638, "grad_norm": 5.96875, "learning_rate": 4.258857921639169e-06, "loss": 1.03573265, "memory(GiB)": 141.16, "step": 101920, "train_speed(iter/s)": 0.290645 }, { "acc": 0.73529034, "epoch": 1.1401935275506223, "grad_norm": 5.875, "learning_rate": 4.257028981830532e-06, "loss": 1.06795435, "memory(GiB)": 141.16, "step": 101940, "train_speed(iter/s)": 0.290666 }, { "acc": 0.73716383, "epoch": 1.1404172264965808, "grad_norm": 6.28125, "learning_rate": 4.25520014367095e-06, "loss": 1.04687729, "memory(GiB)": 141.16, "step": 101960, "train_speed(iter/s)": 0.290686 }, { "acc": 0.73747473, "epoch": 1.1406409254425394, "grad_norm": 6.46875, "learning_rate": 4.253371407410634e-06, "loss": 1.04900894, "memory(GiB)": 141.16, "step": 101980, "train_speed(iter/s)": 0.290704 }, { "acc": 0.74284749, "epoch": 1.140864624388498, "grad_norm": 7.28125, "learning_rate": 4.251542773299781e-06, "loss": 1.03391476, "memory(GiB)": 141.16, "step": 102000, "train_speed(iter/s)": 0.290722 }, { "epoch": 1.140864624388498, "eval_acc": 0.6901338956306994, "eval_loss": 1.079257845878601, "eval_runtime": 2320.5314, "eval_samples_per_second": 32.442, "eval_steps_per_second": 16.221, "step": 102000 }, { "acc": 0.72912164, "epoch": 1.1410883233344564, "grad_norm": 7.71875, "learning_rate": 4.249714241588575e-06, "loss": 1.0825511, "memory(GiB)": 141.16, "step": 102020, "train_speed(iter/s)": 0.288789 }, { "acc": 0.72994518, "epoch": 1.141312022280415, "grad_norm": 8.3125, "learning_rate": 4.247885812527184e-06, "loss": 1.09581747, "memory(GiB)": 141.16, "step": 102040, "train_speed(iter/s)": 0.28881 }, { "acc": 0.7475358, "epoch": 1.1415357212263735, "grad_norm": 6.71875, "learning_rate": 4.246057486365764e-06, "loss": 1.0124815, "memory(GiB)": 141.16, "step": 102060, "train_speed(iter/s)": 0.288829 }, { "acc": 0.7199316, "epoch": 1.141759420172332, "grad_norm": 6.34375, "learning_rate": 4.244229263354458e-06, "loss": 1.12517891, "memory(GiB)": 141.16, "step": 102080, "train_speed(iter/s)": 0.28885 }, { "acc": 0.73988123, "epoch": 1.1419831191182905, "grad_norm": 7.15625, "learning_rate": 4.242401143743389e-06, "loss": 1.0161231, "memory(GiB)": 141.16, "step": 102100, "train_speed(iter/s)": 0.288869 }, { "acc": 0.73618431, "epoch": 1.142206818064249, "grad_norm": 7.0, "learning_rate": 4.240573127782673e-06, "loss": 1.05545807, "memory(GiB)": 141.16, "step": 102120, "train_speed(iter/s)": 0.288885 }, { "acc": 0.7242878, "epoch": 1.1424305170102076, "grad_norm": 7.59375, "learning_rate": 4.238745215722407e-06, "loss": 1.11068859, "memory(GiB)": 141.16, "step": 102140, "train_speed(iter/s)": 0.288903 }, { "acc": 0.73200884, "epoch": 1.1426542159561661, "grad_norm": 7.25, "learning_rate": 4.2369174078126775e-06, "loss": 1.06883831, "memory(GiB)": 141.16, "step": 102160, "train_speed(iter/s)": 0.28892 }, { "acc": 0.73080702, "epoch": 1.1428779149021246, "grad_norm": 5.75, "learning_rate": 4.235089704303554e-06, "loss": 1.0804512, "memory(GiB)": 141.16, "step": 102180, "train_speed(iter/s)": 0.288938 }, { "acc": 0.73521066, "epoch": 1.1431016138480832, "grad_norm": 6.84375, "learning_rate": 4.23326210544509e-06, "loss": 1.06459999, "memory(GiB)": 141.16, "step": 102200, "train_speed(iter/s)": 0.288958 }, { "acc": 0.72583227, "epoch": 1.1433253127940417, "grad_norm": 8.625, "learning_rate": 4.23143461148733e-06, "loss": 1.10744991, "memory(GiB)": 141.16, "step": 102220, "train_speed(iter/s)": 0.288977 }, { "acc": 0.7373415, "epoch": 1.1435490117400002, "grad_norm": 8.5, "learning_rate": 4.2296072226803016e-06, "loss": 1.06617184, "memory(GiB)": 141.16, "step": 102240, "train_speed(iter/s)": 0.288996 }, { "acc": 0.74645867, "epoch": 1.1437727106859588, "grad_norm": 7.3125, "learning_rate": 4.227779939274016e-06, "loss": 1.01386757, "memory(GiB)": 141.16, "step": 102260, "train_speed(iter/s)": 0.289015 }, { "acc": 0.73575172, "epoch": 1.1439964096319173, "grad_norm": 6.40625, "learning_rate": 4.225952761518472e-06, "loss": 1.04703226, "memory(GiB)": 141.16, "step": 102280, "train_speed(iter/s)": 0.289035 }, { "acc": 0.72588253, "epoch": 1.1442201085778758, "grad_norm": 5.84375, "learning_rate": 4.224125689663655e-06, "loss": 1.09783964, "memory(GiB)": 141.16, "step": 102300, "train_speed(iter/s)": 0.289054 }, { "acc": 0.73922882, "epoch": 1.1444438075238343, "grad_norm": 7.6875, "learning_rate": 4.2222987239595316e-06, "loss": 1.02518272, "memory(GiB)": 141.16, "step": 102320, "train_speed(iter/s)": 0.289071 }, { "acc": 0.73871193, "epoch": 1.1446675064697929, "grad_norm": 6.8125, "learning_rate": 4.220471864656059e-06, "loss": 1.04754219, "memory(GiB)": 141.16, "step": 102340, "train_speed(iter/s)": 0.289092 }, { "acc": 0.72708292, "epoch": 1.1448912054157514, "grad_norm": 8.125, "learning_rate": 4.218645112003178e-06, "loss": 1.10227642, "memory(GiB)": 141.16, "step": 102360, "train_speed(iter/s)": 0.289109 }, { "acc": 0.73917432, "epoch": 1.14511490436171, "grad_norm": 6.09375, "learning_rate": 4.216818466250815e-06, "loss": 1.04522953, "memory(GiB)": 141.16, "step": 102380, "train_speed(iter/s)": 0.28913 }, { "acc": 0.73832245, "epoch": 1.1453386033076685, "grad_norm": 7.90625, "learning_rate": 4.214991927648878e-06, "loss": 1.05552292, "memory(GiB)": 141.16, "step": 102400, "train_speed(iter/s)": 0.289149 }, { "acc": 0.74191661, "epoch": 1.145562302253627, "grad_norm": 6.1875, "learning_rate": 4.213165496447267e-06, "loss": 1.04477081, "memory(GiB)": 141.16, "step": 102420, "train_speed(iter/s)": 0.289166 }, { "acc": 0.72842884, "epoch": 1.1457860011995855, "grad_norm": 5.96875, "learning_rate": 4.211339172895861e-06, "loss": 1.09162607, "memory(GiB)": 141.16, "step": 102440, "train_speed(iter/s)": 0.289183 }, { "acc": 0.7193161, "epoch": 1.146009700145544, "grad_norm": 6.625, "learning_rate": 4.2095129572445295e-06, "loss": 1.15498991, "memory(GiB)": 141.16, "step": 102460, "train_speed(iter/s)": 0.289202 }, { "acc": 0.71875954, "epoch": 1.1462333990915026, "grad_norm": 7.0625, "learning_rate": 4.207686849743125e-06, "loss": 1.12690315, "memory(GiB)": 141.16, "step": 102480, "train_speed(iter/s)": 0.289221 }, { "acc": 0.72926202, "epoch": 1.146457098037461, "grad_norm": 5.625, "learning_rate": 4.205860850641484e-06, "loss": 1.09490223, "memory(GiB)": 141.16, "step": 102500, "train_speed(iter/s)": 0.289241 }, { "acc": 0.73244338, "epoch": 1.1466807969834196, "grad_norm": 7.125, "learning_rate": 4.204034960189428e-06, "loss": 1.06289549, "memory(GiB)": 141.16, "step": 102520, "train_speed(iter/s)": 0.289262 }, { "acc": 0.73427491, "epoch": 1.1469044959293782, "grad_norm": 7.5, "learning_rate": 4.202209178636768e-06, "loss": 1.08285923, "memory(GiB)": 141.16, "step": 102540, "train_speed(iter/s)": 0.289279 }, { "acc": 0.73768644, "epoch": 1.1471281948753367, "grad_norm": 6.65625, "learning_rate": 4.200383506233295e-06, "loss": 1.04861059, "memory(GiB)": 141.16, "step": 102560, "train_speed(iter/s)": 0.289299 }, { "acc": 0.72641296, "epoch": 1.1473518938212952, "grad_norm": 6.65625, "learning_rate": 4.198557943228787e-06, "loss": 1.10197468, "memory(GiB)": 141.16, "step": 102580, "train_speed(iter/s)": 0.289318 }, { "acc": 0.72636185, "epoch": 1.1475755927672537, "grad_norm": 6.96875, "learning_rate": 4.1967324898730085e-06, "loss": 1.09402313, "memory(GiB)": 141.16, "step": 102600, "train_speed(iter/s)": 0.289339 }, { "acc": 0.74071341, "epoch": 1.1477992917132123, "grad_norm": 9.1875, "learning_rate": 4.194907146415706e-06, "loss": 1.02298117, "memory(GiB)": 141.16, "step": 102620, "train_speed(iter/s)": 0.289358 }, { "acc": 0.73453026, "epoch": 1.1480229906591708, "grad_norm": 7.96875, "learning_rate": 4.193081913106613e-06, "loss": 1.07652607, "memory(GiB)": 141.16, "step": 102640, "train_speed(iter/s)": 0.289375 }, { "acc": 0.73757668, "epoch": 1.1482466896051293, "grad_norm": 5.375, "learning_rate": 4.191256790195448e-06, "loss": 1.04836082, "memory(GiB)": 141.16, "step": 102660, "train_speed(iter/s)": 0.289391 }, { "acc": 0.72925873, "epoch": 1.1484703885510879, "grad_norm": 8.125, "learning_rate": 4.189431777931915e-06, "loss": 1.0962554, "memory(GiB)": 141.16, "step": 102680, "train_speed(iter/s)": 0.28941 }, { "acc": 0.73033032, "epoch": 1.1486940874970464, "grad_norm": 6.75, "learning_rate": 4.1876068765657e-06, "loss": 1.08234482, "memory(GiB)": 141.16, "step": 102700, "train_speed(iter/s)": 0.289429 }, { "acc": 0.72545576, "epoch": 1.148917786443005, "grad_norm": 7.5625, "learning_rate": 4.185782086346475e-06, "loss": 1.12189779, "memory(GiB)": 141.16, "step": 102720, "train_speed(iter/s)": 0.289445 }, { "acc": 0.74758844, "epoch": 1.1491414853889634, "grad_norm": 8.4375, "learning_rate": 4.183957407523899e-06, "loss": 1.00023966, "memory(GiB)": 141.16, "step": 102740, "train_speed(iter/s)": 0.289466 }, { "acc": 0.73263836, "epoch": 1.149365184334922, "grad_norm": 5.84375, "learning_rate": 4.182132840347613e-06, "loss": 1.07702208, "memory(GiB)": 141.16, "step": 102760, "train_speed(iter/s)": 0.289486 }, { "acc": 0.73929424, "epoch": 1.1495888832808805, "grad_norm": 7.28125, "learning_rate": 4.180308385067246e-06, "loss": 1.03597984, "memory(GiB)": 141.16, "step": 102780, "train_speed(iter/s)": 0.289506 }, { "acc": 0.73252897, "epoch": 1.149812582226839, "grad_norm": 6.4375, "learning_rate": 4.178484041932406e-06, "loss": 1.08025923, "memory(GiB)": 141.16, "step": 102800, "train_speed(iter/s)": 0.289522 }, { "acc": 0.73053617, "epoch": 1.1500362811727975, "grad_norm": 7.09375, "learning_rate": 4.1766598111926926e-06, "loss": 1.08908682, "memory(GiB)": 141.16, "step": 102820, "train_speed(iter/s)": 0.28954 }, { "acc": 0.72888069, "epoch": 1.150259980118756, "grad_norm": 7.34375, "learning_rate": 4.174835693097685e-06, "loss": 1.07869377, "memory(GiB)": 141.16, "step": 102840, "train_speed(iter/s)": 0.289561 }, { "acc": 0.7386755, "epoch": 1.1504836790647146, "grad_norm": 6.875, "learning_rate": 4.173011687896949e-06, "loss": 1.03244781, "memory(GiB)": 141.16, "step": 102860, "train_speed(iter/s)": 0.289582 }, { "acc": 0.72506289, "epoch": 1.1507073780106731, "grad_norm": 7.28125, "learning_rate": 4.171187795840035e-06, "loss": 1.11423912, "memory(GiB)": 141.16, "step": 102880, "train_speed(iter/s)": 0.289598 }, { "acc": 0.74091444, "epoch": 1.1509310769566317, "grad_norm": 8.4375, "learning_rate": 4.1693640171764756e-06, "loss": 1.04242725, "memory(GiB)": 141.16, "step": 102900, "train_speed(iter/s)": 0.289616 }, { "acc": 0.74950676, "epoch": 1.1511547759025902, "grad_norm": 7.125, "learning_rate": 4.1675403521557916e-06, "loss": 0.99447165, "memory(GiB)": 141.16, "step": 102920, "train_speed(iter/s)": 0.289634 }, { "acc": 0.74623938, "epoch": 1.1513784748485487, "grad_norm": 7.03125, "learning_rate": 4.165716801027486e-06, "loss": 1.00766554, "memory(GiB)": 141.16, "step": 102940, "train_speed(iter/s)": 0.289654 }, { "acc": 0.7246532, "epoch": 1.1516021737945072, "grad_norm": 7.625, "learning_rate": 4.1638933640410465e-06, "loss": 1.10356979, "memory(GiB)": 141.16, "step": 102960, "train_speed(iter/s)": 0.289673 }, { "acc": 0.73673673, "epoch": 1.1518258727404658, "grad_norm": 9.25, "learning_rate": 4.162070041445948e-06, "loss": 1.05924873, "memory(GiB)": 141.16, "step": 102980, "train_speed(iter/s)": 0.28969 }, { "acc": 0.74387674, "epoch": 1.1520495716864243, "grad_norm": 8.625, "learning_rate": 4.160246833491642e-06, "loss": 1.0221056, "memory(GiB)": 141.16, "step": 103000, "train_speed(iter/s)": 0.289707 }, { "acc": 0.73470402, "epoch": 1.1522732706323828, "grad_norm": 5.59375, "learning_rate": 4.158423740427574e-06, "loss": 1.06531849, "memory(GiB)": 141.16, "step": 103020, "train_speed(iter/s)": 0.289726 }, { "acc": 0.7560998, "epoch": 1.1524969695783414, "grad_norm": 7.40625, "learning_rate": 4.156600762503166e-06, "loss": 0.97226963, "memory(GiB)": 141.16, "step": 103040, "train_speed(iter/s)": 0.289748 }, { "acc": 0.72461319, "epoch": 1.1527206685242999, "grad_norm": 8.25, "learning_rate": 4.1547778999678275e-06, "loss": 1.10942659, "memory(GiB)": 141.16, "step": 103060, "train_speed(iter/s)": 0.289768 }, { "acc": 0.73143325, "epoch": 1.1529443674702584, "grad_norm": 6.8125, "learning_rate": 4.152955153070954e-06, "loss": 1.08016491, "memory(GiB)": 141.16, "step": 103080, "train_speed(iter/s)": 0.289785 }, { "acc": 0.72959328, "epoch": 1.153168066416217, "grad_norm": 7.0, "learning_rate": 4.151132522061923e-06, "loss": 1.10085335, "memory(GiB)": 141.16, "step": 103100, "train_speed(iter/s)": 0.289804 }, { "acc": 0.73237362, "epoch": 1.1533917653621755, "grad_norm": 7.65625, "learning_rate": 4.149310007190097e-06, "loss": 1.05942535, "memory(GiB)": 141.16, "step": 103120, "train_speed(iter/s)": 0.289821 }, { "acc": 0.74464598, "epoch": 1.153615464308134, "grad_norm": 6.84375, "learning_rate": 4.14748760870482e-06, "loss": 1.01979094, "memory(GiB)": 141.16, "step": 103140, "train_speed(iter/s)": 0.28984 }, { "acc": 0.73493104, "epoch": 1.1538391632540925, "grad_norm": 7.15625, "learning_rate": 4.145665326855423e-06, "loss": 1.06970158, "memory(GiB)": 141.16, "step": 103160, "train_speed(iter/s)": 0.289858 }, { "acc": 0.74716845, "epoch": 1.154062862200051, "grad_norm": 6.125, "learning_rate": 4.14384316189122e-06, "loss": 1.02028484, "memory(GiB)": 141.16, "step": 103180, "train_speed(iter/s)": 0.289873 }, { "acc": 0.7356513, "epoch": 1.1542865611460096, "grad_norm": 7.25, "learning_rate": 4.142021114061511e-06, "loss": 1.04424572, "memory(GiB)": 141.16, "step": 103200, "train_speed(iter/s)": 0.289891 }, { "acc": 0.73185692, "epoch": 1.154510260091968, "grad_norm": 9.0, "learning_rate": 4.140199183615578e-06, "loss": 1.08443451, "memory(GiB)": 141.16, "step": 103220, "train_speed(iter/s)": 0.289909 }, { "acc": 0.74474511, "epoch": 1.1547339590379266, "grad_norm": 6.09375, "learning_rate": 4.138377370802684e-06, "loss": 1.0111433, "memory(GiB)": 141.16, "step": 103240, "train_speed(iter/s)": 0.289929 }, { "acc": 0.72997503, "epoch": 1.1549576579838852, "grad_norm": 8.6875, "learning_rate": 4.136555675872082e-06, "loss": 1.07617073, "memory(GiB)": 141.16, "step": 103260, "train_speed(iter/s)": 0.289948 }, { "acc": 0.74311352, "epoch": 1.1551813569298437, "grad_norm": 6.28125, "learning_rate": 4.134734099073005e-06, "loss": 1.01908846, "memory(GiB)": 141.16, "step": 103280, "train_speed(iter/s)": 0.289966 }, { "acc": 0.73929081, "epoch": 1.1554050558758022, "grad_norm": 7.53125, "learning_rate": 4.132912640654671e-06, "loss": 1.04566116, "memory(GiB)": 141.16, "step": 103300, "train_speed(iter/s)": 0.289982 }, { "acc": 0.73611355, "epoch": 1.1556287548217608, "grad_norm": 7.71875, "learning_rate": 4.131091300866281e-06, "loss": 1.06423855, "memory(GiB)": 141.16, "step": 103320, "train_speed(iter/s)": 0.290002 }, { "acc": 0.74579592, "epoch": 1.1558524537677193, "grad_norm": 7.78125, "learning_rate": 4.12927007995702e-06, "loss": 1.00936279, "memory(GiB)": 141.16, "step": 103340, "train_speed(iter/s)": 0.290018 }, { "acc": 0.72989597, "epoch": 1.156076152713678, "grad_norm": 7.34375, "learning_rate": 4.127448978176058e-06, "loss": 1.07766752, "memory(GiB)": 141.16, "step": 103360, "train_speed(iter/s)": 0.290037 }, { "acc": 0.73425317, "epoch": 1.1562998516596366, "grad_norm": 6.625, "learning_rate": 4.125627995772547e-06, "loss": 1.07471333, "memory(GiB)": 141.16, "step": 103380, "train_speed(iter/s)": 0.290057 }, { "acc": 0.73990049, "epoch": 1.156523550605595, "grad_norm": 8.0625, "learning_rate": 4.123807132995625e-06, "loss": 1.0381484, "memory(GiB)": 141.16, "step": 103400, "train_speed(iter/s)": 0.290076 }, { "acc": 0.73318548, "epoch": 1.1567472495515536, "grad_norm": 6.6875, "learning_rate": 4.121986390094412e-06, "loss": 1.08283577, "memory(GiB)": 141.16, "step": 103420, "train_speed(iter/s)": 0.290094 }, { "acc": 0.73136473, "epoch": 1.1569709484975121, "grad_norm": 6.15625, "learning_rate": 4.1201657673180075e-06, "loss": 1.08710804, "memory(GiB)": 141.16, "step": 103440, "train_speed(iter/s)": 0.290113 }, { "acc": 0.73332901, "epoch": 1.1571946474434707, "grad_norm": 6.71875, "learning_rate": 4.118345264915503e-06, "loss": 1.07193165, "memory(GiB)": 141.16, "step": 103460, "train_speed(iter/s)": 0.290131 }, { "acc": 0.72377901, "epoch": 1.1574183463894292, "grad_norm": 5.9375, "learning_rate": 4.1165248831359675e-06, "loss": 1.10417271, "memory(GiB)": 141.16, "step": 103480, "train_speed(iter/s)": 0.29015 }, { "acc": 0.73879924, "epoch": 1.1576420453353877, "grad_norm": 7.375, "learning_rate": 4.1147046222284564e-06, "loss": 1.05768394, "memory(GiB)": 141.16, "step": 103500, "train_speed(iter/s)": 0.29017 }, { "acc": 0.73721571, "epoch": 1.1578657442813463, "grad_norm": 7.75, "learning_rate": 4.1128844824420075e-06, "loss": 1.05281487, "memory(GiB)": 141.16, "step": 103520, "train_speed(iter/s)": 0.290189 }, { "acc": 0.73842888, "epoch": 1.1580894432273048, "grad_norm": 6.90625, "learning_rate": 4.111064464025641e-06, "loss": 1.03946762, "memory(GiB)": 141.16, "step": 103540, "train_speed(iter/s)": 0.290206 }, { "acc": 0.74844279, "epoch": 1.1583131421732633, "grad_norm": 7.96875, "learning_rate": 4.10924456722836e-06, "loss": 1.00973091, "memory(GiB)": 141.16, "step": 103560, "train_speed(iter/s)": 0.290226 }, { "acc": 0.73783636, "epoch": 1.1585368411192218, "grad_norm": 7.5, "learning_rate": 4.107424792299155e-06, "loss": 1.05706825, "memory(GiB)": 141.16, "step": 103580, "train_speed(iter/s)": 0.29024 }, { "acc": 0.7299263, "epoch": 1.1587605400651804, "grad_norm": 7.65625, "learning_rate": 4.105605139486997e-06, "loss": 1.1034008, "memory(GiB)": 141.16, "step": 103600, "train_speed(iter/s)": 0.290258 }, { "acc": 0.74133987, "epoch": 1.158984239011139, "grad_norm": 5.96875, "learning_rate": 4.10378560904084e-06, "loss": 1.03405075, "memory(GiB)": 141.16, "step": 103620, "train_speed(iter/s)": 0.290279 }, { "acc": 0.73634338, "epoch": 1.1592079379570974, "grad_norm": 5.875, "learning_rate": 4.10196620120962e-06, "loss": 1.07611828, "memory(GiB)": 141.16, "step": 103640, "train_speed(iter/s)": 0.290298 }, { "acc": 0.74909091, "epoch": 1.159431636903056, "grad_norm": 7.28125, "learning_rate": 4.10014691624226e-06, "loss": 0.99064541, "memory(GiB)": 141.16, "step": 103660, "train_speed(iter/s)": 0.290315 }, { "acc": 0.74696803, "epoch": 1.1596553358490145, "grad_norm": 8.125, "learning_rate": 4.098327754387664e-06, "loss": 1.01950397, "memory(GiB)": 141.16, "step": 103680, "train_speed(iter/s)": 0.290333 }, { "acc": 0.73187275, "epoch": 1.159879034794973, "grad_norm": 8.125, "learning_rate": 4.096508715894718e-06, "loss": 1.08600712, "memory(GiB)": 141.16, "step": 103700, "train_speed(iter/s)": 0.290351 }, { "acc": 0.74819064, "epoch": 1.1601027337409315, "grad_norm": 6.9375, "learning_rate": 4.094689801012296e-06, "loss": 1.02482224, "memory(GiB)": 141.16, "step": 103720, "train_speed(iter/s)": 0.290369 }, { "acc": 0.73769445, "epoch": 1.16032643268689, "grad_norm": 6.15625, "learning_rate": 4.092871009989247e-06, "loss": 1.05520344, "memory(GiB)": 141.16, "step": 103740, "train_speed(iter/s)": 0.290387 }, { "acc": 0.73727989, "epoch": 1.1605501316328486, "grad_norm": 8.0, "learning_rate": 4.09105234307441e-06, "loss": 1.07425575, "memory(GiB)": 141.16, "step": 103760, "train_speed(iter/s)": 0.290406 }, { "acc": 0.7181674, "epoch": 1.1607738305788071, "grad_norm": 7.53125, "learning_rate": 4.089233800516605e-06, "loss": 1.14294443, "memory(GiB)": 141.16, "step": 103780, "train_speed(iter/s)": 0.290423 }, { "acc": 0.73742399, "epoch": 1.1609975295247656, "grad_norm": 8.6875, "learning_rate": 4.087415382564633e-06, "loss": 1.06061392, "memory(GiB)": 141.16, "step": 103800, "train_speed(iter/s)": 0.290441 }, { "acc": 0.73667946, "epoch": 1.1612212284707242, "grad_norm": 7.15625, "learning_rate": 4.085597089467283e-06, "loss": 1.0511095, "memory(GiB)": 141.16, "step": 103820, "train_speed(iter/s)": 0.290459 }, { "acc": 0.73285003, "epoch": 1.1614449274166827, "grad_norm": 5.0, "learning_rate": 4.0837789214733185e-06, "loss": 1.06258059, "memory(GiB)": 141.16, "step": 103840, "train_speed(iter/s)": 0.290475 }, { "acc": 0.73552656, "epoch": 1.1616686263626412, "grad_norm": 8.125, "learning_rate": 4.081960878831493e-06, "loss": 1.05071621, "memory(GiB)": 141.16, "step": 103860, "train_speed(iter/s)": 0.290492 }, { "acc": 0.72268577, "epoch": 1.1618923253085998, "grad_norm": 6.59375, "learning_rate": 4.080142961790542e-06, "loss": 1.12989044, "memory(GiB)": 141.16, "step": 103880, "train_speed(iter/s)": 0.29051 }, { "acc": 0.73468962, "epoch": 1.1621160242545583, "grad_norm": 8.3125, "learning_rate": 4.078325170599182e-06, "loss": 1.08310165, "memory(GiB)": 141.16, "step": 103900, "train_speed(iter/s)": 0.290529 }, { "acc": 0.74428711, "epoch": 1.1623397232005168, "grad_norm": 7.03125, "learning_rate": 4.076507505506112e-06, "loss": 1.01162357, "memory(GiB)": 141.16, "step": 103920, "train_speed(iter/s)": 0.290548 }, { "acc": 0.7280345, "epoch": 1.1625634221464753, "grad_norm": 7.15625, "learning_rate": 4.074689966760015e-06, "loss": 1.08443308, "memory(GiB)": 141.16, "step": 103940, "train_speed(iter/s)": 0.290566 }, { "acc": 0.72598047, "epoch": 1.1627871210924339, "grad_norm": 8.5, "learning_rate": 4.072872554609556e-06, "loss": 1.1087141, "memory(GiB)": 141.16, "step": 103960, "train_speed(iter/s)": 0.290586 }, { "acc": 0.72510109, "epoch": 1.1630108200383924, "grad_norm": 7.59375, "learning_rate": 4.071055269303384e-06, "loss": 1.13142567, "memory(GiB)": 141.16, "step": 103980, "train_speed(iter/s)": 0.290603 }, { "acc": 0.73975, "epoch": 1.163234518984351, "grad_norm": 7.28125, "learning_rate": 4.069238111090128e-06, "loss": 1.05310221, "memory(GiB)": 141.16, "step": 104000, "train_speed(iter/s)": 0.290621 }, { "epoch": 1.163234518984351, "eval_acc": 0.6901336984440792, "eval_loss": 1.079216718673706, "eval_runtime": 2323.1769, "eval_samples_per_second": 32.405, "eval_steps_per_second": 16.203, "step": 104000 }, { "acc": 0.72311144, "epoch": 1.1634582179303095, "grad_norm": 7.28125, "learning_rate": 4.067421080218404e-06, "loss": 1.10726194, "memory(GiB)": 141.16, "step": 104020, "train_speed(iter/s)": 0.288729 }, { "acc": 0.72576876, "epoch": 1.163681916876268, "grad_norm": 6.40625, "learning_rate": 4.065604176936804e-06, "loss": 1.11462116, "memory(GiB)": 141.16, "step": 104040, "train_speed(iter/s)": 0.288749 }, { "acc": 0.72561054, "epoch": 1.1639056158222265, "grad_norm": 7.28125, "learning_rate": 4.063787401493908e-06, "loss": 1.1151577, "memory(GiB)": 141.16, "step": 104060, "train_speed(iter/s)": 0.288767 }, { "acc": 0.73085747, "epoch": 1.164129314768185, "grad_norm": 6.1875, "learning_rate": 4.061970754138277e-06, "loss": 1.08491726, "memory(GiB)": 141.16, "step": 104080, "train_speed(iter/s)": 0.288786 }, { "acc": 0.74306545, "epoch": 1.1643530137141436, "grad_norm": 6.875, "learning_rate": 4.060154235118454e-06, "loss": 1.02971497, "memory(GiB)": 141.16, "step": 104100, "train_speed(iter/s)": 0.288804 }, { "acc": 0.72656431, "epoch": 1.164576712660102, "grad_norm": 8.5, "learning_rate": 4.058337844682967e-06, "loss": 1.09151182, "memory(GiB)": 141.16, "step": 104120, "train_speed(iter/s)": 0.288821 }, { "acc": 0.73309059, "epoch": 1.1648004116060606, "grad_norm": 7.4375, "learning_rate": 4.056521583080322e-06, "loss": 1.06074467, "memory(GiB)": 141.16, "step": 104140, "train_speed(iter/s)": 0.288838 }, { "acc": 0.73600874, "epoch": 1.1650241105520192, "grad_norm": 7.375, "learning_rate": 4.054705450559009e-06, "loss": 1.0568038, "memory(GiB)": 141.16, "step": 104160, "train_speed(iter/s)": 0.288856 }, { "acc": 0.72980595, "epoch": 1.1652478094979777, "grad_norm": 6.15625, "learning_rate": 4.052889447367503e-06, "loss": 1.0796771, "memory(GiB)": 141.16, "step": 104180, "train_speed(iter/s)": 0.288874 }, { "acc": 0.74211092, "epoch": 1.1654715084439362, "grad_norm": 7.5625, "learning_rate": 4.051073573754257e-06, "loss": 1.02242231, "memory(GiB)": 141.16, "step": 104200, "train_speed(iter/s)": 0.288891 }, { "acc": 0.74456301, "epoch": 1.1656952073898947, "grad_norm": 5.875, "learning_rate": 4.049257829967709e-06, "loss": 1.00627022, "memory(GiB)": 141.16, "step": 104220, "train_speed(iter/s)": 0.288909 }, { "acc": 0.72962642, "epoch": 1.1659189063358533, "grad_norm": 5.71875, "learning_rate": 4.0474422162562785e-06, "loss": 1.06991138, "memory(GiB)": 141.16, "step": 104240, "train_speed(iter/s)": 0.288928 }, { "acc": 0.74883957, "epoch": 1.1661426052818118, "grad_norm": 7.78125, "learning_rate": 4.045626732868369e-06, "loss": 1.00131445, "memory(GiB)": 141.16, "step": 104260, "train_speed(iter/s)": 0.288947 }, { "acc": 0.72889595, "epoch": 1.1663663042277703, "grad_norm": 6.03125, "learning_rate": 4.043811380052364e-06, "loss": 1.0937994, "memory(GiB)": 141.16, "step": 104280, "train_speed(iter/s)": 0.288964 }, { "acc": 0.7313581, "epoch": 1.1665900031737289, "grad_norm": 6.40625, "learning_rate": 4.0419961580566295e-06, "loss": 1.06633148, "memory(GiB)": 141.16, "step": 104300, "train_speed(iter/s)": 0.288983 }, { "acc": 0.7400105, "epoch": 1.1668137021196874, "grad_norm": 7.21875, "learning_rate": 4.040181067129512e-06, "loss": 1.05180988, "memory(GiB)": 141.16, "step": 104320, "train_speed(iter/s)": 0.289002 }, { "acc": 0.72565813, "epoch": 1.167037401065646, "grad_norm": 5.71875, "learning_rate": 4.038366107519344e-06, "loss": 1.11169481, "memory(GiB)": 141.16, "step": 104340, "train_speed(iter/s)": 0.289021 }, { "acc": 0.72344398, "epoch": 1.1672611000116044, "grad_norm": 5.5625, "learning_rate": 4.036551279474438e-06, "loss": 1.12058258, "memory(GiB)": 141.16, "step": 104360, "train_speed(iter/s)": 0.28904 }, { "acc": 0.73451061, "epoch": 1.167484798957563, "grad_norm": 7.46875, "learning_rate": 4.034736583243088e-06, "loss": 1.07532692, "memory(GiB)": 141.16, "step": 104380, "train_speed(iter/s)": 0.289058 }, { "acc": 0.73667598, "epoch": 1.1677084979035215, "grad_norm": 6.6875, "learning_rate": 4.032922019073569e-06, "loss": 1.05339298, "memory(GiB)": 141.16, "step": 104400, "train_speed(iter/s)": 0.289075 }, { "acc": 0.74291949, "epoch": 1.16793219684948, "grad_norm": 8.625, "learning_rate": 4.031107587214142e-06, "loss": 1.02135496, "memory(GiB)": 141.16, "step": 104420, "train_speed(iter/s)": 0.289095 }, { "acc": 0.72534466, "epoch": 1.1681558957954385, "grad_norm": 8.75, "learning_rate": 4.029293287913044e-06, "loss": 1.10500603, "memory(GiB)": 141.16, "step": 104440, "train_speed(iter/s)": 0.289114 }, { "acc": 0.71557674, "epoch": 1.168379594741397, "grad_norm": 7.125, "learning_rate": 4.0274791214185e-06, "loss": 1.15447588, "memory(GiB)": 141.16, "step": 104460, "train_speed(iter/s)": 0.289131 }, { "acc": 0.73339634, "epoch": 1.1686032936873556, "grad_norm": 6.75, "learning_rate": 4.025665087978713e-06, "loss": 1.04754572, "memory(GiB)": 141.16, "step": 104480, "train_speed(iter/s)": 0.28915 }, { "acc": 0.73300714, "epoch": 1.1688269926333141, "grad_norm": 6.34375, "learning_rate": 4.0238511878418675e-06, "loss": 1.06966381, "memory(GiB)": 141.16, "step": 104500, "train_speed(iter/s)": 0.289169 }, { "acc": 0.73376541, "epoch": 1.1690506915792727, "grad_norm": 5.6875, "learning_rate": 4.0220374212561325e-06, "loss": 1.06832104, "memory(GiB)": 141.16, "step": 104520, "train_speed(iter/s)": 0.289191 }, { "acc": 0.75290103, "epoch": 1.1692743905252312, "grad_norm": 8.125, "learning_rate": 4.020223788469656e-06, "loss": 0.99724274, "memory(GiB)": 141.16, "step": 104540, "train_speed(iter/s)": 0.289206 }, { "acc": 0.72992277, "epoch": 1.1694980894711897, "grad_norm": 7.90625, "learning_rate": 4.01841028973057e-06, "loss": 1.09769754, "memory(GiB)": 141.16, "step": 104560, "train_speed(iter/s)": 0.289224 }, { "acc": 0.74035454, "epoch": 1.1697217884171482, "grad_norm": 5.84375, "learning_rate": 4.016596925286987e-06, "loss": 1.04787865, "memory(GiB)": 141.16, "step": 104580, "train_speed(iter/s)": 0.289244 }, { "acc": 0.73506651, "epoch": 1.1699454873631068, "grad_norm": 8.8125, "learning_rate": 4.014783695387e-06, "loss": 1.06579361, "memory(GiB)": 141.16, "step": 104600, "train_speed(iter/s)": 0.289261 }, { "acc": 0.75484705, "epoch": 1.1701691863090653, "grad_norm": 6.875, "learning_rate": 4.012970600278685e-06, "loss": 0.98481045, "memory(GiB)": 141.16, "step": 104620, "train_speed(iter/s)": 0.28928 }, { "acc": 0.74390941, "epoch": 1.1703928852550238, "grad_norm": 6.46875, "learning_rate": 4.0111576402101e-06, "loss": 1.02428951, "memory(GiB)": 141.16, "step": 104640, "train_speed(iter/s)": 0.289299 }, { "acc": 0.73442163, "epoch": 1.1706165842009824, "grad_norm": 5.09375, "learning_rate": 4.009344815429284e-06, "loss": 1.05972538, "memory(GiB)": 141.16, "step": 104660, "train_speed(iter/s)": 0.289314 }, { "acc": 0.74015865, "epoch": 1.1708402831469409, "grad_norm": 5.875, "learning_rate": 4.0075321261842585e-06, "loss": 1.04236183, "memory(GiB)": 141.16, "step": 104680, "train_speed(iter/s)": 0.289334 }, { "acc": 0.74526858, "epoch": 1.1710639820928994, "grad_norm": 5.96875, "learning_rate": 4.005719572723021e-06, "loss": 1.02175903, "memory(GiB)": 141.16, "step": 104700, "train_speed(iter/s)": 0.289352 }, { "acc": 0.7537529, "epoch": 1.171287681038858, "grad_norm": 7.3125, "learning_rate": 4.0039071552935585e-06, "loss": 0.97334185, "memory(GiB)": 141.16, "step": 104720, "train_speed(iter/s)": 0.289372 }, { "acc": 0.74688148, "epoch": 1.1715113799848165, "grad_norm": 9.0, "learning_rate": 4.002094874143835e-06, "loss": 1.01055393, "memory(GiB)": 141.16, "step": 104740, "train_speed(iter/s)": 0.28939 }, { "acc": 0.73798089, "epoch": 1.171735078930775, "grad_norm": 7.90625, "learning_rate": 4.000282729521795e-06, "loss": 1.05054302, "memory(GiB)": 141.16, "step": 104760, "train_speed(iter/s)": 0.289409 }, { "acc": 0.74606905, "epoch": 1.1719587778767335, "grad_norm": 5.5, "learning_rate": 3.998470721675369e-06, "loss": 1.0115984, "memory(GiB)": 141.16, "step": 104780, "train_speed(iter/s)": 0.289427 }, { "acc": 0.73669214, "epoch": 1.172182476822692, "grad_norm": 6.0625, "learning_rate": 3.996658850852461e-06, "loss": 1.05953722, "memory(GiB)": 141.16, "step": 104800, "train_speed(iter/s)": 0.289446 }, { "acc": 0.72993336, "epoch": 1.1724061757686506, "grad_norm": 5.75, "learning_rate": 3.994847117300965e-06, "loss": 1.07231169, "memory(GiB)": 141.16, "step": 104820, "train_speed(iter/s)": 0.289465 }, { "acc": 0.7346674, "epoch": 1.172629874714609, "grad_norm": 8.625, "learning_rate": 3.99303552126875e-06, "loss": 1.05638704, "memory(GiB)": 141.16, "step": 104840, "train_speed(iter/s)": 0.289483 }, { "acc": 0.72658787, "epoch": 1.1728535736605676, "grad_norm": 6.6875, "learning_rate": 3.991224063003667e-06, "loss": 1.0969861, "memory(GiB)": 141.16, "step": 104860, "train_speed(iter/s)": 0.289502 }, { "acc": 0.74136477, "epoch": 1.1730772726065262, "grad_norm": 7.375, "learning_rate": 3.989412742753554e-06, "loss": 1.0310957, "memory(GiB)": 141.16, "step": 104880, "train_speed(iter/s)": 0.289522 }, { "acc": 0.7284071, "epoch": 1.1733009715524847, "grad_norm": 6.25, "learning_rate": 3.9876015607662195e-06, "loss": 1.09843864, "memory(GiB)": 141.16, "step": 104900, "train_speed(iter/s)": 0.289538 }, { "acc": 0.74571805, "epoch": 1.1735246704984432, "grad_norm": 6.53125, "learning_rate": 3.985790517289464e-06, "loss": 1.0119257, "memory(GiB)": 141.16, "step": 104920, "train_speed(iter/s)": 0.289557 }, { "acc": 0.73357344, "epoch": 1.1737483694444018, "grad_norm": 7.28125, "learning_rate": 3.983979612571061e-06, "loss": 1.07052383, "memory(GiB)": 141.16, "step": 104940, "train_speed(iter/s)": 0.289574 }, { "acc": 0.72735758, "epoch": 1.1739720683903603, "grad_norm": 7.09375, "learning_rate": 3.982168846858768e-06, "loss": 1.09974155, "memory(GiB)": 141.16, "step": 104960, "train_speed(iter/s)": 0.28959 }, { "acc": 0.72667885, "epoch": 1.1741957673363188, "grad_norm": 6.71875, "learning_rate": 3.980358220400328e-06, "loss": 1.11070099, "memory(GiB)": 141.16, "step": 104980, "train_speed(iter/s)": 0.289611 }, { "acc": 0.74414177, "epoch": 1.1744194662822773, "grad_norm": 7.25, "learning_rate": 3.978547733443455e-06, "loss": 1.02685223, "memory(GiB)": 141.16, "step": 105000, "train_speed(iter/s)": 0.289629 }, { "acc": 0.7336298, "epoch": 1.1746431652282359, "grad_norm": 8.375, "learning_rate": 3.976737386235852e-06, "loss": 1.07741613, "memory(GiB)": 141.16, "step": 105020, "train_speed(iter/s)": 0.289647 }, { "acc": 0.73566227, "epoch": 1.1748668641741944, "grad_norm": 7.5625, "learning_rate": 3.974927179025202e-06, "loss": 1.06961479, "memory(GiB)": 141.16, "step": 105040, "train_speed(iter/s)": 0.289666 }, { "acc": 0.72356749, "epoch": 1.175090563120153, "grad_norm": 6.96875, "learning_rate": 3.973117112059165e-06, "loss": 1.10733776, "memory(GiB)": 141.16, "step": 105060, "train_speed(iter/s)": 0.289687 }, { "acc": 0.73635273, "epoch": 1.1753142620661114, "grad_norm": 7.21875, "learning_rate": 3.971307185585385e-06, "loss": 1.05992069, "memory(GiB)": 141.16, "step": 105080, "train_speed(iter/s)": 0.289706 }, { "acc": 0.73137093, "epoch": 1.17553796101207, "grad_norm": 8.1875, "learning_rate": 3.969497399851484e-06, "loss": 1.08611765, "memory(GiB)": 141.16, "step": 105100, "train_speed(iter/s)": 0.289724 }, { "acc": 0.73642702, "epoch": 1.1757616599580285, "grad_norm": 6.75, "learning_rate": 3.967687755105068e-06, "loss": 1.05197916, "memory(GiB)": 141.16, "step": 105120, "train_speed(iter/s)": 0.289745 }, { "acc": 0.73325891, "epoch": 1.175985358903987, "grad_norm": 7.96875, "learning_rate": 3.965878251593723e-06, "loss": 1.07381744, "memory(GiB)": 141.16, "step": 105140, "train_speed(iter/s)": 0.289762 }, { "acc": 0.73327622, "epoch": 1.1762090578499456, "grad_norm": 8.9375, "learning_rate": 3.964068889565014e-06, "loss": 1.07010307, "memory(GiB)": 141.16, "step": 105160, "train_speed(iter/s)": 0.289781 }, { "acc": 0.74111848, "epoch": 1.176432756795904, "grad_norm": 7.875, "learning_rate": 3.9622596692664896e-06, "loss": 1.03740387, "memory(GiB)": 141.16, "step": 105180, "train_speed(iter/s)": 0.2898 }, { "acc": 0.74199262, "epoch": 1.1766564557418626, "grad_norm": 7.71875, "learning_rate": 3.9604505909456735e-06, "loss": 1.03471918, "memory(GiB)": 141.16, "step": 105200, "train_speed(iter/s)": 0.289818 }, { "acc": 0.73079863, "epoch": 1.1768801546878211, "grad_norm": 6.15625, "learning_rate": 3.958641654850075e-06, "loss": 1.08024178, "memory(GiB)": 141.16, "step": 105220, "train_speed(iter/s)": 0.289837 }, { "acc": 0.72201872, "epoch": 1.1771038536337797, "grad_norm": 6.1875, "learning_rate": 3.956832861227182e-06, "loss": 1.12494354, "memory(GiB)": 141.16, "step": 105240, "train_speed(iter/s)": 0.289854 }, { "acc": 0.73092694, "epoch": 1.1773275525797382, "grad_norm": 8.4375, "learning_rate": 3.955024210324464e-06, "loss": 1.08778057, "memory(GiB)": 141.16, "step": 105260, "train_speed(iter/s)": 0.289872 }, { "acc": 0.73241529, "epoch": 1.1775512515256967, "grad_norm": 5.75, "learning_rate": 3.953215702389372e-06, "loss": 1.0575634, "memory(GiB)": 141.16, "step": 105280, "train_speed(iter/s)": 0.28989 }, { "acc": 0.73245597, "epoch": 1.1777749504716553, "grad_norm": 8.4375, "learning_rate": 3.951407337669332e-06, "loss": 1.07596321, "memory(GiB)": 141.16, "step": 105300, "train_speed(iter/s)": 0.289909 }, { "acc": 0.74349155, "epoch": 1.1779986494176138, "grad_norm": 6.03125, "learning_rate": 3.949599116411757e-06, "loss": 1.01747112, "memory(GiB)": 141.16, "step": 105320, "train_speed(iter/s)": 0.289927 }, { "acc": 0.74048619, "epoch": 1.1782223483635723, "grad_norm": 8.125, "learning_rate": 3.947791038864036e-06, "loss": 1.04444437, "memory(GiB)": 141.16, "step": 105340, "train_speed(iter/s)": 0.289946 }, { "acc": 0.74096079, "epoch": 1.1784460473095308, "grad_norm": 7.03125, "learning_rate": 3.9459831052735425e-06, "loss": 1.04010849, "memory(GiB)": 141.16, "step": 105360, "train_speed(iter/s)": 0.289965 }, { "acc": 0.72852478, "epoch": 1.1786697462554894, "grad_norm": 6.84375, "learning_rate": 3.944175315887624e-06, "loss": 1.09617348, "memory(GiB)": 141.16, "step": 105380, "train_speed(iter/s)": 0.289985 }, { "acc": 0.740728, "epoch": 1.178893445201448, "grad_norm": 6.71875, "learning_rate": 3.942367670953613e-06, "loss": 1.03992329, "memory(GiB)": 141.16, "step": 105400, "train_speed(iter/s)": 0.290003 }, { "acc": 0.73657064, "epoch": 1.1791171441474064, "grad_norm": 7.875, "learning_rate": 3.940560170718822e-06, "loss": 1.07439156, "memory(GiB)": 141.16, "step": 105420, "train_speed(iter/s)": 0.29002 }, { "acc": 0.72263622, "epoch": 1.179340843093365, "grad_norm": 6.625, "learning_rate": 3.938752815430543e-06, "loss": 1.11688805, "memory(GiB)": 141.16, "step": 105440, "train_speed(iter/s)": 0.29004 }, { "acc": 0.73623152, "epoch": 1.1795645420393235, "grad_norm": 5.46875, "learning_rate": 3.9369456053360464e-06, "loss": 1.05797949, "memory(GiB)": 141.16, "step": 105460, "train_speed(iter/s)": 0.290059 }, { "acc": 0.72616372, "epoch": 1.179788240985282, "grad_norm": 5.9375, "learning_rate": 3.935138540682587e-06, "loss": 1.09764748, "memory(GiB)": 141.16, "step": 105480, "train_speed(iter/s)": 0.290074 }, { "acc": 0.74190483, "epoch": 1.1800119399312405, "grad_norm": 8.0, "learning_rate": 3.933331621717394e-06, "loss": 1.04136505, "memory(GiB)": 141.16, "step": 105500, "train_speed(iter/s)": 0.290093 }, { "acc": 0.7383832, "epoch": 1.180235638877199, "grad_norm": 7.46875, "learning_rate": 3.931524848687683e-06, "loss": 1.04526558, "memory(GiB)": 141.16, "step": 105520, "train_speed(iter/s)": 0.290113 }, { "acc": 0.73488011, "epoch": 1.1804593378231576, "grad_norm": 6.1875, "learning_rate": 3.9297182218406435e-06, "loss": 1.07351179, "memory(GiB)": 141.16, "step": 105540, "train_speed(iter/s)": 0.290132 }, { "acc": 0.73888006, "epoch": 1.1806830367691161, "grad_norm": 7.5625, "learning_rate": 3.927911741423449e-06, "loss": 1.05215645, "memory(GiB)": 141.16, "step": 105560, "train_speed(iter/s)": 0.29015 }, { "acc": 0.73475885, "epoch": 1.1809067357150747, "grad_norm": 6.03125, "learning_rate": 3.9261054076832526e-06, "loss": 1.08413868, "memory(GiB)": 141.16, "step": 105580, "train_speed(iter/s)": 0.290169 }, { "acc": 0.72571921, "epoch": 1.1811304346610332, "grad_norm": 8.5625, "learning_rate": 3.9242992208671855e-06, "loss": 1.10774727, "memory(GiB)": 141.16, "step": 105600, "train_speed(iter/s)": 0.290187 }, { "acc": 0.73238573, "epoch": 1.1813541336069917, "grad_norm": 8.1875, "learning_rate": 3.922493181222361e-06, "loss": 1.07758999, "memory(GiB)": 141.16, "step": 105620, "train_speed(iter/s)": 0.290206 }, { "acc": 0.72125626, "epoch": 1.1815778325529502, "grad_norm": 5.1875, "learning_rate": 3.92068728899587e-06, "loss": 1.13678198, "memory(GiB)": 141.16, "step": 105640, "train_speed(iter/s)": 0.290226 }, { "acc": 0.73563728, "epoch": 1.1818015314989088, "grad_norm": 7.71875, "learning_rate": 3.918881544434785e-06, "loss": 1.07348042, "memory(GiB)": 141.16, "step": 105660, "train_speed(iter/s)": 0.290245 }, { "acc": 0.73673215, "epoch": 1.1820252304448673, "grad_norm": 7.5, "learning_rate": 3.917075947786156e-06, "loss": 1.06648827, "memory(GiB)": 141.16, "step": 105680, "train_speed(iter/s)": 0.290264 }, { "acc": 0.73694706, "epoch": 1.1822489293908258, "grad_norm": 6.65625, "learning_rate": 3.9152704992970174e-06, "loss": 1.0626009, "memory(GiB)": 141.16, "step": 105700, "train_speed(iter/s)": 0.29028 }, { "acc": 0.7392302, "epoch": 1.1824726283367843, "grad_norm": 7.875, "learning_rate": 3.913465199214379e-06, "loss": 1.05236378, "memory(GiB)": 141.16, "step": 105720, "train_speed(iter/s)": 0.290297 }, { "acc": 0.73158631, "epoch": 1.1826963272827429, "grad_norm": 6.34375, "learning_rate": 3.9116600477852315e-06, "loss": 1.06126413, "memory(GiB)": 141.16, "step": 105740, "train_speed(iter/s)": 0.290312 }, { "acc": 0.73357143, "epoch": 1.1829200262287014, "grad_norm": 6.25, "learning_rate": 3.909855045256545e-06, "loss": 1.06946869, "memory(GiB)": 141.16, "step": 105760, "train_speed(iter/s)": 0.290331 }, { "acc": 0.74241104, "epoch": 1.18314372517466, "grad_norm": 6.84375, "learning_rate": 3.90805019187527e-06, "loss": 1.03768482, "memory(GiB)": 141.16, "step": 105780, "train_speed(iter/s)": 0.290349 }, { "acc": 0.74434657, "epoch": 1.1833674241206185, "grad_norm": 8.3125, "learning_rate": 3.906245487888336e-06, "loss": 1.02195969, "memory(GiB)": 141.16, "step": 105800, "train_speed(iter/s)": 0.290369 }, { "acc": 0.72985649, "epoch": 1.183591123066577, "grad_norm": 6.9375, "learning_rate": 3.904440933542654e-06, "loss": 1.06577072, "memory(GiB)": 141.16, "step": 105820, "train_speed(iter/s)": 0.290386 }, { "acc": 0.72829084, "epoch": 1.1838148220125355, "grad_norm": 6.28125, "learning_rate": 3.902636529085109e-06, "loss": 1.10056953, "memory(GiB)": 141.16, "step": 105840, "train_speed(iter/s)": 0.290405 }, { "acc": 0.73344975, "epoch": 1.184038520958494, "grad_norm": 6.4375, "learning_rate": 3.9008322747625736e-06, "loss": 1.05869904, "memory(GiB)": 141.16, "step": 105860, "train_speed(iter/s)": 0.290424 }, { "acc": 0.74141541, "epoch": 1.1842622199044526, "grad_norm": 7.03125, "learning_rate": 3.899028170821894e-06, "loss": 1.04402933, "memory(GiB)": 141.16, "step": 105880, "train_speed(iter/s)": 0.290442 }, { "acc": 0.74019489, "epoch": 1.184485918850411, "grad_norm": 7.28125, "learning_rate": 3.897224217509896e-06, "loss": 1.04583225, "memory(GiB)": 141.16, "step": 105900, "train_speed(iter/s)": 0.290461 }, { "acc": 0.72981567, "epoch": 1.1847096177963696, "grad_norm": 7.46875, "learning_rate": 3.895420415073389e-06, "loss": 1.07834377, "memory(GiB)": 141.16, "step": 105920, "train_speed(iter/s)": 0.29048 }, { "acc": 0.72107344, "epoch": 1.1849333167423282, "grad_norm": 7.09375, "learning_rate": 3.893616763759155e-06, "loss": 1.12574129, "memory(GiB)": 141.16, "step": 105940, "train_speed(iter/s)": 0.290499 }, { "acc": 0.72766323, "epoch": 1.1851570156882867, "grad_norm": 7.90625, "learning_rate": 3.891813263813962e-06, "loss": 1.09970379, "memory(GiB)": 141.16, "step": 105960, "train_speed(iter/s)": 0.29052 }, { "acc": 0.72691927, "epoch": 1.1853807146342452, "grad_norm": 5.3125, "learning_rate": 3.890009915484556e-06, "loss": 1.11352844, "memory(GiB)": 141.16, "step": 105980, "train_speed(iter/s)": 0.290541 }, { "acc": 0.73847933, "epoch": 1.1856044135802037, "grad_norm": 8.625, "learning_rate": 3.888206719017657e-06, "loss": 1.03826313, "memory(GiB)": 141.16, "step": 106000, "train_speed(iter/s)": 0.290562 }, { "epoch": 1.1856044135802037, "eval_acc": 0.6901481423640073, "eval_loss": 1.0792373418807983, "eval_runtime": 2319.1002, "eval_samples_per_second": 32.462, "eval_steps_per_second": 16.231, "step": 106000 }, { "acc": 0.71876879, "epoch": 1.1858281125261623, "grad_norm": 7.5, "learning_rate": 3.886403674659972e-06, "loss": 1.1216053, "memory(GiB)": 141.16, "step": 106020, "train_speed(iter/s)": 0.288706 }, { "acc": 0.71560488, "epoch": 1.1860518114721208, "grad_norm": 8.625, "learning_rate": 3.88460078265818e-06, "loss": 1.14564648, "memory(GiB)": 141.16, "step": 106040, "train_speed(iter/s)": 0.288727 }, { "acc": 0.73441701, "epoch": 1.1862755104180793, "grad_norm": 7.15625, "learning_rate": 3.882798043258943e-06, "loss": 1.07936878, "memory(GiB)": 141.16, "step": 106060, "train_speed(iter/s)": 0.288747 }, { "acc": 0.73411837, "epoch": 1.1864992093640379, "grad_norm": 8.875, "learning_rate": 3.880995456708903e-06, "loss": 1.05378551, "memory(GiB)": 141.16, "step": 106080, "train_speed(iter/s)": 0.288764 }, { "acc": 0.74094682, "epoch": 1.1867229083099964, "grad_norm": 6.5625, "learning_rate": 3.879193023254678e-06, "loss": 1.03357601, "memory(GiB)": 141.16, "step": 106100, "train_speed(iter/s)": 0.28878 }, { "acc": 0.71721139, "epoch": 1.186946607255955, "grad_norm": 9.125, "learning_rate": 3.877390743142869e-06, "loss": 1.14899921, "memory(GiB)": 141.16, "step": 106120, "train_speed(iter/s)": 0.288798 }, { "acc": 0.73865023, "epoch": 1.1871703062019134, "grad_norm": 6.3125, "learning_rate": 3.875588616620052e-06, "loss": 1.04911537, "memory(GiB)": 141.16, "step": 106140, "train_speed(iter/s)": 0.288818 }, { "acc": 0.73287649, "epoch": 1.187394005147872, "grad_norm": 6.34375, "learning_rate": 3.873786643932782e-06, "loss": 1.07652779, "memory(GiB)": 141.16, "step": 106160, "train_speed(iter/s)": 0.288835 }, { "acc": 0.73808131, "epoch": 1.1876177040938305, "grad_norm": 6.28125, "learning_rate": 3.8719848253275975e-06, "loss": 1.05327606, "memory(GiB)": 141.16, "step": 106180, "train_speed(iter/s)": 0.288852 }, { "acc": 0.74301891, "epoch": 1.187841403039789, "grad_norm": 8.25, "learning_rate": 3.870183161051012e-06, "loss": 1.02802715, "memory(GiB)": 141.16, "step": 106200, "train_speed(iter/s)": 0.28887 }, { "acc": 0.74459114, "epoch": 1.1880651019857476, "grad_norm": 7.375, "learning_rate": 3.86838165134952e-06, "loss": 1.0207592, "memory(GiB)": 141.16, "step": 106220, "train_speed(iter/s)": 0.288889 }, { "acc": 0.72857676, "epoch": 1.188288800931706, "grad_norm": 7.09375, "learning_rate": 3.866580296469591e-06, "loss": 1.0919054, "memory(GiB)": 141.16, "step": 106240, "train_speed(iter/s)": 0.288908 }, { "acc": 0.73898878, "epoch": 1.1885124998776646, "grad_norm": 6.34375, "learning_rate": 3.864779096657678e-06, "loss": 1.04930906, "memory(GiB)": 141.16, "step": 106260, "train_speed(iter/s)": 0.288925 }, { "acc": 0.7283823, "epoch": 1.1887361988236231, "grad_norm": 7.0625, "learning_rate": 3.862978052160211e-06, "loss": 1.09802189, "memory(GiB)": 141.16, "step": 106280, "train_speed(iter/s)": 0.288943 }, { "acc": 0.73756809, "epoch": 1.1889598977695817, "grad_norm": 7.84375, "learning_rate": 3.861177163223597e-06, "loss": 1.05398016, "memory(GiB)": 141.16, "step": 106300, "train_speed(iter/s)": 0.28896 }, { "acc": 0.74086528, "epoch": 1.1891835967155402, "grad_norm": 5.65625, "learning_rate": 3.8593764300942274e-06, "loss": 1.03183861, "memory(GiB)": 141.16, "step": 106320, "train_speed(iter/s)": 0.288977 }, { "acc": 0.73557987, "epoch": 1.1894072956614987, "grad_norm": 6.96875, "learning_rate": 3.857575853018463e-06, "loss": 1.06013718, "memory(GiB)": 141.16, "step": 106340, "train_speed(iter/s)": 0.288996 }, { "acc": 0.74032078, "epoch": 1.1896309946074572, "grad_norm": 6.9375, "learning_rate": 3.8557754322426515e-06, "loss": 1.04895878, "memory(GiB)": 141.16, "step": 106360, "train_speed(iter/s)": 0.289015 }, { "acc": 0.73286543, "epoch": 1.1898546935534158, "grad_norm": 7.5625, "learning_rate": 3.853975168013115e-06, "loss": 1.07456703, "memory(GiB)": 141.16, "step": 106380, "train_speed(iter/s)": 0.289033 }, { "acc": 0.7332109, "epoch": 1.1900783924993743, "grad_norm": 7.5625, "learning_rate": 3.852175060576157e-06, "loss": 1.07321777, "memory(GiB)": 141.16, "step": 106400, "train_speed(iter/s)": 0.289051 }, { "acc": 0.72030478, "epoch": 1.1903020914453328, "grad_norm": 7.8125, "learning_rate": 3.8503751101780575e-06, "loss": 1.11924105, "memory(GiB)": 141.16, "step": 106420, "train_speed(iter/s)": 0.289069 }, { "acc": 0.74929724, "epoch": 1.1905257903912914, "grad_norm": 8.625, "learning_rate": 3.848575317065073e-06, "loss": 1.01018038, "memory(GiB)": 141.16, "step": 106440, "train_speed(iter/s)": 0.289088 }, { "acc": 0.7519937, "epoch": 1.1907494893372499, "grad_norm": 5.9375, "learning_rate": 3.846775681483444e-06, "loss": 1.00191708, "memory(GiB)": 141.16, "step": 106460, "train_speed(iter/s)": 0.289106 }, { "acc": 0.74438629, "epoch": 1.1909731882832084, "grad_norm": 6.5, "learning_rate": 3.844976203679385e-06, "loss": 1.02292709, "memory(GiB)": 141.16, "step": 106480, "train_speed(iter/s)": 0.289122 }, { "acc": 0.7281158, "epoch": 1.191196887229167, "grad_norm": 7.9375, "learning_rate": 3.84317688389909e-06, "loss": 1.08293858, "memory(GiB)": 141.16, "step": 106500, "train_speed(iter/s)": 0.28914 }, { "acc": 0.72007599, "epoch": 1.1914205861751255, "grad_norm": 5.78125, "learning_rate": 3.8413777223887335e-06, "loss": 1.11842899, "memory(GiB)": 141.16, "step": 106520, "train_speed(iter/s)": 0.289159 }, { "acc": 0.7252811, "epoch": 1.191644285121084, "grad_norm": 5.28125, "learning_rate": 3.839578719394464e-06, "loss": 1.11186905, "memory(GiB)": 141.16, "step": 106540, "train_speed(iter/s)": 0.289177 }, { "acc": 0.72757092, "epoch": 1.1918679840670425, "grad_norm": 5.8125, "learning_rate": 3.837779875162413e-06, "loss": 1.09654016, "memory(GiB)": 141.16, "step": 106560, "train_speed(iter/s)": 0.289195 }, { "acc": 0.73877058, "epoch": 1.192091683013001, "grad_norm": 7.1875, "learning_rate": 3.835981189938687e-06, "loss": 1.05002518, "memory(GiB)": 141.16, "step": 106580, "train_speed(iter/s)": 0.289213 }, { "acc": 0.72743845, "epoch": 1.1923153819589596, "grad_norm": 5.875, "learning_rate": 3.83418266396937e-06, "loss": 1.09077435, "memory(GiB)": 141.16, "step": 106600, "train_speed(iter/s)": 0.289232 }, { "acc": 0.72499323, "epoch": 1.1925390809049181, "grad_norm": 7.15625, "learning_rate": 3.832384297500529e-06, "loss": 1.11493969, "memory(GiB)": 141.16, "step": 106620, "train_speed(iter/s)": 0.289248 }, { "acc": 0.7276041, "epoch": 1.1927627798508766, "grad_norm": 6.15625, "learning_rate": 3.830586090778204e-06, "loss": 1.07251673, "memory(GiB)": 141.16, "step": 106640, "train_speed(iter/s)": 0.289268 }, { "acc": 0.73273439, "epoch": 1.1929864787968352, "grad_norm": 8.625, "learning_rate": 3.828788044048418e-06, "loss": 1.0741848, "memory(GiB)": 141.16, "step": 106660, "train_speed(iter/s)": 0.289287 }, { "acc": 0.74159641, "epoch": 1.1932101777427937, "grad_norm": 8.9375, "learning_rate": 3.826990157557169e-06, "loss": 1.03693018, "memory(GiB)": 141.16, "step": 106680, "train_speed(iter/s)": 0.289305 }, { "acc": 0.73399224, "epoch": 1.1934338766887522, "grad_norm": 7.0625, "learning_rate": 3.82519243155043e-06, "loss": 1.05750313, "memory(GiB)": 141.16, "step": 106700, "train_speed(iter/s)": 0.289322 }, { "acc": 0.73967481, "epoch": 1.1936575756347108, "grad_norm": 5.96875, "learning_rate": 3.8233948662741595e-06, "loss": 1.02882481, "memory(GiB)": 141.16, "step": 106720, "train_speed(iter/s)": 0.289341 }, { "acc": 0.73820543, "epoch": 1.1938812745806693, "grad_norm": 7.8125, "learning_rate": 3.821597461974289e-06, "loss": 1.05362625, "memory(GiB)": 141.16, "step": 106740, "train_speed(iter/s)": 0.289358 }, { "acc": 0.72815771, "epoch": 1.1941049735266278, "grad_norm": 8.0625, "learning_rate": 3.819800218896728e-06, "loss": 1.10502682, "memory(GiB)": 141.16, "step": 106760, "train_speed(iter/s)": 0.289378 }, { "acc": 0.73009281, "epoch": 1.1943286724725863, "grad_norm": 5.5625, "learning_rate": 3.818003137287367e-06, "loss": 1.07788134, "memory(GiB)": 141.16, "step": 106780, "train_speed(iter/s)": 0.289397 }, { "acc": 0.73903141, "epoch": 1.1945523714185449, "grad_norm": 7.46875, "learning_rate": 3.816206217392072e-06, "loss": 1.0489418, "memory(GiB)": 141.16, "step": 106800, "train_speed(iter/s)": 0.289415 }, { "acc": 0.74963541, "epoch": 1.1947760703645034, "grad_norm": 7.0625, "learning_rate": 3.8144094594566854e-06, "loss": 0.99557762, "memory(GiB)": 141.16, "step": 106820, "train_speed(iter/s)": 0.289432 }, { "acc": 0.72895336, "epoch": 1.194999769310462, "grad_norm": 6.3125, "learning_rate": 3.812612863727031e-06, "loss": 1.1004261, "memory(GiB)": 141.16, "step": 106840, "train_speed(iter/s)": 0.289449 }, { "acc": 0.72155094, "epoch": 1.1952234682564205, "grad_norm": 5.09375, "learning_rate": 3.8108164304489085e-06, "loss": 1.13203697, "memory(GiB)": 141.16, "step": 106860, "train_speed(iter/s)": 0.289468 }, { "acc": 0.73653326, "epoch": 1.195447167202379, "grad_norm": 8.5625, "learning_rate": 3.8090201598680972e-06, "loss": 1.06825657, "memory(GiB)": 141.16, "step": 106880, "train_speed(iter/s)": 0.289486 }, { "acc": 0.74204731, "epoch": 1.1956708661483375, "grad_norm": 8.4375, "learning_rate": 3.8072240522303495e-06, "loss": 1.03125725, "memory(GiB)": 141.16, "step": 106900, "train_speed(iter/s)": 0.289505 }, { "acc": 0.7336688, "epoch": 1.195894565094296, "grad_norm": 7.53125, "learning_rate": 3.8054281077814e-06, "loss": 1.06783218, "memory(GiB)": 141.16, "step": 106920, "train_speed(iter/s)": 0.289527 }, { "acc": 0.74376774, "epoch": 1.1961182640402546, "grad_norm": 7.75, "learning_rate": 3.8036323267669604e-06, "loss": 1.01382256, "memory(GiB)": 141.16, "step": 106940, "train_speed(iter/s)": 0.289545 }, { "acc": 0.74063339, "epoch": 1.196341962986213, "grad_norm": 5.34375, "learning_rate": 3.801836709432718e-06, "loss": 1.03042898, "memory(GiB)": 141.16, "step": 106960, "train_speed(iter/s)": 0.289563 }, { "acc": 0.75036993, "epoch": 1.1965656619321716, "grad_norm": 8.4375, "learning_rate": 3.8000412560243405e-06, "loss": 0.9886632, "memory(GiB)": 141.16, "step": 106980, "train_speed(iter/s)": 0.289582 }, { "acc": 0.74182482, "epoch": 1.1967893608781301, "grad_norm": 8.0625, "learning_rate": 3.79824596678747e-06, "loss": 1.02637243, "memory(GiB)": 141.16, "step": 107000, "train_speed(iter/s)": 0.289602 }, { "acc": 0.73400784, "epoch": 1.1970130598240887, "grad_norm": 7.84375, "learning_rate": 3.796450841967728e-06, "loss": 1.07539787, "memory(GiB)": 141.16, "step": 107020, "train_speed(iter/s)": 0.289622 }, { "acc": 0.74120245, "epoch": 1.1972367587700472, "grad_norm": 7.8125, "learning_rate": 3.7946558818107132e-06, "loss": 1.02479067, "memory(GiB)": 141.16, "step": 107040, "train_speed(iter/s)": 0.289642 }, { "acc": 0.7437994, "epoch": 1.1974604577160057, "grad_norm": 7.03125, "learning_rate": 3.7928610865620023e-06, "loss": 1.03995647, "memory(GiB)": 141.16, "step": 107060, "train_speed(iter/s)": 0.289661 }, { "acc": 0.74793587, "epoch": 1.1976841566619643, "grad_norm": 8.0625, "learning_rate": 3.7910664564671496e-06, "loss": 1.02384052, "memory(GiB)": 141.16, "step": 107080, "train_speed(iter/s)": 0.28968 }, { "acc": 0.73836222, "epoch": 1.1979078556079228, "grad_norm": 6.71875, "learning_rate": 3.7892719917716847e-06, "loss": 1.05430403, "memory(GiB)": 141.16, "step": 107100, "train_speed(iter/s)": 0.289697 }, { "acc": 0.73585186, "epoch": 1.1981315545538813, "grad_norm": 5.65625, "learning_rate": 3.7874776927211165e-06, "loss": 1.0604394, "memory(GiB)": 141.16, "step": 107120, "train_speed(iter/s)": 0.289717 }, { "acc": 0.74349251, "epoch": 1.1983552534998398, "grad_norm": 7.125, "learning_rate": 3.7856835595609304e-06, "loss": 1.01608639, "memory(GiB)": 141.16, "step": 107140, "train_speed(iter/s)": 0.289736 }, { "acc": 0.74270692, "epoch": 1.1985789524457984, "grad_norm": 8.375, "learning_rate": 3.7838895925365905e-06, "loss": 1.02222919, "memory(GiB)": 141.16, "step": 107160, "train_speed(iter/s)": 0.289754 }, { "acc": 0.73342171, "epoch": 1.198802651391757, "grad_norm": 7.09375, "learning_rate": 3.7820957918935374e-06, "loss": 1.05201435, "memory(GiB)": 141.16, "step": 107180, "train_speed(iter/s)": 0.289772 }, { "acc": 0.73219032, "epoch": 1.1990263503377154, "grad_norm": 6.6875, "learning_rate": 3.780302157877187e-06, "loss": 1.07309761, "memory(GiB)": 141.16, "step": 107200, "train_speed(iter/s)": 0.28979 }, { "acc": 0.72885413, "epoch": 1.199250049283674, "grad_norm": 7.9375, "learning_rate": 3.7785086907329345e-06, "loss": 1.07556992, "memory(GiB)": 141.16, "step": 107220, "train_speed(iter/s)": 0.289808 }, { "acc": 0.73820696, "epoch": 1.1994737482296325, "grad_norm": 7.21875, "learning_rate": 3.7767153907061522e-06, "loss": 1.04652557, "memory(GiB)": 141.16, "step": 107240, "train_speed(iter/s)": 0.289828 }, { "acc": 0.72729473, "epoch": 1.199697447175591, "grad_norm": 7.125, "learning_rate": 3.7749222580421896e-06, "loss": 1.09890327, "memory(GiB)": 141.16, "step": 107260, "train_speed(iter/s)": 0.289845 }, { "acc": 0.73537397, "epoch": 1.1999211461215495, "grad_norm": 7.4375, "learning_rate": 3.773129292986373e-06, "loss": 1.052318, "memory(GiB)": 141.16, "step": 107280, "train_speed(iter/s)": 0.289862 }, { "acc": 0.7488204, "epoch": 1.200144845067508, "grad_norm": 6.59375, "learning_rate": 3.771336495784005e-06, "loss": 1.00329781, "memory(GiB)": 141.16, "step": 107300, "train_speed(iter/s)": 0.289881 }, { "acc": 0.72496424, "epoch": 1.2003685440134666, "grad_norm": 8.5625, "learning_rate": 3.7695438666803654e-06, "loss": 1.11021423, "memory(GiB)": 141.16, "step": 107320, "train_speed(iter/s)": 0.289899 }, { "acc": 0.73507738, "epoch": 1.2005922429594251, "grad_norm": 6.28125, "learning_rate": 3.767751405920712e-06, "loss": 1.05364227, "memory(GiB)": 141.16, "step": 107340, "train_speed(iter/s)": 0.289918 }, { "acc": 0.75004902, "epoch": 1.2008159419053837, "grad_norm": 5.71875, "learning_rate": 3.765959113750279e-06, "loss": 0.99629955, "memory(GiB)": 141.16, "step": 107360, "train_speed(iter/s)": 0.289936 }, { "acc": 0.73743467, "epoch": 1.2010396408513422, "grad_norm": 7.09375, "learning_rate": 3.76416699041428e-06, "loss": 1.04285259, "memory(GiB)": 141.16, "step": 107380, "train_speed(iter/s)": 0.289953 }, { "acc": 0.75032578, "epoch": 1.2012633397973007, "grad_norm": 9.0625, "learning_rate": 3.7623750361578986e-06, "loss": 0.99359398, "memory(GiB)": 141.16, "step": 107400, "train_speed(iter/s)": 0.28997 }, { "acc": 0.73148146, "epoch": 1.2014870387432592, "grad_norm": 6.53125, "learning_rate": 3.7605832512263026e-06, "loss": 1.09072285, "memory(GiB)": 141.16, "step": 107420, "train_speed(iter/s)": 0.289987 }, { "acc": 0.73539591, "epoch": 1.2017107376892178, "grad_norm": 6.46875, "learning_rate": 3.7587916358646328e-06, "loss": 1.04401312, "memory(GiB)": 141.16, "step": 107440, "train_speed(iter/s)": 0.290005 }, { "acc": 0.73070517, "epoch": 1.2019344366351763, "grad_norm": 6.875, "learning_rate": 3.757000190318008e-06, "loss": 1.07888412, "memory(GiB)": 141.16, "step": 107460, "train_speed(iter/s)": 0.290023 }, { "acc": 0.72353687, "epoch": 1.2021581355811348, "grad_norm": 5.9375, "learning_rate": 3.755208914831525e-06, "loss": 1.09201899, "memory(GiB)": 141.16, "step": 107480, "train_speed(iter/s)": 0.29004 }, { "acc": 0.73463221, "epoch": 1.2023818345270934, "grad_norm": 8.125, "learning_rate": 3.7534178096502537e-06, "loss": 1.0608942, "memory(GiB)": 141.16, "step": 107500, "train_speed(iter/s)": 0.290058 }, { "acc": 0.74720545, "epoch": 1.2026055334730519, "grad_norm": 7.3125, "learning_rate": 3.7516268750192437e-06, "loss": 1.02255287, "memory(GiB)": 141.16, "step": 107520, "train_speed(iter/s)": 0.290078 }, { "acc": 0.72939072, "epoch": 1.2028292324190104, "grad_norm": 7.0625, "learning_rate": 3.74983611118352e-06, "loss": 1.07584457, "memory(GiB)": 141.16, "step": 107540, "train_speed(iter/s)": 0.290097 }, { "acc": 0.754531, "epoch": 1.203052931364969, "grad_norm": 6.78125, "learning_rate": 3.7480455183880865e-06, "loss": 0.97750397, "memory(GiB)": 141.16, "step": 107560, "train_speed(iter/s)": 0.290115 }, { "acc": 0.73203049, "epoch": 1.2032766303109275, "grad_norm": 7.15625, "learning_rate": 3.746255096877921e-06, "loss": 1.06866665, "memory(GiB)": 141.16, "step": 107580, "train_speed(iter/s)": 0.290132 }, { "acc": 0.73628039, "epoch": 1.203500329256886, "grad_norm": 6.03125, "learning_rate": 3.7444648468979774e-06, "loss": 1.05785255, "memory(GiB)": 141.16, "step": 107600, "train_speed(iter/s)": 0.290147 }, { "acc": 0.74234924, "epoch": 1.2037240282028445, "grad_norm": 7.5625, "learning_rate": 3.7426747686931886e-06, "loss": 1.04165344, "memory(GiB)": 141.16, "step": 107620, "train_speed(iter/s)": 0.290165 }, { "acc": 0.73888474, "epoch": 1.203947727148803, "grad_norm": 8.1875, "learning_rate": 3.7408848625084624e-06, "loss": 1.04160767, "memory(GiB)": 141.16, "step": 107640, "train_speed(iter/s)": 0.290184 }, { "acc": 0.73507466, "epoch": 1.2041714260947616, "grad_norm": 6.90625, "learning_rate": 3.7390951285886845e-06, "loss": 1.03955965, "memory(GiB)": 141.16, "step": 107660, "train_speed(iter/s)": 0.290201 }, { "acc": 0.73881269, "epoch": 1.20439512504072, "grad_norm": 7.78125, "learning_rate": 3.737305567178716e-06, "loss": 1.05650234, "memory(GiB)": 141.16, "step": 107680, "train_speed(iter/s)": 0.29022 }, { "acc": 0.73640842, "epoch": 1.2046188239866786, "grad_norm": 6.84375, "learning_rate": 3.7355161785233928e-06, "loss": 1.06802692, "memory(GiB)": 141.16, "step": 107700, "train_speed(iter/s)": 0.290239 }, { "acc": 0.74684987, "epoch": 1.2048425229326372, "grad_norm": 7.875, "learning_rate": 3.733726962867532e-06, "loss": 1.00702686, "memory(GiB)": 141.16, "step": 107720, "train_speed(iter/s)": 0.290257 }, { "acc": 0.7335433, "epoch": 1.2050662218785957, "grad_norm": 6.90625, "learning_rate": 3.7319379204559203e-06, "loss": 1.07042875, "memory(GiB)": 141.16, "step": 107740, "train_speed(iter/s)": 0.290275 }, { "acc": 0.74567633, "epoch": 1.2052899208245544, "grad_norm": 5.96875, "learning_rate": 3.730149051533326e-06, "loss": 1.00901918, "memory(GiB)": 141.16, "step": 107760, "train_speed(iter/s)": 0.290294 }, { "acc": 0.73238535, "epoch": 1.205513619770513, "grad_norm": 6.5, "learning_rate": 3.7283603563444916e-06, "loss": 1.07621212, "memory(GiB)": 141.16, "step": 107780, "train_speed(iter/s)": 0.290309 }, { "acc": 0.73082161, "epoch": 1.2057373187164715, "grad_norm": 7.90625, "learning_rate": 3.726571835134136e-06, "loss": 1.08194923, "memory(GiB)": 141.16, "step": 107800, "train_speed(iter/s)": 0.290327 }, { "acc": 0.73707094, "epoch": 1.20596101766243, "grad_norm": 6.4375, "learning_rate": 3.724783488146957e-06, "loss": 1.05317678, "memory(GiB)": 141.16, "step": 107820, "train_speed(iter/s)": 0.290344 }, { "acc": 0.72431874, "epoch": 1.2061847166083886, "grad_norm": 7.15625, "learning_rate": 3.7229953156276216e-06, "loss": 1.13298512, "memory(GiB)": 141.16, "step": 107840, "train_speed(iter/s)": 0.29036 }, { "acc": 0.72912149, "epoch": 1.206408415554347, "grad_norm": 8.4375, "learning_rate": 3.72120731782078e-06, "loss": 1.08966789, "memory(GiB)": 141.16, "step": 107860, "train_speed(iter/s)": 0.290378 }, { "acc": 0.7367475, "epoch": 1.2066321145003056, "grad_norm": 7.21875, "learning_rate": 3.7194194949710556e-06, "loss": 1.0726944, "memory(GiB)": 141.16, "step": 107880, "train_speed(iter/s)": 0.290394 }, { "acc": 0.72848759, "epoch": 1.2068558134462641, "grad_norm": 6.0, "learning_rate": 3.7176318473230476e-06, "loss": 1.0859251, "memory(GiB)": 141.16, "step": 107900, "train_speed(iter/s)": 0.290414 }, { "acc": 0.74235287, "epoch": 1.2070795123922227, "grad_norm": 8.0625, "learning_rate": 3.7158443751213334e-06, "loss": 1.04425259, "memory(GiB)": 141.16, "step": 107920, "train_speed(iter/s)": 0.290432 }, { "acc": 0.73742342, "epoch": 1.2073032113381812, "grad_norm": 5.125, "learning_rate": 3.714057078610463e-06, "loss": 1.05043049, "memory(GiB)": 141.16, "step": 107940, "train_speed(iter/s)": 0.29045 }, { "acc": 0.7328516, "epoch": 1.2075269102841397, "grad_norm": 7.71875, "learning_rate": 3.7122699580349643e-06, "loss": 1.07826042, "memory(GiB)": 141.16, "step": 107960, "train_speed(iter/s)": 0.290466 }, { "acc": 0.7292284, "epoch": 1.2077506092300982, "grad_norm": 7.59375, "learning_rate": 3.710483013639341e-06, "loss": 1.08998041, "memory(GiB)": 141.16, "step": 107980, "train_speed(iter/s)": 0.290485 }, { "acc": 0.71955051, "epoch": 1.2079743081760568, "grad_norm": 5.96875, "learning_rate": 3.708696245668073e-06, "loss": 1.12606382, "memory(GiB)": 141.16, "step": 108000, "train_speed(iter/s)": 0.290505 }, { "epoch": 1.2079743081760568, "eval_acc": 0.6901285715919546, "eval_loss": 1.0792056322097778, "eval_runtime": 2319.081, "eval_samples_per_second": 32.462, "eval_steps_per_second": 16.231, "step": 108000 }, { "acc": 0.73666015, "epoch": 1.2081980071220153, "grad_norm": 5.625, "learning_rate": 3.706909654365617e-06, "loss": 1.06311884, "memory(GiB)": 141.16, "step": 108020, "train_speed(iter/s)": 0.288681 }, { "acc": 0.72035036, "epoch": 1.2084217060679738, "grad_norm": 5.4375, "learning_rate": 3.7051232399764016e-06, "loss": 1.11979561, "memory(GiB)": 141.16, "step": 108040, "train_speed(iter/s)": 0.2887 }, { "acc": 0.72043605, "epoch": 1.2086454050139324, "grad_norm": 7.78125, "learning_rate": 3.7033370027448346e-06, "loss": 1.12806053, "memory(GiB)": 141.16, "step": 108060, "train_speed(iter/s)": 0.288719 }, { "acc": 0.7368916, "epoch": 1.208869103959891, "grad_norm": 6.71875, "learning_rate": 3.701550942915299e-06, "loss": 1.06079845, "memory(GiB)": 141.16, "step": 108080, "train_speed(iter/s)": 0.288737 }, { "acc": 0.74843388, "epoch": 1.2090928029058494, "grad_norm": 8.1875, "learning_rate": 3.6997650607321545e-06, "loss": 0.99133377, "memory(GiB)": 141.16, "step": 108100, "train_speed(iter/s)": 0.288757 }, { "acc": 0.73952556, "epoch": 1.209316501851808, "grad_norm": 6.21875, "learning_rate": 3.6979793564397343e-06, "loss": 1.04450607, "memory(GiB)": 141.16, "step": 108120, "train_speed(iter/s)": 0.288775 }, { "acc": 0.74067278, "epoch": 1.2095402007977665, "grad_norm": 7.1875, "learning_rate": 3.6961938302823476e-06, "loss": 1.03702335, "memory(GiB)": 141.16, "step": 108140, "train_speed(iter/s)": 0.288795 }, { "acc": 0.72959776, "epoch": 1.209763899743725, "grad_norm": 5.53125, "learning_rate": 3.6944084825042813e-06, "loss": 1.10279961, "memory(GiB)": 141.16, "step": 108160, "train_speed(iter/s)": 0.288812 }, { "acc": 0.73545923, "epoch": 1.2099875986896835, "grad_norm": 7.34375, "learning_rate": 3.6926233133497947e-06, "loss": 1.06829929, "memory(GiB)": 141.16, "step": 108180, "train_speed(iter/s)": 0.28883 }, { "acc": 0.73088732, "epoch": 1.210211297635642, "grad_norm": 6.4375, "learning_rate": 3.690838323063126e-06, "loss": 1.07289495, "memory(GiB)": 141.16, "step": 108200, "train_speed(iter/s)": 0.288848 }, { "acc": 0.7254056, "epoch": 1.2104349965816006, "grad_norm": 7.03125, "learning_rate": 3.6890535118884884e-06, "loss": 1.11133842, "memory(GiB)": 141.16, "step": 108220, "train_speed(iter/s)": 0.288865 }, { "acc": 0.74261513, "epoch": 1.2106586955275591, "grad_norm": 8.375, "learning_rate": 3.6872688800700674e-06, "loss": 1.02092838, "memory(GiB)": 141.16, "step": 108240, "train_speed(iter/s)": 0.288883 }, { "acc": 0.74746265, "epoch": 1.2108823944735176, "grad_norm": 8.5625, "learning_rate": 3.685484427852026e-06, "loss": 1.0044467, "memory(GiB)": 141.16, "step": 108260, "train_speed(iter/s)": 0.288902 }, { "acc": 0.75124121, "epoch": 1.2111060934194762, "grad_norm": 7.40625, "learning_rate": 3.6837001554785035e-06, "loss": 0.98525906, "memory(GiB)": 141.16, "step": 108280, "train_speed(iter/s)": 0.288921 }, { "acc": 0.71955233, "epoch": 1.2113297923654347, "grad_norm": 6.84375, "learning_rate": 3.6819160631936146e-06, "loss": 1.11702414, "memory(GiB)": 141.16, "step": 108300, "train_speed(iter/s)": 0.288938 }, { "acc": 0.74045944, "epoch": 1.2115534913113932, "grad_norm": 8.25, "learning_rate": 3.680132151241449e-06, "loss": 1.0465456, "memory(GiB)": 141.16, "step": 108320, "train_speed(iter/s)": 0.288955 }, { "acc": 0.7474329, "epoch": 1.2117771902573518, "grad_norm": 6.5, "learning_rate": 3.678348419866069e-06, "loss": 1.0140934, "memory(GiB)": 141.16, "step": 108340, "train_speed(iter/s)": 0.288973 }, { "acc": 0.73326931, "epoch": 1.2120008892033103, "grad_norm": 7.9375, "learning_rate": 3.676564869311516e-06, "loss": 1.07820435, "memory(GiB)": 141.16, "step": 108360, "train_speed(iter/s)": 0.288992 }, { "acc": 0.74160676, "epoch": 1.2122245881492688, "grad_norm": 6.3125, "learning_rate": 3.674781499821805e-06, "loss": 1.02247725, "memory(GiB)": 141.16, "step": 108380, "train_speed(iter/s)": 0.289007 }, { "acc": 0.75083971, "epoch": 1.2124482870952273, "grad_norm": 8.5, "learning_rate": 3.6729983116409267e-06, "loss": 0.98748856, "memory(GiB)": 141.16, "step": 108400, "train_speed(iter/s)": 0.289028 }, { "acc": 0.75249705, "epoch": 1.2126719860411859, "grad_norm": 6.78125, "learning_rate": 3.6712153050128474e-06, "loss": 0.98174248, "memory(GiB)": 141.16, "step": 108420, "train_speed(iter/s)": 0.289047 }, { "acc": 0.72694359, "epoch": 1.2128956849871444, "grad_norm": 6.9375, "learning_rate": 3.669432480181507e-06, "loss": 1.09514513, "memory(GiB)": 141.16, "step": 108440, "train_speed(iter/s)": 0.289062 }, { "acc": 0.73669806, "epoch": 1.213119383933103, "grad_norm": 8.9375, "learning_rate": 3.667649837390821e-06, "loss": 1.05328236, "memory(GiB)": 141.16, "step": 108460, "train_speed(iter/s)": 0.28908 }, { "acc": 0.74373455, "epoch": 1.2133430828790615, "grad_norm": 5.34375, "learning_rate": 3.6658673768846803e-06, "loss": 1.01906118, "memory(GiB)": 141.16, "step": 108480, "train_speed(iter/s)": 0.289101 }, { "acc": 0.74792423, "epoch": 1.21356678182502, "grad_norm": 8.0625, "learning_rate": 3.664085098906952e-06, "loss": 1.00914135, "memory(GiB)": 141.16, "step": 108500, "train_speed(iter/s)": 0.289117 }, { "acc": 0.73545661, "epoch": 1.2137904807709785, "grad_norm": 9.125, "learning_rate": 3.662303003701478e-06, "loss": 1.07127056, "memory(GiB)": 141.16, "step": 108520, "train_speed(iter/s)": 0.289133 }, { "acc": 0.73603764, "epoch": 1.214014179716937, "grad_norm": 9.125, "learning_rate": 3.6605210915120715e-06, "loss": 1.07004919, "memory(GiB)": 141.16, "step": 108540, "train_speed(iter/s)": 0.289153 }, { "acc": 0.741467, "epoch": 1.2142378786628956, "grad_norm": 6.71875, "learning_rate": 3.6587393625825262e-06, "loss": 1.04370518, "memory(GiB)": 141.16, "step": 108560, "train_speed(iter/s)": 0.289171 }, { "acc": 0.72339644, "epoch": 1.214461577608854, "grad_norm": 7.21875, "learning_rate": 3.6569578171566067e-06, "loss": 1.10677509, "memory(GiB)": 141.16, "step": 108580, "train_speed(iter/s)": 0.28919 }, { "acc": 0.73745742, "epoch": 1.2146852765548126, "grad_norm": 5.5, "learning_rate": 3.6551764554780544e-06, "loss": 1.04178143, "memory(GiB)": 141.16, "step": 108600, "train_speed(iter/s)": 0.289206 }, { "acc": 0.73211741, "epoch": 1.2149089755007711, "grad_norm": 5.15625, "learning_rate": 3.6533952777905856e-06, "loss": 1.08175526, "memory(GiB)": 141.16, "step": 108620, "train_speed(iter/s)": 0.289222 }, { "acc": 0.74128218, "epoch": 1.2151326744467297, "grad_norm": 8.1875, "learning_rate": 3.65161428433789e-06, "loss": 1.04096546, "memory(GiB)": 141.16, "step": 108640, "train_speed(iter/s)": 0.28924 }, { "acc": 0.71715884, "epoch": 1.2153563733926882, "grad_norm": 6.25, "learning_rate": 3.6498334753636323e-06, "loss": 1.14484749, "memory(GiB)": 141.16, "step": 108660, "train_speed(iter/s)": 0.289257 }, { "acc": 0.72831793, "epoch": 1.2155800723386467, "grad_norm": 6.53125, "learning_rate": 3.648052851111454e-06, "loss": 1.09007931, "memory(GiB)": 141.16, "step": 108680, "train_speed(iter/s)": 0.289275 }, { "acc": 0.74170656, "epoch": 1.2158037712846053, "grad_norm": 5.5625, "learning_rate": 3.646272411824969e-06, "loss": 1.04000549, "memory(GiB)": 141.16, "step": 108700, "train_speed(iter/s)": 0.289294 }, { "acc": 0.75188713, "epoch": 1.2160274702305638, "grad_norm": 5.15625, "learning_rate": 3.6444921577477686e-06, "loss": 0.96991425, "memory(GiB)": 141.16, "step": 108720, "train_speed(iter/s)": 0.289311 }, { "acc": 0.7421411, "epoch": 1.2162511691765223, "grad_norm": 5.71875, "learning_rate": 3.642712089123415e-06, "loss": 1.03771114, "memory(GiB)": 141.16, "step": 108740, "train_speed(iter/s)": 0.28933 }, { "acc": 0.73203168, "epoch": 1.2164748681224808, "grad_norm": 5.28125, "learning_rate": 3.640932206195447e-06, "loss": 1.08182898, "memory(GiB)": 141.16, "step": 108760, "train_speed(iter/s)": 0.28935 }, { "acc": 0.73752093, "epoch": 1.2166985670684394, "grad_norm": 7.0, "learning_rate": 3.6391525092073793e-06, "loss": 1.04655161, "memory(GiB)": 141.16, "step": 108780, "train_speed(iter/s)": 0.289368 }, { "acc": 0.74329529, "epoch": 1.216922266014398, "grad_norm": 6.375, "learning_rate": 3.637372998402699e-06, "loss": 1.02929268, "memory(GiB)": 141.16, "step": 108800, "train_speed(iter/s)": 0.289385 }, { "acc": 0.7395113, "epoch": 1.2171459649603564, "grad_norm": 7.15625, "learning_rate": 3.63559367402487e-06, "loss": 1.05313683, "memory(GiB)": 141.16, "step": 108820, "train_speed(iter/s)": 0.289404 }, { "acc": 0.7335278, "epoch": 1.217369663906315, "grad_norm": 7.53125, "learning_rate": 3.633814536317327e-06, "loss": 1.06987333, "memory(GiB)": 141.16, "step": 108840, "train_speed(iter/s)": 0.289423 }, { "acc": 0.74592166, "epoch": 1.2175933628522735, "grad_norm": 9.875, "learning_rate": 3.6320355855234837e-06, "loss": 1.02070131, "memory(GiB)": 141.16, "step": 108860, "train_speed(iter/s)": 0.289443 }, { "acc": 0.73779669, "epoch": 1.217817061798232, "grad_norm": 6.09375, "learning_rate": 3.630256821886724e-06, "loss": 1.04982777, "memory(GiB)": 141.16, "step": 108880, "train_speed(iter/s)": 0.289462 }, { "acc": 0.72868052, "epoch": 1.2180407607441905, "grad_norm": 7.125, "learning_rate": 3.628478245650412e-06, "loss": 1.10414429, "memory(GiB)": 141.16, "step": 108900, "train_speed(iter/s)": 0.289479 }, { "acc": 0.72810335, "epoch": 1.218264459690149, "grad_norm": 7.5625, "learning_rate": 3.626699857057877e-06, "loss": 1.08130636, "memory(GiB)": 141.16, "step": 108920, "train_speed(iter/s)": 0.289497 }, { "acc": 0.7298974, "epoch": 1.2184881586361076, "grad_norm": 7.21875, "learning_rate": 3.624921656352431e-06, "loss": 1.08747807, "memory(GiB)": 141.16, "step": 108940, "train_speed(iter/s)": 0.289514 }, { "acc": 0.7430357, "epoch": 1.2187118575820661, "grad_norm": 6.78125, "learning_rate": 3.623143643777357e-06, "loss": 1.02093945, "memory(GiB)": 141.16, "step": 108960, "train_speed(iter/s)": 0.289531 }, { "acc": 0.73834968, "epoch": 1.2189355565280247, "grad_norm": 6.65625, "learning_rate": 3.621365819575912e-06, "loss": 1.06056118, "memory(GiB)": 141.16, "step": 108980, "train_speed(iter/s)": 0.28955 }, { "acc": 0.73279552, "epoch": 1.2191592554739832, "grad_norm": 8.25, "learning_rate": 3.6195881839913285e-06, "loss": 1.07681141, "memory(GiB)": 141.16, "step": 109000, "train_speed(iter/s)": 0.28957 }, { "acc": 0.73829856, "epoch": 1.2193829544199417, "grad_norm": 7.0625, "learning_rate": 3.6178107372668113e-06, "loss": 1.05661011, "memory(GiB)": 141.16, "step": 109020, "train_speed(iter/s)": 0.289589 }, { "acc": 0.73935704, "epoch": 1.2196066533659002, "grad_norm": 6.34375, "learning_rate": 3.6160334796455414e-06, "loss": 1.04694214, "memory(GiB)": 141.16, "step": 109040, "train_speed(iter/s)": 0.289607 }, { "acc": 0.73461685, "epoch": 1.2198303523118588, "grad_norm": 7.5, "learning_rate": 3.614256411370674e-06, "loss": 1.06013699, "memory(GiB)": 141.16, "step": 109060, "train_speed(iter/s)": 0.289626 }, { "acc": 0.7375102, "epoch": 1.2200540512578173, "grad_norm": 6.65625, "learning_rate": 3.6124795326853356e-06, "loss": 1.05094566, "memory(GiB)": 141.16, "step": 109080, "train_speed(iter/s)": 0.289644 }, { "acc": 0.74822025, "epoch": 1.2202777502037758, "grad_norm": 7.28125, "learning_rate": 3.610702843832629e-06, "loss": 0.99525986, "memory(GiB)": 141.16, "step": 109100, "train_speed(iter/s)": 0.289664 }, { "acc": 0.74235134, "epoch": 1.2205014491497344, "grad_norm": 9.0625, "learning_rate": 3.608926345055631e-06, "loss": 1.04797544, "memory(GiB)": 141.16, "step": 109120, "train_speed(iter/s)": 0.289682 }, { "acc": 0.73561335, "epoch": 1.2207251480956929, "grad_norm": 7.4375, "learning_rate": 3.607150036597392e-06, "loss": 1.08670826, "memory(GiB)": 141.16, "step": 109140, "train_speed(iter/s)": 0.289701 }, { "acc": 0.72913971, "epoch": 1.2209488470416514, "grad_norm": 7.84375, "learning_rate": 3.605373918700938e-06, "loss": 1.08641901, "memory(GiB)": 141.16, "step": 109160, "train_speed(iter/s)": 0.289718 }, { "acc": 0.74390235, "epoch": 1.22117254598761, "grad_norm": 6.90625, "learning_rate": 3.6035979916092646e-06, "loss": 1.0109601, "memory(GiB)": 141.16, "step": 109180, "train_speed(iter/s)": 0.289735 }, { "acc": 0.73980122, "epoch": 1.2213962449335685, "grad_norm": 6.53125, "learning_rate": 3.601822255565345e-06, "loss": 1.05473309, "memory(GiB)": 141.16, "step": 109200, "train_speed(iter/s)": 0.289751 }, { "acc": 0.72308125, "epoch": 1.221619943879527, "grad_norm": 7.09375, "learning_rate": 3.6000467108121247e-06, "loss": 1.1184432, "memory(GiB)": 141.16, "step": 109220, "train_speed(iter/s)": 0.289769 }, { "acc": 0.72991199, "epoch": 1.2218436428254855, "grad_norm": 7.65625, "learning_rate": 3.598271357592525e-06, "loss": 1.08561344, "memory(GiB)": 141.16, "step": 109240, "train_speed(iter/s)": 0.289786 }, { "acc": 0.72912169, "epoch": 1.222067341771444, "grad_norm": 7.0, "learning_rate": 3.5964961961494394e-06, "loss": 1.10296288, "memory(GiB)": 141.16, "step": 109260, "train_speed(iter/s)": 0.289805 }, { "acc": 0.74357657, "epoch": 1.2222910407174026, "grad_norm": 7.375, "learning_rate": 3.5947212267257346e-06, "loss": 1.02997684, "memory(GiB)": 141.16, "step": 109280, "train_speed(iter/s)": 0.289823 }, { "acc": 0.72488804, "epoch": 1.222514739663361, "grad_norm": 7.15625, "learning_rate": 3.592946449564251e-06, "loss": 1.11994276, "memory(GiB)": 141.16, "step": 109300, "train_speed(iter/s)": 0.289842 }, { "acc": 0.7436944, "epoch": 1.2227384386093196, "grad_norm": 7.53125, "learning_rate": 3.5911718649078055e-06, "loss": 1.0284296, "memory(GiB)": 141.16, "step": 109320, "train_speed(iter/s)": 0.28986 }, { "acc": 0.73503675, "epoch": 1.2229621375552782, "grad_norm": 7.1875, "learning_rate": 3.5893974729991855e-06, "loss": 1.07001925, "memory(GiB)": 141.16, "step": 109340, "train_speed(iter/s)": 0.289878 }, { "acc": 0.72939787, "epoch": 1.2231858365012367, "grad_norm": 5.375, "learning_rate": 3.5876232740811543e-06, "loss": 1.09079704, "memory(GiB)": 141.16, "step": 109360, "train_speed(iter/s)": 0.289896 }, { "acc": 0.75236969, "epoch": 1.2234095354471952, "grad_norm": 7.5, "learning_rate": 3.5858492683964453e-06, "loss": 0.9822134, "memory(GiB)": 141.16, "step": 109380, "train_speed(iter/s)": 0.289913 }, { "acc": 0.73334846, "epoch": 1.2236332343931537, "grad_norm": 7.375, "learning_rate": 3.58407545618777e-06, "loss": 1.05970964, "memory(GiB)": 141.16, "step": 109400, "train_speed(iter/s)": 0.289931 }, { "acc": 0.73513041, "epoch": 1.2238569333391123, "grad_norm": 6.84375, "learning_rate": 3.5823018376978097e-06, "loss": 1.08146915, "memory(GiB)": 141.16, "step": 109420, "train_speed(iter/s)": 0.289946 }, { "acc": 0.7418982, "epoch": 1.2240806322850708, "grad_norm": 7.78125, "learning_rate": 3.580528413169222e-06, "loss": 1.03781013, "memory(GiB)": 141.16, "step": 109440, "train_speed(iter/s)": 0.289962 }, { "acc": 0.7281888, "epoch": 1.2243043312310293, "grad_norm": 7.0625, "learning_rate": 3.5787551828446377e-06, "loss": 1.0793169, "memory(GiB)": 141.16, "step": 109460, "train_speed(iter/s)": 0.289978 }, { "acc": 0.73963823, "epoch": 1.2245280301769879, "grad_norm": 6.28125, "learning_rate": 3.5769821469666565e-06, "loss": 1.04836178, "memory(GiB)": 141.16, "step": 109480, "train_speed(iter/s)": 0.289996 }, { "acc": 0.72945385, "epoch": 1.2247517291229464, "grad_norm": 5.53125, "learning_rate": 3.575209305777858e-06, "loss": 1.08956747, "memory(GiB)": 141.16, "step": 109500, "train_speed(iter/s)": 0.290012 }, { "acc": 0.73399382, "epoch": 1.224975428068905, "grad_norm": 7.9375, "learning_rate": 3.5734366595207915e-06, "loss": 1.07724953, "memory(GiB)": 141.16, "step": 109520, "train_speed(iter/s)": 0.290028 }, { "acc": 0.74050226, "epoch": 1.2251991270148634, "grad_norm": 6.59375, "learning_rate": 3.5716642084379806e-06, "loss": 1.05017223, "memory(GiB)": 141.16, "step": 109540, "train_speed(iter/s)": 0.290046 }, { "acc": 0.72882729, "epoch": 1.225422825960822, "grad_norm": 5.9375, "learning_rate": 3.569891952771921e-06, "loss": 1.09556694, "memory(GiB)": 141.16, "step": 109560, "train_speed(iter/s)": 0.290064 }, { "acc": 0.73633356, "epoch": 1.2256465249067805, "grad_norm": 7.5625, "learning_rate": 3.568119892765084e-06, "loss": 1.06461744, "memory(GiB)": 141.16, "step": 109580, "train_speed(iter/s)": 0.290082 }, { "acc": 0.73492517, "epoch": 1.225870223852739, "grad_norm": 8.125, "learning_rate": 3.5663480286599117e-06, "loss": 1.07363443, "memory(GiB)": 141.16, "step": 109600, "train_speed(iter/s)": 0.290099 }, { "acc": 0.73427, "epoch": 1.2260939227986976, "grad_norm": 6.5, "learning_rate": 3.56457636069882e-06, "loss": 1.07847061, "memory(GiB)": 141.16, "step": 109620, "train_speed(iter/s)": 0.290116 }, { "acc": 0.72843895, "epoch": 1.226317621744656, "grad_norm": 8.0, "learning_rate": 3.5628048891241994e-06, "loss": 1.09010735, "memory(GiB)": 141.16, "step": 109640, "train_speed(iter/s)": 0.290131 }, { "acc": 0.74347582, "epoch": 1.2265413206906146, "grad_norm": 7.25, "learning_rate": 3.561033614178412e-06, "loss": 1.01686287, "memory(GiB)": 141.16, "step": 109660, "train_speed(iter/s)": 0.29015 }, { "acc": 0.73523331, "epoch": 1.2267650196365731, "grad_norm": 8.625, "learning_rate": 3.5592625361037946e-06, "loss": 1.0729723, "memory(GiB)": 141.16, "step": 109680, "train_speed(iter/s)": 0.290167 }, { "acc": 0.71789174, "epoch": 1.2269887185825317, "grad_norm": 6.5, "learning_rate": 3.5574916551426553e-06, "loss": 1.1388154, "memory(GiB)": 141.16, "step": 109700, "train_speed(iter/s)": 0.290186 }, { "acc": 0.7215817, "epoch": 1.2272124175284902, "grad_norm": 9.4375, "learning_rate": 3.5557209715372743e-06, "loss": 1.12884722, "memory(GiB)": 141.16, "step": 109720, "train_speed(iter/s)": 0.290204 }, { "acc": 0.74096484, "epoch": 1.2274361164744487, "grad_norm": 6.90625, "learning_rate": 3.553950485529909e-06, "loss": 1.03006649, "memory(GiB)": 141.16, "step": 109740, "train_speed(iter/s)": 0.290219 }, { "acc": 0.73836365, "epoch": 1.2276598154204073, "grad_norm": 8.5625, "learning_rate": 3.5521801973627856e-06, "loss": 1.06086807, "memory(GiB)": 141.16, "step": 109760, "train_speed(iter/s)": 0.290238 }, { "acc": 0.73894849, "epoch": 1.2278835143663658, "grad_norm": 6.625, "learning_rate": 3.550410107278106e-06, "loss": 1.05935135, "memory(GiB)": 141.16, "step": 109780, "train_speed(iter/s)": 0.290257 }, { "acc": 0.73855143, "epoch": 1.2281072133123243, "grad_norm": 7.65625, "learning_rate": 3.548640215518043e-06, "loss": 1.04552574, "memory(GiB)": 141.16, "step": 109800, "train_speed(iter/s)": 0.290274 }, { "acc": 0.72708588, "epoch": 1.2283309122582828, "grad_norm": 8.375, "learning_rate": 3.5468705223247426e-06, "loss": 1.09579391, "memory(GiB)": 141.16, "step": 109820, "train_speed(iter/s)": 0.29029 }, { "acc": 0.73793173, "epoch": 1.2285546112042414, "grad_norm": 6.625, "learning_rate": 3.545101027940325e-06, "loss": 1.04781904, "memory(GiB)": 141.16, "step": 109840, "train_speed(iter/s)": 0.290308 }, { "acc": 0.72336941, "epoch": 1.2287783101502, "grad_norm": 6.28125, "learning_rate": 3.5433317326068817e-06, "loss": 1.11862345, "memory(GiB)": 141.16, "step": 109860, "train_speed(iter/s)": 0.290326 }, { "acc": 0.73372707, "epoch": 1.2290020090961584, "grad_norm": 7.34375, "learning_rate": 3.5415626365664792e-06, "loss": 1.07290421, "memory(GiB)": 141.16, "step": 109880, "train_speed(iter/s)": 0.290342 }, { "acc": 0.71287518, "epoch": 1.229225708042117, "grad_norm": 7.09375, "learning_rate": 3.5397937400611525e-06, "loss": 1.17140427, "memory(GiB)": 141.16, "step": 109900, "train_speed(iter/s)": 0.29036 }, { "acc": 0.73423929, "epoch": 1.2294494069880755, "grad_norm": 7.65625, "learning_rate": 3.5380250433329146e-06, "loss": 1.08752098, "memory(GiB)": 141.16, "step": 109920, "train_speed(iter/s)": 0.290376 }, { "acc": 0.73421335, "epoch": 1.229673105934034, "grad_norm": 6.375, "learning_rate": 3.536256546623746e-06, "loss": 1.0822094, "memory(GiB)": 141.16, "step": 109940, "train_speed(iter/s)": 0.290394 }, { "acc": 0.73434019, "epoch": 1.2298968048799925, "grad_norm": 7.59375, "learning_rate": 3.534488250175604e-06, "loss": 1.06899834, "memory(GiB)": 141.16, "step": 109960, "train_speed(iter/s)": 0.290411 }, { "acc": 0.71868658, "epoch": 1.230120503825951, "grad_norm": 5.125, "learning_rate": 3.532720154230417e-06, "loss": 1.135075, "memory(GiB)": 141.16, "step": 109980, "train_speed(iter/s)": 0.29043 }, { "acc": 0.73502359, "epoch": 1.2303442027719096, "grad_norm": 7.53125, "learning_rate": 3.5309522590300844e-06, "loss": 1.04698544, "memory(GiB)": 141.16, "step": 110000, "train_speed(iter/s)": 0.290448 }, { "epoch": 1.2303442027719096, "eval_acc": 0.6901581988816363, "eval_loss": 1.0791984796524048, "eval_runtime": 2323.8516, "eval_samples_per_second": 32.396, "eval_steps_per_second": 16.198, "step": 110000 }, { "acc": 0.72430687, "epoch": 1.2305679017178681, "grad_norm": 8.3125, "learning_rate": 3.5291845648164804e-06, "loss": 1.10895605, "memory(GiB)": 141.16, "step": 110020, "train_speed(iter/s)": 0.288656 }, { "acc": 0.73624315, "epoch": 1.2307916006638266, "grad_norm": 7.8125, "learning_rate": 3.5274170718314506e-06, "loss": 1.0507719, "memory(GiB)": 141.16, "step": 110040, "train_speed(iter/s)": 0.288673 }, { "acc": 0.73726387, "epoch": 1.2310152996097852, "grad_norm": 7.3125, "learning_rate": 3.525649780316813e-06, "loss": 1.05835896, "memory(GiB)": 141.16, "step": 110060, "train_speed(iter/s)": 0.288692 }, { "acc": 0.72478771, "epoch": 1.2312389985557437, "grad_norm": 7.0, "learning_rate": 3.5238826905143607e-06, "loss": 1.11563473, "memory(GiB)": 141.16, "step": 110080, "train_speed(iter/s)": 0.28871 }, { "acc": 0.73007431, "epoch": 1.2314626975017022, "grad_norm": 8.3125, "learning_rate": 3.5221158026658544e-06, "loss": 1.07832184, "memory(GiB)": 141.16, "step": 110100, "train_speed(iter/s)": 0.288727 }, { "acc": 0.74481668, "epoch": 1.2316863964476608, "grad_norm": 6.09375, "learning_rate": 3.52034911701303e-06, "loss": 1.03616848, "memory(GiB)": 141.16, "step": 110120, "train_speed(iter/s)": 0.288746 }, { "acc": 0.73952808, "epoch": 1.2319100953936193, "grad_norm": 7.53125, "learning_rate": 3.5185826337975947e-06, "loss": 1.03190041, "memory(GiB)": 141.16, "step": 110140, "train_speed(iter/s)": 0.288763 }, { "acc": 0.72903433, "epoch": 1.2321337943395778, "grad_norm": 5.65625, "learning_rate": 3.51681635326123e-06, "loss": 1.07671242, "memory(GiB)": 141.16, "step": 110160, "train_speed(iter/s)": 0.288781 }, { "acc": 0.73563776, "epoch": 1.2323574932855363, "grad_norm": 6.40625, "learning_rate": 3.5150502756455862e-06, "loss": 1.06125832, "memory(GiB)": 141.16, "step": 110180, "train_speed(iter/s)": 0.288799 }, { "acc": 0.74176311, "epoch": 1.2325811922314949, "grad_norm": 8.4375, "learning_rate": 3.513284401192291e-06, "loss": 1.02790432, "memory(GiB)": 141.16, "step": 110200, "train_speed(iter/s)": 0.288818 }, { "acc": 0.74469957, "epoch": 1.2328048911774534, "grad_norm": 8.125, "learning_rate": 3.51151873014294e-06, "loss": 1.01148548, "memory(GiB)": 141.16, "step": 110220, "train_speed(iter/s)": 0.288834 }, { "acc": 0.73967447, "epoch": 1.233028590123412, "grad_norm": 7.65625, "learning_rate": 3.5097532627391014e-06, "loss": 1.05379181, "memory(GiB)": 141.16, "step": 110240, "train_speed(iter/s)": 0.288852 }, { "acc": 0.73641958, "epoch": 1.2332522890693705, "grad_norm": 6.625, "learning_rate": 3.5079879992223164e-06, "loss": 1.0474659, "memory(GiB)": 141.16, "step": 110260, "train_speed(iter/s)": 0.288871 }, { "acc": 0.72775178, "epoch": 1.233475988015329, "grad_norm": 8.1875, "learning_rate": 3.5062229398340995e-06, "loss": 1.09256029, "memory(GiB)": 141.16, "step": 110280, "train_speed(iter/s)": 0.288889 }, { "acc": 0.72478828, "epoch": 1.2336996869612875, "grad_norm": 6.59375, "learning_rate": 3.5044580848159355e-06, "loss": 1.11203251, "memory(GiB)": 141.16, "step": 110300, "train_speed(iter/s)": 0.288908 }, { "acc": 0.73755841, "epoch": 1.233923385907246, "grad_norm": 7.03125, "learning_rate": 3.502693434409282e-06, "loss": 1.04270172, "memory(GiB)": 141.16, "step": 110320, "train_speed(iter/s)": 0.288925 }, { "acc": 0.74503622, "epoch": 1.2341470848532046, "grad_norm": 6.15625, "learning_rate": 3.5009289888555676e-06, "loss": 1.02244644, "memory(GiB)": 141.16, "step": 110340, "train_speed(iter/s)": 0.288941 }, { "acc": 0.74367313, "epoch": 1.234370783799163, "grad_norm": 7.71875, "learning_rate": 3.4991647483961945e-06, "loss": 1.01089106, "memory(GiB)": 141.16, "step": 110360, "train_speed(iter/s)": 0.288959 }, { "acc": 0.72662687, "epoch": 1.2345944827451216, "grad_norm": 8.75, "learning_rate": 3.497400713272535e-06, "loss": 1.10441942, "memory(GiB)": 141.16, "step": 110380, "train_speed(iter/s)": 0.288977 }, { "acc": 0.7324132, "epoch": 1.2348181816910802, "grad_norm": 6.09375, "learning_rate": 3.4956368837259357e-06, "loss": 1.07453918, "memory(GiB)": 141.16, "step": 110400, "train_speed(iter/s)": 0.288995 }, { "acc": 0.74396353, "epoch": 1.2350418806370387, "grad_norm": 8.0625, "learning_rate": 3.493873259997713e-06, "loss": 1.02922096, "memory(GiB)": 141.16, "step": 110420, "train_speed(iter/s)": 0.289013 }, { "acc": 0.74208517, "epoch": 1.2352655795829972, "grad_norm": 7.0625, "learning_rate": 3.492109842329156e-06, "loss": 1.02743864, "memory(GiB)": 141.16, "step": 110440, "train_speed(iter/s)": 0.289031 }, { "acc": 0.72654085, "epoch": 1.2354892785289557, "grad_norm": 6.25, "learning_rate": 3.4903466309615254e-06, "loss": 1.10560703, "memory(GiB)": 141.16, "step": 110460, "train_speed(iter/s)": 0.289051 }, { "acc": 0.74039259, "epoch": 1.2357129774749143, "grad_norm": 7.46875, "learning_rate": 3.488583626136053e-06, "loss": 1.04076214, "memory(GiB)": 141.16, "step": 110480, "train_speed(iter/s)": 0.289071 }, { "acc": 0.73794489, "epoch": 1.2359366764208728, "grad_norm": 6.5, "learning_rate": 3.486820828093943e-06, "loss": 1.04249163, "memory(GiB)": 141.16, "step": 110500, "train_speed(iter/s)": 0.28909 }, { "acc": 0.74031658, "epoch": 1.2361603753668313, "grad_norm": 9.5625, "learning_rate": 3.4850582370763743e-06, "loss": 1.04659472, "memory(GiB)": 141.16, "step": 110520, "train_speed(iter/s)": 0.289106 }, { "acc": 0.74198198, "epoch": 1.2363840743127898, "grad_norm": 9.0, "learning_rate": 3.4832958533244897e-06, "loss": 1.04431019, "memory(GiB)": 141.16, "step": 110540, "train_speed(iter/s)": 0.289124 }, { "acc": 0.73086815, "epoch": 1.2366077732587484, "grad_norm": 7.03125, "learning_rate": 3.481533677079413e-06, "loss": 1.06844864, "memory(GiB)": 141.16, "step": 110560, "train_speed(iter/s)": 0.289142 }, { "acc": 0.72941799, "epoch": 1.236831472204707, "grad_norm": 7.21875, "learning_rate": 3.4797717085822314e-06, "loss": 1.08410902, "memory(GiB)": 141.16, "step": 110580, "train_speed(iter/s)": 0.289159 }, { "acc": 0.73370829, "epoch": 1.2370551711506654, "grad_norm": 5.34375, "learning_rate": 3.4780099480740104e-06, "loss": 1.04895535, "memory(GiB)": 141.16, "step": 110600, "train_speed(iter/s)": 0.289176 }, { "acc": 0.73809204, "epoch": 1.237278870096624, "grad_norm": 6.4375, "learning_rate": 3.4762483957957834e-06, "loss": 1.05870762, "memory(GiB)": 141.16, "step": 110620, "train_speed(iter/s)": 0.289193 }, { "acc": 0.74217262, "epoch": 1.2375025690425825, "grad_norm": 5.40625, "learning_rate": 3.4744870519885544e-06, "loss": 1.03454666, "memory(GiB)": 141.16, "step": 110640, "train_speed(iter/s)": 0.289211 }, { "acc": 0.7415719, "epoch": 1.237726267988541, "grad_norm": 6.5, "learning_rate": 3.4727259168933002e-06, "loss": 1.0327776, "memory(GiB)": 141.16, "step": 110660, "train_speed(iter/s)": 0.28923 }, { "acc": 0.74020891, "epoch": 1.2379499669344995, "grad_norm": 7.4375, "learning_rate": 3.470964990750971e-06, "loss": 1.03944798, "memory(GiB)": 141.16, "step": 110680, "train_speed(iter/s)": 0.289246 }, { "acc": 0.72328644, "epoch": 1.238173665880458, "grad_norm": 6.875, "learning_rate": 3.4692042738024865e-06, "loss": 1.10467968, "memory(GiB)": 141.16, "step": 110700, "train_speed(iter/s)": 0.289265 }, { "acc": 0.728686, "epoch": 1.2383973648264166, "grad_norm": 8.0625, "learning_rate": 3.4674437662887385e-06, "loss": 1.11199017, "memory(GiB)": 141.16, "step": 110720, "train_speed(iter/s)": 0.289283 }, { "acc": 0.74465094, "epoch": 1.2386210637723751, "grad_norm": 6.46875, "learning_rate": 3.465683468450587e-06, "loss": 1.02040396, "memory(GiB)": 141.16, "step": 110740, "train_speed(iter/s)": 0.289302 }, { "acc": 0.73269844, "epoch": 1.2388447627183337, "grad_norm": 11.375, "learning_rate": 3.4639233805288676e-06, "loss": 1.08838654, "memory(GiB)": 141.16, "step": 110760, "train_speed(iter/s)": 0.289323 }, { "acc": 0.73521957, "epoch": 1.2390684616642922, "grad_norm": 8.6875, "learning_rate": 3.462163502764385e-06, "loss": 1.03792286, "memory(GiB)": 141.16, "step": 110780, "train_speed(iter/s)": 0.289342 }, { "acc": 0.72710662, "epoch": 1.2392921606102507, "grad_norm": 6.375, "learning_rate": 3.460403835397917e-06, "loss": 1.11176071, "memory(GiB)": 141.16, "step": 110800, "train_speed(iter/s)": 0.28936 }, { "acc": 0.74918575, "epoch": 1.2395158595562092, "grad_norm": 7.875, "learning_rate": 3.4586443786702106e-06, "loss": 0.99796247, "memory(GiB)": 141.16, "step": 110820, "train_speed(iter/s)": 0.289379 }, { "acc": 0.72920246, "epoch": 1.2397395585021678, "grad_norm": 7.46875, "learning_rate": 3.4568851328219834e-06, "loss": 1.09038591, "memory(GiB)": 141.16, "step": 110840, "train_speed(iter/s)": 0.289395 }, { "acc": 0.7170886, "epoch": 1.2399632574481263, "grad_norm": 7.59375, "learning_rate": 3.455126098093926e-06, "loss": 1.14982605, "memory(GiB)": 141.16, "step": 110860, "train_speed(iter/s)": 0.289411 }, { "acc": 0.74199624, "epoch": 1.2401869563940848, "grad_norm": 6.8125, "learning_rate": 3.4533672747267e-06, "loss": 1.03802071, "memory(GiB)": 141.16, "step": 110880, "train_speed(iter/s)": 0.289428 }, { "acc": 0.74426155, "epoch": 1.2404106553400434, "grad_norm": 6.8125, "learning_rate": 3.451608662960937e-06, "loss": 1.01142883, "memory(GiB)": 141.16, "step": 110900, "train_speed(iter/s)": 0.289446 }, { "acc": 0.74298224, "epoch": 1.2406343542860019, "grad_norm": 5.40625, "learning_rate": 3.449850263037241e-06, "loss": 1.03176289, "memory(GiB)": 141.16, "step": 110920, "train_speed(iter/s)": 0.289463 }, { "acc": 0.73204889, "epoch": 1.2408580532319604, "grad_norm": 6.03125, "learning_rate": 3.4480920751961853e-06, "loss": 1.08094349, "memory(GiB)": 141.16, "step": 110940, "train_speed(iter/s)": 0.289478 }, { "acc": 0.73704987, "epoch": 1.241081752177919, "grad_norm": 6.71875, "learning_rate": 3.4463340996783155e-06, "loss": 1.07262688, "memory(GiB)": 141.16, "step": 110960, "train_speed(iter/s)": 0.289493 }, { "acc": 0.72204428, "epoch": 1.2413054511238775, "grad_norm": 8.625, "learning_rate": 3.4445763367241485e-06, "loss": 1.11595879, "memory(GiB)": 141.16, "step": 110980, "train_speed(iter/s)": 0.289509 }, { "acc": 0.72574067, "epoch": 1.241529150069836, "grad_norm": 6.84375, "learning_rate": 3.4428187865741702e-06, "loss": 1.10766563, "memory(GiB)": 141.16, "step": 111000, "train_speed(iter/s)": 0.289526 }, { "acc": 0.73701515, "epoch": 1.2417528490157945, "grad_norm": 6.59375, "learning_rate": 3.4410614494688397e-06, "loss": 1.06698246, "memory(GiB)": 141.16, "step": 111020, "train_speed(iter/s)": 0.28954 }, { "acc": 0.73883252, "epoch": 1.241976547961753, "grad_norm": 7.21875, "learning_rate": 3.439304325648585e-06, "loss": 1.03962889, "memory(GiB)": 141.16, "step": 111040, "train_speed(iter/s)": 0.289557 }, { "acc": 0.74205456, "epoch": 1.2422002469077116, "grad_norm": 7.0625, "learning_rate": 3.4375474153538064e-06, "loss": 1.05614233, "memory(GiB)": 141.16, "step": 111060, "train_speed(iter/s)": 0.289575 }, { "acc": 0.74182091, "epoch": 1.24242394585367, "grad_norm": 7.0, "learning_rate": 3.435790718824873e-06, "loss": 1.03539734, "memory(GiB)": 141.16, "step": 111080, "train_speed(iter/s)": 0.289593 }, { "acc": 0.73741016, "epoch": 1.2426476447996286, "grad_norm": 8.75, "learning_rate": 3.434034236302127e-06, "loss": 1.05418987, "memory(GiB)": 141.16, "step": 111100, "train_speed(iter/s)": 0.289609 }, { "acc": 0.7492857, "epoch": 1.2428713437455872, "grad_norm": 6.34375, "learning_rate": 3.4322779680258822e-06, "loss": 1.00365009, "memory(GiB)": 141.16, "step": 111120, "train_speed(iter/s)": 0.289627 }, { "acc": 0.73819876, "epoch": 1.2430950426915457, "grad_norm": 8.25, "learning_rate": 3.4305219142364176e-06, "loss": 1.04085464, "memory(GiB)": 141.16, "step": 111140, "train_speed(iter/s)": 0.289647 }, { "acc": 0.7298461, "epoch": 1.2433187416375042, "grad_norm": 6.8125, "learning_rate": 3.428766075173988e-06, "loss": 1.0902976, "memory(GiB)": 141.16, "step": 111160, "train_speed(iter/s)": 0.289663 }, { "acc": 0.7302496, "epoch": 1.2435424405834627, "grad_norm": 7.46875, "learning_rate": 3.4270104510788184e-06, "loss": 1.09849949, "memory(GiB)": 141.16, "step": 111180, "train_speed(iter/s)": 0.289681 }, { "acc": 0.73911901, "epoch": 1.2437661395294213, "grad_norm": 6.0625, "learning_rate": 3.4252550421911015e-06, "loss": 1.03510618, "memory(GiB)": 141.16, "step": 111200, "train_speed(iter/s)": 0.289701 }, { "acc": 0.74165907, "epoch": 1.2439898384753798, "grad_norm": 6.0, "learning_rate": 3.423499848751004e-06, "loss": 1.03520432, "memory(GiB)": 141.16, "step": 111220, "train_speed(iter/s)": 0.289719 }, { "acc": 0.74323874, "epoch": 1.2442135374213383, "grad_norm": 6.34375, "learning_rate": 3.42174487099866e-06, "loss": 1.02883959, "memory(GiB)": 141.16, "step": 111240, "train_speed(iter/s)": 0.289736 }, { "acc": 0.73386083, "epoch": 1.2444372363672969, "grad_norm": 8.375, "learning_rate": 3.419990109174176e-06, "loss": 1.07032852, "memory(GiB)": 141.16, "step": 111260, "train_speed(iter/s)": 0.289755 }, { "acc": 0.75758252, "epoch": 1.2446609353132554, "grad_norm": 6.78125, "learning_rate": 3.41823556351763e-06, "loss": 0.95384216, "memory(GiB)": 141.16, "step": 111280, "train_speed(iter/s)": 0.289772 }, { "acc": 0.72478414, "epoch": 1.244884634259214, "grad_norm": 5.96875, "learning_rate": 3.416481234269066e-06, "loss": 1.10665321, "memory(GiB)": 141.16, "step": 111300, "train_speed(iter/s)": 0.289789 }, { "acc": 0.72840195, "epoch": 1.2451083332051724, "grad_norm": 8.3125, "learning_rate": 3.414727121668503e-06, "loss": 1.08010759, "memory(GiB)": 141.16, "step": 111320, "train_speed(iter/s)": 0.289805 }, { "acc": 0.73646131, "epoch": 1.245332032151131, "grad_norm": 7.03125, "learning_rate": 3.412973225955929e-06, "loss": 1.07489548, "memory(GiB)": 141.16, "step": 111340, "train_speed(iter/s)": 0.289821 }, { "acc": 0.73449812, "epoch": 1.2455557310970895, "grad_norm": 8.625, "learning_rate": 3.4112195473713015e-06, "loss": 1.07514973, "memory(GiB)": 141.16, "step": 111360, "train_speed(iter/s)": 0.289837 }, { "acc": 0.72822247, "epoch": 1.245779430043048, "grad_norm": 6.34375, "learning_rate": 3.409466086154548e-06, "loss": 1.11097832, "memory(GiB)": 141.16, "step": 111380, "train_speed(iter/s)": 0.289855 }, { "acc": 0.7369452, "epoch": 1.2460031289890066, "grad_norm": 7.125, "learning_rate": 3.4077128425455686e-06, "loss": 1.04902039, "memory(GiB)": 141.16, "step": 111400, "train_speed(iter/s)": 0.289871 }, { "acc": 0.73213024, "epoch": 1.246226827934965, "grad_norm": 6.90625, "learning_rate": 3.405959816784231e-06, "loss": 1.07933512, "memory(GiB)": 141.16, "step": 111420, "train_speed(iter/s)": 0.289887 }, { "acc": 0.72854395, "epoch": 1.2464505268809236, "grad_norm": 7.0625, "learning_rate": 3.404207009110374e-06, "loss": 1.11037827, "memory(GiB)": 141.16, "step": 111440, "train_speed(iter/s)": 0.289905 }, { "acc": 0.72419367, "epoch": 1.2466742258268821, "grad_norm": 6.90625, "learning_rate": 3.4024544197638085e-06, "loss": 1.1094305, "memory(GiB)": 141.16, "step": 111460, "train_speed(iter/s)": 0.289925 }, { "acc": 0.72720976, "epoch": 1.2468979247728407, "grad_norm": 6.125, "learning_rate": 3.400702048984312e-06, "loss": 1.09589252, "memory(GiB)": 141.16, "step": 111480, "train_speed(iter/s)": 0.289941 }, { "acc": 0.74724178, "epoch": 1.2471216237187992, "grad_norm": 6.875, "learning_rate": 3.3989498970116347e-06, "loss": 1.02230682, "memory(GiB)": 141.16, "step": 111500, "train_speed(iter/s)": 0.289961 }, { "acc": 0.73676643, "epoch": 1.2473453226647577, "grad_norm": 6.59375, "learning_rate": 3.3971979640854954e-06, "loss": 1.06182327, "memory(GiB)": 141.16, "step": 111520, "train_speed(iter/s)": 0.289979 }, { "acc": 0.73151631, "epoch": 1.2475690216107163, "grad_norm": 7.5625, "learning_rate": 3.3954462504455838e-06, "loss": 1.06408424, "memory(GiB)": 141.16, "step": 111540, "train_speed(iter/s)": 0.289995 }, { "acc": 0.73994837, "epoch": 1.2477927205566748, "grad_norm": 7.8125, "learning_rate": 3.3936947563315603e-06, "loss": 1.04020109, "memory(GiB)": 141.16, "step": 111560, "train_speed(iter/s)": 0.290013 }, { "acc": 0.73773327, "epoch": 1.2480164195026333, "grad_norm": 7.03125, "learning_rate": 3.391943481983053e-06, "loss": 1.04489412, "memory(GiB)": 141.16, "step": 111580, "train_speed(iter/s)": 0.29003 }, { "acc": 0.74122057, "epoch": 1.2482401184485918, "grad_norm": 6.875, "learning_rate": 3.3901924276396614e-06, "loss": 1.049119, "memory(GiB)": 141.16, "step": 111600, "train_speed(iter/s)": 0.290047 }, { "acc": 0.73893614, "epoch": 1.2484638173945504, "grad_norm": 8.625, "learning_rate": 3.3884415935409555e-06, "loss": 1.05685616, "memory(GiB)": 141.16, "step": 111620, "train_speed(iter/s)": 0.290064 }, { "acc": 0.72533917, "epoch": 1.248687516340509, "grad_norm": 8.3125, "learning_rate": 3.3866909799264737e-06, "loss": 1.11593218, "memory(GiB)": 141.16, "step": 111640, "train_speed(iter/s)": 0.29008 }, { "acc": 0.73331404, "epoch": 1.2489112152864674, "grad_norm": 7.8125, "learning_rate": 3.3849405870357265e-06, "loss": 1.08456879, "memory(GiB)": 141.16, "step": 111660, "train_speed(iter/s)": 0.290098 }, { "acc": 0.73184676, "epoch": 1.249134914232426, "grad_norm": 7.5625, "learning_rate": 3.383190415108191e-06, "loss": 1.07505703, "memory(GiB)": 141.16, "step": 111680, "train_speed(iter/s)": 0.290116 }, { "acc": 0.73712111, "epoch": 1.2493586131783845, "grad_norm": 6.75, "learning_rate": 3.3814404643833156e-06, "loss": 1.03477268, "memory(GiB)": 141.16, "step": 111700, "train_speed(iter/s)": 0.290134 }, { "acc": 0.73318796, "epoch": 1.249582312124343, "grad_norm": 7.6875, "learning_rate": 3.379690735100519e-06, "loss": 1.05942249, "memory(GiB)": 141.16, "step": 111720, "train_speed(iter/s)": 0.290153 }, { "acc": 0.73788424, "epoch": 1.2498060110703015, "grad_norm": 6.4375, "learning_rate": 3.37794122749919e-06, "loss": 1.04586792, "memory(GiB)": 141.16, "step": 111740, "train_speed(iter/s)": 0.29017 }, { "acc": 0.72295227, "epoch": 1.25002971001626, "grad_norm": 6.0625, "learning_rate": 3.376191941818686e-06, "loss": 1.12617769, "memory(GiB)": 141.16, "step": 111760, "train_speed(iter/s)": 0.290188 }, { "acc": 0.72962093, "epoch": 1.2502534089622186, "grad_norm": 8.125, "learning_rate": 3.374442878298334e-06, "loss": 1.07478275, "memory(GiB)": 141.16, "step": 111780, "train_speed(iter/s)": 0.290205 }, { "acc": 0.75588741, "epoch": 1.2504771079081771, "grad_norm": 6.75, "learning_rate": 3.37269403717743e-06, "loss": 0.98006191, "memory(GiB)": 141.16, "step": 111800, "train_speed(iter/s)": 0.290221 }, { "acc": 0.73361998, "epoch": 1.2507008068541356, "grad_norm": 6.09375, "learning_rate": 3.3709454186952417e-06, "loss": 1.07555256, "memory(GiB)": 141.16, "step": 111820, "train_speed(iter/s)": 0.290236 }, { "acc": 0.72744608, "epoch": 1.2509245058000942, "grad_norm": 9.4375, "learning_rate": 3.369197023091004e-06, "loss": 1.07627945, "memory(GiB)": 141.16, "step": 111840, "train_speed(iter/s)": 0.290252 }, { "acc": 0.73626318, "epoch": 1.2511482047460527, "grad_norm": 6.6875, "learning_rate": 3.367448850603925e-06, "loss": 1.05960255, "memory(GiB)": 141.16, "step": 111860, "train_speed(iter/s)": 0.290269 }, { "acc": 0.7220088, "epoch": 1.2513719036920112, "grad_norm": 6.28125, "learning_rate": 3.3657009014731763e-06, "loss": 1.10832787, "memory(GiB)": 141.16, "step": 111880, "train_speed(iter/s)": 0.290285 }, { "acc": 0.74324613, "epoch": 1.2515956026379698, "grad_norm": 8.875, "learning_rate": 3.3639531759379035e-06, "loss": 1.01655922, "memory(GiB)": 141.16, "step": 111900, "train_speed(iter/s)": 0.290301 }, { "acc": 0.73019209, "epoch": 1.2518193015839283, "grad_norm": 6.71875, "learning_rate": 3.362205674237221e-06, "loss": 1.0859045, "memory(GiB)": 141.16, "step": 111920, "train_speed(iter/s)": 0.290319 }, { "acc": 0.73370185, "epoch": 1.2520430005298868, "grad_norm": 8.125, "learning_rate": 3.3604583966102124e-06, "loss": 1.06757336, "memory(GiB)": 141.16, "step": 111940, "train_speed(iter/s)": 0.290337 }, { "acc": 0.73199396, "epoch": 1.2522666994758453, "grad_norm": 7.21875, "learning_rate": 3.3587113432959295e-06, "loss": 1.06534834, "memory(GiB)": 141.16, "step": 111960, "train_speed(iter/s)": 0.290355 }, { "acc": 0.73136115, "epoch": 1.2524903984218039, "grad_norm": 8.375, "learning_rate": 3.356964514533394e-06, "loss": 1.06178207, "memory(GiB)": 141.16, "step": 111980, "train_speed(iter/s)": 0.290373 }, { "acc": 0.73448668, "epoch": 1.2527140973677624, "grad_norm": 6.625, "learning_rate": 3.355217910561597e-06, "loss": 1.07677097, "memory(GiB)": 141.16, "step": 112000, "train_speed(iter/s)": 0.290391 }, { "epoch": 1.2527140973677624, "eval_acc": 0.6901432126985029, "eval_loss": 1.0791959762573242, "eval_runtime": 2322.1499, "eval_samples_per_second": 32.42, "eval_steps_per_second": 16.21, "step": 112000 }, { "acc": 0.73462954, "epoch": 1.252937796313721, "grad_norm": 6.90625, "learning_rate": 3.3534715316194986e-06, "loss": 1.06934862, "memory(GiB)": 141.16, "step": 112020, "train_speed(iter/s)": 0.288632 }, { "acc": 0.7406662, "epoch": 1.2531614952596795, "grad_norm": 6.5625, "learning_rate": 3.35172537794603e-06, "loss": 1.037679, "memory(GiB)": 141.16, "step": 112040, "train_speed(iter/s)": 0.288651 }, { "acc": 0.7369503, "epoch": 1.253385194205638, "grad_norm": 5.90625, "learning_rate": 3.34997944978009e-06, "loss": 1.04362574, "memory(GiB)": 141.16, "step": 112060, "train_speed(iter/s)": 0.28867 }, { "acc": 0.73186779, "epoch": 1.2536088931515965, "grad_norm": 7.75, "learning_rate": 3.3482337473605435e-06, "loss": 1.0632803, "memory(GiB)": 141.16, "step": 112080, "train_speed(iter/s)": 0.288688 }, { "acc": 0.73639183, "epoch": 1.253832592097555, "grad_norm": 7.46875, "learning_rate": 3.34648827092623e-06, "loss": 1.04281282, "memory(GiB)": 141.16, "step": 112100, "train_speed(iter/s)": 0.288704 }, { "acc": 0.72819443, "epoch": 1.2540562910435136, "grad_norm": 9.5625, "learning_rate": 3.344743020715955e-06, "loss": 1.07854633, "memory(GiB)": 141.16, "step": 112120, "train_speed(iter/s)": 0.288722 }, { "acc": 0.72867508, "epoch": 1.254279989989472, "grad_norm": 8.0625, "learning_rate": 3.3429979969684944e-06, "loss": 1.08170433, "memory(GiB)": 141.16, "step": 112140, "train_speed(iter/s)": 0.288741 }, { "acc": 0.72181873, "epoch": 1.2545036889354306, "grad_norm": 7.15625, "learning_rate": 3.3412531999225928e-06, "loss": 1.12817726, "memory(GiB)": 141.16, "step": 112160, "train_speed(iter/s)": 0.288758 }, { "acc": 0.72150888, "epoch": 1.2547273878813892, "grad_norm": 7.03125, "learning_rate": 3.339508629816961e-06, "loss": 1.1248271, "memory(GiB)": 141.16, "step": 112180, "train_speed(iter/s)": 0.288776 }, { "acc": 0.74193811, "epoch": 1.2549510868273477, "grad_norm": 6.625, "learning_rate": 3.3377642868902827e-06, "loss": 1.04235477, "memory(GiB)": 141.16, "step": 112200, "train_speed(iter/s)": 0.288793 }, { "acc": 0.72881222, "epoch": 1.2551747857733062, "grad_norm": 6.25, "learning_rate": 3.336020171381209e-06, "loss": 1.09998093, "memory(GiB)": 141.16, "step": 112220, "train_speed(iter/s)": 0.288812 }, { "acc": 0.73659472, "epoch": 1.2553984847192647, "grad_norm": 7.65625, "learning_rate": 3.3342762835283593e-06, "loss": 1.06894789, "memory(GiB)": 141.16, "step": 112240, "train_speed(iter/s)": 0.28883 }, { "acc": 0.733815, "epoch": 1.2556221836652233, "grad_norm": 7.3125, "learning_rate": 3.3325326235703235e-06, "loss": 1.06763649, "memory(GiB)": 141.16, "step": 112260, "train_speed(iter/s)": 0.288847 }, { "acc": 0.7442606, "epoch": 1.2558458826111818, "grad_norm": 5.71875, "learning_rate": 3.3307891917456573e-06, "loss": 1.03641901, "memory(GiB)": 141.16, "step": 112280, "train_speed(iter/s)": 0.288864 }, { "acc": 0.7340992, "epoch": 1.2560695815571403, "grad_norm": 6.90625, "learning_rate": 3.329045988292889e-06, "loss": 1.05487289, "memory(GiB)": 141.16, "step": 112300, "train_speed(iter/s)": 0.288881 }, { "acc": 0.75227757, "epoch": 1.2562932805030989, "grad_norm": 7.5, "learning_rate": 3.3273030134505124e-06, "loss": 1.00377598, "memory(GiB)": 141.16, "step": 112320, "train_speed(iter/s)": 0.288897 }, { "acc": 0.74209003, "epoch": 1.2565169794490574, "grad_norm": 8.5, "learning_rate": 3.325560267456992e-06, "loss": 1.03161278, "memory(GiB)": 141.16, "step": 112340, "train_speed(iter/s)": 0.288914 }, { "acc": 0.72341318, "epoch": 1.256740678395016, "grad_norm": 7.78125, "learning_rate": 3.323817750550761e-06, "loss": 1.10974331, "memory(GiB)": 141.16, "step": 112360, "train_speed(iter/s)": 0.288931 }, { "acc": 0.73255396, "epoch": 1.2569643773409744, "grad_norm": 7.4375, "learning_rate": 3.322075462970219e-06, "loss": 1.08797216, "memory(GiB)": 141.16, "step": 112380, "train_speed(iter/s)": 0.28895 }, { "acc": 0.73091536, "epoch": 1.257188076286933, "grad_norm": 8.4375, "learning_rate": 3.3203334049537373e-06, "loss": 1.08437252, "memory(GiB)": 141.16, "step": 112400, "train_speed(iter/s)": 0.288968 }, { "acc": 0.74838176, "epoch": 1.2574117752328915, "grad_norm": 7.25, "learning_rate": 3.318591576739653e-06, "loss": 1.01169729, "memory(GiB)": 141.16, "step": 112420, "train_speed(iter/s)": 0.288988 }, { "acc": 0.74946203, "epoch": 1.25763547417885, "grad_norm": 8.125, "learning_rate": 3.3168499785662745e-06, "loss": 0.98441353, "memory(GiB)": 141.16, "step": 112440, "train_speed(iter/s)": 0.289004 }, { "acc": 0.75399303, "epoch": 1.2578591731248085, "grad_norm": 8.25, "learning_rate": 3.3151086106718783e-06, "loss": 0.97375412, "memory(GiB)": 141.16, "step": 112460, "train_speed(iter/s)": 0.289023 }, { "acc": 0.74881105, "epoch": 1.258082872070767, "grad_norm": 7.78125, "learning_rate": 3.313367473294705e-06, "loss": 1.02098522, "memory(GiB)": 141.16, "step": 112480, "train_speed(iter/s)": 0.28904 }, { "acc": 0.72400999, "epoch": 1.2583065710167256, "grad_norm": 9.125, "learning_rate": 3.3116265666729687e-06, "loss": 1.10814161, "memory(GiB)": 141.16, "step": 112500, "train_speed(iter/s)": 0.289057 }, { "acc": 0.72752209, "epoch": 1.2585302699626841, "grad_norm": 5.6875, "learning_rate": 3.3098858910448517e-06, "loss": 1.09749651, "memory(GiB)": 141.16, "step": 112520, "train_speed(iter/s)": 0.289074 }, { "acc": 0.73033438, "epoch": 1.2587539689086427, "grad_norm": 6.0, "learning_rate": 3.3081454466485007e-06, "loss": 1.08390846, "memory(GiB)": 141.16, "step": 112540, "train_speed(iter/s)": 0.289091 }, { "acc": 0.7344501, "epoch": 1.2589776678546012, "grad_norm": 6.71875, "learning_rate": 3.3064052337220355e-06, "loss": 1.06954651, "memory(GiB)": 141.16, "step": 112560, "train_speed(iter/s)": 0.289107 }, { "acc": 0.73725662, "epoch": 1.2592013668005597, "grad_norm": 6.4375, "learning_rate": 3.3046652525035404e-06, "loss": 1.06136837, "memory(GiB)": 141.16, "step": 112580, "train_speed(iter/s)": 0.289125 }, { "acc": 0.733564, "epoch": 1.2594250657465182, "grad_norm": 5.28125, "learning_rate": 3.3029255032310715e-06, "loss": 1.06049232, "memory(GiB)": 141.16, "step": 112600, "train_speed(iter/s)": 0.289143 }, { "acc": 0.74609003, "epoch": 1.2596487646924768, "grad_norm": 6.9375, "learning_rate": 3.301185986142651e-06, "loss": 1.01636295, "memory(GiB)": 141.16, "step": 112620, "train_speed(iter/s)": 0.289162 }, { "acc": 0.73925104, "epoch": 1.2598724636384353, "grad_norm": 5.4375, "learning_rate": 3.299446701476269e-06, "loss": 1.01700878, "memory(GiB)": 141.16, "step": 112640, "train_speed(iter/s)": 0.28918 }, { "acc": 0.73837218, "epoch": 1.2600961625843938, "grad_norm": 7.28125, "learning_rate": 3.297707649469884e-06, "loss": 1.04020691, "memory(GiB)": 141.16, "step": 112660, "train_speed(iter/s)": 0.289198 }, { "acc": 0.73055658, "epoch": 1.2603198615303524, "grad_norm": 6.59375, "learning_rate": 3.295968830361424e-06, "loss": 1.09759064, "memory(GiB)": 141.16, "step": 112680, "train_speed(iter/s)": 0.289215 }, { "acc": 0.73618193, "epoch": 1.2605435604763109, "grad_norm": 5.78125, "learning_rate": 3.294230244388784e-06, "loss": 1.04700623, "memory(GiB)": 141.16, "step": 112700, "train_speed(iter/s)": 0.289233 }, { "acc": 0.73026705, "epoch": 1.2607672594222694, "grad_norm": 6.8125, "learning_rate": 3.2924918917898296e-06, "loss": 1.07847652, "memory(GiB)": 141.16, "step": 112720, "train_speed(iter/s)": 0.289249 }, { "acc": 0.72660155, "epoch": 1.260990958368228, "grad_norm": 8.75, "learning_rate": 3.2907537728023887e-06, "loss": 1.09780684, "memory(GiB)": 141.16, "step": 112740, "train_speed(iter/s)": 0.289264 }, { "acc": 0.72798767, "epoch": 1.2612146573141865, "grad_norm": 6.625, "learning_rate": 3.2890158876642618e-06, "loss": 1.11014519, "memory(GiB)": 141.16, "step": 112760, "train_speed(iter/s)": 0.28928 }, { "acc": 0.73361912, "epoch": 1.261438356260145, "grad_norm": 7.78125, "learning_rate": 3.2872782366132185e-06, "loss": 1.06673889, "memory(GiB)": 141.16, "step": 112780, "train_speed(iter/s)": 0.289296 }, { "acc": 0.73673668, "epoch": 1.2616620552061035, "grad_norm": 8.0, "learning_rate": 3.2855408198869922e-06, "loss": 1.05642414, "memory(GiB)": 141.16, "step": 112800, "train_speed(iter/s)": 0.289313 }, { "acc": 0.73246865, "epoch": 1.261885754152062, "grad_norm": 5.375, "learning_rate": 3.2838036377232875e-06, "loss": 1.07212877, "memory(GiB)": 141.16, "step": 112820, "train_speed(iter/s)": 0.28933 }, { "acc": 0.73763642, "epoch": 1.2621094530980206, "grad_norm": 7.0625, "learning_rate": 3.2820666903597747e-06, "loss": 1.04937782, "memory(GiB)": 141.16, "step": 112840, "train_speed(iter/s)": 0.289347 }, { "acc": 0.72275934, "epoch": 1.2623331520439791, "grad_norm": 5.21875, "learning_rate": 3.2803299780340938e-06, "loss": 1.10951061, "memory(GiB)": 141.16, "step": 112860, "train_speed(iter/s)": 0.289364 }, { "acc": 0.73472557, "epoch": 1.2625568509899376, "grad_norm": 8.1875, "learning_rate": 3.278593500983851e-06, "loss": 1.04128838, "memory(GiB)": 141.16, "step": 112880, "train_speed(iter/s)": 0.289382 }, { "acc": 0.73191137, "epoch": 1.2627805499358962, "grad_norm": 6.28125, "learning_rate": 3.2768572594466227e-06, "loss": 1.06660843, "memory(GiB)": 141.16, "step": 112900, "train_speed(iter/s)": 0.289399 }, { "acc": 0.73141022, "epoch": 1.2630042488818547, "grad_norm": 7.25, "learning_rate": 3.275121253659951e-06, "loss": 1.09420891, "memory(GiB)": 141.16, "step": 112920, "train_speed(iter/s)": 0.289415 }, { "acc": 0.72233491, "epoch": 1.2632279478278132, "grad_norm": 7.03125, "learning_rate": 3.2733854838613455e-06, "loss": 1.13236237, "memory(GiB)": 141.16, "step": 112940, "train_speed(iter/s)": 0.289433 }, { "acc": 0.73372669, "epoch": 1.2634516467737718, "grad_norm": 9.0625, "learning_rate": 3.271649950288284e-06, "loss": 1.06895771, "memory(GiB)": 141.16, "step": 112960, "train_speed(iter/s)": 0.289451 }, { "acc": 0.74069843, "epoch": 1.2636753457197303, "grad_norm": 7.6875, "learning_rate": 3.269914653178214e-06, "loss": 1.03363523, "memory(GiB)": 141.16, "step": 112980, "train_speed(iter/s)": 0.289466 }, { "acc": 0.72471399, "epoch": 1.2638990446656888, "grad_norm": 6.40625, "learning_rate": 3.2681795927685477e-06, "loss": 1.13313828, "memory(GiB)": 141.16, "step": 113000, "train_speed(iter/s)": 0.289485 }, { "acc": 0.74500661, "epoch": 1.2641227436116473, "grad_norm": 7.375, "learning_rate": 3.266444769296667e-06, "loss": 1.02496634, "memory(GiB)": 141.16, "step": 113020, "train_speed(iter/s)": 0.289503 }, { "acc": 0.73588309, "epoch": 1.2643464425576059, "grad_norm": 7.75, "learning_rate": 3.26471018299992e-06, "loss": 1.05153637, "memory(GiB)": 141.16, "step": 113040, "train_speed(iter/s)": 0.28952 }, { "acc": 0.73390417, "epoch": 1.2645701415035644, "grad_norm": 5.78125, "learning_rate": 3.2629758341156227e-06, "loss": 1.06292, "memory(GiB)": 141.16, "step": 113060, "train_speed(iter/s)": 0.289536 }, { "acc": 0.73243303, "epoch": 1.264793840449523, "grad_norm": 7.6875, "learning_rate": 3.261241722881059e-06, "loss": 1.06739674, "memory(GiB)": 141.16, "step": 113080, "train_speed(iter/s)": 0.289554 }, { "acc": 0.73645039, "epoch": 1.2650175393954814, "grad_norm": 6.6875, "learning_rate": 3.25950784953348e-06, "loss": 1.05582247, "memory(GiB)": 141.16, "step": 113100, "train_speed(iter/s)": 0.289573 }, { "acc": 0.7249754, "epoch": 1.26524123834144, "grad_norm": 7.21875, "learning_rate": 3.2577742143101053e-06, "loss": 1.09599266, "memory(GiB)": 141.16, "step": 113120, "train_speed(iter/s)": 0.289592 }, { "acc": 0.72646427, "epoch": 1.2654649372873985, "grad_norm": 7.09375, "learning_rate": 3.2560408174481202e-06, "loss": 1.10699291, "memory(GiB)": 141.16, "step": 113140, "train_speed(iter/s)": 0.289608 }, { "acc": 0.7469429, "epoch": 1.265688636233357, "grad_norm": 6.6875, "learning_rate": 3.254307659184678e-06, "loss": 1.02489071, "memory(GiB)": 141.16, "step": 113160, "train_speed(iter/s)": 0.289624 }, { "acc": 0.733564, "epoch": 1.2659123351793156, "grad_norm": 8.1875, "learning_rate": 3.2525747397568984e-06, "loss": 1.06650219, "memory(GiB)": 141.16, "step": 113180, "train_speed(iter/s)": 0.289641 }, { "acc": 0.73319702, "epoch": 1.266136034125274, "grad_norm": 8.5, "learning_rate": 3.2508420594018723e-06, "loss": 1.08131437, "memory(GiB)": 141.16, "step": 113200, "train_speed(iter/s)": 0.289658 }, { "acc": 0.72762156, "epoch": 1.2663597330712326, "grad_norm": 6.53125, "learning_rate": 3.249109618356654e-06, "loss": 1.09317932, "memory(GiB)": 141.16, "step": 113220, "train_speed(iter/s)": 0.289675 }, { "acc": 0.73277702, "epoch": 1.2665834320171911, "grad_norm": 7.0625, "learning_rate": 3.247377416858265e-06, "loss": 1.07360106, "memory(GiB)": 141.16, "step": 113240, "train_speed(iter/s)": 0.289693 }, { "acc": 0.74459262, "epoch": 1.2668071309631497, "grad_norm": 6.15625, "learning_rate": 3.2456454551436967e-06, "loss": 1.01807728, "memory(GiB)": 141.16, "step": 113260, "train_speed(iter/s)": 0.28971 }, { "acc": 0.73575134, "epoch": 1.2670308299091082, "grad_norm": 8.3125, "learning_rate": 3.243913733449905e-06, "loss": 1.05066109, "memory(GiB)": 141.16, "step": 113280, "train_speed(iter/s)": 0.289727 }, { "acc": 0.72883148, "epoch": 1.2672545288550667, "grad_norm": 7.84375, "learning_rate": 3.242182252013815e-06, "loss": 1.08333607, "memory(GiB)": 141.16, "step": 113300, "train_speed(iter/s)": 0.289744 }, { "acc": 0.74639144, "epoch": 1.2674782278010253, "grad_norm": 8.8125, "learning_rate": 3.2404510110723192e-06, "loss": 1.01695986, "memory(GiB)": 141.16, "step": 113320, "train_speed(iter/s)": 0.28976 }, { "acc": 0.7255095, "epoch": 1.2677019267469838, "grad_norm": 8.3125, "learning_rate": 3.2387200108622736e-06, "loss": 1.09962616, "memory(GiB)": 141.16, "step": 113340, "train_speed(iter/s)": 0.289776 }, { "acc": 0.71794844, "epoch": 1.2679256256929423, "grad_norm": 7.1875, "learning_rate": 3.2369892516205047e-06, "loss": 1.15132332, "memory(GiB)": 141.16, "step": 113360, "train_speed(iter/s)": 0.289794 }, { "acc": 0.73849301, "epoch": 1.2681493246389008, "grad_norm": 6.84375, "learning_rate": 3.235258733583806e-06, "loss": 1.045928, "memory(GiB)": 141.16, "step": 113380, "train_speed(iter/s)": 0.289812 }, { "acc": 0.73689103, "epoch": 1.2683730235848594, "grad_norm": 8.5625, "learning_rate": 3.233528456988936e-06, "loss": 1.02063456, "memory(GiB)": 141.16, "step": 113400, "train_speed(iter/s)": 0.28983 }, { "acc": 0.73711195, "epoch": 1.268596722530818, "grad_norm": 7.46875, "learning_rate": 3.231798422072623e-06, "loss": 1.06434822, "memory(GiB)": 141.16, "step": 113420, "train_speed(iter/s)": 0.289847 }, { "acc": 0.7420495, "epoch": 1.2688204214767764, "grad_norm": 7.5, "learning_rate": 3.2300686290715584e-06, "loss": 1.03518362, "memory(GiB)": 141.16, "step": 113440, "train_speed(iter/s)": 0.28986 }, { "acc": 0.73425174, "epoch": 1.269044120422735, "grad_norm": 8.1875, "learning_rate": 3.2283390782224035e-06, "loss": 1.07077122, "memory(GiB)": 141.16, "step": 113460, "train_speed(iter/s)": 0.289877 }, { "acc": 0.74384336, "epoch": 1.2692678193686935, "grad_norm": 7.1875, "learning_rate": 3.226609769761785e-06, "loss": 1.0216629, "memory(GiB)": 141.16, "step": 113480, "train_speed(iter/s)": 0.289894 }, { "acc": 0.73864775, "epoch": 1.269491518314652, "grad_norm": 7.28125, "learning_rate": 3.224880703926298e-06, "loss": 1.04200745, "memory(GiB)": 141.16, "step": 113500, "train_speed(iter/s)": 0.289914 }, { "acc": 0.73980145, "epoch": 1.2697152172606105, "grad_norm": 8.125, "learning_rate": 3.223151880952504e-06, "loss": 1.02813997, "memory(GiB)": 141.16, "step": 113520, "train_speed(iter/s)": 0.289932 }, { "acc": 0.72390728, "epoch": 1.269938916206569, "grad_norm": 6.75, "learning_rate": 3.221423301076929e-06, "loss": 1.10033369, "memory(GiB)": 141.16, "step": 113540, "train_speed(iter/s)": 0.289951 }, { "acc": 0.73441906, "epoch": 1.2701626151525276, "grad_norm": 7.5, "learning_rate": 3.2196949645360675e-06, "loss": 1.06743059, "memory(GiB)": 141.16, "step": 113560, "train_speed(iter/s)": 0.289969 }, { "acc": 0.72429953, "epoch": 1.2703863140984861, "grad_norm": 7.78125, "learning_rate": 3.2179668715663814e-06, "loss": 1.12140255, "memory(GiB)": 141.16, "step": 113580, "train_speed(iter/s)": 0.289989 }, { "acc": 0.73330841, "epoch": 1.2706100130444447, "grad_norm": 6.8125, "learning_rate": 3.2162390224042987e-06, "loss": 1.06949615, "memory(GiB)": 141.16, "step": 113600, "train_speed(iter/s)": 0.290009 }, { "acc": 0.73397131, "epoch": 1.2708337119904032, "grad_norm": 6.5625, "learning_rate": 3.2145114172862147e-06, "loss": 1.06035795, "memory(GiB)": 141.16, "step": 113620, "train_speed(iter/s)": 0.290029 }, { "acc": 0.74000282, "epoch": 1.2710574109363617, "grad_norm": 5.59375, "learning_rate": 3.2127840564484893e-06, "loss": 1.02320271, "memory(GiB)": 141.16, "step": 113640, "train_speed(iter/s)": 0.290047 }, { "acc": 0.72770038, "epoch": 1.2712811098823202, "grad_norm": 7.5625, "learning_rate": 3.2110569401274494e-06, "loss": 1.10390148, "memory(GiB)": 141.16, "step": 113660, "train_speed(iter/s)": 0.290067 }, { "acc": 0.74723525, "epoch": 1.2715048088282788, "grad_norm": 6.28125, "learning_rate": 3.2093300685593896e-06, "loss": 1.0303606, "memory(GiB)": 141.16, "step": 113680, "train_speed(iter/s)": 0.290085 }, { "acc": 0.73153648, "epoch": 1.2717285077742373, "grad_norm": 8.125, "learning_rate": 3.207603441980571e-06, "loss": 1.07516708, "memory(GiB)": 141.16, "step": 113700, "train_speed(iter/s)": 0.290103 }, { "acc": 0.74058971, "epoch": 1.2719522067201958, "grad_norm": 5.34375, "learning_rate": 3.205877060627221e-06, "loss": 1.03677883, "memory(GiB)": 141.16, "step": 113720, "train_speed(iter/s)": 0.29012 }, { "acc": 0.73216839, "epoch": 1.2721759056661546, "grad_norm": 7.3125, "learning_rate": 3.204150924735533e-06, "loss": 1.08317709, "memory(GiB)": 141.16, "step": 113740, "train_speed(iter/s)": 0.290138 }, { "acc": 0.74048953, "epoch": 1.272399604612113, "grad_norm": 7.125, "learning_rate": 3.2024250345416674e-06, "loss": 1.03051643, "memory(GiB)": 141.16, "step": 113760, "train_speed(iter/s)": 0.290158 }, { "acc": 0.73323541, "epoch": 1.2726233035580716, "grad_norm": 6.375, "learning_rate": 3.2006993902817497e-06, "loss": 1.06913166, "memory(GiB)": 141.16, "step": 113780, "train_speed(iter/s)": 0.290176 }, { "acc": 0.74485235, "epoch": 1.2728470025040302, "grad_norm": 5.125, "learning_rate": 3.198973992191874e-06, "loss": 1.01948967, "memory(GiB)": 141.16, "step": 113800, "train_speed(iter/s)": 0.290192 }, { "acc": 0.72779408, "epoch": 1.2730707014499887, "grad_norm": 8.25, "learning_rate": 3.197248840508098e-06, "loss": 1.09046621, "memory(GiB)": 141.16, "step": 113820, "train_speed(iter/s)": 0.29021 }, { "acc": 0.74376287, "epoch": 1.2732944003959472, "grad_norm": 11.5, "learning_rate": 3.195523935466448e-06, "loss": 1.02722263, "memory(GiB)": 141.16, "step": 113840, "train_speed(iter/s)": 0.290226 }, { "acc": 0.74112463, "epoch": 1.2735180993419057, "grad_norm": 6.5625, "learning_rate": 3.1937992773029164e-06, "loss": 1.02889738, "memory(GiB)": 141.16, "step": 113860, "train_speed(iter/s)": 0.290244 }, { "acc": 0.73794551, "epoch": 1.2737417982878643, "grad_norm": 6.5, "learning_rate": 3.1920748662534594e-06, "loss": 1.05727596, "memory(GiB)": 141.16, "step": 113880, "train_speed(iter/s)": 0.290261 }, { "acc": 0.73013163, "epoch": 1.2739654972338228, "grad_norm": 7.875, "learning_rate": 3.190350702554002e-06, "loss": 1.07037563, "memory(GiB)": 141.16, "step": 113900, "train_speed(iter/s)": 0.290279 }, { "acc": 0.72111673, "epoch": 1.2741891961797813, "grad_norm": 8.0625, "learning_rate": 3.188626786440434e-06, "loss": 1.10559616, "memory(GiB)": 141.16, "step": 113920, "train_speed(iter/s)": 0.290296 }, { "acc": 0.74247694, "epoch": 1.2744128951257399, "grad_norm": 6.21875, "learning_rate": 3.186903118148613e-06, "loss": 1.02021408, "memory(GiB)": 141.16, "step": 113940, "train_speed(iter/s)": 0.290313 }, { "acc": 0.72380571, "epoch": 1.2746365940716984, "grad_norm": 7.78125, "learning_rate": 3.18517969791436e-06, "loss": 1.11981936, "memory(GiB)": 141.16, "step": 113960, "train_speed(iter/s)": 0.290332 }, { "acc": 0.73272519, "epoch": 1.274860293017657, "grad_norm": 7.25, "learning_rate": 3.1834565259734647e-06, "loss": 1.07658939, "memory(GiB)": 141.16, "step": 113980, "train_speed(iter/s)": 0.290348 }, { "acc": 0.72472868, "epoch": 1.2750839919636154, "grad_norm": 5.375, "learning_rate": 3.1817336025616803e-06, "loss": 1.10191956, "memory(GiB)": 141.16, "step": 114000, "train_speed(iter/s)": 0.290366 }, { "epoch": 1.2750839919636154, "eval_acc": 0.6901602200444932, "eval_loss": 1.0791800022125244, "eval_runtime": 2320.0619, "eval_samples_per_second": 32.449, "eval_steps_per_second": 16.225, "step": 114000 }, { "acc": 0.75151443, "epoch": 1.275307690909574, "grad_norm": 6.84375, "learning_rate": 3.180010927914728e-06, "loss": 1.00241413, "memory(GiB)": 141.16, "step": 114020, "train_speed(iter/s)": 0.28864 }, { "acc": 0.73902569, "epoch": 1.2755313898555325, "grad_norm": 7.5625, "learning_rate": 3.178288502268294e-06, "loss": 1.03781376, "memory(GiB)": 141.16, "step": 114040, "train_speed(iter/s)": 0.288658 }, { "acc": 0.73737488, "epoch": 1.275755088801491, "grad_norm": 6.9375, "learning_rate": 3.1765663258580333e-06, "loss": 1.04127769, "memory(GiB)": 141.16, "step": 114060, "train_speed(iter/s)": 0.288673 }, { "acc": 0.74141388, "epoch": 1.2759787877474495, "grad_norm": 5.34375, "learning_rate": 3.1748443989195597e-06, "loss": 1.03934059, "memory(GiB)": 141.16, "step": 114080, "train_speed(iter/s)": 0.288691 }, { "acc": 0.7413301, "epoch": 1.276202486693408, "grad_norm": 8.875, "learning_rate": 3.1731227216884606e-06, "loss": 1.03492947, "memory(GiB)": 141.16, "step": 114100, "train_speed(iter/s)": 0.28871 }, { "acc": 0.73593416, "epoch": 1.2764261856393666, "grad_norm": 7.34375, "learning_rate": 3.171401294400286e-06, "loss": 1.08150101, "memory(GiB)": 141.16, "step": 114120, "train_speed(iter/s)": 0.288729 }, { "acc": 0.73141432, "epoch": 1.2766498845853251, "grad_norm": 4.625, "learning_rate": 3.16968011729055e-06, "loss": 1.09895153, "memory(GiB)": 141.16, "step": 114140, "train_speed(iter/s)": 0.288747 }, { "acc": 0.73727231, "epoch": 1.2768735835312837, "grad_norm": 7.0, "learning_rate": 3.1679591905947365e-06, "loss": 1.03976574, "memory(GiB)": 141.16, "step": 114160, "train_speed(iter/s)": 0.288766 }, { "acc": 0.74528074, "epoch": 1.2770972824772422, "grad_norm": 7.9375, "learning_rate": 3.1662385145482912e-06, "loss": 1.01771288, "memory(GiB)": 141.16, "step": 114180, "train_speed(iter/s)": 0.288784 }, { "acc": 0.72835016, "epoch": 1.2773209814232007, "grad_norm": 9.8125, "learning_rate": 3.1645180893866267e-06, "loss": 1.0998291, "memory(GiB)": 141.16, "step": 114200, "train_speed(iter/s)": 0.288801 }, { "acc": 0.743225, "epoch": 1.2775446803691592, "grad_norm": 7.3125, "learning_rate": 3.1627979153451225e-06, "loss": 1.03146229, "memory(GiB)": 141.16, "step": 114220, "train_speed(iter/s)": 0.288819 }, { "acc": 0.71921196, "epoch": 1.2777683793151178, "grad_norm": 7.46875, "learning_rate": 3.161077992659124e-06, "loss": 1.12827854, "memory(GiB)": 141.16, "step": 114240, "train_speed(iter/s)": 0.288837 }, { "acc": 0.72821913, "epoch": 1.2779920782610763, "grad_norm": 6.3125, "learning_rate": 3.159358321563941e-06, "loss": 1.08609009, "memory(GiB)": 141.16, "step": 114260, "train_speed(iter/s)": 0.288854 }, { "acc": 0.7353241, "epoch": 1.2782157772070348, "grad_norm": 7.09375, "learning_rate": 3.1576389022948474e-06, "loss": 1.06560307, "memory(GiB)": 141.16, "step": 114280, "train_speed(iter/s)": 0.288871 }, { "acc": 0.73414555, "epoch": 1.2784394761529934, "grad_norm": 7.4375, "learning_rate": 3.155919735087085e-06, "loss": 1.06093521, "memory(GiB)": 141.16, "step": 114300, "train_speed(iter/s)": 0.288888 }, { "acc": 0.73559656, "epoch": 1.2786631750989519, "grad_norm": 8.75, "learning_rate": 3.1542008201758616e-06, "loss": 1.05596533, "memory(GiB)": 141.16, "step": 114320, "train_speed(iter/s)": 0.288904 }, { "acc": 0.7341423, "epoch": 1.2788868740449104, "grad_norm": 7.84375, "learning_rate": 3.152482157796348e-06, "loss": 1.07570429, "memory(GiB)": 141.16, "step": 114340, "train_speed(iter/s)": 0.288919 }, { "acc": 0.73596458, "epoch": 1.279110572990869, "grad_norm": 7.25, "learning_rate": 3.150763748183684e-06, "loss": 1.05679026, "memory(GiB)": 141.16, "step": 114360, "train_speed(iter/s)": 0.288937 }, { "acc": 0.72675247, "epoch": 1.2793342719368275, "grad_norm": 7.1875, "learning_rate": 3.149045591572969e-06, "loss": 1.09451046, "memory(GiB)": 141.16, "step": 114380, "train_speed(iter/s)": 0.288953 }, { "acc": 0.73480272, "epoch": 1.279557970882786, "grad_norm": 9.0625, "learning_rate": 3.1473276881992742e-06, "loss": 1.05958099, "memory(GiB)": 141.16, "step": 114400, "train_speed(iter/s)": 0.288973 }, { "acc": 0.73795643, "epoch": 1.2797816698287445, "grad_norm": 7.0, "learning_rate": 3.145610038297632e-06, "loss": 1.04305553, "memory(GiB)": 141.16, "step": 114420, "train_speed(iter/s)": 0.288991 }, { "acc": 0.74170589, "epoch": 1.280005368774703, "grad_norm": 7.46875, "learning_rate": 3.1438926421030414e-06, "loss": 1.03428802, "memory(GiB)": 141.16, "step": 114440, "train_speed(iter/s)": 0.289008 }, { "acc": 0.72136078, "epoch": 1.2802290677206616, "grad_norm": 7.96875, "learning_rate": 3.142175499850469e-06, "loss": 1.1155139, "memory(GiB)": 141.16, "step": 114460, "train_speed(iter/s)": 0.289024 }, { "acc": 0.72127972, "epoch": 1.2804527666666201, "grad_norm": 7.4375, "learning_rate": 3.1404586117748413e-06, "loss": 1.12888527, "memory(GiB)": 141.16, "step": 114480, "train_speed(iter/s)": 0.289039 }, { "acc": 0.73545208, "epoch": 1.2806764656125786, "grad_norm": 6.21875, "learning_rate": 3.1387419781110546e-06, "loss": 1.05358315, "memory(GiB)": 141.16, "step": 114500, "train_speed(iter/s)": 0.289057 }, { "acc": 0.73925753, "epoch": 1.2809001645585372, "grad_norm": 9.3125, "learning_rate": 3.137025599093969e-06, "loss": 1.04625864, "memory(GiB)": 141.16, "step": 114520, "train_speed(iter/s)": 0.289074 }, { "acc": 0.74381452, "epoch": 1.2811238635044957, "grad_norm": 6.75, "learning_rate": 3.135309474958409e-06, "loss": 1.01812801, "memory(GiB)": 141.16, "step": 114540, "train_speed(iter/s)": 0.289091 }, { "acc": 0.72129726, "epoch": 1.2813475624504542, "grad_norm": 6.78125, "learning_rate": 3.1335936059391668e-06, "loss": 1.13189774, "memory(GiB)": 141.16, "step": 114560, "train_speed(iter/s)": 0.289106 }, { "acc": 0.73143764, "epoch": 1.2815712613964128, "grad_norm": 5.8125, "learning_rate": 3.1318779922709953e-06, "loss": 1.06459694, "memory(GiB)": 141.16, "step": 114580, "train_speed(iter/s)": 0.289122 }, { "acc": 0.73147202, "epoch": 1.2817949603423713, "grad_norm": 5.90625, "learning_rate": 3.130162634188616e-06, "loss": 1.07992706, "memory(GiB)": 141.16, "step": 114600, "train_speed(iter/s)": 0.289139 }, { "acc": 0.73716316, "epoch": 1.2820186592883298, "grad_norm": 8.6875, "learning_rate": 3.1284475319267143e-06, "loss": 1.05353737, "memory(GiB)": 141.16, "step": 114620, "train_speed(iter/s)": 0.289157 }, { "acc": 0.74628501, "epoch": 1.2822423582342883, "grad_norm": 9.0, "learning_rate": 3.126732685719941e-06, "loss": 1.00886078, "memory(GiB)": 141.16, "step": 114640, "train_speed(iter/s)": 0.289173 }, { "acc": 0.72757387, "epoch": 1.2824660571802469, "grad_norm": 6.28125, "learning_rate": 3.125018095802913e-06, "loss": 1.08958549, "memory(GiB)": 141.16, "step": 114660, "train_speed(iter/s)": 0.289187 }, { "acc": 0.73957825, "epoch": 1.2826897561262054, "grad_norm": 7.625, "learning_rate": 3.1233037624102067e-06, "loss": 1.04093885, "memory(GiB)": 141.16, "step": 114680, "train_speed(iter/s)": 0.289205 }, { "acc": 0.74923396, "epoch": 1.282913455072164, "grad_norm": 7.75, "learning_rate": 3.121589685776372e-06, "loss": 0.99175377, "memory(GiB)": 141.16, "step": 114700, "train_speed(iter/s)": 0.289221 }, { "acc": 0.74300761, "epoch": 1.2831371540181224, "grad_norm": 5.59375, "learning_rate": 3.1198758661359152e-06, "loss": 1.02584267, "memory(GiB)": 141.16, "step": 114720, "train_speed(iter/s)": 0.289239 }, { "acc": 0.73836913, "epoch": 1.283360852964081, "grad_norm": 6.4375, "learning_rate": 3.118162303723314e-06, "loss": 1.04055338, "memory(GiB)": 141.16, "step": 114740, "train_speed(iter/s)": 0.289255 }, { "acc": 0.72484598, "epoch": 1.2835845519100395, "grad_norm": 6.9375, "learning_rate": 3.1164489987730078e-06, "loss": 1.1249732, "memory(GiB)": 141.16, "step": 114760, "train_speed(iter/s)": 0.289269 }, { "acc": 0.72942629, "epoch": 1.283808250855998, "grad_norm": 5.6875, "learning_rate": 3.1147359515194e-06, "loss": 1.08595648, "memory(GiB)": 141.16, "step": 114780, "train_speed(iter/s)": 0.289287 }, { "acc": 0.74122076, "epoch": 1.2840319498019566, "grad_norm": 8.5, "learning_rate": 3.1130231621968602e-06, "loss": 1.03354425, "memory(GiB)": 141.16, "step": 114800, "train_speed(iter/s)": 0.289304 }, { "acc": 0.73579888, "epoch": 1.284255648747915, "grad_norm": 6.625, "learning_rate": 3.1113106310397236e-06, "loss": 1.07672749, "memory(GiB)": 141.16, "step": 114820, "train_speed(iter/s)": 0.289321 }, { "acc": 0.74888797, "epoch": 1.2844793476938736, "grad_norm": 7.46875, "learning_rate": 3.10959835828229e-06, "loss": 0.99664516, "memory(GiB)": 141.16, "step": 114840, "train_speed(iter/s)": 0.289339 }, { "acc": 0.73685989, "epoch": 1.2847030466398321, "grad_norm": 5.90625, "learning_rate": 3.107886344158819e-06, "loss": 1.02897568, "memory(GiB)": 141.16, "step": 114860, "train_speed(iter/s)": 0.289358 }, { "acc": 0.7284976, "epoch": 1.2849267455857907, "grad_norm": 5.875, "learning_rate": 3.106174588903541e-06, "loss": 1.07395563, "memory(GiB)": 141.16, "step": 114880, "train_speed(iter/s)": 0.289373 }, { "acc": 0.7423748, "epoch": 1.2851504445317492, "grad_norm": 6.15625, "learning_rate": 3.1044630927506483e-06, "loss": 1.03318367, "memory(GiB)": 141.16, "step": 114900, "train_speed(iter/s)": 0.28939 }, { "acc": 0.74265685, "epoch": 1.2853741434777077, "grad_norm": 6.96875, "learning_rate": 3.1027518559342982e-06, "loss": 1.0288476, "memory(GiB)": 141.16, "step": 114920, "train_speed(iter/s)": 0.289408 }, { "acc": 0.72676759, "epoch": 1.2855978424236663, "grad_norm": 6.21875, "learning_rate": 3.1010408786886114e-06, "loss": 1.09739361, "memory(GiB)": 141.16, "step": 114940, "train_speed(iter/s)": 0.289426 }, { "acc": 0.73699441, "epoch": 1.2858215413696248, "grad_norm": 7.25, "learning_rate": 3.0993301612476743e-06, "loss": 1.0413784, "memory(GiB)": 141.16, "step": 114960, "train_speed(iter/s)": 0.289443 }, { "acc": 0.72908306, "epoch": 1.2860452403155833, "grad_norm": 6.5, "learning_rate": 3.097619703845539e-06, "loss": 1.09214401, "memory(GiB)": 141.16, "step": 114980, "train_speed(iter/s)": 0.28946 }, { "acc": 0.73278723, "epoch": 1.2862689392615418, "grad_norm": 9.1875, "learning_rate": 3.095909506716219e-06, "loss": 1.07352581, "memory(GiB)": 141.16, "step": 115000, "train_speed(iter/s)": 0.289474 }, { "acc": 0.73259993, "epoch": 1.2864926382075004, "grad_norm": 8.5625, "learning_rate": 3.0941995700936957e-06, "loss": 1.07816925, "memory(GiB)": 141.16, "step": 115020, "train_speed(iter/s)": 0.289489 }, { "acc": 0.72921867, "epoch": 1.286716337153459, "grad_norm": 7.28125, "learning_rate": 3.09248989421191e-06, "loss": 1.08387804, "memory(GiB)": 141.16, "step": 115040, "train_speed(iter/s)": 0.289507 }, { "acc": 0.73093462, "epoch": 1.2869400360994174, "grad_norm": 6.75, "learning_rate": 3.0907804793047715e-06, "loss": 1.0809144, "memory(GiB)": 141.16, "step": 115060, "train_speed(iter/s)": 0.289525 }, { "acc": 0.74011583, "epoch": 1.287163735045376, "grad_norm": 9.4375, "learning_rate": 3.0890713256061523e-06, "loss": 1.06234627, "memory(GiB)": 141.16, "step": 115080, "train_speed(iter/s)": 0.289542 }, { "acc": 0.73862529, "epoch": 1.2873874339913345, "grad_norm": 8.9375, "learning_rate": 3.0873624333498884e-06, "loss": 1.05445843, "memory(GiB)": 141.16, "step": 115100, "train_speed(iter/s)": 0.289558 }, { "acc": 0.73162818, "epoch": 1.287611132937293, "grad_norm": 4.9375, "learning_rate": 3.0856538027697834e-06, "loss": 1.08904629, "memory(GiB)": 141.16, "step": 115120, "train_speed(iter/s)": 0.289577 }, { "acc": 0.74248013, "epoch": 1.2878348318832515, "grad_norm": 7.375, "learning_rate": 3.0839454340996e-06, "loss": 1.03897438, "memory(GiB)": 141.16, "step": 115140, "train_speed(iter/s)": 0.289592 }, { "acc": 0.74051161, "epoch": 1.28805853082921, "grad_norm": 7.1875, "learning_rate": 3.0822373275730672e-06, "loss": 1.03522778, "memory(GiB)": 141.16, "step": 115160, "train_speed(iter/s)": 0.289608 }, { "acc": 0.73737526, "epoch": 1.2882822297751686, "grad_norm": 6.3125, "learning_rate": 3.0805294834238793e-06, "loss": 1.04685097, "memory(GiB)": 141.16, "step": 115180, "train_speed(iter/s)": 0.289624 }, { "acc": 0.74865818, "epoch": 1.2885059287211271, "grad_norm": 7.09375, "learning_rate": 3.0788219018856934e-06, "loss": 1.00073986, "memory(GiB)": 141.16, "step": 115200, "train_speed(iter/s)": 0.28964 }, { "acc": 0.73956256, "epoch": 1.2887296276670857, "grad_norm": 6.25, "learning_rate": 3.0771145831921323e-06, "loss": 1.05084696, "memory(GiB)": 141.16, "step": 115220, "train_speed(iter/s)": 0.289658 }, { "acc": 0.73777933, "epoch": 1.2889533266130442, "grad_norm": 5.84375, "learning_rate": 3.0754075275767804e-06, "loss": 1.05004625, "memory(GiB)": 141.16, "step": 115240, "train_speed(iter/s)": 0.289674 }, { "acc": 0.74200692, "epoch": 1.2891770255590027, "grad_norm": 5.59375, "learning_rate": 3.073700735273186e-06, "loss": 1.02907333, "memory(GiB)": 141.16, "step": 115260, "train_speed(iter/s)": 0.28969 }, { "acc": 0.73108845, "epoch": 1.2894007245049612, "grad_norm": 8.0625, "learning_rate": 3.0719942065148655e-06, "loss": 1.08696938, "memory(GiB)": 141.16, "step": 115280, "train_speed(iter/s)": 0.289705 }, { "acc": 0.7403245, "epoch": 1.2896244234509198, "grad_norm": 5.96875, "learning_rate": 3.070287941535295e-06, "loss": 1.0383502, "memory(GiB)": 141.16, "step": 115300, "train_speed(iter/s)": 0.289721 }, { "acc": 0.74107647, "epoch": 1.2898481223968783, "grad_norm": 9.625, "learning_rate": 3.0685819405679164e-06, "loss": 1.0402277, "memory(GiB)": 141.16, "step": 115320, "train_speed(iter/s)": 0.289738 }, { "acc": 0.72571201, "epoch": 1.2900718213428368, "grad_norm": 6.15625, "learning_rate": 3.0668762038461342e-06, "loss": 1.11303444, "memory(GiB)": 141.16, "step": 115340, "train_speed(iter/s)": 0.289753 }, { "acc": 0.74188166, "epoch": 1.2902955202887954, "grad_norm": 5.46875, "learning_rate": 3.0651707316033176e-06, "loss": 1.05024185, "memory(GiB)": 141.16, "step": 115360, "train_speed(iter/s)": 0.289771 }, { "acc": 0.74833212, "epoch": 1.2905192192347539, "grad_norm": 7.46875, "learning_rate": 3.0634655240728002e-06, "loss": 1.00965195, "memory(GiB)": 141.16, "step": 115380, "train_speed(iter/s)": 0.289785 }, { "acc": 0.7428091, "epoch": 1.2907429181807124, "grad_norm": 7.375, "learning_rate": 3.061760581487878e-06, "loss": 1.01355267, "memory(GiB)": 141.16, "step": 115400, "train_speed(iter/s)": 0.289803 }, { "acc": 0.7290947, "epoch": 1.290966617126671, "grad_norm": 6.65625, "learning_rate": 3.060055904081814e-06, "loss": 1.09261017, "memory(GiB)": 141.16, "step": 115420, "train_speed(iter/s)": 0.289821 }, { "acc": 0.73162727, "epoch": 1.2911903160726295, "grad_norm": 6.9375, "learning_rate": 3.0583514920878293e-06, "loss": 1.0798111, "memory(GiB)": 141.16, "step": 115440, "train_speed(iter/s)": 0.289838 }, { "acc": 0.72178483, "epoch": 1.291414015018588, "grad_norm": 8.1875, "learning_rate": 3.0566473457391127e-06, "loss": 1.13374033, "memory(GiB)": 141.16, "step": 115460, "train_speed(iter/s)": 0.289854 }, { "acc": 0.73355637, "epoch": 1.2916377139645465, "grad_norm": 5.75, "learning_rate": 3.054943465268816e-06, "loss": 1.06997509, "memory(GiB)": 141.16, "step": 115480, "train_speed(iter/s)": 0.28987 }, { "acc": 0.72615409, "epoch": 1.291861412910505, "grad_norm": 6.71875, "learning_rate": 3.0532398509100545e-06, "loss": 1.10764103, "memory(GiB)": 141.16, "step": 115500, "train_speed(iter/s)": 0.289888 }, { "acc": 0.73778343, "epoch": 1.2920851118564636, "grad_norm": 6.03125, "learning_rate": 3.051536502895909e-06, "loss": 1.04225302, "memory(GiB)": 141.16, "step": 115520, "train_speed(iter/s)": 0.289905 }, { "acc": 0.74594326, "epoch": 1.292308810802422, "grad_norm": 6.875, "learning_rate": 3.0498334214594184e-06, "loss": 1.01569653, "memory(GiB)": 141.16, "step": 115540, "train_speed(iter/s)": 0.289921 }, { "acc": 0.7405654, "epoch": 1.2925325097483806, "grad_norm": 9.125, "learning_rate": 3.048130606833589e-06, "loss": 1.03352413, "memory(GiB)": 141.16, "step": 115560, "train_speed(iter/s)": 0.289936 }, { "acc": 0.73569818, "epoch": 1.2927562086943392, "grad_norm": 6.21875, "learning_rate": 3.046428059251393e-06, "loss": 1.05823917, "memory(GiB)": 141.16, "step": 115580, "train_speed(iter/s)": 0.289952 }, { "acc": 0.73366833, "epoch": 1.2929799076402977, "grad_norm": 8.5, "learning_rate": 3.0447257789457597e-06, "loss": 1.07669859, "memory(GiB)": 141.16, "step": 115600, "train_speed(iter/s)": 0.289969 }, { "acc": 0.73445611, "epoch": 1.2932036065862562, "grad_norm": 8.1875, "learning_rate": 3.0430237661495894e-06, "loss": 1.05414009, "memory(GiB)": 141.16, "step": 115620, "train_speed(iter/s)": 0.289985 }, { "acc": 0.72994757, "epoch": 1.2934273055322147, "grad_norm": 7.46875, "learning_rate": 3.0413220210957377e-06, "loss": 1.07558079, "memory(GiB)": 141.16, "step": 115640, "train_speed(iter/s)": 0.290001 }, { "acc": 0.73363581, "epoch": 1.2936510044781733, "grad_norm": 6.09375, "learning_rate": 3.03962054401703e-06, "loss": 1.06736727, "memory(GiB)": 141.16, "step": 115660, "train_speed(iter/s)": 0.290019 }, { "acc": 0.75082731, "epoch": 1.2938747034241318, "grad_norm": 9.625, "learning_rate": 3.037919335146252e-06, "loss": 0.98917856, "memory(GiB)": 141.16, "step": 115680, "train_speed(iter/s)": 0.290038 }, { "acc": 0.74015102, "epoch": 1.2940984023700903, "grad_norm": 7.46875, "learning_rate": 3.036218394716154e-06, "loss": 1.02404976, "memory(GiB)": 141.16, "step": 115700, "train_speed(iter/s)": 0.290055 }, { "acc": 0.74118428, "epoch": 1.2943221013160489, "grad_norm": 8.5625, "learning_rate": 3.0345177229594487e-06, "loss": 1.02337198, "memory(GiB)": 141.16, "step": 115720, "train_speed(iter/s)": 0.290072 }, { "acc": 0.7342495, "epoch": 1.2945458002620074, "grad_norm": 7.65625, "learning_rate": 3.0328173201088117e-06, "loss": 1.05678825, "memory(GiB)": 141.16, "step": 115740, "train_speed(iter/s)": 0.290088 }, { "acc": 0.7227561, "epoch": 1.294769499207966, "grad_norm": 7.65625, "learning_rate": 3.0311171863968823e-06, "loss": 1.10911989, "memory(GiB)": 141.16, "step": 115760, "train_speed(iter/s)": 0.290105 }, { "acc": 0.73660369, "epoch": 1.2949931981539244, "grad_norm": 6.46875, "learning_rate": 3.029417322056264e-06, "loss": 1.08075876, "memory(GiB)": 141.16, "step": 115780, "train_speed(iter/s)": 0.290123 }, { "acc": 0.72983971, "epoch": 1.295216897099883, "grad_norm": 8.8125, "learning_rate": 3.0277177273195223e-06, "loss": 1.08175526, "memory(GiB)": 141.16, "step": 115800, "train_speed(iter/s)": 0.290141 }, { "acc": 0.7352088, "epoch": 1.2954405960458415, "grad_norm": 8.875, "learning_rate": 3.0260184024191864e-06, "loss": 1.06741829, "memory(GiB)": 141.16, "step": 115820, "train_speed(iter/s)": 0.290159 }, { "acc": 0.72972679, "epoch": 1.2956642949918, "grad_norm": 6.625, "learning_rate": 3.0243193475877477e-06, "loss": 1.08463182, "memory(GiB)": 141.16, "step": 115840, "train_speed(iter/s)": 0.290175 }, { "acc": 0.7339994, "epoch": 1.2958879939377586, "grad_norm": 6.6875, "learning_rate": 3.02262056305766e-06, "loss": 1.07700863, "memory(GiB)": 141.16, "step": 115860, "train_speed(iter/s)": 0.290192 }, { "acc": 0.72880588, "epoch": 1.296111692883717, "grad_norm": 6.8125, "learning_rate": 3.0209220490613434e-06, "loss": 1.08508415, "memory(GiB)": 141.16, "step": 115880, "train_speed(iter/s)": 0.290211 }, { "acc": 0.74644351, "epoch": 1.2963353918296756, "grad_norm": 7.25, "learning_rate": 3.0192238058311774e-06, "loss": 1.01052914, "memory(GiB)": 141.16, "step": 115900, "train_speed(iter/s)": 0.290228 }, { "acc": 0.7476964, "epoch": 1.2965590907756341, "grad_norm": 6.9375, "learning_rate": 3.0175258335995082e-06, "loss": 1.00872269, "memory(GiB)": 141.16, "step": 115920, "train_speed(iter/s)": 0.290245 }, { "acc": 0.73560629, "epoch": 1.2967827897215927, "grad_norm": 8.4375, "learning_rate": 3.0158281325986392e-06, "loss": 1.04505491, "memory(GiB)": 141.16, "step": 115940, "train_speed(iter/s)": 0.290261 }, { "acc": 0.73888192, "epoch": 1.2970064886675512, "grad_norm": 7.5625, "learning_rate": 3.014130703060843e-06, "loss": 1.04483624, "memory(GiB)": 141.16, "step": 115960, "train_speed(iter/s)": 0.29028 }, { "acc": 0.7367486, "epoch": 1.2972301876135097, "grad_norm": 6.125, "learning_rate": 3.0124335452183505e-06, "loss": 1.04779005, "memory(GiB)": 141.16, "step": 115980, "train_speed(iter/s)": 0.290294 }, { "acc": 0.7383111, "epoch": 1.2974538865594683, "grad_norm": 6.78125, "learning_rate": 3.0107366593033584e-06, "loss": 1.03966255, "memory(GiB)": 141.16, "step": 116000, "train_speed(iter/s)": 0.290312 }, { "epoch": 1.2974538865594683, "eval_acc": 0.6901794950366156, "eval_loss": 1.0791640281677246, "eval_runtime": 2324.254, "eval_samples_per_second": 32.39, "eval_steps_per_second": 16.195, "step": 116000 }, { "acc": 0.73138685, "epoch": 1.2976775855054268, "grad_norm": 5.59375, "learning_rate": 3.0090400455480263e-06, "loss": 1.0874382, "memory(GiB)": 141.16, "step": 116020, "train_speed(iter/s)": 0.288613 }, { "acc": 0.73628292, "epoch": 1.2979012844513853, "grad_norm": 7.625, "learning_rate": 3.007343704184471e-06, "loss": 1.05927677, "memory(GiB)": 141.16, "step": 116040, "train_speed(iter/s)": 0.288629 }, { "acc": 0.72366333, "epoch": 1.2981249833973438, "grad_norm": 8.75, "learning_rate": 3.00564763544478e-06, "loss": 1.10582819, "memory(GiB)": 141.16, "step": 116060, "train_speed(iter/s)": 0.288647 }, { "acc": 0.74182968, "epoch": 1.2983486823433024, "grad_norm": 7.0625, "learning_rate": 3.0039518395609974e-06, "loss": 1.02143955, "memory(GiB)": 141.16, "step": 116080, "train_speed(iter/s)": 0.288664 }, { "acc": 0.73035183, "epoch": 1.298572381289261, "grad_norm": 6.15625, "learning_rate": 3.002256316765133e-06, "loss": 1.08746872, "memory(GiB)": 141.16, "step": 116100, "train_speed(iter/s)": 0.288682 }, { "acc": 0.7320466, "epoch": 1.2987960802352194, "grad_norm": 8.8125, "learning_rate": 3.000561067289159e-06, "loss": 1.10011864, "memory(GiB)": 141.16, "step": 116120, "train_speed(iter/s)": 0.288701 }, { "acc": 0.72982659, "epoch": 1.299019779181178, "grad_norm": 6.59375, "learning_rate": 2.998866091365009e-06, "loss": 1.07668915, "memory(GiB)": 141.16, "step": 116140, "train_speed(iter/s)": 0.288719 }, { "acc": 0.74370041, "epoch": 1.2992434781271365, "grad_norm": 7.65625, "learning_rate": 2.9971713892245825e-06, "loss": 1.02483864, "memory(GiB)": 141.16, "step": 116160, "train_speed(iter/s)": 0.288736 }, { "acc": 0.73632345, "epoch": 1.299467177073095, "grad_norm": 7.4375, "learning_rate": 2.995476961099735e-06, "loss": 1.06113205, "memory(GiB)": 141.16, "step": 116180, "train_speed(iter/s)": 0.288751 }, { "acc": 0.72763672, "epoch": 1.2996908760190535, "grad_norm": 7.9375, "learning_rate": 2.9937828072222907e-06, "loss": 1.09659462, "memory(GiB)": 141.16, "step": 116200, "train_speed(iter/s)": 0.288767 }, { "acc": 0.72333808, "epoch": 1.299914574965012, "grad_norm": 5.875, "learning_rate": 2.9920889278240338e-06, "loss": 1.12357426, "memory(GiB)": 141.16, "step": 116220, "train_speed(iter/s)": 0.288781 }, { "acc": 0.71413803, "epoch": 1.3001382739109706, "grad_norm": 6.75, "learning_rate": 2.990395323136712e-06, "loss": 1.15299263, "memory(GiB)": 141.16, "step": 116240, "train_speed(iter/s)": 0.288799 }, { "acc": 0.72703471, "epoch": 1.3003619728569291, "grad_norm": 8.1875, "learning_rate": 2.9887019933920337e-06, "loss": 1.08951263, "memory(GiB)": 141.16, "step": 116260, "train_speed(iter/s)": 0.288816 }, { "acc": 0.73968525, "epoch": 1.3005856718028876, "grad_norm": 6.0625, "learning_rate": 2.9870089388216706e-06, "loss": 1.02401142, "memory(GiB)": 141.16, "step": 116280, "train_speed(iter/s)": 0.288835 }, { "acc": 0.73365688, "epoch": 1.3008093707488462, "grad_norm": 7.71875, "learning_rate": 2.985316159657257e-06, "loss": 1.082162, "memory(GiB)": 141.16, "step": 116300, "train_speed(iter/s)": 0.288851 }, { "acc": 0.7417634, "epoch": 1.3010330696948047, "grad_norm": 9.125, "learning_rate": 2.983623656130389e-06, "loss": 1.01738873, "memory(GiB)": 141.16, "step": 116320, "train_speed(iter/s)": 0.288869 }, { "acc": 0.74321289, "epoch": 1.3012567686407632, "grad_norm": 7.5, "learning_rate": 2.981931428472625e-06, "loss": 1.0299077, "memory(GiB)": 141.16, "step": 116340, "train_speed(iter/s)": 0.288886 }, { "acc": 0.75006471, "epoch": 1.3014804675867218, "grad_norm": 6.96875, "learning_rate": 2.9802394769154875e-06, "loss": 0.98897629, "memory(GiB)": 141.16, "step": 116360, "train_speed(iter/s)": 0.288903 }, { "acc": 0.72346725, "epoch": 1.3017041665326803, "grad_norm": 8.75, "learning_rate": 2.978547801690458e-06, "loss": 1.11918736, "memory(GiB)": 141.16, "step": 116380, "train_speed(iter/s)": 0.28892 }, { "acc": 0.72268691, "epoch": 1.3019278654786388, "grad_norm": 5.375, "learning_rate": 2.9768564030289827e-06, "loss": 1.11524343, "memory(GiB)": 141.16, "step": 116400, "train_speed(iter/s)": 0.288939 }, { "acc": 0.73772831, "epoch": 1.3021515644245973, "grad_norm": 7.71875, "learning_rate": 2.9751652811624686e-06, "loss": 1.05986519, "memory(GiB)": 141.16, "step": 116420, "train_speed(iter/s)": 0.288956 }, { "acc": 0.73340492, "epoch": 1.3023752633705559, "grad_norm": 7.625, "learning_rate": 2.9734744363222855e-06, "loss": 1.07872286, "memory(GiB)": 141.16, "step": 116440, "train_speed(iter/s)": 0.288975 }, { "acc": 0.72914028, "epoch": 1.3025989623165144, "grad_norm": 6.03125, "learning_rate": 2.971783868739766e-06, "loss": 1.08876228, "memory(GiB)": 141.16, "step": 116460, "train_speed(iter/s)": 0.288992 }, { "acc": 0.73374367, "epoch": 1.302822661262473, "grad_norm": 7.28125, "learning_rate": 2.9700935786462027e-06, "loss": 1.06635761, "memory(GiB)": 141.16, "step": 116480, "train_speed(iter/s)": 0.289007 }, { "acc": 0.73671227, "epoch": 1.3030463602084315, "grad_norm": 8.3125, "learning_rate": 2.9684035662728516e-06, "loss": 1.0641983, "memory(GiB)": 141.16, "step": 116500, "train_speed(iter/s)": 0.289023 }, { "acc": 0.72616158, "epoch": 1.30327005915439, "grad_norm": 5.65625, "learning_rate": 2.9667138318509304e-06, "loss": 1.1204668, "memory(GiB)": 141.16, "step": 116520, "train_speed(iter/s)": 0.289038 }, { "acc": 0.73900309, "epoch": 1.3034937581003485, "grad_norm": 6.875, "learning_rate": 2.9650243756116196e-06, "loss": 1.03487263, "memory(GiB)": 141.16, "step": 116540, "train_speed(iter/s)": 0.289055 }, { "acc": 0.73130503, "epoch": 1.303717457046307, "grad_norm": 8.875, "learning_rate": 2.9633351977860624e-06, "loss": 1.0764802, "memory(GiB)": 141.16, "step": 116560, "train_speed(iter/s)": 0.289074 }, { "acc": 0.719245, "epoch": 1.3039411559922656, "grad_norm": 7.59375, "learning_rate": 2.961646298605359e-06, "loss": 1.13654289, "memory(GiB)": 141.16, "step": 116580, "train_speed(iter/s)": 0.289091 }, { "acc": 0.72892771, "epoch": 1.304164854938224, "grad_norm": 8.0625, "learning_rate": 2.959957678300577e-06, "loss": 1.08372536, "memory(GiB)": 141.16, "step": 116600, "train_speed(iter/s)": 0.289104 }, { "acc": 0.73769865, "epoch": 1.3043885538841826, "grad_norm": 6.6875, "learning_rate": 2.9582693371027436e-06, "loss": 1.05731468, "memory(GiB)": 141.16, "step": 116620, "train_speed(iter/s)": 0.289122 }, { "acc": 0.73528309, "epoch": 1.3046122528301412, "grad_norm": 8.5625, "learning_rate": 2.956581275242848e-06, "loss": 1.06179552, "memory(GiB)": 141.16, "step": 116640, "train_speed(iter/s)": 0.289138 }, { "acc": 0.73769126, "epoch": 1.3048359517760997, "grad_norm": 5.84375, "learning_rate": 2.954893492951842e-06, "loss": 1.06194963, "memory(GiB)": 141.16, "step": 116660, "train_speed(iter/s)": 0.289153 }, { "acc": 0.73176465, "epoch": 1.3050596507220582, "grad_norm": 6.28125, "learning_rate": 2.9532059904606363e-06, "loss": 1.07492018, "memory(GiB)": 141.16, "step": 116680, "train_speed(iter/s)": 0.289171 }, { "acc": 0.7291625, "epoch": 1.3052833496680167, "grad_norm": 6.0, "learning_rate": 2.9515187680001067e-06, "loss": 1.0790884, "memory(GiB)": 141.16, "step": 116700, "train_speed(iter/s)": 0.28919 }, { "acc": 0.73383079, "epoch": 1.3055070486139753, "grad_norm": 11.1875, "learning_rate": 2.9498318258010893e-06, "loss": 1.07046757, "memory(GiB)": 141.16, "step": 116720, "train_speed(iter/s)": 0.289209 }, { "acc": 0.73438153, "epoch": 1.3057307475599338, "grad_norm": 6.0625, "learning_rate": 2.9481451640943816e-06, "loss": 1.0716217, "memory(GiB)": 141.16, "step": 116740, "train_speed(iter/s)": 0.289225 }, { "acc": 0.74344745, "epoch": 1.3059544465058923, "grad_norm": 8.6875, "learning_rate": 2.9464587831107442e-06, "loss": 1.01736736, "memory(GiB)": 141.16, "step": 116760, "train_speed(iter/s)": 0.289242 }, { "acc": 0.74097252, "epoch": 1.3061781454518508, "grad_norm": 9.0625, "learning_rate": 2.9447726830808966e-06, "loss": 1.05201435, "memory(GiB)": 141.16, "step": 116780, "train_speed(iter/s)": 0.289259 }, { "acc": 0.73641362, "epoch": 1.3064018443978094, "grad_norm": 5.84375, "learning_rate": 2.9430868642355214e-06, "loss": 1.04490023, "memory(GiB)": 141.16, "step": 116800, "train_speed(iter/s)": 0.289277 }, { "acc": 0.72364655, "epoch": 1.306625543343768, "grad_norm": 8.4375, "learning_rate": 2.941401326805263e-06, "loss": 1.11168118, "memory(GiB)": 141.16, "step": 116820, "train_speed(iter/s)": 0.289294 }, { "acc": 0.72120857, "epoch": 1.3068492422897264, "grad_norm": 6.71875, "learning_rate": 2.9397160710207285e-06, "loss": 1.1291235, "memory(GiB)": 141.16, "step": 116840, "train_speed(iter/s)": 0.28931 }, { "acc": 0.72915859, "epoch": 1.307072941235685, "grad_norm": 8.0625, "learning_rate": 2.9380310971124836e-06, "loss": 1.08364124, "memory(GiB)": 141.16, "step": 116860, "train_speed(iter/s)": 0.289326 }, { "acc": 0.74465785, "epoch": 1.3072966401816435, "grad_norm": 6.96875, "learning_rate": 2.9363464053110557e-06, "loss": 1.01415558, "memory(GiB)": 141.16, "step": 116880, "train_speed(iter/s)": 0.289342 }, { "acc": 0.74158292, "epoch": 1.307520339127602, "grad_norm": 7.09375, "learning_rate": 2.9346619958469367e-06, "loss": 1.02298994, "memory(GiB)": 141.16, "step": 116900, "train_speed(iter/s)": 0.289357 }, { "acc": 0.72829452, "epoch": 1.3077440380735605, "grad_norm": 7.0, "learning_rate": 2.932977868950577e-06, "loss": 1.08095093, "memory(GiB)": 141.16, "step": 116920, "train_speed(iter/s)": 0.289373 }, { "acc": 0.72698722, "epoch": 1.307967737019519, "grad_norm": 7.0, "learning_rate": 2.9312940248523893e-06, "loss": 1.1069519, "memory(GiB)": 141.16, "step": 116940, "train_speed(iter/s)": 0.289391 }, { "acc": 0.72485971, "epoch": 1.3081914359654776, "grad_norm": 7.21875, "learning_rate": 2.929610463782749e-06, "loss": 1.10628338, "memory(GiB)": 141.16, "step": 116960, "train_speed(iter/s)": 0.289407 }, { "acc": 0.72898045, "epoch": 1.3084151349114361, "grad_norm": 6.75, "learning_rate": 2.9279271859719883e-06, "loss": 1.09001236, "memory(GiB)": 141.16, "step": 116980, "train_speed(iter/s)": 0.289424 }, { "acc": 0.73515921, "epoch": 1.3086388338573947, "grad_norm": 7.625, "learning_rate": 2.926244191650406e-06, "loss": 1.05445957, "memory(GiB)": 141.16, "step": 117000, "train_speed(iter/s)": 0.28944 }, { "acc": 0.73983579, "epoch": 1.3088625328033532, "grad_norm": 7.1875, "learning_rate": 2.9245614810482583e-06, "loss": 1.03670158, "memory(GiB)": 141.16, "step": 117020, "train_speed(iter/s)": 0.289458 }, { "acc": 0.72869301, "epoch": 1.3090862317493117, "grad_norm": 6.875, "learning_rate": 2.922879054395765e-06, "loss": 1.09266396, "memory(GiB)": 141.16, "step": 117040, "train_speed(iter/s)": 0.289474 }, { "acc": 0.73438702, "epoch": 1.3093099306952702, "grad_norm": 7.625, "learning_rate": 2.9211969119231075e-06, "loss": 1.07787457, "memory(GiB)": 141.16, "step": 117060, "train_speed(iter/s)": 0.289489 }, { "acc": 0.71463532, "epoch": 1.3095336296412288, "grad_norm": 7.34375, "learning_rate": 2.9195150538604237e-06, "loss": 1.17050047, "memory(GiB)": 141.16, "step": 117080, "train_speed(iter/s)": 0.289505 }, { "acc": 0.74076748, "epoch": 1.3097573285871873, "grad_norm": 6.71875, "learning_rate": 2.9178334804378184e-06, "loss": 1.02974453, "memory(GiB)": 141.16, "step": 117100, "train_speed(iter/s)": 0.289523 }, { "acc": 0.73706341, "epoch": 1.3099810275331458, "grad_norm": 5.9375, "learning_rate": 2.916152191885354e-06, "loss": 1.04504671, "memory(GiB)": 141.16, "step": 117120, "train_speed(iter/s)": 0.289541 }, { "acc": 0.72327824, "epoch": 1.3102047264791044, "grad_norm": 6.625, "learning_rate": 2.9144711884330535e-06, "loss": 1.11417007, "memory(GiB)": 141.16, "step": 117140, "train_speed(iter/s)": 0.289559 }, { "acc": 0.75121069, "epoch": 1.3104284254250629, "grad_norm": 5.90625, "learning_rate": 2.912790470310905e-06, "loss": 1.00656443, "memory(GiB)": 141.16, "step": 117160, "train_speed(iter/s)": 0.289575 }, { "acc": 0.73653531, "epoch": 1.3106521243710214, "grad_norm": 7.71875, "learning_rate": 2.9111100377488515e-06, "loss": 1.06111479, "memory(GiB)": 141.16, "step": 117180, "train_speed(iter/s)": 0.289594 }, { "acc": 0.73443193, "epoch": 1.31087582331698, "grad_norm": 8.125, "learning_rate": 2.909429890976806e-06, "loss": 1.0691143, "memory(GiB)": 141.16, "step": 117200, "train_speed(iter/s)": 0.289612 }, { "acc": 0.73704414, "epoch": 1.3110995222629385, "grad_norm": 6.0625, "learning_rate": 2.9077500302246286e-06, "loss": 1.06141624, "memory(GiB)": 141.16, "step": 117220, "train_speed(iter/s)": 0.289629 }, { "acc": 0.73169732, "epoch": 1.311323221208897, "grad_norm": 6.375, "learning_rate": 2.906070455722154e-06, "loss": 1.07965794, "memory(GiB)": 141.16, "step": 117240, "train_speed(iter/s)": 0.289646 }, { "acc": 0.72438469, "epoch": 1.3115469201548555, "grad_norm": 9.0625, "learning_rate": 2.9043911676991706e-06, "loss": 1.11788197, "memory(GiB)": 141.16, "step": 117260, "train_speed(iter/s)": 0.289664 }, { "acc": 0.74159694, "epoch": 1.311770619100814, "grad_norm": 10.0, "learning_rate": 2.9027121663854263e-06, "loss": 1.03371544, "memory(GiB)": 141.16, "step": 117280, "train_speed(iter/s)": 0.289679 }, { "acc": 0.72709327, "epoch": 1.3119943180467726, "grad_norm": 7.5, "learning_rate": 2.9010334520106367e-06, "loss": 1.09316578, "memory(GiB)": 141.16, "step": 117300, "train_speed(iter/s)": 0.289694 }, { "acc": 0.7314693, "epoch": 1.312218016992731, "grad_norm": 7.9375, "learning_rate": 2.89935502480447e-06, "loss": 1.07166786, "memory(GiB)": 141.16, "step": 117320, "train_speed(iter/s)": 0.289711 }, { "acc": 0.72997293, "epoch": 1.3124417159386896, "grad_norm": 7.25, "learning_rate": 2.897676884996563e-06, "loss": 1.08756733, "memory(GiB)": 141.16, "step": 117340, "train_speed(iter/s)": 0.289728 }, { "acc": 0.745017, "epoch": 1.3126654148846482, "grad_norm": 7.0625, "learning_rate": 2.8959990328165078e-06, "loss": 1.01652184, "memory(GiB)": 141.16, "step": 117360, "train_speed(iter/s)": 0.289743 }, { "acc": 0.73516998, "epoch": 1.3128891138306067, "grad_norm": 5.90625, "learning_rate": 2.8943214684938557e-06, "loss": 1.07219868, "memory(GiB)": 141.16, "step": 117380, "train_speed(iter/s)": 0.289759 }, { "acc": 0.73842583, "epoch": 1.3131128127765652, "grad_norm": 7.96875, "learning_rate": 2.8926441922581255e-06, "loss": 1.04196987, "memory(GiB)": 141.16, "step": 117400, "train_speed(iter/s)": 0.289776 }, { "acc": 0.73592033, "epoch": 1.3133365117225237, "grad_norm": 7.1875, "learning_rate": 2.8909672043387894e-06, "loss": 1.07019024, "memory(GiB)": 141.16, "step": 117420, "train_speed(iter/s)": 0.289792 }, { "acc": 0.7470767, "epoch": 1.3135602106684823, "grad_norm": 7.84375, "learning_rate": 2.8892905049652862e-06, "loss": 1.01264696, "memory(GiB)": 141.16, "step": 117440, "train_speed(iter/s)": 0.289808 }, { "acc": 0.72050638, "epoch": 1.3137839096144408, "grad_norm": 9.125, "learning_rate": 2.887614094367011e-06, "loss": 1.13916492, "memory(GiB)": 141.16, "step": 117460, "train_speed(iter/s)": 0.289825 }, { "acc": 0.73385906, "epoch": 1.3140076085603993, "grad_norm": 7.96875, "learning_rate": 2.885937972773319e-06, "loss": 1.0453701, "memory(GiB)": 141.16, "step": 117480, "train_speed(iter/s)": 0.289842 }, { "acc": 0.73208327, "epoch": 1.314231307506358, "grad_norm": 6.0, "learning_rate": 2.8842621404135308e-06, "loss": 1.07637873, "memory(GiB)": 141.16, "step": 117500, "train_speed(iter/s)": 0.289857 }, { "acc": 0.72952538, "epoch": 1.3144550064523166, "grad_norm": 7.5, "learning_rate": 2.882586597516921e-06, "loss": 1.0704649, "memory(GiB)": 141.16, "step": 117520, "train_speed(iter/s)": 0.289872 }, { "acc": 0.73672371, "epoch": 1.3146787053982751, "grad_norm": 6.8125, "learning_rate": 2.8809113443127312e-06, "loss": 1.04643927, "memory(GiB)": 141.16, "step": 117540, "train_speed(iter/s)": 0.289889 }, { "acc": 0.72997322, "epoch": 1.3149024043442337, "grad_norm": 5.53125, "learning_rate": 2.8792363810301587e-06, "loss": 1.09168987, "memory(GiB)": 141.16, "step": 117560, "train_speed(iter/s)": 0.289906 }, { "acc": 0.73737755, "epoch": 1.3151261032901922, "grad_norm": 8.1875, "learning_rate": 2.8775617078983596e-06, "loss": 1.0415493, "memory(GiB)": 141.16, "step": 117580, "train_speed(iter/s)": 0.289924 }, { "acc": 0.75117321, "epoch": 1.3153498022361507, "grad_norm": 6.28125, "learning_rate": 2.8758873251464583e-06, "loss": 0.98299084, "memory(GiB)": 141.16, "step": 117600, "train_speed(iter/s)": 0.289941 }, { "acc": 0.73301716, "epoch": 1.3155735011821093, "grad_norm": 9.3125, "learning_rate": 2.8742132330035283e-06, "loss": 1.07140417, "memory(GiB)": 141.16, "step": 117620, "train_speed(iter/s)": 0.289958 }, { "acc": 0.73028297, "epoch": 1.3157972001280678, "grad_norm": 6.84375, "learning_rate": 2.872539431698615e-06, "loss": 1.09100838, "memory(GiB)": 141.16, "step": 117640, "train_speed(iter/s)": 0.289975 }, { "acc": 0.74432259, "epoch": 1.3160208990740263, "grad_norm": 5.78125, "learning_rate": 2.870865921460716e-06, "loss": 1.01895084, "memory(GiB)": 141.16, "step": 117660, "train_speed(iter/s)": 0.289993 }, { "acc": 0.73495235, "epoch": 1.3162445980199848, "grad_norm": 7.53125, "learning_rate": 2.8691927025187886e-06, "loss": 1.07443151, "memory(GiB)": 141.16, "step": 117680, "train_speed(iter/s)": 0.290008 }, { "acc": 0.73214445, "epoch": 1.3164682969659434, "grad_norm": 6.625, "learning_rate": 2.8675197751017586e-06, "loss": 1.08497515, "memory(GiB)": 141.16, "step": 117700, "train_speed(iter/s)": 0.290024 }, { "acc": 0.72723045, "epoch": 1.316691995911902, "grad_norm": 6.71875, "learning_rate": 2.865847139438501e-06, "loss": 1.09633675, "memory(GiB)": 141.16, "step": 117720, "train_speed(iter/s)": 0.29004 }, { "acc": 0.74955301, "epoch": 1.3169156948578604, "grad_norm": 8.1875, "learning_rate": 2.8641747957578613e-06, "loss": 1.01285534, "memory(GiB)": 141.16, "step": 117740, "train_speed(iter/s)": 0.290054 }, { "acc": 0.74123998, "epoch": 1.317139393803819, "grad_norm": 7.25, "learning_rate": 2.862502744288637e-06, "loss": 1.03390102, "memory(GiB)": 141.16, "step": 117760, "train_speed(iter/s)": 0.29007 }, { "acc": 0.72829647, "epoch": 1.3173630927497775, "grad_norm": 8.8125, "learning_rate": 2.860830985259587e-06, "loss": 1.09389677, "memory(GiB)": 141.16, "step": 117780, "train_speed(iter/s)": 0.290086 }, { "acc": 0.73437672, "epoch": 1.317586791695736, "grad_norm": 8.125, "learning_rate": 2.859159518899437e-06, "loss": 1.06037045, "memory(GiB)": 141.16, "step": 117800, "train_speed(iter/s)": 0.290102 }, { "acc": 0.73198833, "epoch": 1.3178104906416945, "grad_norm": 7.25, "learning_rate": 2.8574883454368616e-06, "loss": 1.0665453, "memory(GiB)": 141.16, "step": 117820, "train_speed(iter/s)": 0.290118 }, { "acc": 0.74552937, "epoch": 1.318034189587653, "grad_norm": 7.0625, "learning_rate": 2.8558174651005068e-06, "loss": 1.00934658, "memory(GiB)": 141.16, "step": 117840, "train_speed(iter/s)": 0.290136 }, { "acc": 0.75381842, "epoch": 1.3182578885336116, "grad_norm": 7.1875, "learning_rate": 2.8541468781189695e-06, "loss": 0.97257299, "memory(GiB)": 141.16, "step": 117860, "train_speed(iter/s)": 0.290153 }, { "acc": 0.7432456, "epoch": 1.3184815874795701, "grad_norm": 6.0625, "learning_rate": 2.852476584720809e-06, "loss": 1.02187605, "memory(GiB)": 141.16, "step": 117880, "train_speed(iter/s)": 0.290172 }, { "acc": 0.74475551, "epoch": 1.3187052864255286, "grad_norm": 6.84375, "learning_rate": 2.8508065851345486e-06, "loss": 1.02052135, "memory(GiB)": 141.16, "step": 117900, "train_speed(iter/s)": 0.290188 }, { "acc": 0.73184991, "epoch": 1.3189289853714872, "grad_norm": 6.4375, "learning_rate": 2.849136879588664e-06, "loss": 1.09740076, "memory(GiB)": 141.16, "step": 117920, "train_speed(iter/s)": 0.290205 }, { "acc": 0.73313575, "epoch": 1.3191526843174457, "grad_norm": 9.875, "learning_rate": 2.8474674683116e-06, "loss": 1.07477112, "memory(GiB)": 141.16, "step": 117940, "train_speed(iter/s)": 0.290221 }, { "acc": 0.73869696, "epoch": 1.3193763832634042, "grad_norm": 8.875, "learning_rate": 2.8457983515317533e-06, "loss": 1.05189581, "memory(GiB)": 141.16, "step": 117960, "train_speed(iter/s)": 0.290238 }, { "acc": 0.74200659, "epoch": 1.3196000822093628, "grad_norm": 7.65625, "learning_rate": 2.8441295294774795e-06, "loss": 1.03689699, "memory(GiB)": 141.16, "step": 117980, "train_speed(iter/s)": 0.290254 }, { "acc": 0.72704468, "epoch": 1.3198237811553213, "grad_norm": 8.6875, "learning_rate": 2.842461002377104e-06, "loss": 1.1148963, "memory(GiB)": 141.16, "step": 118000, "train_speed(iter/s)": 0.290271 }, { "epoch": 1.3198237811553213, "eval_acc": 0.690173332954735, "eval_loss": 1.079169750213623, "eval_runtime": 2323.5154, "eval_samples_per_second": 32.4, "eval_steps_per_second": 16.2, "step": 118000 }, { "acc": 0.74275484, "epoch": 1.3200474801012798, "grad_norm": 6.1875, "learning_rate": 2.840792770458899e-06, "loss": 1.02981186, "memory(GiB)": 141.16, "step": 118020, "train_speed(iter/s)": 0.288602 }, { "acc": 0.74058619, "epoch": 1.3202711790472383, "grad_norm": 6.65625, "learning_rate": 2.839124833951107e-06, "loss": 1.03991489, "memory(GiB)": 141.16, "step": 118040, "train_speed(iter/s)": 0.288619 }, { "acc": 0.74207544, "epoch": 1.3204948779931969, "grad_norm": 6.5, "learning_rate": 2.8374571930819237e-06, "loss": 1.03727245, "memory(GiB)": 141.16, "step": 118060, "train_speed(iter/s)": 0.288636 }, { "acc": 0.73179779, "epoch": 1.3207185769391554, "grad_norm": 6.75, "learning_rate": 2.8357898480795047e-06, "loss": 1.06941261, "memory(GiB)": 141.16, "step": 118080, "train_speed(iter/s)": 0.28865 }, { "acc": 0.72696199, "epoch": 1.320942275885114, "grad_norm": 6.5, "learning_rate": 2.83412279917197e-06, "loss": 1.09587507, "memory(GiB)": 141.16, "step": 118100, "train_speed(iter/s)": 0.288666 }, { "acc": 0.72423182, "epoch": 1.3211659748310725, "grad_norm": 8.0625, "learning_rate": 2.832456046587392e-06, "loss": 1.11205015, "memory(GiB)": 141.16, "step": 118120, "train_speed(iter/s)": 0.288681 }, { "acc": 0.72445087, "epoch": 1.321389673777031, "grad_norm": 5.125, "learning_rate": 2.83078959055381e-06, "loss": 1.11238842, "memory(GiB)": 141.16, "step": 118140, "train_speed(iter/s)": 0.288697 }, { "acc": 0.7357965, "epoch": 1.3216133727229895, "grad_norm": 6.53125, "learning_rate": 2.829123431299217e-06, "loss": 1.05605783, "memory(GiB)": 141.16, "step": 118160, "train_speed(iter/s)": 0.288716 }, { "acc": 0.74087009, "epoch": 1.321837071668948, "grad_norm": 7.0, "learning_rate": 2.827457569051566e-06, "loss": 1.04261303, "memory(GiB)": 141.16, "step": 118180, "train_speed(iter/s)": 0.288732 }, { "acc": 0.7403017, "epoch": 1.3220607706149066, "grad_norm": 7.6875, "learning_rate": 2.825792004038774e-06, "loss": 1.02914562, "memory(GiB)": 141.16, "step": 118200, "train_speed(iter/s)": 0.288749 }, { "acc": 0.73137751, "epoch": 1.322284469560865, "grad_norm": 8.1875, "learning_rate": 2.8241267364887103e-06, "loss": 1.07936373, "memory(GiB)": 141.16, "step": 118220, "train_speed(iter/s)": 0.288765 }, { "acc": 0.74642782, "epoch": 1.3225081685068236, "grad_norm": 7.65625, "learning_rate": 2.822461766629212e-06, "loss": 1.00439796, "memory(GiB)": 141.16, "step": 118240, "train_speed(iter/s)": 0.288781 }, { "acc": 0.74147286, "epoch": 1.3227318674527822, "grad_norm": 7.375, "learning_rate": 2.820797094688068e-06, "loss": 1.03220196, "memory(GiB)": 141.16, "step": 118260, "train_speed(iter/s)": 0.288797 }, { "acc": 0.73347054, "epoch": 1.3229555663987407, "grad_norm": 7.28125, "learning_rate": 2.8191327208930276e-06, "loss": 1.07709522, "memory(GiB)": 141.16, "step": 118280, "train_speed(iter/s)": 0.288813 }, { "acc": 0.72945766, "epoch": 1.3231792653446992, "grad_norm": 6.3125, "learning_rate": 2.8174686454718048e-06, "loss": 1.08679066, "memory(GiB)": 141.16, "step": 118300, "train_speed(iter/s)": 0.288831 }, { "acc": 0.72956204, "epoch": 1.3234029642906577, "grad_norm": 8.4375, "learning_rate": 2.8158048686520647e-06, "loss": 1.10067863, "memory(GiB)": 141.16, "step": 118320, "train_speed(iter/s)": 0.288848 }, { "acc": 0.74394512, "epoch": 1.3236266632366163, "grad_norm": 7.09375, "learning_rate": 2.814141390661439e-06, "loss": 1.0268466, "memory(GiB)": 141.16, "step": 118340, "train_speed(iter/s)": 0.288864 }, { "acc": 0.7346632, "epoch": 1.3238503621825748, "grad_norm": 8.25, "learning_rate": 2.812478211727515e-06, "loss": 1.0680295, "memory(GiB)": 141.16, "step": 118360, "train_speed(iter/s)": 0.28888 }, { "acc": 0.73779573, "epoch": 1.3240740611285333, "grad_norm": 6.375, "learning_rate": 2.8108153320778385e-06, "loss": 1.05811758, "memory(GiB)": 141.16, "step": 118380, "train_speed(iter/s)": 0.288894 }, { "acc": 0.72814341, "epoch": 1.3242977600744918, "grad_norm": 8.5, "learning_rate": 2.809152751939915e-06, "loss": 1.10075321, "memory(GiB)": 141.16, "step": 118400, "train_speed(iter/s)": 0.28891 }, { "acc": 0.74157615, "epoch": 1.3245214590204504, "grad_norm": 6.6875, "learning_rate": 2.8074904715412084e-06, "loss": 1.0362608, "memory(GiB)": 141.16, "step": 118420, "train_speed(iter/s)": 0.288926 }, { "acc": 0.73647289, "epoch": 1.324745157966409, "grad_norm": 6.28125, "learning_rate": 2.805828491109145e-06, "loss": 1.04802094, "memory(GiB)": 141.16, "step": 118440, "train_speed(iter/s)": 0.288946 }, { "acc": 0.73590956, "epoch": 1.3249688569123674, "grad_norm": 8.5625, "learning_rate": 2.804166810871103e-06, "loss": 1.05821877, "memory(GiB)": 141.16, "step": 118460, "train_speed(iter/s)": 0.288964 }, { "acc": 0.74054446, "epoch": 1.325192555858326, "grad_norm": 6.8125, "learning_rate": 2.8025054310544297e-06, "loss": 1.03840218, "memory(GiB)": 141.16, "step": 118480, "train_speed(iter/s)": 0.288984 }, { "acc": 0.72536669, "epoch": 1.3254162548042845, "grad_norm": 6.78125, "learning_rate": 2.800844351886423e-06, "loss": 1.09873562, "memory(GiB)": 141.16, "step": 118500, "train_speed(iter/s)": 0.289002 }, { "acc": 0.72813702, "epoch": 1.325639953750243, "grad_norm": 6.78125, "learning_rate": 2.79918357359434e-06, "loss": 1.09973068, "memory(GiB)": 141.16, "step": 118520, "train_speed(iter/s)": 0.289021 }, { "acc": 0.73375111, "epoch": 1.3258636526962015, "grad_norm": 6.75, "learning_rate": 2.7975230964054033e-06, "loss": 1.0625926, "memory(GiB)": 141.16, "step": 118540, "train_speed(iter/s)": 0.289039 }, { "acc": 0.73718452, "epoch": 1.32608735164216, "grad_norm": 6.65625, "learning_rate": 2.795862920546785e-06, "loss": 1.04563274, "memory(GiB)": 141.16, "step": 118560, "train_speed(iter/s)": 0.289056 }, { "acc": 0.74347668, "epoch": 1.3263110505881186, "grad_norm": 6.9375, "learning_rate": 2.794203046245626e-06, "loss": 1.02306843, "memory(GiB)": 141.16, "step": 118580, "train_speed(iter/s)": 0.289073 }, { "acc": 0.73367839, "epoch": 1.3265347495340771, "grad_norm": 7.65625, "learning_rate": 2.792543473729018e-06, "loss": 1.07745028, "memory(GiB)": 141.16, "step": 118600, "train_speed(iter/s)": 0.289093 }, { "acc": 0.7493989, "epoch": 1.3267584484800357, "grad_norm": 7.28125, "learning_rate": 2.7908842032240133e-06, "loss": 1.01805706, "memory(GiB)": 141.16, "step": 118620, "train_speed(iter/s)": 0.289111 }, { "acc": 0.73754826, "epoch": 1.3269821474259942, "grad_norm": 5.6875, "learning_rate": 2.7892252349576264e-06, "loss": 1.05214071, "memory(GiB)": 141.16, "step": 118640, "train_speed(iter/s)": 0.289127 }, { "acc": 0.7234395, "epoch": 1.3272058463719527, "grad_norm": 7.09375, "learning_rate": 2.7875665691568256e-06, "loss": 1.11038809, "memory(GiB)": 141.16, "step": 118660, "train_speed(iter/s)": 0.289142 }, { "acc": 0.73627005, "epoch": 1.3274295453179112, "grad_norm": 7.90625, "learning_rate": 2.785908206048542e-06, "loss": 1.07446594, "memory(GiB)": 141.16, "step": 118680, "train_speed(iter/s)": 0.289159 }, { "acc": 0.74365387, "epoch": 1.3276532442638698, "grad_norm": 6.875, "learning_rate": 2.784250145859663e-06, "loss": 1.03245716, "memory(GiB)": 141.16, "step": 118700, "train_speed(iter/s)": 0.289176 }, { "acc": 0.72943583, "epoch": 1.3278769432098283, "grad_norm": 6.09375, "learning_rate": 2.7825923888170325e-06, "loss": 1.09248657, "memory(GiB)": 141.16, "step": 118720, "train_speed(iter/s)": 0.289192 }, { "acc": 0.74059849, "epoch": 1.3281006421557868, "grad_norm": 8.4375, "learning_rate": 2.7809349351474592e-06, "loss": 1.02667446, "memory(GiB)": 141.16, "step": 118740, "train_speed(iter/s)": 0.28921 }, { "acc": 0.73476567, "epoch": 1.3283243411017454, "grad_norm": 6.1875, "learning_rate": 2.7792777850777026e-06, "loss": 1.05584278, "memory(GiB)": 141.16, "step": 118760, "train_speed(iter/s)": 0.289228 }, { "acc": 0.73542166, "epoch": 1.3285480400477039, "grad_norm": 7.28125, "learning_rate": 2.777620938834488e-06, "loss": 1.04849329, "memory(GiB)": 141.16, "step": 118780, "train_speed(iter/s)": 0.289244 }, { "acc": 0.74648418, "epoch": 1.3287717389936624, "grad_norm": 7.625, "learning_rate": 2.775964396644495e-06, "loss": 1.00111485, "memory(GiB)": 141.16, "step": 118800, "train_speed(iter/s)": 0.289258 }, { "acc": 0.73335133, "epoch": 1.328995437939621, "grad_norm": 7.75, "learning_rate": 2.774308158734358e-06, "loss": 1.07279568, "memory(GiB)": 141.16, "step": 118820, "train_speed(iter/s)": 0.289274 }, { "acc": 0.73369837, "epoch": 1.3292191368855795, "grad_norm": 5.5, "learning_rate": 2.7726522253306804e-06, "loss": 1.05986099, "memory(GiB)": 141.16, "step": 118840, "train_speed(iter/s)": 0.289291 }, { "acc": 0.73399515, "epoch": 1.329442835831538, "grad_norm": 5.3125, "learning_rate": 2.7709965966600116e-06, "loss": 1.06339922, "memory(GiB)": 141.16, "step": 118860, "train_speed(iter/s)": 0.289307 }, { "acc": 0.74356928, "epoch": 1.3296665347774965, "grad_norm": 6.03125, "learning_rate": 2.769341272948871e-06, "loss": 1.02125044, "memory(GiB)": 141.16, "step": 118880, "train_speed(iter/s)": 0.289324 }, { "acc": 0.73407536, "epoch": 1.329890233723455, "grad_norm": 9.375, "learning_rate": 2.7676862544237275e-06, "loss": 1.07675018, "memory(GiB)": 141.16, "step": 118900, "train_speed(iter/s)": 0.289341 }, { "acc": 0.73231049, "epoch": 1.3301139326694136, "grad_norm": 5.59375, "learning_rate": 2.7660315413110096e-06, "loss": 1.05817986, "memory(GiB)": 141.16, "step": 118920, "train_speed(iter/s)": 0.289357 }, { "acc": 0.72771697, "epoch": 1.330337631615372, "grad_norm": 7.96875, "learning_rate": 2.7643771338371096e-06, "loss": 1.09258022, "memory(GiB)": 141.16, "step": 118940, "train_speed(iter/s)": 0.289372 }, { "acc": 0.74180775, "epoch": 1.3305613305613306, "grad_norm": 7.96875, "learning_rate": 2.7627230322283698e-06, "loss": 1.02967472, "memory(GiB)": 141.16, "step": 118960, "train_speed(iter/s)": 0.289388 }, { "acc": 0.72394567, "epoch": 1.3307850295072892, "grad_norm": 7.5, "learning_rate": 2.7610692367110993e-06, "loss": 1.12082615, "memory(GiB)": 141.16, "step": 118980, "train_speed(iter/s)": 0.289403 }, { "acc": 0.7373827, "epoch": 1.3310087284532477, "grad_norm": 8.875, "learning_rate": 2.759415747511559e-06, "loss": 1.05242977, "memory(GiB)": 141.16, "step": 119000, "train_speed(iter/s)": 0.289419 }, { "acc": 0.73368168, "epoch": 1.3312324273992062, "grad_norm": 6.8125, "learning_rate": 2.757762564855968e-06, "loss": 1.076618, "memory(GiB)": 141.16, "step": 119020, "train_speed(iter/s)": 0.289432 }, { "acc": 0.73404107, "epoch": 1.3314561263451647, "grad_norm": 6.625, "learning_rate": 2.756109688970509e-06, "loss": 1.0769434, "memory(GiB)": 141.16, "step": 119040, "train_speed(iter/s)": 0.289448 }, { "acc": 0.74274483, "epoch": 1.3316798252911233, "grad_norm": 7.46875, "learning_rate": 2.754457120081315e-06, "loss": 1.01482973, "memory(GiB)": 141.16, "step": 119060, "train_speed(iter/s)": 0.289463 }, { "acc": 0.74563093, "epoch": 1.3319035242370818, "grad_norm": 7.0625, "learning_rate": 2.752804858414485e-06, "loss": 1.02588844, "memory(GiB)": 141.16, "step": 119080, "train_speed(iter/s)": 0.289481 }, { "acc": 0.72553635, "epoch": 1.3321272231830403, "grad_norm": 8.3125, "learning_rate": 2.751152904196068e-06, "loss": 1.1131753, "memory(GiB)": 141.16, "step": 119100, "train_speed(iter/s)": 0.289498 }, { "acc": 0.72785273, "epoch": 1.3323509221289989, "grad_norm": 8.875, "learning_rate": 2.74950125765208e-06, "loss": 1.10260696, "memory(GiB)": 141.16, "step": 119120, "train_speed(iter/s)": 0.289515 }, { "acc": 0.73626986, "epoch": 1.3325746210749574, "grad_norm": 7.25, "learning_rate": 2.747849919008487e-06, "loss": 1.05151205, "memory(GiB)": 141.16, "step": 119140, "train_speed(iter/s)": 0.289531 }, { "acc": 0.73594956, "epoch": 1.332798320020916, "grad_norm": 7.28125, "learning_rate": 2.746198888491213e-06, "loss": 1.06404591, "memory(GiB)": 141.16, "step": 119160, "train_speed(iter/s)": 0.289547 }, { "acc": 0.72574511, "epoch": 1.3330220189668744, "grad_norm": 6.4375, "learning_rate": 2.7445481663261477e-06, "loss": 1.10293989, "memory(GiB)": 141.16, "step": 119180, "train_speed(iter/s)": 0.289562 }, { "acc": 0.74948583, "epoch": 1.333245717912833, "grad_norm": 7.84375, "learning_rate": 2.742897752739129e-06, "loss": 0.99431915, "memory(GiB)": 141.16, "step": 119200, "train_speed(iter/s)": 0.289578 }, { "acc": 0.73316126, "epoch": 1.3334694168587915, "grad_norm": 8.125, "learning_rate": 2.741247647955961e-06, "loss": 1.07770596, "memory(GiB)": 141.16, "step": 119220, "train_speed(iter/s)": 0.289594 }, { "acc": 0.73869305, "epoch": 1.33369311580475, "grad_norm": 7.34375, "learning_rate": 2.7395978522023996e-06, "loss": 1.06283913, "memory(GiB)": 141.16, "step": 119240, "train_speed(iter/s)": 0.28961 }, { "acc": 0.7354178, "epoch": 1.3339168147507086, "grad_norm": 8.4375, "learning_rate": 2.737948365704159e-06, "loss": 1.07402477, "memory(GiB)": 141.16, "step": 119260, "train_speed(iter/s)": 0.289626 }, { "acc": 0.72211189, "epoch": 1.334140513696667, "grad_norm": 6.71875, "learning_rate": 2.736299188686916e-06, "loss": 1.11813555, "memory(GiB)": 141.16, "step": 119280, "train_speed(iter/s)": 0.289642 }, { "acc": 0.73766589, "epoch": 1.3343642126426256, "grad_norm": 6.21875, "learning_rate": 2.7346503213762977e-06, "loss": 1.05245323, "memory(GiB)": 141.16, "step": 119300, "train_speed(iter/s)": 0.28966 }, { "acc": 0.73081684, "epoch": 1.3345879115885841, "grad_norm": 7.71875, "learning_rate": 2.7330017639978968e-06, "loss": 1.06816607, "memory(GiB)": 141.16, "step": 119320, "train_speed(iter/s)": 0.289678 }, { "acc": 0.73915453, "epoch": 1.3348116105345427, "grad_norm": 7.25, "learning_rate": 2.7313535167772575e-06, "loss": 1.06459503, "memory(GiB)": 141.16, "step": 119340, "train_speed(iter/s)": 0.289693 }, { "acc": 0.73329773, "epoch": 1.3350353094805012, "grad_norm": 7.65625, "learning_rate": 2.729705579939881e-06, "loss": 1.08687077, "memory(GiB)": 141.16, "step": 119360, "train_speed(iter/s)": 0.289708 }, { "acc": 0.72818871, "epoch": 1.3352590084264597, "grad_norm": 5.6875, "learning_rate": 2.728057953711234e-06, "loss": 1.11998196, "memory(GiB)": 141.16, "step": 119380, "train_speed(iter/s)": 0.289724 }, { "acc": 0.73788137, "epoch": 1.3354827073724183, "grad_norm": 7.34375, "learning_rate": 2.72641063831673e-06, "loss": 1.05385113, "memory(GiB)": 141.16, "step": 119400, "train_speed(iter/s)": 0.28974 }, { "acc": 0.74558978, "epoch": 1.3357064063183768, "grad_norm": 5.59375, "learning_rate": 2.7247636339817496e-06, "loss": 1.03093691, "memory(GiB)": 141.16, "step": 119420, "train_speed(iter/s)": 0.289756 }, { "acc": 0.73381252, "epoch": 1.3359301052643353, "grad_norm": 6.3125, "learning_rate": 2.723116940931625e-06, "loss": 1.06944122, "memory(GiB)": 141.16, "step": 119440, "train_speed(iter/s)": 0.289773 }, { "acc": 0.72143106, "epoch": 1.3361538042102938, "grad_norm": 6.84375, "learning_rate": 2.7214705593916453e-06, "loss": 1.12202969, "memory(GiB)": 141.16, "step": 119460, "train_speed(iter/s)": 0.289787 }, { "acc": 0.73347397, "epoch": 1.3363775031562524, "grad_norm": 6.53125, "learning_rate": 2.719824489587062e-06, "loss": 1.06486969, "memory(GiB)": 141.16, "step": 119480, "train_speed(iter/s)": 0.289804 }, { "acc": 0.7382967, "epoch": 1.336601202102211, "grad_norm": 6.84375, "learning_rate": 2.7181787317430784e-06, "loss": 1.04311142, "memory(GiB)": 141.16, "step": 119500, "train_speed(iter/s)": 0.289822 }, { "acc": 0.73485098, "epoch": 1.3368249010481694, "grad_norm": 6.5, "learning_rate": 2.716533286084861e-06, "loss": 1.06943798, "memory(GiB)": 141.16, "step": 119520, "train_speed(iter/s)": 0.289838 }, { "acc": 0.7275239, "epoch": 1.337048599994128, "grad_norm": 7.8125, "learning_rate": 2.7148881528375282e-06, "loss": 1.09688454, "memory(GiB)": 141.16, "step": 119540, "train_speed(iter/s)": 0.289855 }, { "acc": 0.74276791, "epoch": 1.3372722989400865, "grad_norm": 6.375, "learning_rate": 2.7132433322261554e-06, "loss": 1.02706308, "memory(GiB)": 141.16, "step": 119560, "train_speed(iter/s)": 0.289867 }, { "acc": 0.72259455, "epoch": 1.337495997886045, "grad_norm": 6.53125, "learning_rate": 2.7115988244757847e-06, "loss": 1.11760035, "memory(GiB)": 141.16, "step": 119580, "train_speed(iter/s)": 0.289882 }, { "acc": 0.73699656, "epoch": 1.3377196968320035, "grad_norm": 7.0625, "learning_rate": 2.7099546298113986e-06, "loss": 1.04891214, "memory(GiB)": 141.16, "step": 119600, "train_speed(iter/s)": 0.289898 }, { "acc": 0.7278966, "epoch": 1.337943395777962, "grad_norm": 7.875, "learning_rate": 2.7083107484579547e-06, "loss": 1.08638859, "memory(GiB)": 141.16, "step": 119620, "train_speed(iter/s)": 0.289913 }, { "acc": 0.73877659, "epoch": 1.3381670947239206, "grad_norm": 7.4375, "learning_rate": 2.7066671806403533e-06, "loss": 1.05506039, "memory(GiB)": 141.16, "step": 119640, "train_speed(iter/s)": 0.289929 }, { "acc": 0.73812585, "epoch": 1.3383907936698791, "grad_norm": 7.84375, "learning_rate": 2.705023926583463e-06, "loss": 1.04113674, "memory(GiB)": 141.16, "step": 119660, "train_speed(iter/s)": 0.289944 }, { "acc": 0.72500606, "epoch": 1.3386144926158376, "grad_norm": 5.84375, "learning_rate": 2.703380986512103e-06, "loss": 1.11532726, "memory(GiB)": 141.16, "step": 119680, "train_speed(iter/s)": 0.28996 }, { "acc": 0.73827868, "epoch": 1.3388381915617962, "grad_norm": 7.5, "learning_rate": 2.7017383606510483e-06, "loss": 1.05289955, "memory(GiB)": 141.16, "step": 119700, "train_speed(iter/s)": 0.289976 }, { "acc": 0.73840256, "epoch": 1.3390618905077547, "grad_norm": 6.21875, "learning_rate": 2.7000960492250365e-06, "loss": 1.04985943, "memory(GiB)": 141.16, "step": 119720, "train_speed(iter/s)": 0.289992 }, { "acc": 0.73889914, "epoch": 1.3392855894537132, "grad_norm": 7.0625, "learning_rate": 2.6984540524587576e-06, "loss": 1.05136185, "memory(GiB)": 141.16, "step": 119740, "train_speed(iter/s)": 0.290009 }, { "acc": 0.73252478, "epoch": 1.3395092883996718, "grad_norm": 8.625, "learning_rate": 2.6968123705768624e-06, "loss": 1.08499794, "memory(GiB)": 141.16, "step": 119760, "train_speed(iter/s)": 0.290026 }, { "acc": 0.74574914, "epoch": 1.3397329873456303, "grad_norm": 6.3125, "learning_rate": 2.6951710038039545e-06, "loss": 1.02021179, "memory(GiB)": 141.16, "step": 119780, "train_speed(iter/s)": 0.290042 }, { "acc": 0.74602056, "epoch": 1.3399566862915888, "grad_norm": 9.25, "learning_rate": 2.693529952364595e-06, "loss": 1.0152317, "memory(GiB)": 141.16, "step": 119800, "train_speed(iter/s)": 0.29006 }, { "acc": 0.73698254, "epoch": 1.3401803852375473, "grad_norm": 6.53125, "learning_rate": 2.6918892164833075e-06, "loss": 1.04807892, "memory(GiB)": 141.16, "step": 119820, "train_speed(iter/s)": 0.290079 }, { "acc": 0.74035435, "epoch": 1.3404040841835059, "grad_norm": 7.8125, "learning_rate": 2.690248796384564e-06, "loss": 1.01376419, "memory(GiB)": 141.16, "step": 119840, "train_speed(iter/s)": 0.290096 }, { "acc": 0.72081156, "epoch": 1.3406277831294644, "grad_norm": 5.75, "learning_rate": 2.6886086922928012e-06, "loss": 1.12401829, "memory(GiB)": 141.16, "step": 119860, "train_speed(iter/s)": 0.290114 }, { "acc": 0.73110409, "epoch": 1.340851482075423, "grad_norm": 9.3125, "learning_rate": 2.686968904432406e-06, "loss": 1.0721591, "memory(GiB)": 141.16, "step": 119880, "train_speed(iter/s)": 0.29013 }, { "acc": 0.72508183, "epoch": 1.3410751810213815, "grad_norm": 6.8125, "learning_rate": 2.6853294330277237e-06, "loss": 1.10733776, "memory(GiB)": 141.16, "step": 119900, "train_speed(iter/s)": 0.290148 }, { "acc": 0.73312874, "epoch": 1.34129887996734, "grad_norm": 5.6875, "learning_rate": 2.6836902783030615e-06, "loss": 1.06391315, "memory(GiB)": 141.16, "step": 119920, "train_speed(iter/s)": 0.290164 }, { "acc": 0.7294642, "epoch": 1.3415225789132985, "grad_norm": 6.25, "learning_rate": 2.6820514404826747e-06, "loss": 1.08943224, "memory(GiB)": 141.16, "step": 119940, "train_speed(iter/s)": 0.29018 }, { "acc": 0.72988515, "epoch": 1.341746277859257, "grad_norm": 7.53125, "learning_rate": 2.6804129197907833e-06, "loss": 1.08283501, "memory(GiB)": 141.16, "step": 119960, "train_speed(iter/s)": 0.290197 }, { "acc": 0.726366, "epoch": 1.3419699768052156, "grad_norm": 5.96875, "learning_rate": 2.6787747164515603e-06, "loss": 1.08063574, "memory(GiB)": 141.16, "step": 119980, "train_speed(iter/s)": 0.290214 }, { "acc": 0.74898911, "epoch": 1.342193675751174, "grad_norm": 8.1875, "learning_rate": 2.6771368306891318e-06, "loss": 0.99783325, "memory(GiB)": 141.16, "step": 120000, "train_speed(iter/s)": 0.290231 }, { "epoch": 1.342193675751174, "eval_acc": 0.690148931110488, "eval_loss": 1.079148769378662, "eval_runtime": 2325.3721, "eval_samples_per_second": 32.375, "eval_steps_per_second": 16.188, "step": 120000 }, { "acc": 0.73606329, "epoch": 1.3424173746971326, "grad_norm": 6.40625, "learning_rate": 2.675499262727588e-06, "loss": 1.05852852, "memory(GiB)": 141.16, "step": 120020, "train_speed(iter/s)": 0.288588 }, { "acc": 0.73523378, "epoch": 1.3426410736430912, "grad_norm": 7.84375, "learning_rate": 2.6738620127909676e-06, "loss": 1.07326813, "memory(GiB)": 141.16, "step": 120040, "train_speed(iter/s)": 0.288604 }, { "acc": 0.74132819, "epoch": 1.3428647725890497, "grad_norm": 7.75, "learning_rate": 2.6722250811032735e-06, "loss": 1.03491859, "memory(GiB)": 141.16, "step": 120060, "train_speed(iter/s)": 0.288621 }, { "acc": 0.74423881, "epoch": 1.3430884715350082, "grad_norm": 8.625, "learning_rate": 2.670588467888461e-06, "loss": 1.03218079, "memory(GiB)": 141.16, "step": 120080, "train_speed(iter/s)": 0.288638 }, { "acc": 0.73207297, "epoch": 1.3433121704809667, "grad_norm": 6.6875, "learning_rate": 2.6689521733704382e-06, "loss": 1.07012959, "memory(GiB)": 141.16, "step": 120100, "train_speed(iter/s)": 0.288655 }, { "acc": 0.74457407, "epoch": 1.3435358694269253, "grad_norm": 5.84375, "learning_rate": 2.667316197773079e-06, "loss": 1.0075119, "memory(GiB)": 141.16, "step": 120120, "train_speed(iter/s)": 0.288672 }, { "acc": 0.74386497, "epoch": 1.3437595683728838, "grad_norm": 5.9375, "learning_rate": 2.6656805413202036e-06, "loss": 1.03876343, "memory(GiB)": 141.16, "step": 120140, "train_speed(iter/s)": 0.288688 }, { "acc": 0.74373002, "epoch": 1.3439832673188423, "grad_norm": 7.6875, "learning_rate": 2.664045204235597e-06, "loss": 1.01476412, "memory(GiB)": 141.16, "step": 120160, "train_speed(iter/s)": 0.288705 }, { "acc": 0.73314362, "epoch": 1.3442069662648009, "grad_norm": 9.25, "learning_rate": 2.662410186742995e-06, "loss": 1.07195339, "memory(GiB)": 141.16, "step": 120180, "train_speed(iter/s)": 0.28872 }, { "acc": 0.71923733, "epoch": 1.3444306652107594, "grad_norm": 4.6875, "learning_rate": 2.6607754890660892e-06, "loss": 1.14061098, "memory(GiB)": 141.16, "step": 120200, "train_speed(iter/s)": 0.288735 }, { "acc": 0.72056522, "epoch": 1.344654364156718, "grad_norm": 6.6875, "learning_rate": 2.6591411114285337e-06, "loss": 1.11862793, "memory(GiB)": 141.16, "step": 120220, "train_speed(iter/s)": 0.288751 }, { "acc": 0.7417901, "epoch": 1.3448780631026764, "grad_norm": 8.6875, "learning_rate": 2.657507054053931e-06, "loss": 1.03294754, "memory(GiB)": 141.16, "step": 120240, "train_speed(iter/s)": 0.288767 }, { "acc": 0.73825998, "epoch": 1.345101762048635, "grad_norm": 5.84375, "learning_rate": 2.6558733171658473e-06, "loss": 1.03789005, "memory(GiB)": 141.16, "step": 120260, "train_speed(iter/s)": 0.288784 }, { "acc": 0.73727283, "epoch": 1.3453254609945935, "grad_norm": 7.25, "learning_rate": 2.654239900987799e-06, "loss": 1.05891647, "memory(GiB)": 141.16, "step": 120280, "train_speed(iter/s)": 0.2888 }, { "acc": 0.72526474, "epoch": 1.345549159940552, "grad_norm": 7.5625, "learning_rate": 2.6526068057432585e-06, "loss": 1.11082802, "memory(GiB)": 141.16, "step": 120300, "train_speed(iter/s)": 0.288817 }, { "acc": 0.73358402, "epoch": 1.3457728588865105, "grad_norm": 8.5, "learning_rate": 2.6509740316556616e-06, "loss": 1.06932516, "memory(GiB)": 141.16, "step": 120320, "train_speed(iter/s)": 0.288835 }, { "acc": 0.74214821, "epoch": 1.345996557832469, "grad_norm": 5.875, "learning_rate": 2.6493415789483902e-06, "loss": 1.021138, "memory(GiB)": 141.16, "step": 120340, "train_speed(iter/s)": 0.28885 }, { "acc": 0.75434532, "epoch": 1.3462202567784276, "grad_norm": 7.125, "learning_rate": 2.647709447844792e-06, "loss": 0.9617569, "memory(GiB)": 141.16, "step": 120360, "train_speed(iter/s)": 0.288868 }, { "acc": 0.7233057, "epoch": 1.3464439557243861, "grad_norm": 6.46875, "learning_rate": 2.646077638568162e-06, "loss": 1.10704098, "memory(GiB)": 141.16, "step": 120380, "train_speed(iter/s)": 0.288882 }, { "acc": 0.73503976, "epoch": 1.3466676546703447, "grad_norm": 8.5, "learning_rate": 2.644446151341755e-06, "loss": 1.05754776, "memory(GiB)": 141.16, "step": 120400, "train_speed(iter/s)": 0.288898 }, { "acc": 0.72623491, "epoch": 1.3468913536163032, "grad_norm": 5.21875, "learning_rate": 2.6428149863887854e-06, "loss": 1.10328007, "memory(GiB)": 141.16, "step": 120420, "train_speed(iter/s)": 0.288913 }, { "acc": 0.72240958, "epoch": 1.3471150525622617, "grad_norm": 7.90625, "learning_rate": 2.641184143932416e-06, "loss": 1.13411512, "memory(GiB)": 141.16, "step": 120440, "train_speed(iter/s)": 0.28893 }, { "acc": 0.75874109, "epoch": 1.3473387515082202, "grad_norm": 8.0625, "learning_rate": 2.639553624195772e-06, "loss": 0.94474773, "memory(GiB)": 141.16, "step": 120460, "train_speed(iter/s)": 0.288946 }, { "acc": 0.73382845, "epoch": 1.3475624504541788, "grad_norm": 8.0625, "learning_rate": 2.6379234274019313e-06, "loss": 1.06078529, "memory(GiB)": 141.16, "step": 120480, "train_speed(iter/s)": 0.288962 }, { "acc": 0.72233686, "epoch": 1.3477861494001373, "grad_norm": 7.75, "learning_rate": 2.6362935537739254e-06, "loss": 1.10789223, "memory(GiB)": 141.16, "step": 120500, "train_speed(iter/s)": 0.288977 }, { "acc": 0.73245168, "epoch": 1.3480098483460958, "grad_norm": 6.40625, "learning_rate": 2.6346640035347483e-06, "loss": 1.06858034, "memory(GiB)": 141.16, "step": 120520, "train_speed(iter/s)": 0.288994 }, { "acc": 0.73326111, "epoch": 1.3482335472920544, "grad_norm": 6.5, "learning_rate": 2.633034776907342e-06, "loss": 1.06328926, "memory(GiB)": 141.16, "step": 120540, "train_speed(iter/s)": 0.289009 }, { "acc": 0.7339067, "epoch": 1.3484572462380129, "grad_norm": 6.78125, "learning_rate": 2.631405874114612e-06, "loss": 1.08140087, "memory(GiB)": 141.16, "step": 120560, "train_speed(iter/s)": 0.289024 }, { "acc": 0.73826051, "epoch": 1.3486809451839714, "grad_norm": 5.53125, "learning_rate": 2.629777295379414e-06, "loss": 1.04148197, "memory(GiB)": 141.16, "step": 120580, "train_speed(iter/s)": 0.28904 }, { "acc": 0.73939152, "epoch": 1.34890464412993, "grad_norm": 7.84375, "learning_rate": 2.628149040924558e-06, "loss": 1.0406435, "memory(GiB)": 141.16, "step": 120600, "train_speed(iter/s)": 0.289057 }, { "acc": 0.74378386, "epoch": 1.3491283430758885, "grad_norm": 7.90625, "learning_rate": 2.626521110972816e-06, "loss": 1.02396364, "memory(GiB)": 141.16, "step": 120620, "train_speed(iter/s)": 0.289075 }, { "acc": 0.74512925, "epoch": 1.349352042021847, "grad_norm": 7.125, "learning_rate": 2.62489350574691e-06, "loss": 1.00051022, "memory(GiB)": 141.16, "step": 120640, "train_speed(iter/s)": 0.289093 }, { "acc": 0.73556271, "epoch": 1.3495757409678055, "grad_norm": 8.875, "learning_rate": 2.623266225469522e-06, "loss": 1.06066513, "memory(GiB)": 141.16, "step": 120660, "train_speed(iter/s)": 0.289109 }, { "acc": 0.73184347, "epoch": 1.349799439913764, "grad_norm": 7.125, "learning_rate": 2.621639270363285e-06, "loss": 1.07821741, "memory(GiB)": 141.16, "step": 120680, "train_speed(iter/s)": 0.289126 }, { "acc": 0.74779606, "epoch": 1.3500231388597226, "grad_norm": 9.9375, "learning_rate": 2.62001264065079e-06, "loss": 1.01210642, "memory(GiB)": 141.16, "step": 120700, "train_speed(iter/s)": 0.289143 }, { "acc": 0.73723183, "epoch": 1.350246837805681, "grad_norm": 6.03125, "learning_rate": 2.618386336554584e-06, "loss": 1.05046825, "memory(GiB)": 141.16, "step": 120720, "train_speed(iter/s)": 0.28916 }, { "acc": 0.72992034, "epoch": 1.3504705367516396, "grad_norm": 7.46875, "learning_rate": 2.616760358297167e-06, "loss": 1.09031229, "memory(GiB)": 141.16, "step": 120740, "train_speed(iter/s)": 0.289177 }, { "acc": 0.74202824, "epoch": 1.3506942356975982, "grad_norm": 7.59375, "learning_rate": 2.615134706101001e-06, "loss": 1.01985168, "memory(GiB)": 141.16, "step": 120760, "train_speed(iter/s)": 0.289193 }, { "acc": 0.73405933, "epoch": 1.3509179346435567, "grad_norm": 7.6875, "learning_rate": 2.6135093801884913e-06, "loss": 1.06122389, "memory(GiB)": 141.16, "step": 120780, "train_speed(iter/s)": 0.289208 }, { "acc": 0.72095633, "epoch": 1.3511416335895152, "grad_norm": 5.84375, "learning_rate": 2.6118843807820118e-06, "loss": 1.11493549, "memory(GiB)": 141.16, "step": 120800, "train_speed(iter/s)": 0.289224 }, { "acc": 0.73431797, "epoch": 1.3513653325354738, "grad_norm": 7.71875, "learning_rate": 2.6102597081038816e-06, "loss": 1.07399158, "memory(GiB)": 141.16, "step": 120820, "train_speed(iter/s)": 0.289241 }, { "acc": 0.73084464, "epoch": 1.3515890314814323, "grad_norm": 7.4375, "learning_rate": 2.6086353623763796e-06, "loss": 1.07211266, "memory(GiB)": 141.16, "step": 120840, "train_speed(iter/s)": 0.289258 }, { "acc": 0.72963142, "epoch": 1.3518127304273908, "grad_norm": 6.78125, "learning_rate": 2.6070113438217413e-06, "loss": 1.08642445, "memory(GiB)": 141.16, "step": 120860, "train_speed(iter/s)": 0.289275 }, { "acc": 0.74244728, "epoch": 1.3520364293733493, "grad_norm": 6.09375, "learning_rate": 2.6053876526621546e-06, "loss": 1.02920446, "memory(GiB)": 141.16, "step": 120880, "train_speed(iter/s)": 0.289293 }, { "acc": 0.73839579, "epoch": 1.3522601283193079, "grad_norm": 7.59375, "learning_rate": 2.6037642891197644e-06, "loss": 1.04440975, "memory(GiB)": 141.16, "step": 120900, "train_speed(iter/s)": 0.289311 }, { "acc": 0.74496965, "epoch": 1.3524838272652664, "grad_norm": 8.0625, "learning_rate": 2.60214125341667e-06, "loss": 1.00959988, "memory(GiB)": 141.16, "step": 120920, "train_speed(iter/s)": 0.289327 }, { "acc": 0.73396196, "epoch": 1.352707526211225, "grad_norm": 7.5625, "learning_rate": 2.600518545774924e-06, "loss": 1.06698885, "memory(GiB)": 141.16, "step": 120940, "train_speed(iter/s)": 0.289344 }, { "acc": 0.72168522, "epoch": 1.3529312251571834, "grad_norm": 6.125, "learning_rate": 2.598896166416539e-06, "loss": 1.13769093, "memory(GiB)": 141.16, "step": 120960, "train_speed(iter/s)": 0.289358 }, { "acc": 0.7363327, "epoch": 1.353154924103142, "grad_norm": 7.09375, "learning_rate": 2.5972741155634763e-06, "loss": 1.03665419, "memory(GiB)": 141.16, "step": 120980, "train_speed(iter/s)": 0.289375 }, { "acc": 0.74886026, "epoch": 1.3533786230491005, "grad_norm": 8.375, "learning_rate": 2.595652393437659e-06, "loss": 0.99698238, "memory(GiB)": 141.16, "step": 121000, "train_speed(iter/s)": 0.28939 }, { "acc": 0.72530513, "epoch": 1.353602321995059, "grad_norm": 5.53125, "learning_rate": 2.59403100026096e-06, "loss": 1.11254578, "memory(GiB)": 141.16, "step": 121020, "train_speed(iter/s)": 0.289403 }, { "acc": 0.727842, "epoch": 1.3538260209410176, "grad_norm": 6.28125, "learning_rate": 2.5924099362552085e-06, "loss": 1.09975185, "memory(GiB)": 141.16, "step": 121040, "train_speed(iter/s)": 0.28942 }, { "acc": 0.7283102, "epoch": 1.354049719886976, "grad_norm": 7.5625, "learning_rate": 2.590789201642192e-06, "loss": 1.08471756, "memory(GiB)": 141.16, "step": 121060, "train_speed(iter/s)": 0.289436 }, { "acc": 0.75758953, "epoch": 1.3542734188329346, "grad_norm": 9.5, "learning_rate": 2.589168796643645e-06, "loss": 0.94492798, "memory(GiB)": 141.16, "step": 121080, "train_speed(iter/s)": 0.289454 }, { "acc": 0.72620239, "epoch": 1.3544971177788931, "grad_norm": 7.6875, "learning_rate": 2.587548721481269e-06, "loss": 1.10801315, "memory(GiB)": 141.16, "step": 121100, "train_speed(iter/s)": 0.289471 }, { "acc": 0.7311522, "epoch": 1.3547208167248517, "grad_norm": 7.40625, "learning_rate": 2.5859289763767088e-06, "loss": 1.0824687, "memory(GiB)": 141.16, "step": 121120, "train_speed(iter/s)": 0.289487 }, { "acc": 0.74611673, "epoch": 1.3549445156708102, "grad_norm": 7.375, "learning_rate": 2.5843095615515678e-06, "loss": 1.00420055, "memory(GiB)": 141.16, "step": 121140, "train_speed(iter/s)": 0.289502 }, { "acc": 0.73567958, "epoch": 1.3551682146167687, "grad_norm": 7.125, "learning_rate": 2.582690477227409e-06, "loss": 1.07926884, "memory(GiB)": 141.16, "step": 121160, "train_speed(iter/s)": 0.289519 }, { "acc": 0.7281188, "epoch": 1.3553919135627273, "grad_norm": 7.8125, "learning_rate": 2.581071723625742e-06, "loss": 1.09045153, "memory(GiB)": 141.16, "step": 121180, "train_speed(iter/s)": 0.289536 }, { "acc": 0.73432064, "epoch": 1.3556156125086858, "grad_norm": 6.9375, "learning_rate": 2.57945330096804e-06, "loss": 1.06505737, "memory(GiB)": 141.16, "step": 121200, "train_speed(iter/s)": 0.28955 }, { "acc": 0.73481016, "epoch": 1.3558393114546443, "grad_norm": 6.75, "learning_rate": 2.577835209475724e-06, "loss": 1.07348804, "memory(GiB)": 141.16, "step": 121220, "train_speed(iter/s)": 0.289564 }, { "acc": 0.7441349, "epoch": 1.3560630104006028, "grad_norm": 8.5, "learning_rate": 2.5762174493701696e-06, "loss": 1.03538237, "memory(GiB)": 141.16, "step": 121240, "train_speed(iter/s)": 0.289581 }, { "acc": 0.73462963, "epoch": 1.3562867093465614, "grad_norm": 7.28125, "learning_rate": 2.5746000208727145e-06, "loss": 1.05540714, "memory(GiB)": 141.16, "step": 121260, "train_speed(iter/s)": 0.289596 }, { "acc": 0.73698673, "epoch": 1.35651040829252, "grad_norm": 7.6875, "learning_rate": 2.572982924204641e-06, "loss": 1.04040012, "memory(GiB)": 141.16, "step": 121280, "train_speed(iter/s)": 0.289612 }, { "acc": 0.73564072, "epoch": 1.3567341072384784, "grad_norm": 10.0625, "learning_rate": 2.5713661595871965e-06, "loss": 1.05028667, "memory(GiB)": 141.16, "step": 121300, "train_speed(iter/s)": 0.289628 }, { "acc": 0.72777438, "epoch": 1.356957806184437, "grad_norm": 8.8125, "learning_rate": 2.569749727241574e-06, "loss": 1.09279995, "memory(GiB)": 141.16, "step": 121320, "train_speed(iter/s)": 0.289643 }, { "acc": 0.73200083, "epoch": 1.3571815051303955, "grad_norm": 7.3125, "learning_rate": 2.5681336273889225e-06, "loss": 1.08460274, "memory(GiB)": 141.16, "step": 121340, "train_speed(iter/s)": 0.289658 }, { "acc": 0.73197069, "epoch": 1.357405204076354, "grad_norm": 8.1875, "learning_rate": 2.5665178602503528e-06, "loss": 1.06825228, "memory(GiB)": 141.16, "step": 121360, "train_speed(iter/s)": 0.289675 }, { "acc": 0.73971443, "epoch": 1.3576289030223125, "grad_norm": 7.03125, "learning_rate": 2.56490242604692e-06, "loss": 1.05329113, "memory(GiB)": 141.16, "step": 121380, "train_speed(iter/s)": 0.28969 }, { "acc": 0.7351234, "epoch": 1.357852601968271, "grad_norm": 6.375, "learning_rate": 2.563287324999643e-06, "loss": 1.06622343, "memory(GiB)": 141.16, "step": 121400, "train_speed(iter/s)": 0.289706 }, { "acc": 0.73447065, "epoch": 1.3580763009142296, "grad_norm": 7.09375, "learning_rate": 2.561672557329489e-06, "loss": 1.06385326, "memory(GiB)": 141.16, "step": 121420, "train_speed(iter/s)": 0.28972 }, { "acc": 0.73397026, "epoch": 1.3582999998601881, "grad_norm": 6.0, "learning_rate": 2.5600581232573782e-06, "loss": 1.08120642, "memory(GiB)": 141.16, "step": 121440, "train_speed(iter/s)": 0.289738 }, { "acc": 0.73770781, "epoch": 1.3585236988061467, "grad_norm": 10.0, "learning_rate": 2.558444023004193e-06, "loss": 1.04635277, "memory(GiB)": 141.16, "step": 121460, "train_speed(iter/s)": 0.289754 }, { "acc": 0.73144603, "epoch": 1.3587473977521052, "grad_norm": 8.0, "learning_rate": 2.5568302567907623e-06, "loss": 1.08456097, "memory(GiB)": 141.16, "step": 121480, "train_speed(iter/s)": 0.289771 }, { "acc": 0.7268816, "epoch": 1.3589710966980637, "grad_norm": 6.90625, "learning_rate": 2.5552168248378737e-06, "loss": 1.09272594, "memory(GiB)": 141.16, "step": 121500, "train_speed(iter/s)": 0.289788 }, { "acc": 0.73995857, "epoch": 1.3591947956440222, "grad_norm": 7.9375, "learning_rate": 2.5536037273662686e-06, "loss": 1.05897484, "memory(GiB)": 141.16, "step": 121520, "train_speed(iter/s)": 0.289803 }, { "acc": 0.73963766, "epoch": 1.3594184945899808, "grad_norm": 8.25, "learning_rate": 2.551990964596639e-06, "loss": 1.04647036, "memory(GiB)": 141.16, "step": 121540, "train_speed(iter/s)": 0.289818 }, { "acc": 0.7376152, "epoch": 1.3596421935359393, "grad_norm": 8.375, "learning_rate": 2.550378536749637e-06, "loss": 1.04189348, "memory(GiB)": 141.16, "step": 121560, "train_speed(iter/s)": 0.289834 }, { "acc": 0.7403975, "epoch": 1.3598658924818978, "grad_norm": 6.25, "learning_rate": 2.548766444045862e-06, "loss": 1.03262234, "memory(GiB)": 141.16, "step": 121580, "train_speed(iter/s)": 0.289849 }, { "acc": 0.73384051, "epoch": 1.3600895914278563, "grad_norm": 5.375, "learning_rate": 2.5471546867058763e-06, "loss": 1.05698547, "memory(GiB)": 141.16, "step": 121600, "train_speed(iter/s)": 0.289866 }, { "acc": 0.73637929, "epoch": 1.3603132903738149, "grad_norm": 9.125, "learning_rate": 2.5455432649501883e-06, "loss": 1.06318798, "memory(GiB)": 141.16, "step": 121620, "train_speed(iter/s)": 0.289882 }, { "acc": 0.73697968, "epoch": 1.3605369893197734, "grad_norm": 6.90625, "learning_rate": 2.543932178999262e-06, "loss": 1.06101933, "memory(GiB)": 141.16, "step": 121640, "train_speed(iter/s)": 0.289898 }, { "acc": 0.73117666, "epoch": 1.360760688265732, "grad_norm": 8.8125, "learning_rate": 2.542321429073521e-06, "loss": 1.08510189, "memory(GiB)": 141.16, "step": 121660, "train_speed(iter/s)": 0.289913 }, { "acc": 0.74159784, "epoch": 1.3609843872116905, "grad_norm": 7.34375, "learning_rate": 2.5407110153933345e-06, "loss": 1.03048096, "memory(GiB)": 141.16, "step": 121680, "train_speed(iter/s)": 0.289928 }, { "acc": 0.72572803, "epoch": 1.361208086157649, "grad_norm": 8.375, "learning_rate": 2.539100938179035e-06, "loss": 1.12645607, "memory(GiB)": 141.16, "step": 121700, "train_speed(iter/s)": 0.289942 }, { "acc": 0.72965984, "epoch": 1.3614317851036075, "grad_norm": 7.875, "learning_rate": 2.5374911976509008e-06, "loss": 1.07987728, "memory(GiB)": 141.16, "step": 121720, "train_speed(iter/s)": 0.289958 }, { "acc": 0.73195, "epoch": 1.361655484049566, "grad_norm": 7.40625, "learning_rate": 2.5358817940291667e-06, "loss": 1.07628822, "memory(GiB)": 141.16, "step": 121740, "train_speed(iter/s)": 0.289974 }, { "acc": 0.7333189, "epoch": 1.3618791829955246, "grad_norm": 6.65625, "learning_rate": 2.5342727275340258e-06, "loss": 1.07357216, "memory(GiB)": 141.16, "step": 121760, "train_speed(iter/s)": 0.289991 }, { "acc": 0.74402857, "epoch": 1.362102881941483, "grad_norm": 7.1875, "learning_rate": 2.532663998385617e-06, "loss": 1.02781429, "memory(GiB)": 141.16, "step": 121780, "train_speed(iter/s)": 0.290007 }, { "acc": 0.72681112, "epoch": 1.3623265808874416, "grad_norm": 8.5, "learning_rate": 2.531055606804041e-06, "loss": 1.09492168, "memory(GiB)": 141.16, "step": 121800, "train_speed(iter/s)": 0.290022 }, { "acc": 0.73180122, "epoch": 1.3625502798334002, "grad_norm": 7.9375, "learning_rate": 2.5294475530093477e-06, "loss": 1.07605038, "memory(GiB)": 141.16, "step": 121820, "train_speed(iter/s)": 0.290037 }, { "acc": 0.72939348, "epoch": 1.3627739787793587, "grad_norm": 7.25, "learning_rate": 2.5278398372215395e-06, "loss": 1.10657969, "memory(GiB)": 141.16, "step": 121840, "train_speed(iter/s)": 0.290053 }, { "acc": 0.73931966, "epoch": 1.3629976777253172, "grad_norm": 6.84375, "learning_rate": 2.526232459660578e-06, "loss": 1.04230547, "memory(GiB)": 141.16, "step": 121860, "train_speed(iter/s)": 0.29007 }, { "acc": 0.73854861, "epoch": 1.3632213766712757, "grad_norm": 7.15625, "learning_rate": 2.5246254205463738e-06, "loss": 1.0445013, "memory(GiB)": 141.16, "step": 121880, "train_speed(iter/s)": 0.290088 }, { "acc": 0.73628893, "epoch": 1.3634450756172343, "grad_norm": 7.0625, "learning_rate": 2.5230187200987945e-06, "loss": 1.0625782, "memory(GiB)": 141.16, "step": 121900, "train_speed(iter/s)": 0.290103 }, { "acc": 0.74107475, "epoch": 1.3636687745631928, "grad_norm": 6.78125, "learning_rate": 2.5214123585376582e-06, "loss": 1.04396152, "memory(GiB)": 141.16, "step": 121920, "train_speed(iter/s)": 0.290119 }, { "acc": 0.72211113, "epoch": 1.3638924735091513, "grad_norm": 6.59375, "learning_rate": 2.519806336082739e-06, "loss": 1.10509453, "memory(GiB)": 141.16, "step": 121940, "train_speed(iter/s)": 0.290135 }, { "acc": 0.72583599, "epoch": 1.3641161724551099, "grad_norm": 6.0625, "learning_rate": 2.5182006529537626e-06, "loss": 1.10501709, "memory(GiB)": 141.16, "step": 121960, "train_speed(iter/s)": 0.290151 }, { "acc": 0.72355399, "epoch": 1.3643398714010684, "grad_norm": 6.34375, "learning_rate": 2.5165953093704088e-06, "loss": 1.11351728, "memory(GiB)": 141.16, "step": 121980, "train_speed(iter/s)": 0.290166 }, { "acc": 0.73088946, "epoch": 1.364563570347027, "grad_norm": 7.71875, "learning_rate": 2.5149903055523145e-06, "loss": 1.07622414, "memory(GiB)": 141.16, "step": 122000, "train_speed(iter/s)": 0.290184 }, { "epoch": 1.364563570347027, "eval_acc": 0.69013995911927, "eval_loss": 1.079160451889038, "eval_runtime": 2321.6083, "eval_samples_per_second": 32.427, "eval_steps_per_second": 16.214, "step": 122000 }, { "acc": 0.73886395, "epoch": 1.3647872692929854, "grad_norm": 7.78125, "learning_rate": 2.5133856417190635e-06, "loss": 1.036063, "memory(GiB)": 141.16, "step": 122020, "train_speed(iter/s)": 0.288572 }, { "acc": 0.73294792, "epoch": 1.365010968238944, "grad_norm": 8.75, "learning_rate": 2.5117813180901997e-06, "loss": 1.08007956, "memory(GiB)": 141.16, "step": 122040, "train_speed(iter/s)": 0.288589 }, { "acc": 0.72672682, "epoch": 1.3652346671849025, "grad_norm": 6.96875, "learning_rate": 2.510177334885217e-06, "loss": 1.10909958, "memory(GiB)": 141.16, "step": 122060, "train_speed(iter/s)": 0.288603 }, { "acc": 0.73662558, "epoch": 1.365458366130861, "grad_norm": 5.5625, "learning_rate": 2.508573692323561e-06, "loss": 1.06270342, "memory(GiB)": 141.16, "step": 122080, "train_speed(iter/s)": 0.28862 }, { "acc": 0.74009371, "epoch": 1.3656820650768196, "grad_norm": 6.59375, "learning_rate": 2.5069703906246362e-06, "loss": 1.03169518, "memory(GiB)": 141.16, "step": 122100, "train_speed(iter/s)": 0.288635 }, { "acc": 0.7376399, "epoch": 1.365905764022778, "grad_norm": 8.375, "learning_rate": 2.5053674300077935e-06, "loss": 1.05590057, "memory(GiB)": 141.16, "step": 122120, "train_speed(iter/s)": 0.288652 }, { "acc": 0.73249979, "epoch": 1.3661294629687366, "grad_norm": 13.5, "learning_rate": 2.503764810692345e-06, "loss": 1.09296808, "memory(GiB)": 141.16, "step": 122140, "train_speed(iter/s)": 0.288663 }, { "acc": 0.7380919, "epoch": 1.3663531619146951, "grad_norm": 7.0, "learning_rate": 2.5021625328975495e-06, "loss": 1.05246716, "memory(GiB)": 141.16, "step": 122160, "train_speed(iter/s)": 0.288678 }, { "acc": 0.7325357, "epoch": 1.3665768608606537, "grad_norm": 7.1875, "learning_rate": 2.5005605968426204e-06, "loss": 1.07716169, "memory(GiB)": 141.16, "step": 122180, "train_speed(iter/s)": 0.288694 }, { "acc": 0.73650179, "epoch": 1.3668005598066122, "grad_norm": 7.1875, "learning_rate": 2.498959002746729e-06, "loss": 1.04998722, "memory(GiB)": 141.16, "step": 122200, "train_speed(iter/s)": 0.288708 }, { "acc": 0.73341217, "epoch": 1.3670242587525707, "grad_norm": 6.65625, "learning_rate": 2.4973577508289914e-06, "loss": 1.06153889, "memory(GiB)": 141.16, "step": 122220, "train_speed(iter/s)": 0.288725 }, { "acc": 0.7445116, "epoch": 1.3672479576985292, "grad_norm": 8.5, "learning_rate": 2.495756841308487e-06, "loss": 1.03186359, "memory(GiB)": 141.16, "step": 122240, "train_speed(iter/s)": 0.28874 }, { "acc": 0.73848095, "epoch": 1.3674716566444878, "grad_norm": 7.34375, "learning_rate": 2.4941562744042403e-06, "loss": 1.02739649, "memory(GiB)": 141.16, "step": 122260, "train_speed(iter/s)": 0.288755 }, { "acc": 0.72071877, "epoch": 1.3676953555904463, "grad_norm": 4.6875, "learning_rate": 2.4925560503352303e-06, "loss": 1.11834679, "memory(GiB)": 141.16, "step": 122280, "train_speed(iter/s)": 0.288769 }, { "acc": 0.73723645, "epoch": 1.3679190545364048, "grad_norm": 8.375, "learning_rate": 2.490956169320394e-06, "loss": 1.05213127, "memory(GiB)": 141.16, "step": 122300, "train_speed(iter/s)": 0.288785 }, { "acc": 0.73493781, "epoch": 1.3681427534823634, "grad_norm": 7.59375, "learning_rate": 2.4893566315786143e-06, "loss": 1.05773621, "memory(GiB)": 141.16, "step": 122320, "train_speed(iter/s)": 0.288798 }, { "acc": 0.74675713, "epoch": 1.3683664524283219, "grad_norm": 6.78125, "learning_rate": 2.487757437328735e-06, "loss": 0.98702669, "memory(GiB)": 141.16, "step": 122340, "train_speed(iter/s)": 0.288813 }, { "acc": 0.72347479, "epoch": 1.3685901513742804, "grad_norm": 6.71875, "learning_rate": 2.486158586789546e-06, "loss": 1.09631014, "memory(GiB)": 141.16, "step": 122360, "train_speed(iter/s)": 0.288829 }, { "acc": 0.72897158, "epoch": 1.368813850320239, "grad_norm": 6.4375, "learning_rate": 2.484560080179792e-06, "loss": 1.0943924, "memory(GiB)": 141.16, "step": 122380, "train_speed(iter/s)": 0.288845 }, { "acc": 0.73393402, "epoch": 1.3690375492661975, "grad_norm": 5.59375, "learning_rate": 2.4829619177181747e-06, "loss": 1.05501804, "memory(GiB)": 141.16, "step": 122400, "train_speed(iter/s)": 0.288862 }, { "acc": 0.7392344, "epoch": 1.369261248212156, "grad_norm": 6.75, "learning_rate": 2.4813640996233417e-06, "loss": 1.0436224, "memory(GiB)": 141.16, "step": 122420, "train_speed(iter/s)": 0.288877 }, { "acc": 0.75181131, "epoch": 1.3694849471581145, "grad_norm": 8.125, "learning_rate": 2.4797666261139016e-06, "loss": 0.99172411, "memory(GiB)": 141.16, "step": 122440, "train_speed(iter/s)": 0.288892 }, { "acc": 0.72274208, "epoch": 1.369708646104073, "grad_norm": 5.90625, "learning_rate": 2.4781694974084093e-06, "loss": 1.12373533, "memory(GiB)": 141.16, "step": 122460, "train_speed(iter/s)": 0.288907 }, { "acc": 0.73793745, "epoch": 1.3699323450500316, "grad_norm": 8.75, "learning_rate": 2.476572713725373e-06, "loss": 1.04972639, "memory(GiB)": 141.16, "step": 122480, "train_speed(iter/s)": 0.288922 }, { "acc": 0.72813234, "epoch": 1.3701560439959901, "grad_norm": 6.375, "learning_rate": 2.4749762752832597e-06, "loss": 1.09634552, "memory(GiB)": 141.16, "step": 122500, "train_speed(iter/s)": 0.288936 }, { "acc": 0.72789397, "epoch": 1.3703797429419486, "grad_norm": 6.65625, "learning_rate": 2.473380182300481e-06, "loss": 1.10791883, "memory(GiB)": 141.16, "step": 122520, "train_speed(iter/s)": 0.288951 }, { "acc": 0.73110971, "epoch": 1.3706034418879072, "grad_norm": 7.625, "learning_rate": 2.471784434995409e-06, "loss": 1.08012085, "memory(GiB)": 141.16, "step": 122540, "train_speed(iter/s)": 0.288967 }, { "acc": 0.73207235, "epoch": 1.3708271408338657, "grad_norm": 8.3125, "learning_rate": 2.470189033586363e-06, "loss": 1.06396074, "memory(GiB)": 141.16, "step": 122560, "train_speed(iter/s)": 0.288984 }, { "acc": 0.74737086, "epoch": 1.3710508397798242, "grad_norm": 6.28125, "learning_rate": 2.468593978291614e-06, "loss": 1.00497322, "memory(GiB)": 141.16, "step": 122580, "train_speed(iter/s)": 0.289 }, { "acc": 0.73366137, "epoch": 1.3712745387257828, "grad_norm": 7.78125, "learning_rate": 2.466999269329393e-06, "loss": 1.06162558, "memory(GiB)": 141.16, "step": 122600, "train_speed(iter/s)": 0.289016 }, { "acc": 0.73628278, "epoch": 1.3714982376717413, "grad_norm": 6.5, "learning_rate": 2.4654049069178753e-06, "loss": 1.05566416, "memory(GiB)": 141.16, "step": 122620, "train_speed(iter/s)": 0.289032 }, { "acc": 0.71886444, "epoch": 1.3717219366176998, "grad_norm": 6.40625, "learning_rate": 2.4638108912751958e-06, "loss": 1.15231304, "memory(GiB)": 141.16, "step": 122640, "train_speed(iter/s)": 0.289046 }, { "acc": 0.73139572, "epoch": 1.3719456355636583, "grad_norm": 8.9375, "learning_rate": 2.462217222619437e-06, "loss": 1.07483635, "memory(GiB)": 141.16, "step": 122660, "train_speed(iter/s)": 0.289063 }, { "acc": 0.7350564, "epoch": 1.3721693345096169, "grad_norm": 7.78125, "learning_rate": 2.460623901168633e-06, "loss": 1.07633257, "memory(GiB)": 141.16, "step": 122680, "train_speed(iter/s)": 0.289081 }, { "acc": 0.72441664, "epoch": 1.3723930334555754, "grad_norm": 6.59375, "learning_rate": 2.4590309271407774e-06, "loss": 1.10364141, "memory(GiB)": 141.16, "step": 122700, "train_speed(iter/s)": 0.289097 }, { "acc": 0.75852337, "epoch": 1.372616732401534, "grad_norm": 6.8125, "learning_rate": 2.4574383007538085e-06, "loss": 0.95090971, "memory(GiB)": 141.16, "step": 122720, "train_speed(iter/s)": 0.289113 }, { "acc": 0.74772282, "epoch": 1.3728404313474925, "grad_norm": 8.4375, "learning_rate": 2.455846022225623e-06, "loss": 1.01339893, "memory(GiB)": 141.16, "step": 122740, "train_speed(iter/s)": 0.289127 }, { "acc": 0.74347563, "epoch": 1.373064130293451, "grad_norm": 7.09375, "learning_rate": 2.454254091774066e-06, "loss": 1.01970644, "memory(GiB)": 141.16, "step": 122760, "train_speed(iter/s)": 0.289144 }, { "acc": 0.73508897, "epoch": 1.3732878292394095, "grad_norm": 7.125, "learning_rate": 2.4526625096169344e-06, "loss": 1.0644145, "memory(GiB)": 141.16, "step": 122780, "train_speed(iter/s)": 0.28916 }, { "acc": 0.72096701, "epoch": 1.373511528185368, "grad_norm": 8.875, "learning_rate": 2.4510712759719837e-06, "loss": 1.12821846, "memory(GiB)": 141.16, "step": 122800, "train_speed(iter/s)": 0.289175 }, { "acc": 0.73770671, "epoch": 1.3737352271313266, "grad_norm": 8.5625, "learning_rate": 2.4494803910569127e-06, "loss": 1.04741821, "memory(GiB)": 141.16, "step": 122820, "train_speed(iter/s)": 0.289192 }, { "acc": 0.7515337, "epoch": 1.373958926077285, "grad_norm": 9.375, "learning_rate": 2.4478898550893815e-06, "loss": 0.99041443, "memory(GiB)": 141.16, "step": 122840, "train_speed(iter/s)": 0.289207 }, { "acc": 0.73212643, "epoch": 1.3741826250232436, "grad_norm": 6.59375, "learning_rate": 2.446299668286996e-06, "loss": 1.08496647, "memory(GiB)": 141.16, "step": 122860, "train_speed(iter/s)": 0.289222 }, { "acc": 0.72777457, "epoch": 1.3744063239692021, "grad_norm": 7.6875, "learning_rate": 2.444709830867315e-06, "loss": 1.09220247, "memory(GiB)": 141.16, "step": 122880, "train_speed(iter/s)": 0.289237 }, { "acc": 0.72557197, "epoch": 1.3746300229151607, "grad_norm": 7.71875, "learning_rate": 2.443120343047855e-06, "loss": 1.10552959, "memory(GiB)": 141.16, "step": 122900, "train_speed(iter/s)": 0.289253 }, { "acc": 0.72693138, "epoch": 1.3748537218611192, "grad_norm": 6.71875, "learning_rate": 2.441531205046076e-06, "loss": 1.08652267, "memory(GiB)": 141.16, "step": 122920, "train_speed(iter/s)": 0.289268 }, { "acc": 0.73935766, "epoch": 1.3750774208070777, "grad_norm": 6.90625, "learning_rate": 2.439942417079399e-06, "loss": 1.04289742, "memory(GiB)": 141.16, "step": 122940, "train_speed(iter/s)": 0.289282 }, { "acc": 0.74446926, "epoch": 1.3753011197530363, "grad_norm": 7.40625, "learning_rate": 2.4383539793651905e-06, "loss": 1.02775822, "memory(GiB)": 141.16, "step": 122960, "train_speed(iter/s)": 0.289297 }, { "acc": 0.7292407, "epoch": 1.3755248186989948, "grad_norm": 8.375, "learning_rate": 2.436765892120771e-06, "loss": 1.08963804, "memory(GiB)": 141.16, "step": 122980, "train_speed(iter/s)": 0.289313 }, { "acc": 0.73369479, "epoch": 1.3757485176449533, "grad_norm": 7.625, "learning_rate": 2.435178155563416e-06, "loss": 1.06861324, "memory(GiB)": 141.16, "step": 123000, "train_speed(iter/s)": 0.289331 }, { "acc": 0.7293088, "epoch": 1.3759722165909118, "grad_norm": 6.75, "learning_rate": 2.4335907699103467e-06, "loss": 1.08922577, "memory(GiB)": 141.16, "step": 123020, "train_speed(iter/s)": 0.289348 }, { "acc": 0.73802633, "epoch": 1.3761959155368704, "grad_norm": 7.6875, "learning_rate": 2.432003735378745e-06, "loss": 1.04928446, "memory(GiB)": 141.16, "step": 123040, "train_speed(iter/s)": 0.289364 }, { "acc": 0.73600931, "epoch": 1.376419614482829, "grad_norm": 6.5625, "learning_rate": 2.4304170521857375e-06, "loss": 1.04087315, "memory(GiB)": 141.16, "step": 123060, "train_speed(iter/s)": 0.28938 }, { "acc": 0.73586721, "epoch": 1.3766433134287874, "grad_norm": 6.96875, "learning_rate": 2.4288307205484026e-06, "loss": 1.05456886, "memory(GiB)": 141.16, "step": 123080, "train_speed(iter/s)": 0.289395 }, { "acc": 0.74287081, "epoch": 1.376867012374746, "grad_norm": 8.375, "learning_rate": 2.427244740683778e-06, "loss": 1.04297018, "memory(GiB)": 141.16, "step": 123100, "train_speed(iter/s)": 0.289411 }, { "acc": 0.72492709, "epoch": 1.3770907113207045, "grad_norm": 9.25, "learning_rate": 2.425659112808846e-06, "loss": 1.10999432, "memory(GiB)": 141.16, "step": 123120, "train_speed(iter/s)": 0.289427 }, { "acc": 0.73039265, "epoch": 1.377314410266663, "grad_norm": 6.78125, "learning_rate": 2.4240738371405427e-06, "loss": 1.08425388, "memory(GiB)": 141.16, "step": 123140, "train_speed(iter/s)": 0.289441 }, { "acc": 0.72877483, "epoch": 1.3775381092126215, "grad_norm": 7.3125, "learning_rate": 2.422488913895755e-06, "loss": 1.09018612, "memory(GiB)": 141.16, "step": 123160, "train_speed(iter/s)": 0.289458 }, { "acc": 0.74673672, "epoch": 1.37776180815858, "grad_norm": 6.125, "learning_rate": 2.4209043432913274e-06, "loss": 1.00515079, "memory(GiB)": 141.16, "step": 123180, "train_speed(iter/s)": 0.289474 }, { "acc": 0.74211864, "epoch": 1.3779855071045386, "grad_norm": 8.3125, "learning_rate": 2.4193201255440496e-06, "loss": 1.03277607, "memory(GiB)": 141.16, "step": 123200, "train_speed(iter/s)": 0.28949 }, { "acc": 0.73013821, "epoch": 1.3782092060504971, "grad_norm": 6.21875, "learning_rate": 2.417736260870663e-06, "loss": 1.07507706, "memory(GiB)": 141.16, "step": 123220, "train_speed(iter/s)": 0.289506 }, { "acc": 0.74034157, "epoch": 1.3784329049964557, "grad_norm": 8.375, "learning_rate": 2.4161527494878663e-06, "loss": 1.02718945, "memory(GiB)": 141.16, "step": 123240, "train_speed(iter/s)": 0.289522 }, { "acc": 0.73652329, "epoch": 1.3786566039424142, "grad_norm": 5.9375, "learning_rate": 2.4145695916123037e-06, "loss": 1.04278364, "memory(GiB)": 141.16, "step": 123260, "train_speed(iter/s)": 0.289537 }, { "acc": 0.73703823, "epoch": 1.3788803028883727, "grad_norm": 8.1875, "learning_rate": 2.412986787460577e-06, "loss": 1.0849659, "memory(GiB)": 141.16, "step": 123280, "train_speed(iter/s)": 0.28955 }, { "acc": 0.72863445, "epoch": 1.3791040018343312, "grad_norm": 5.8125, "learning_rate": 2.411404337249235e-06, "loss": 1.09720268, "memory(GiB)": 141.16, "step": 123300, "train_speed(iter/s)": 0.289565 }, { "acc": 0.73121142, "epoch": 1.3793277007802898, "grad_norm": 7.4375, "learning_rate": 2.409822241194777e-06, "loss": 1.06768723, "memory(GiB)": 141.16, "step": 123320, "train_speed(iter/s)": 0.28958 }, { "acc": 0.73497205, "epoch": 1.3795513997262483, "grad_norm": 8.9375, "learning_rate": 2.408240499513661e-06, "loss": 1.0613308, "memory(GiB)": 141.16, "step": 123340, "train_speed(iter/s)": 0.289595 }, { "acc": 0.72928066, "epoch": 1.3797750986722068, "grad_norm": 6.375, "learning_rate": 2.406659112422287e-06, "loss": 1.09661999, "memory(GiB)": 141.16, "step": 123360, "train_speed(iter/s)": 0.28961 }, { "acc": 0.73554516, "epoch": 1.3799987976181654, "grad_norm": 8.875, "learning_rate": 2.4050780801370162e-06, "loss": 1.05216064, "memory(GiB)": 141.16, "step": 123380, "train_speed(iter/s)": 0.289626 }, { "acc": 0.739291, "epoch": 1.3802224965641239, "grad_norm": 7.59375, "learning_rate": 2.4034974028741533e-06, "loss": 1.0445796, "memory(GiB)": 141.16, "step": 123400, "train_speed(iter/s)": 0.289641 }, { "acc": 0.73587999, "epoch": 1.3804461955100824, "grad_norm": 7.0, "learning_rate": 2.401917080849957e-06, "loss": 1.04619961, "memory(GiB)": 141.16, "step": 123420, "train_speed(iter/s)": 0.289656 }, { "acc": 0.72201529, "epoch": 1.380669894456041, "grad_norm": 4.71875, "learning_rate": 2.400337114280641e-06, "loss": 1.12311697, "memory(GiB)": 141.16, "step": 123440, "train_speed(iter/s)": 0.289671 }, { "acc": 0.72941222, "epoch": 1.3808935934019995, "grad_norm": 6.4375, "learning_rate": 2.398757503382363e-06, "loss": 1.08896036, "memory(GiB)": 141.16, "step": 123460, "train_speed(iter/s)": 0.289685 }, { "acc": 0.73736711, "epoch": 1.381117292347958, "grad_norm": 6.96875, "learning_rate": 2.3971782483712414e-06, "loss": 1.07691565, "memory(GiB)": 141.16, "step": 123480, "train_speed(iter/s)": 0.289701 }, { "acc": 0.72957096, "epoch": 1.3813409912939165, "grad_norm": 8.5, "learning_rate": 2.3955993494633385e-06, "loss": 1.09736319, "memory(GiB)": 141.16, "step": 123500, "train_speed(iter/s)": 0.289717 }, { "acc": 0.72369604, "epoch": 1.381564690239875, "grad_norm": 7.875, "learning_rate": 2.394020806874667e-06, "loss": 1.12573376, "memory(GiB)": 141.16, "step": 123520, "train_speed(iter/s)": 0.28973 }, { "acc": 0.73138661, "epoch": 1.3817883891858336, "grad_norm": 7.8125, "learning_rate": 2.3924426208212003e-06, "loss": 1.07471294, "memory(GiB)": 141.16, "step": 123540, "train_speed(iter/s)": 0.289745 }, { "acc": 0.74445076, "epoch": 1.382012088131792, "grad_norm": 7.3125, "learning_rate": 2.3908647915188514e-06, "loss": 1.01457453, "memory(GiB)": 141.16, "step": 123560, "train_speed(iter/s)": 0.289759 }, { "acc": 0.74427624, "epoch": 1.3822357870777506, "grad_norm": 9.5, "learning_rate": 2.3892873191834936e-06, "loss": 1.0174593, "memory(GiB)": 141.16, "step": 123580, "train_speed(iter/s)": 0.289775 }, { "acc": 0.7410779, "epoch": 1.3824594860237092, "grad_norm": 7.84375, "learning_rate": 2.387710204030947e-06, "loss": 1.04729824, "memory(GiB)": 141.16, "step": 123600, "train_speed(iter/s)": 0.289789 }, { "acc": 0.75108891, "epoch": 1.3826831849696677, "grad_norm": 6.34375, "learning_rate": 2.38613344627698e-06, "loss": 1.00152454, "memory(GiB)": 141.16, "step": 123620, "train_speed(iter/s)": 0.289807 }, { "acc": 0.74232745, "epoch": 1.3829068839156262, "grad_norm": 6.9375, "learning_rate": 2.384557046137321e-06, "loss": 1.02689629, "memory(GiB)": 141.16, "step": 123640, "train_speed(iter/s)": 0.289823 }, { "acc": 0.73501978, "epoch": 1.3831305828615847, "grad_norm": 5.75, "learning_rate": 2.382981003827639e-06, "loss": 1.06238518, "memory(GiB)": 141.16, "step": 123660, "train_speed(iter/s)": 0.289837 }, { "acc": 0.73714294, "epoch": 1.3833542818075433, "grad_norm": 6.40625, "learning_rate": 2.3814053195635633e-06, "loss": 1.0420496, "memory(GiB)": 141.16, "step": 123680, "train_speed(iter/s)": 0.289853 }, { "acc": 0.73391833, "epoch": 1.3835779807535018, "grad_norm": 6.4375, "learning_rate": 2.3798299935606684e-06, "loss": 1.0600997, "memory(GiB)": 141.16, "step": 123700, "train_speed(iter/s)": 0.289867 }, { "acc": 0.7311676, "epoch": 1.3838016796994603, "grad_norm": 5.75, "learning_rate": 2.3782550260344796e-06, "loss": 1.07997522, "memory(GiB)": 141.16, "step": 123720, "train_speed(iter/s)": 0.289884 }, { "acc": 0.72859716, "epoch": 1.3840253786454189, "grad_norm": 6.6875, "learning_rate": 2.3766804172004784e-06, "loss": 1.09418221, "memory(GiB)": 141.16, "step": 123740, "train_speed(iter/s)": 0.289902 }, { "acc": 0.73524837, "epoch": 1.3842490775913774, "grad_norm": 9.4375, "learning_rate": 2.37510616727409e-06, "loss": 1.06161203, "memory(GiB)": 141.16, "step": 123760, "train_speed(iter/s)": 0.289917 }, { "acc": 0.71996565, "epoch": 1.3844727765373361, "grad_norm": 7.15625, "learning_rate": 2.373532276470698e-06, "loss": 1.1471283, "memory(GiB)": 141.16, "step": 123780, "train_speed(iter/s)": 0.289932 }, { "acc": 0.73194723, "epoch": 1.3846964754832947, "grad_norm": 7.75, "learning_rate": 2.3719587450056316e-06, "loss": 1.07659225, "memory(GiB)": 141.16, "step": 123800, "train_speed(iter/s)": 0.289948 }, { "acc": 0.73997936, "epoch": 1.3849201744292532, "grad_norm": 5.46875, "learning_rate": 2.3703855730941704e-06, "loss": 1.03854179, "memory(GiB)": 141.16, "step": 123820, "train_speed(iter/s)": 0.289965 }, { "acc": 0.7390625, "epoch": 1.3851438733752117, "grad_norm": 5.5, "learning_rate": 2.3688127609515502e-06, "loss": 1.05196266, "memory(GiB)": 141.16, "step": 123840, "train_speed(iter/s)": 0.289982 }, { "acc": 0.73431978, "epoch": 1.3853675723211702, "grad_norm": 8.1875, "learning_rate": 2.3672403087929512e-06, "loss": 1.06629515, "memory(GiB)": 141.16, "step": 123860, "train_speed(iter/s)": 0.289997 }, { "acc": 0.73005428, "epoch": 1.3855912712671288, "grad_norm": 9.75, "learning_rate": 2.3656682168335105e-06, "loss": 1.08237839, "memory(GiB)": 141.16, "step": 123880, "train_speed(iter/s)": 0.290013 }, { "acc": 0.72637911, "epoch": 1.3858149702130873, "grad_norm": 8.0625, "learning_rate": 2.3640964852883108e-06, "loss": 1.10185699, "memory(GiB)": 141.16, "step": 123900, "train_speed(iter/s)": 0.29003 }, { "acc": 0.73246036, "epoch": 1.3860386691590458, "grad_norm": 6.34375, "learning_rate": 2.362525114372386e-06, "loss": 1.0623888, "memory(GiB)": 141.16, "step": 123920, "train_speed(iter/s)": 0.290047 }, { "acc": 0.73375678, "epoch": 1.3862623681050044, "grad_norm": 7.3125, "learning_rate": 2.3609541043007254e-06, "loss": 1.06709156, "memory(GiB)": 141.16, "step": 123940, "train_speed(iter/s)": 0.290063 }, { "acc": 0.72769852, "epoch": 1.386486067050963, "grad_norm": 5.46875, "learning_rate": 2.3593834552882627e-06, "loss": 1.09747334, "memory(GiB)": 141.16, "step": 123960, "train_speed(iter/s)": 0.290079 }, { "acc": 0.73767443, "epoch": 1.3867097659969214, "grad_norm": 7.40625, "learning_rate": 2.3578131675498876e-06, "loss": 1.0366786, "memory(GiB)": 141.16, "step": 123980, "train_speed(iter/s)": 0.290094 }, { "acc": 0.71869884, "epoch": 1.38693346494288, "grad_norm": 8.1875, "learning_rate": 2.356243241300437e-06, "loss": 1.1256361, "memory(GiB)": 141.16, "step": 124000, "train_speed(iter/s)": 0.29011 }, { "epoch": 1.38693346494288, "eval_acc": 0.6901515931298604, "eval_loss": 1.0791257619857788, "eval_runtime": 2319.7168, "eval_samples_per_second": 32.454, "eval_steps_per_second": 16.227, "step": 124000 }, { "acc": 0.72480249, "epoch": 1.3871571638888385, "grad_norm": 7.0625, "learning_rate": 2.3546736767546974e-06, "loss": 1.10735302, "memory(GiB)": 141.16, "step": 124020, "train_speed(iter/s)": 0.288526 }, { "acc": 0.7299057, "epoch": 1.387380862834797, "grad_norm": 7.3125, "learning_rate": 2.353104474127411e-06, "loss": 1.0861599, "memory(GiB)": 141.16, "step": 124040, "train_speed(iter/s)": 0.28854 }, { "acc": 0.73309784, "epoch": 1.3876045617807555, "grad_norm": 7.84375, "learning_rate": 2.3515356336332633e-06, "loss": 1.0887599, "memory(GiB)": 141.16, "step": 124060, "train_speed(iter/s)": 0.288555 }, { "acc": 0.73234377, "epoch": 1.387828260726714, "grad_norm": 7.84375, "learning_rate": 2.3499671554868986e-06, "loss": 1.07377176, "memory(GiB)": 141.16, "step": 124080, "train_speed(iter/s)": 0.28857 }, { "acc": 0.72649326, "epoch": 1.3880519596726726, "grad_norm": 8.5625, "learning_rate": 2.348399039902904e-06, "loss": 1.09641638, "memory(GiB)": 141.16, "step": 124100, "train_speed(iter/s)": 0.288585 }, { "acc": 0.73506112, "epoch": 1.3882756586186311, "grad_norm": 6.1875, "learning_rate": 2.346831287095819e-06, "loss": 1.06256132, "memory(GiB)": 141.16, "step": 124120, "train_speed(iter/s)": 0.288602 }, { "acc": 0.73414536, "epoch": 1.3884993575645896, "grad_norm": 7.09375, "learning_rate": 2.345263897280139e-06, "loss": 1.07368393, "memory(GiB)": 141.16, "step": 124140, "train_speed(iter/s)": 0.288619 }, { "acc": 0.73589973, "epoch": 1.3887230565105482, "grad_norm": 6.5, "learning_rate": 2.3436968706703008e-06, "loss": 1.06287422, "memory(GiB)": 141.16, "step": 124160, "train_speed(iter/s)": 0.288636 }, { "acc": 0.73224235, "epoch": 1.3889467554565067, "grad_norm": 5.96875, "learning_rate": 2.342130207480699e-06, "loss": 1.07697678, "memory(GiB)": 141.16, "step": 124180, "train_speed(iter/s)": 0.288651 }, { "acc": 0.72576904, "epoch": 1.3891704544024652, "grad_norm": 7.8125, "learning_rate": 2.3405639079256754e-06, "loss": 1.09649668, "memory(GiB)": 141.16, "step": 124200, "train_speed(iter/s)": 0.288666 }, { "acc": 0.73102918, "epoch": 1.3893941533484238, "grad_norm": 7.65625, "learning_rate": 2.338997972219519e-06, "loss": 1.07908783, "memory(GiB)": 141.16, "step": 124220, "train_speed(iter/s)": 0.288682 }, { "acc": 0.74418926, "epoch": 1.3896178522943823, "grad_norm": 7.5, "learning_rate": 2.3374324005764763e-06, "loss": 1.00588074, "memory(GiB)": 141.16, "step": 124240, "train_speed(iter/s)": 0.288699 }, { "acc": 0.73673534, "epoch": 1.3898415512403408, "grad_norm": 6.78125, "learning_rate": 2.335867193210737e-06, "loss": 1.06411095, "memory(GiB)": 141.16, "step": 124260, "train_speed(iter/s)": 0.288716 }, { "acc": 0.7475914, "epoch": 1.3900652501862993, "grad_norm": 6.9375, "learning_rate": 2.334302350336446e-06, "loss": 0.99892569, "memory(GiB)": 141.16, "step": 124280, "train_speed(iter/s)": 0.288733 }, { "acc": 0.73077302, "epoch": 1.3902889491322579, "grad_norm": 5.8125, "learning_rate": 2.332737872167695e-06, "loss": 1.09000835, "memory(GiB)": 141.16, "step": 124300, "train_speed(iter/s)": 0.288749 }, { "acc": 0.73578358, "epoch": 1.3905126480782164, "grad_norm": 8.5, "learning_rate": 2.3311737589185273e-06, "loss": 1.06463737, "memory(GiB)": 141.16, "step": 124320, "train_speed(iter/s)": 0.288765 }, { "acc": 0.73160009, "epoch": 1.390736347024175, "grad_norm": 7.59375, "learning_rate": 2.329610010802934e-06, "loss": 1.07418833, "memory(GiB)": 141.16, "step": 124340, "train_speed(iter/s)": 0.288776 }, { "acc": 0.72703028, "epoch": 1.3909600459701335, "grad_norm": 8.25, "learning_rate": 2.328046628034861e-06, "loss": 1.09596405, "memory(GiB)": 141.16, "step": 124360, "train_speed(iter/s)": 0.288794 }, { "acc": 0.73483191, "epoch": 1.391183744916092, "grad_norm": 7.625, "learning_rate": 2.3264836108282014e-06, "loss": 1.07576046, "memory(GiB)": 141.16, "step": 124380, "train_speed(iter/s)": 0.288807 }, { "acc": 0.73606982, "epoch": 1.3914074438620505, "grad_norm": 7.46875, "learning_rate": 2.3249209593967946e-06, "loss": 1.05090446, "memory(GiB)": 141.16, "step": 124400, "train_speed(iter/s)": 0.288825 }, { "acc": 0.74787073, "epoch": 1.391631142808009, "grad_norm": 8.9375, "learning_rate": 2.3233586739544384e-06, "loss": 0.99409494, "memory(GiB)": 141.16, "step": 124420, "train_speed(iter/s)": 0.288841 }, { "acc": 0.73404112, "epoch": 1.3918548417539676, "grad_norm": 6.59375, "learning_rate": 2.321796754714872e-06, "loss": 1.0712986, "memory(GiB)": 141.16, "step": 124440, "train_speed(iter/s)": 0.288856 }, { "acc": 0.73628426, "epoch": 1.392078540699926, "grad_norm": 6.1875, "learning_rate": 2.3202352018917914e-06, "loss": 1.07720814, "memory(GiB)": 141.16, "step": 124460, "train_speed(iter/s)": 0.288872 }, { "acc": 0.74088087, "epoch": 1.3923022396458846, "grad_norm": 7.71875, "learning_rate": 2.3186740156988375e-06, "loss": 1.03815861, "memory(GiB)": 141.16, "step": 124480, "train_speed(iter/s)": 0.288889 }, { "acc": 0.73742704, "epoch": 1.3925259385918431, "grad_norm": 5.71875, "learning_rate": 2.3171131963496017e-06, "loss": 1.05085049, "memory(GiB)": 141.16, "step": 124500, "train_speed(iter/s)": 0.288904 }, { "acc": 0.74219265, "epoch": 1.3927496375378017, "grad_norm": 7.96875, "learning_rate": 2.3155527440576296e-06, "loss": 1.01476231, "memory(GiB)": 141.16, "step": 124520, "train_speed(iter/s)": 0.288919 }, { "acc": 0.73039827, "epoch": 1.3929733364837602, "grad_norm": 6.0, "learning_rate": 2.3139926590364105e-06, "loss": 1.07520857, "memory(GiB)": 141.16, "step": 124540, "train_speed(iter/s)": 0.288935 }, { "acc": 0.72824392, "epoch": 1.3931970354297187, "grad_norm": 5.28125, "learning_rate": 2.3124329414993886e-06, "loss": 1.08244085, "memory(GiB)": 141.16, "step": 124560, "train_speed(iter/s)": 0.288949 }, { "acc": 0.73649392, "epoch": 1.3934207343756773, "grad_norm": 7.9375, "learning_rate": 2.310873591659955e-06, "loss": 1.06902008, "memory(GiB)": 141.16, "step": 124580, "train_speed(iter/s)": 0.288964 }, { "acc": 0.73211412, "epoch": 1.3936444333216358, "grad_norm": 7.75, "learning_rate": 2.3093146097314485e-06, "loss": 1.06711369, "memory(GiB)": 141.16, "step": 124600, "train_speed(iter/s)": 0.288979 }, { "acc": 0.74966421, "epoch": 1.3938681322675943, "grad_norm": 9.0625, "learning_rate": 2.307755995927164e-06, "loss": 0.99383411, "memory(GiB)": 141.16, "step": 124620, "train_speed(iter/s)": 0.288992 }, { "acc": 0.7339303, "epoch": 1.3940918312135528, "grad_norm": 8.5, "learning_rate": 2.3061977504603384e-06, "loss": 1.082761, "memory(GiB)": 141.16, "step": 124640, "train_speed(iter/s)": 0.289008 }, { "acc": 0.72349806, "epoch": 1.3943155301595114, "grad_norm": 6.3125, "learning_rate": 2.304639873544166e-06, "loss": 1.12582722, "memory(GiB)": 141.16, "step": 124660, "train_speed(iter/s)": 0.289024 }, { "acc": 0.74493008, "epoch": 1.39453922910547, "grad_norm": 5.4375, "learning_rate": 2.303082365391784e-06, "loss": 1.01909475, "memory(GiB)": 141.16, "step": 124680, "train_speed(iter/s)": 0.28904 }, { "acc": 0.73870139, "epoch": 1.3947629280514284, "grad_norm": 7.78125, "learning_rate": 2.3015252262162807e-06, "loss": 1.03364735, "memory(GiB)": 141.16, "step": 124700, "train_speed(iter/s)": 0.289056 }, { "acc": 0.74074898, "epoch": 1.394986626997387, "grad_norm": 6.59375, "learning_rate": 2.2999684562306982e-06, "loss": 1.01874733, "memory(GiB)": 141.16, "step": 124720, "train_speed(iter/s)": 0.289071 }, { "acc": 0.76152024, "epoch": 1.3952103259433455, "grad_norm": 6.96875, "learning_rate": 2.298412055648022e-06, "loss": 0.93985367, "memory(GiB)": 141.16, "step": 124740, "train_speed(iter/s)": 0.289086 }, { "acc": 0.7391777, "epoch": 1.395434024889304, "grad_norm": 7.3125, "learning_rate": 2.296856024681192e-06, "loss": 1.03541088, "memory(GiB)": 141.16, "step": 124760, "train_speed(iter/s)": 0.2891 }, { "acc": 0.73612528, "epoch": 1.3956577238352625, "grad_norm": 8.4375, "learning_rate": 2.2953003635430955e-06, "loss": 1.06028862, "memory(GiB)": 141.16, "step": 124780, "train_speed(iter/s)": 0.289115 }, { "acc": 0.73645906, "epoch": 1.395881422781221, "grad_norm": 6.9375, "learning_rate": 2.293745072446566e-06, "loss": 1.04821224, "memory(GiB)": 141.16, "step": 124800, "train_speed(iter/s)": 0.289129 }, { "acc": 0.73539114, "epoch": 1.3961051217271796, "grad_norm": 6.5625, "learning_rate": 2.292190151604394e-06, "loss": 1.0644701, "memory(GiB)": 141.16, "step": 124820, "train_speed(iter/s)": 0.289145 }, { "acc": 0.73929477, "epoch": 1.3963288206731381, "grad_norm": 6.375, "learning_rate": 2.290635601229311e-06, "loss": 1.03279839, "memory(GiB)": 141.16, "step": 124840, "train_speed(iter/s)": 0.289161 }, { "acc": 0.73034434, "epoch": 1.3965525196190967, "grad_norm": 8.0625, "learning_rate": 2.2890814215340052e-06, "loss": 1.09027958, "memory(GiB)": 141.16, "step": 124860, "train_speed(iter/s)": 0.289176 }, { "acc": 0.73621807, "epoch": 1.3967762185650552, "grad_norm": 8.0625, "learning_rate": 2.2875276127311088e-06, "loss": 1.0547492, "memory(GiB)": 141.16, "step": 124880, "train_speed(iter/s)": 0.289193 }, { "acc": 0.7339097, "epoch": 1.3969999175110137, "grad_norm": 7.4375, "learning_rate": 2.285974175033203e-06, "loss": 1.0742857, "memory(GiB)": 141.16, "step": 124900, "train_speed(iter/s)": 0.28921 }, { "acc": 0.72969332, "epoch": 1.3972236164569722, "grad_norm": 7.65625, "learning_rate": 2.2844211086528244e-06, "loss": 1.07672501, "memory(GiB)": 141.16, "step": 124920, "train_speed(iter/s)": 0.289226 }, { "acc": 0.7289403, "epoch": 1.3974473154029308, "grad_norm": 7.21875, "learning_rate": 2.2828684138024513e-06, "loss": 1.10344181, "memory(GiB)": 141.16, "step": 124940, "train_speed(iter/s)": 0.28924 }, { "acc": 0.73630409, "epoch": 1.3976710143488893, "grad_norm": 6.71875, "learning_rate": 2.2813160906945177e-06, "loss": 1.05597229, "memory(GiB)": 141.16, "step": 124960, "train_speed(iter/s)": 0.289257 }, { "acc": 0.7248414, "epoch": 1.3978947132948478, "grad_norm": 7.28125, "learning_rate": 2.2797641395414017e-06, "loss": 1.10650883, "memory(GiB)": 141.16, "step": 124980, "train_speed(iter/s)": 0.28927 }, { "acc": 0.74328594, "epoch": 1.3981184122408064, "grad_norm": 8.5625, "learning_rate": 2.2782125605554307e-06, "loss": 1.04362488, "memory(GiB)": 141.16, "step": 125000, "train_speed(iter/s)": 0.289287 }, { "acc": 0.73381062, "epoch": 1.3983421111867649, "grad_norm": 6.375, "learning_rate": 2.276661353948886e-06, "loss": 1.06863136, "memory(GiB)": 141.16, "step": 125020, "train_speed(iter/s)": 0.289302 }, { "acc": 0.73841534, "epoch": 1.3985658101327234, "grad_norm": 7.59375, "learning_rate": 2.275110519933993e-06, "loss": 1.05089703, "memory(GiB)": 141.16, "step": 125040, "train_speed(iter/s)": 0.289315 }, { "acc": 0.72032251, "epoch": 1.398789509078682, "grad_norm": 8.4375, "learning_rate": 2.2735600587229294e-06, "loss": 1.13982544, "memory(GiB)": 141.16, "step": 125060, "train_speed(iter/s)": 0.289331 }, { "acc": 0.74538155, "epoch": 1.3990132080246405, "grad_norm": 7.75, "learning_rate": 2.2720099705278197e-06, "loss": 1.02870321, "memory(GiB)": 141.16, "step": 125080, "train_speed(iter/s)": 0.289345 }, { "acc": 0.74293203, "epoch": 1.399236906970599, "grad_norm": 7.3125, "learning_rate": 2.2704602555607363e-06, "loss": 1.02436314, "memory(GiB)": 141.16, "step": 125100, "train_speed(iter/s)": 0.289362 }, { "acc": 0.7285841, "epoch": 1.3994606059165575, "grad_norm": 6.5, "learning_rate": 2.2689109140337064e-06, "loss": 1.09616175, "memory(GiB)": 141.16, "step": 125120, "train_speed(iter/s)": 0.289376 }, { "acc": 0.74027071, "epoch": 1.399684304862516, "grad_norm": 6.9375, "learning_rate": 2.267361946158697e-06, "loss": 1.02286911, "memory(GiB)": 141.16, "step": 125140, "train_speed(iter/s)": 0.289393 }, { "acc": 0.7368762, "epoch": 1.3999080038084746, "grad_norm": 6.5625, "learning_rate": 2.2658133521476337e-06, "loss": 1.04737911, "memory(GiB)": 141.16, "step": 125160, "train_speed(iter/s)": 0.289406 }, { "acc": 0.73585634, "epoch": 1.400131702754433, "grad_norm": 6.25, "learning_rate": 2.264265132212385e-06, "loss": 1.05480671, "memory(GiB)": 141.16, "step": 125180, "train_speed(iter/s)": 0.289423 }, { "acc": 0.73022556, "epoch": 1.4003554017003916, "grad_norm": 7.71875, "learning_rate": 2.2627172865647666e-06, "loss": 1.07298546, "memory(GiB)": 141.16, "step": 125200, "train_speed(iter/s)": 0.289439 }, { "acc": 0.73327045, "epoch": 1.4005791006463502, "grad_norm": 7.5, "learning_rate": 2.26116981541655e-06, "loss": 1.06975651, "memory(GiB)": 141.16, "step": 125220, "train_speed(iter/s)": 0.289453 }, { "acc": 0.74069195, "epoch": 1.4008027995923087, "grad_norm": 6.84375, "learning_rate": 2.259622718979448e-06, "loss": 1.05866175, "memory(GiB)": 141.16, "step": 125240, "train_speed(iter/s)": 0.289469 }, { "acc": 0.73611326, "epoch": 1.4010264985382672, "grad_norm": 5.375, "learning_rate": 2.2580759974651283e-06, "loss": 1.04738846, "memory(GiB)": 141.16, "step": 125260, "train_speed(iter/s)": 0.289485 }, { "acc": 0.73916388, "epoch": 1.4012501974842257, "grad_norm": 7.09375, "learning_rate": 2.2565296510852035e-06, "loss": 1.0311141, "memory(GiB)": 141.16, "step": 125280, "train_speed(iter/s)": 0.289502 }, { "acc": 0.73455362, "epoch": 1.4014738964301843, "grad_norm": 8.5625, "learning_rate": 2.254983680051234e-06, "loss": 1.06831379, "memory(GiB)": 141.16, "step": 125300, "train_speed(iter/s)": 0.289519 }, { "acc": 0.73876996, "epoch": 1.4016975953761428, "grad_norm": 6.5, "learning_rate": 2.2534380845747343e-06, "loss": 1.06253386, "memory(GiB)": 141.16, "step": 125320, "train_speed(iter/s)": 0.289535 }, { "acc": 0.72607989, "epoch": 1.4019212943221013, "grad_norm": 6.625, "learning_rate": 2.25189286486716e-06, "loss": 1.1143652, "memory(GiB)": 141.16, "step": 125340, "train_speed(iter/s)": 0.289549 }, { "acc": 0.72886152, "epoch": 1.4021449932680599, "grad_norm": 5.34375, "learning_rate": 2.250348021139924e-06, "loss": 1.08744726, "memory(GiB)": 141.16, "step": 125360, "train_speed(iter/s)": 0.289565 }, { "acc": 0.73403831, "epoch": 1.4023686922140184, "grad_norm": 6.53125, "learning_rate": 2.248803553604379e-06, "loss": 1.05554571, "memory(GiB)": 141.16, "step": 125380, "train_speed(iter/s)": 0.28958 }, { "acc": 0.72946997, "epoch": 1.402592391159977, "grad_norm": 8.0, "learning_rate": 2.24725946247183e-06, "loss": 1.08492947, "memory(GiB)": 141.16, "step": 125400, "train_speed(iter/s)": 0.289594 }, { "acc": 0.73663988, "epoch": 1.4028160901059354, "grad_norm": 6.46875, "learning_rate": 2.2457157479535346e-06, "loss": 1.05489235, "memory(GiB)": 141.16, "step": 125420, "train_speed(iter/s)": 0.289609 }, { "acc": 0.74023571, "epoch": 1.403039789051894, "grad_norm": 9.0, "learning_rate": 2.2441724102606906e-06, "loss": 1.02133923, "memory(GiB)": 141.16, "step": 125440, "train_speed(iter/s)": 0.289626 }, { "acc": 0.74526119, "epoch": 1.4032634879978525, "grad_norm": 7.34375, "learning_rate": 2.242629449604453e-06, "loss": 1.01666412, "memory(GiB)": 141.16, "step": 125460, "train_speed(iter/s)": 0.289641 }, { "acc": 0.73472686, "epoch": 1.403487186943811, "grad_norm": 7.46875, "learning_rate": 2.241086866195918e-06, "loss": 1.06357975, "memory(GiB)": 141.16, "step": 125480, "train_speed(iter/s)": 0.289656 }, { "acc": 0.74842815, "epoch": 1.4037108858897696, "grad_norm": 6.15625, "learning_rate": 2.2395446602461335e-06, "loss": 1.00053186, "memory(GiB)": 141.16, "step": 125500, "train_speed(iter/s)": 0.289671 }, { "acc": 0.73716087, "epoch": 1.403934584835728, "grad_norm": 7.59375, "learning_rate": 2.2380028319660955e-06, "loss": 1.06302013, "memory(GiB)": 141.16, "step": 125520, "train_speed(iter/s)": 0.289687 }, { "acc": 0.74758453, "epoch": 1.4041582837816866, "grad_norm": 6.875, "learning_rate": 2.236461381566747e-06, "loss": 1.02718945, "memory(GiB)": 141.16, "step": 125540, "train_speed(iter/s)": 0.289703 }, { "acc": 0.74424553, "epoch": 1.4043819827276451, "grad_norm": 6.78125, "learning_rate": 2.2349203092589827e-06, "loss": 1.02266388, "memory(GiB)": 141.16, "step": 125560, "train_speed(iter/s)": 0.289717 }, { "acc": 0.73451653, "epoch": 1.4046056816736037, "grad_norm": 7.125, "learning_rate": 2.23337961525364e-06, "loss": 1.06071119, "memory(GiB)": 141.16, "step": 125580, "train_speed(iter/s)": 0.289732 }, { "acc": 0.74135466, "epoch": 1.4048293806195622, "grad_norm": 6.875, "learning_rate": 2.231839299761513e-06, "loss": 1.03813248, "memory(GiB)": 141.16, "step": 125600, "train_speed(iter/s)": 0.289746 }, { "acc": 0.7188571, "epoch": 1.4050530795655207, "grad_norm": 7.5625, "learning_rate": 2.2302993629933355e-06, "loss": 1.12931566, "memory(GiB)": 141.16, "step": 125620, "train_speed(iter/s)": 0.28976 }, { "acc": 0.74135246, "epoch": 1.4052767785114793, "grad_norm": 7.1875, "learning_rate": 2.2287598051597914e-06, "loss": 1.03436489, "memory(GiB)": 141.16, "step": 125640, "train_speed(iter/s)": 0.289776 }, { "acc": 0.74024229, "epoch": 1.4055004774574378, "grad_norm": 7.78125, "learning_rate": 2.227220626471518e-06, "loss": 1.00523548, "memory(GiB)": 141.16, "step": 125660, "train_speed(iter/s)": 0.289793 }, { "acc": 0.73738213, "epoch": 1.4057241764033963, "grad_norm": 7.8125, "learning_rate": 2.225681827139093e-06, "loss": 1.05298901, "memory(GiB)": 141.16, "step": 125680, "train_speed(iter/s)": 0.289808 }, { "acc": 0.7286047, "epoch": 1.4059478753493548, "grad_norm": 7.96875, "learning_rate": 2.22414340737305e-06, "loss": 1.10400982, "memory(GiB)": 141.16, "step": 125700, "train_speed(iter/s)": 0.289824 }, { "acc": 0.73331795, "epoch": 1.4061715742953134, "grad_norm": 6.15625, "learning_rate": 2.222605367383865e-06, "loss": 1.05714111, "memory(GiB)": 141.16, "step": 125720, "train_speed(iter/s)": 0.289841 }, { "acc": 0.74401474, "epoch": 1.406395273241272, "grad_norm": 6.875, "learning_rate": 2.2210677073819624e-06, "loss": 1.01460037, "memory(GiB)": 141.16, "step": 125740, "train_speed(iter/s)": 0.289857 }, { "acc": 0.75434952, "epoch": 1.4066189721872304, "grad_norm": 6.75, "learning_rate": 2.2195304275777193e-06, "loss": 0.99252615, "memory(GiB)": 141.16, "step": 125760, "train_speed(iter/s)": 0.289874 }, { "acc": 0.72224426, "epoch": 1.406842671133189, "grad_norm": 7.125, "learning_rate": 2.2179935281814535e-06, "loss": 1.12315006, "memory(GiB)": 141.16, "step": 125780, "train_speed(iter/s)": 0.289889 }, { "acc": 0.74505372, "epoch": 1.4070663700791475, "grad_norm": 6.0, "learning_rate": 2.2164570094034393e-06, "loss": 1.03061161, "memory(GiB)": 141.16, "step": 125800, "train_speed(iter/s)": 0.289905 }, { "acc": 0.73523192, "epoch": 1.407290069025106, "grad_norm": 7.90625, "learning_rate": 2.2149208714538917e-06, "loss": 1.04718008, "memory(GiB)": 141.16, "step": 125820, "train_speed(iter/s)": 0.289921 }, { "acc": 0.73701763, "epoch": 1.4075137679710645, "grad_norm": 7.46875, "learning_rate": 2.213385114542976e-06, "loss": 1.05527534, "memory(GiB)": 141.16, "step": 125840, "train_speed(iter/s)": 0.289937 }, { "acc": 0.72496452, "epoch": 1.407737466917023, "grad_norm": 7.875, "learning_rate": 2.2118497388808075e-06, "loss": 1.09966717, "memory(GiB)": 141.16, "step": 125860, "train_speed(iter/s)": 0.289954 }, { "acc": 0.73563447, "epoch": 1.4079611658629816, "grad_norm": 6.03125, "learning_rate": 2.2103147446774446e-06, "loss": 1.06361179, "memory(GiB)": 141.16, "step": 125880, "train_speed(iter/s)": 0.28997 }, { "acc": 0.72951536, "epoch": 1.4081848648089401, "grad_norm": 7.40625, "learning_rate": 2.208780132142901e-06, "loss": 1.09738903, "memory(GiB)": 141.16, "step": 125900, "train_speed(iter/s)": 0.289985 }, { "acc": 0.7452857, "epoch": 1.4084085637548986, "grad_norm": 6.53125, "learning_rate": 2.2072459014871305e-06, "loss": 1.02215919, "memory(GiB)": 141.16, "step": 125920, "train_speed(iter/s)": 0.290001 }, { "acc": 0.7317615, "epoch": 1.4086322627008572, "grad_norm": 6.46875, "learning_rate": 2.2057120529200366e-06, "loss": 1.07992058, "memory(GiB)": 141.16, "step": 125940, "train_speed(iter/s)": 0.290015 }, { "acc": 0.7423707, "epoch": 1.4088559616468157, "grad_norm": 5.46875, "learning_rate": 2.2041785866514755e-06, "loss": 1.03312111, "memory(GiB)": 141.16, "step": 125960, "train_speed(iter/s)": 0.29003 }, { "acc": 0.74114676, "epoch": 1.4090796605927742, "grad_norm": 6.90625, "learning_rate": 2.2026455028912434e-06, "loss": 1.04749823, "memory(GiB)": 141.16, "step": 125980, "train_speed(iter/s)": 0.290046 }, { "acc": 0.73377829, "epoch": 1.4093033595387328, "grad_norm": 5.4375, "learning_rate": 2.201112801849092e-06, "loss": 1.07983513, "memory(GiB)": 141.16, "step": 126000, "train_speed(iter/s)": 0.290061 }, { "epoch": 1.4093033595387328, "eval_acc": 0.6901504100101393, "eval_loss": 1.0791383981704712, "eval_runtime": 2320.3772, "eval_samples_per_second": 32.444, "eval_steps_per_second": 16.222, "step": 126000 }, { "acc": 0.73757553, "epoch": 1.4095270584846913, "grad_norm": 5.9375, "learning_rate": 2.199580483734714e-06, "loss": 1.05107307, "memory(GiB)": 141.16, "step": 126020, "train_speed(iter/s)": 0.288501 }, { "acc": 0.72913537, "epoch": 1.4097507574306498, "grad_norm": 8.4375, "learning_rate": 2.1980485487577513e-06, "loss": 1.09143047, "memory(GiB)": 141.16, "step": 126040, "train_speed(iter/s)": 0.288517 }, { "acc": 0.71606112, "epoch": 1.4099744563766083, "grad_norm": 6.15625, "learning_rate": 2.1965169971277984e-06, "loss": 1.15258837, "memory(GiB)": 141.16, "step": 126060, "train_speed(iter/s)": 0.288532 }, { "acc": 0.72719355, "epoch": 1.4101981553225669, "grad_norm": 5.25, "learning_rate": 2.194985829054389e-06, "loss": 1.10153408, "memory(GiB)": 141.16, "step": 126080, "train_speed(iter/s)": 0.288545 }, { "acc": 0.72850513, "epoch": 1.4104218542685254, "grad_norm": 7.875, "learning_rate": 2.1934550447470134e-06, "loss": 1.08586826, "memory(GiB)": 141.16, "step": 126100, "train_speed(iter/s)": 0.288563 }, { "acc": 0.72590289, "epoch": 1.410645553214484, "grad_norm": 6.46875, "learning_rate": 2.1919246444151022e-06, "loss": 1.10441875, "memory(GiB)": 141.16, "step": 126120, "train_speed(iter/s)": 0.288578 }, { "acc": 0.73392344, "epoch": 1.4108692521604425, "grad_norm": 9.375, "learning_rate": 2.1903946282680345e-06, "loss": 1.06697149, "memory(GiB)": 141.16, "step": 126140, "train_speed(iter/s)": 0.288594 }, { "acc": 0.73103189, "epoch": 1.411092951106401, "grad_norm": 8.125, "learning_rate": 2.188864996515142e-06, "loss": 1.08878555, "memory(GiB)": 141.16, "step": 126160, "train_speed(iter/s)": 0.28861 }, { "acc": 0.75003705, "epoch": 1.4113166500523595, "grad_norm": 6.53125, "learning_rate": 2.1873357493656965e-06, "loss": 0.99407616, "memory(GiB)": 141.16, "step": 126180, "train_speed(iter/s)": 0.288625 }, { "acc": 0.7349431, "epoch": 1.411540348998318, "grad_norm": 6.09375, "learning_rate": 2.1858068870289245e-06, "loss": 1.05737457, "memory(GiB)": 141.16, "step": 126200, "train_speed(iter/s)": 0.288643 }, { "acc": 0.7258142, "epoch": 1.4117640479442766, "grad_norm": 7.96875, "learning_rate": 2.1842784097139945e-06, "loss": 1.10459824, "memory(GiB)": 141.16, "step": 126220, "train_speed(iter/s)": 0.28866 }, { "acc": 0.7303328, "epoch": 1.411987746890235, "grad_norm": 7.65625, "learning_rate": 2.1827503176300224e-06, "loss": 1.07662163, "memory(GiB)": 141.16, "step": 126240, "train_speed(iter/s)": 0.288676 }, { "acc": 0.73689203, "epoch": 1.4122114458361936, "grad_norm": 6.40625, "learning_rate": 2.1812226109860764e-06, "loss": 1.0441782, "memory(GiB)": 141.16, "step": 126260, "train_speed(iter/s)": 0.28869 }, { "acc": 0.72082911, "epoch": 1.4124351447821522, "grad_norm": 7.03125, "learning_rate": 2.1796952899911643e-06, "loss": 1.12460155, "memory(GiB)": 141.16, "step": 126280, "train_speed(iter/s)": 0.288705 }, { "acc": 0.74311533, "epoch": 1.4126588437281107, "grad_norm": 6.90625, "learning_rate": 2.1781683548542504e-06, "loss": 1.02848434, "memory(GiB)": 141.16, "step": 126300, "train_speed(iter/s)": 0.28872 }, { "acc": 0.73980441, "epoch": 1.4128825426740692, "grad_norm": 8.5, "learning_rate": 2.1766418057842386e-06, "loss": 1.04638214, "memory(GiB)": 141.16, "step": 126320, "train_speed(iter/s)": 0.288736 }, { "acc": 0.73176289, "epoch": 1.4131062416200277, "grad_norm": 7.59375, "learning_rate": 2.1751156429899815e-06, "loss": 1.06988716, "memory(GiB)": 141.16, "step": 126340, "train_speed(iter/s)": 0.288753 }, { "acc": 0.72020226, "epoch": 1.4133299405659863, "grad_norm": 7.03125, "learning_rate": 2.1735898666802828e-06, "loss": 1.15259533, "memory(GiB)": 141.16, "step": 126360, "train_speed(iter/s)": 0.288768 }, { "acc": 0.73562269, "epoch": 1.4135536395119448, "grad_norm": 6.875, "learning_rate": 2.172064477063887e-06, "loss": 1.04816818, "memory(GiB)": 141.16, "step": 126380, "train_speed(iter/s)": 0.288784 }, { "acc": 0.73675203, "epoch": 1.4137773384579033, "grad_norm": 6.875, "learning_rate": 2.1705394743494935e-06, "loss": 1.06472883, "memory(GiB)": 141.16, "step": 126400, "train_speed(iter/s)": 0.288799 }, { "acc": 0.73546529, "epoch": 1.4140010374038618, "grad_norm": 8.5625, "learning_rate": 2.169014858745742e-06, "loss": 1.05388899, "memory(GiB)": 141.16, "step": 126420, "train_speed(iter/s)": 0.288816 }, { "acc": 0.7339642, "epoch": 1.4142247363498204, "grad_norm": 7.75, "learning_rate": 2.16749063046122e-06, "loss": 1.0679327, "memory(GiB)": 141.16, "step": 126440, "train_speed(iter/s)": 0.288832 }, { "acc": 0.73949108, "epoch": 1.414448435295779, "grad_norm": 6.4375, "learning_rate": 2.1659667897044678e-06, "loss": 1.06322126, "memory(GiB)": 141.16, "step": 126460, "train_speed(iter/s)": 0.288848 }, { "acc": 0.74393034, "epoch": 1.4146721342417374, "grad_norm": 6.9375, "learning_rate": 2.1644433366839648e-06, "loss": 1.03669376, "memory(GiB)": 141.16, "step": 126480, "train_speed(iter/s)": 0.288862 }, { "acc": 0.72033272, "epoch": 1.414895833187696, "grad_norm": 8.0, "learning_rate": 2.1629202716081443e-06, "loss": 1.12097416, "memory(GiB)": 141.16, "step": 126500, "train_speed(iter/s)": 0.288876 }, { "acc": 0.75375156, "epoch": 1.4151195321336545, "grad_norm": 8.5625, "learning_rate": 2.1613975946853815e-06, "loss": 0.9842679, "memory(GiB)": 141.16, "step": 126520, "train_speed(iter/s)": 0.288892 }, { "acc": 0.73688793, "epoch": 1.415343231079613, "grad_norm": 8.625, "learning_rate": 2.159875306123999e-06, "loss": 1.07408037, "memory(GiB)": 141.16, "step": 126540, "train_speed(iter/s)": 0.288907 }, { "acc": 0.72879839, "epoch": 1.4155669300255715, "grad_norm": 6.21875, "learning_rate": 2.158353406132272e-06, "loss": 1.10031595, "memory(GiB)": 141.16, "step": 126560, "train_speed(iter/s)": 0.288924 }, { "acc": 0.73293409, "epoch": 1.41579062897153, "grad_norm": 6.125, "learning_rate": 2.156831894918413e-06, "loss": 1.06443701, "memory(GiB)": 141.16, "step": 126580, "train_speed(iter/s)": 0.288938 }, { "acc": 0.7324338, "epoch": 1.4160143279174886, "grad_norm": 8.625, "learning_rate": 2.1553107726905907e-06, "loss": 1.07574701, "memory(GiB)": 141.16, "step": 126600, "train_speed(iter/s)": 0.288955 }, { "acc": 0.74072738, "epoch": 1.4162380268634471, "grad_norm": 6.53125, "learning_rate": 2.153790039656915e-06, "loss": 1.03579006, "memory(GiB)": 141.16, "step": 126620, "train_speed(iter/s)": 0.28897 }, { "acc": 0.72138109, "epoch": 1.4164617258094057, "grad_norm": 6.5625, "learning_rate": 2.152269696025442e-06, "loss": 1.12299585, "memory(GiB)": 141.16, "step": 126640, "train_speed(iter/s)": 0.288987 }, { "acc": 0.72600923, "epoch": 1.4166854247553642, "grad_norm": 7.75, "learning_rate": 2.150749742004179e-06, "loss": 1.10975533, "memory(GiB)": 141.16, "step": 126660, "train_speed(iter/s)": 0.289002 }, { "acc": 0.72724328, "epoch": 1.4169091237013227, "grad_norm": 8.25, "learning_rate": 2.149230177801077e-06, "loss": 1.10277052, "memory(GiB)": 141.16, "step": 126680, "train_speed(iter/s)": 0.289018 }, { "acc": 0.72629566, "epoch": 1.4171328226472812, "grad_norm": 7.1875, "learning_rate": 2.147711003624034e-06, "loss": 1.0974596, "memory(GiB)": 141.16, "step": 126700, "train_speed(iter/s)": 0.289034 }, { "acc": 0.73893538, "epoch": 1.4173565215932398, "grad_norm": 7.90625, "learning_rate": 2.1461922196808914e-06, "loss": 1.05735655, "memory(GiB)": 141.16, "step": 126720, "train_speed(iter/s)": 0.28905 }, { "acc": 0.73384037, "epoch": 1.4175802205391983, "grad_norm": 6.0625, "learning_rate": 2.1446738261794466e-06, "loss": 1.05223665, "memory(GiB)": 141.16, "step": 126740, "train_speed(iter/s)": 0.289067 }, { "acc": 0.73813057, "epoch": 1.4178039194851568, "grad_norm": 8.125, "learning_rate": 2.1431558233274337e-06, "loss": 1.03493757, "memory(GiB)": 141.16, "step": 126760, "train_speed(iter/s)": 0.289085 }, { "acc": 0.74020691, "epoch": 1.4180276184311154, "grad_norm": 6.5625, "learning_rate": 2.1416382113325356e-06, "loss": 1.04726276, "memory(GiB)": 141.16, "step": 126780, "train_speed(iter/s)": 0.289099 }, { "acc": 0.74015732, "epoch": 1.4182513173770739, "grad_norm": 6.28125, "learning_rate": 2.140120990402388e-06, "loss": 1.03372288, "memory(GiB)": 141.16, "step": 126800, "train_speed(iter/s)": 0.289116 }, { "acc": 0.74232626, "epoch": 1.4184750163230324, "grad_norm": 7.125, "learning_rate": 2.138604160744564e-06, "loss": 1.02827721, "memory(GiB)": 141.16, "step": 126820, "train_speed(iter/s)": 0.289132 }, { "acc": 0.72917967, "epoch": 1.418698715268991, "grad_norm": 7.3125, "learning_rate": 2.1370877225665913e-06, "loss": 1.08969555, "memory(GiB)": 141.16, "step": 126840, "train_speed(iter/s)": 0.289147 }, { "acc": 0.72402401, "epoch": 1.4189224142149495, "grad_norm": 7.1875, "learning_rate": 2.135571676075939e-06, "loss": 1.11462326, "memory(GiB)": 141.16, "step": 126860, "train_speed(iter/s)": 0.289161 }, { "acc": 0.73370686, "epoch": 1.419146113160908, "grad_norm": 8.875, "learning_rate": 2.1340560214800217e-06, "loss": 1.06327267, "memory(GiB)": 141.16, "step": 126880, "train_speed(iter/s)": 0.289177 }, { "acc": 0.74621658, "epoch": 1.4193698121068665, "grad_norm": 6.90625, "learning_rate": 2.1325407589862057e-06, "loss": 1.01550503, "memory(GiB)": 141.16, "step": 126900, "train_speed(iter/s)": 0.289191 }, { "acc": 0.74253821, "epoch": 1.419593511052825, "grad_norm": 5.9375, "learning_rate": 2.1310258888017983e-06, "loss": 1.03363314, "memory(GiB)": 141.16, "step": 126920, "train_speed(iter/s)": 0.289208 }, { "acc": 0.73284817, "epoch": 1.4198172099987836, "grad_norm": 6.5, "learning_rate": 2.1295114111340575e-06, "loss": 1.07789688, "memory(GiB)": 141.16, "step": 126940, "train_speed(iter/s)": 0.289224 }, { "acc": 0.72135911, "epoch": 1.420040908944742, "grad_norm": 9.3125, "learning_rate": 2.1279973261901848e-06, "loss": 1.13027687, "memory(GiB)": 141.16, "step": 126960, "train_speed(iter/s)": 0.289239 }, { "acc": 0.73022366, "epoch": 1.4202646078907006, "grad_norm": 7.40625, "learning_rate": 2.126483634177326e-06, "loss": 1.0736412, "memory(GiB)": 141.16, "step": 126980, "train_speed(iter/s)": 0.289254 }, { "acc": 0.72851005, "epoch": 1.4204883068366592, "grad_norm": 6.375, "learning_rate": 2.12497033530258e-06, "loss": 1.07197895, "memory(GiB)": 141.16, "step": 127000, "train_speed(iter/s)": 0.289272 }, { "acc": 0.73714504, "epoch": 1.4207120057826177, "grad_norm": 6.71875, "learning_rate": 2.123457429772984e-06, "loss": 1.05918484, "memory(GiB)": 141.16, "step": 127020, "train_speed(iter/s)": 0.289288 }, { "acc": 0.74067054, "epoch": 1.4209357047285762, "grad_norm": 6.5625, "learning_rate": 2.1219449177955293e-06, "loss": 1.04431858, "memory(GiB)": 141.16, "step": 127040, "train_speed(iter/s)": 0.289304 }, { "acc": 0.73205881, "epoch": 1.4211594036745347, "grad_norm": 7.6875, "learning_rate": 2.1204327995771464e-06, "loss": 1.07134819, "memory(GiB)": 141.16, "step": 127060, "train_speed(iter/s)": 0.289319 }, { "acc": 0.73553858, "epoch": 1.4213831026204933, "grad_norm": 8.25, "learning_rate": 2.1189210753247127e-06, "loss": 1.06662483, "memory(GiB)": 141.16, "step": 127080, "train_speed(iter/s)": 0.289336 }, { "acc": 0.74240189, "epoch": 1.4216068015664518, "grad_norm": 6.90625, "learning_rate": 2.117409745245058e-06, "loss": 1.02323503, "memory(GiB)": 141.16, "step": 127100, "train_speed(iter/s)": 0.289351 }, { "acc": 0.7254745, "epoch": 1.4218305005124103, "grad_norm": 6.3125, "learning_rate": 2.1158988095449502e-06, "loss": 1.09514465, "memory(GiB)": 141.16, "step": 127120, "train_speed(iter/s)": 0.289364 }, { "acc": 0.73134708, "epoch": 1.4220541994583689, "grad_norm": 7.375, "learning_rate": 2.114388268431111e-06, "loss": 1.0879283, "memory(GiB)": 141.16, "step": 127140, "train_speed(iter/s)": 0.289378 }, { "acc": 0.73129396, "epoch": 1.4222778984043274, "grad_norm": 7.125, "learning_rate": 2.1128781221102e-06, "loss": 1.05699158, "memory(GiB)": 141.16, "step": 127160, "train_speed(iter/s)": 0.289396 }, { "acc": 0.73156605, "epoch": 1.422501597350286, "grad_norm": 7.65625, "learning_rate": 2.111368370788828e-06, "loss": 1.07876768, "memory(GiB)": 141.16, "step": 127180, "train_speed(iter/s)": 0.289413 }, { "acc": 0.74612422, "epoch": 1.4227252962962444, "grad_norm": 6.0625, "learning_rate": 2.1098590146735522e-06, "loss": 1.01294308, "memory(GiB)": 141.16, "step": 127200, "train_speed(iter/s)": 0.289428 }, { "acc": 0.72559862, "epoch": 1.422948995242203, "grad_norm": 7.5, "learning_rate": 2.108350053970871e-06, "loss": 1.09907951, "memory(GiB)": 141.16, "step": 127220, "train_speed(iter/s)": 0.289445 }, { "acc": 0.7306179, "epoch": 1.4231726941881615, "grad_norm": 6.15625, "learning_rate": 2.1068414888872353e-06, "loss": 1.07967854, "memory(GiB)": 141.16, "step": 127240, "train_speed(iter/s)": 0.289462 }, { "acc": 0.72269635, "epoch": 1.42339639313412, "grad_norm": 7.1875, "learning_rate": 2.105333319629037e-06, "loss": 1.11166611, "memory(GiB)": 141.16, "step": 127260, "train_speed(iter/s)": 0.289477 }, { "acc": 0.71121483, "epoch": 1.4236200920800786, "grad_norm": 5.8125, "learning_rate": 2.103825546402613e-06, "loss": 1.16435814, "memory(GiB)": 141.16, "step": 127280, "train_speed(iter/s)": 0.289493 }, { "acc": 0.73156352, "epoch": 1.423843791026037, "grad_norm": 6.9375, "learning_rate": 2.102318169414252e-06, "loss": 1.09617825, "memory(GiB)": 141.16, "step": 127300, "train_speed(iter/s)": 0.28951 }, { "acc": 0.73931146, "epoch": 1.4240674899719956, "grad_norm": 6.5, "learning_rate": 2.100811188870181e-06, "loss": 1.04598866, "memory(GiB)": 141.16, "step": 127320, "train_speed(iter/s)": 0.289527 }, { "acc": 0.73634157, "epoch": 1.4242911889179541, "grad_norm": 7.9375, "learning_rate": 2.0993046049765796e-06, "loss": 1.04962215, "memory(GiB)": 141.16, "step": 127340, "train_speed(iter/s)": 0.289541 }, { "acc": 0.74752293, "epoch": 1.4245148878639127, "grad_norm": 7.09375, "learning_rate": 2.0977984179395693e-06, "loss": 1.00323467, "memory(GiB)": 141.16, "step": 127360, "train_speed(iter/s)": 0.289558 }, { "acc": 0.74910464, "epoch": 1.4247385868098712, "grad_norm": 7.34375, "learning_rate": 2.096292627965216e-06, "loss": 0.99534283, "memory(GiB)": 141.16, "step": 127380, "train_speed(iter/s)": 0.289573 }, { "acc": 0.74072227, "epoch": 1.4249622857558297, "grad_norm": 7.28125, "learning_rate": 2.0947872352595353e-06, "loss": 1.02685566, "memory(GiB)": 141.16, "step": 127400, "train_speed(iter/s)": 0.289587 }, { "acc": 0.74374619, "epoch": 1.4251859847017883, "grad_norm": 9.5, "learning_rate": 2.093282240028485e-06, "loss": 1.02606506, "memory(GiB)": 141.16, "step": 127420, "train_speed(iter/s)": 0.289601 }, { "acc": 0.72644777, "epoch": 1.4254096836477468, "grad_norm": 8.4375, "learning_rate": 2.0917776424779727e-06, "loss": 1.08146572, "memory(GiB)": 141.16, "step": 127440, "train_speed(iter/s)": 0.289616 }, { "acc": 0.74770765, "epoch": 1.4256333825937053, "grad_norm": 7.25, "learning_rate": 2.0902734428138468e-06, "loss": 1.00364552, "memory(GiB)": 141.16, "step": 127460, "train_speed(iter/s)": 0.289631 }, { "acc": 0.72721977, "epoch": 1.4258570815396638, "grad_norm": 7.25, "learning_rate": 2.0887696412419017e-06, "loss": 1.08361969, "memory(GiB)": 141.16, "step": 127480, "train_speed(iter/s)": 0.289647 }, { "acc": 0.74124956, "epoch": 1.4260807804856224, "grad_norm": 6.15625, "learning_rate": 2.0872662379678822e-06, "loss": 1.03884935, "memory(GiB)": 141.16, "step": 127500, "train_speed(iter/s)": 0.289661 }, { "acc": 0.73504839, "epoch": 1.426304479431581, "grad_norm": 8.0625, "learning_rate": 2.0857632331974725e-06, "loss": 1.06130085, "memory(GiB)": 141.16, "step": 127520, "train_speed(iter/s)": 0.289676 }, { "acc": 0.73470478, "epoch": 1.4265281783775394, "grad_norm": 7.25, "learning_rate": 2.084260627136308e-06, "loss": 1.07046766, "memory(GiB)": 141.16, "step": 127540, "train_speed(iter/s)": 0.28969 }, { "acc": 0.73306904, "epoch": 1.4267518773234982, "grad_norm": 5.71875, "learning_rate": 2.0827584199899658e-06, "loss": 1.07806721, "memory(GiB)": 141.16, "step": 127560, "train_speed(iter/s)": 0.289703 }, { "acc": 0.74332976, "epoch": 1.4269755762694567, "grad_norm": 5.5, "learning_rate": 2.0812566119639664e-06, "loss": 1.02677898, "memory(GiB)": 141.16, "step": 127580, "train_speed(iter/s)": 0.289719 }, { "acc": 0.73024888, "epoch": 1.4271992752154152, "grad_norm": 7.03125, "learning_rate": 2.0797552032637828e-06, "loss": 1.07716026, "memory(GiB)": 141.16, "step": 127600, "train_speed(iter/s)": 0.289735 }, { "acc": 0.7370182, "epoch": 1.4274229741613738, "grad_norm": 8.125, "learning_rate": 2.078254194094826e-06, "loss": 1.06773252, "memory(GiB)": 141.16, "step": 127620, "train_speed(iter/s)": 0.289751 }, { "acc": 0.73001995, "epoch": 1.4276466731073323, "grad_norm": 7.84375, "learning_rate": 2.076753584662458e-06, "loss": 1.09344959, "memory(GiB)": 141.16, "step": 127640, "train_speed(iter/s)": 0.289767 }, { "acc": 0.73515139, "epoch": 1.4278703720532908, "grad_norm": 6.21875, "learning_rate": 2.0752533751719826e-06, "loss": 1.0633584, "memory(GiB)": 141.16, "step": 127660, "train_speed(iter/s)": 0.289784 }, { "acc": 0.73918991, "epoch": 1.4280940709992493, "grad_norm": 7.78125, "learning_rate": 2.0737535658286485e-06, "loss": 1.03479719, "memory(GiB)": 141.16, "step": 127680, "train_speed(iter/s)": 0.2898 }, { "acc": 0.72873535, "epoch": 1.4283177699452079, "grad_norm": 6.09375, "learning_rate": 2.0722541568376535e-06, "loss": 1.11928692, "memory(GiB)": 141.16, "step": 127700, "train_speed(iter/s)": 0.289815 }, { "acc": 0.73932333, "epoch": 1.4285414688911664, "grad_norm": 6.1875, "learning_rate": 2.0707551484041347e-06, "loss": 1.06042957, "memory(GiB)": 141.16, "step": 127720, "train_speed(iter/s)": 0.289831 }, { "acc": 0.72934275, "epoch": 1.428765167837125, "grad_norm": 8.9375, "learning_rate": 2.0692565407331834e-06, "loss": 1.08458996, "memory(GiB)": 141.16, "step": 127740, "train_speed(iter/s)": 0.289846 }, { "acc": 0.74538045, "epoch": 1.4289888667830835, "grad_norm": 7.5, "learning_rate": 2.0677583340298263e-06, "loss": 1.01576653, "memory(GiB)": 141.16, "step": 127760, "train_speed(iter/s)": 0.289862 }, { "acc": 0.72270231, "epoch": 1.429212565729042, "grad_norm": 7.34375, "learning_rate": 2.0662605284990388e-06, "loss": 1.12811422, "memory(GiB)": 141.16, "step": 127780, "train_speed(iter/s)": 0.289878 }, { "acc": 0.72831326, "epoch": 1.4294362646750005, "grad_norm": 6.875, "learning_rate": 2.0647631243457455e-06, "loss": 1.08877449, "memory(GiB)": 141.16, "step": 127800, "train_speed(iter/s)": 0.289892 }, { "acc": 0.73798561, "epoch": 1.429659963620959, "grad_norm": 7.9375, "learning_rate": 2.0632661217748094e-06, "loss": 1.04211473, "memory(GiB)": 141.16, "step": 127820, "train_speed(iter/s)": 0.289907 }, { "acc": 0.74276161, "epoch": 1.4298836625669176, "grad_norm": 5.5, "learning_rate": 2.0617695209910454e-06, "loss": 1.03235703, "memory(GiB)": 141.16, "step": 127840, "train_speed(iter/s)": 0.289923 }, { "acc": 0.73272228, "epoch": 1.430107361512876, "grad_norm": 8.5625, "learning_rate": 2.0602733221992077e-06, "loss": 1.0664938, "memory(GiB)": 141.16, "step": 127860, "train_speed(iter/s)": 0.289939 }, { "acc": 0.74282913, "epoch": 1.4303310604588346, "grad_norm": 6.53125, "learning_rate": 2.058777525603998e-06, "loss": 1.0316082, "memory(GiB)": 141.16, "step": 127880, "train_speed(iter/s)": 0.289955 }, { "acc": 0.71746917, "epoch": 1.4305547594047932, "grad_norm": 6.46875, "learning_rate": 2.057282131410062e-06, "loss": 1.15155182, "memory(GiB)": 141.16, "step": 127900, "train_speed(iter/s)": 0.289969 }, { "acc": 0.74104586, "epoch": 1.4307784583507517, "grad_norm": 4.28125, "learning_rate": 2.0557871398219903e-06, "loss": 1.03216438, "memory(GiB)": 141.16, "step": 127920, "train_speed(iter/s)": 0.289985 }, { "acc": 0.73505921, "epoch": 1.4310021572967102, "grad_norm": 8.1875, "learning_rate": 2.0542925510443224e-06, "loss": 1.0814353, "memory(GiB)": 141.16, "step": 127940, "train_speed(iter/s)": 0.29 }, { "acc": 0.73345909, "epoch": 1.4312258562426687, "grad_norm": 7.0625, "learning_rate": 2.0527983652815347e-06, "loss": 1.07834148, "memory(GiB)": 141.16, "step": 127960, "train_speed(iter/s)": 0.290015 }, { "acc": 0.74134369, "epoch": 1.4314495551886273, "grad_norm": 8.6875, "learning_rate": 2.0513045827380584e-06, "loss": 1.04817057, "memory(GiB)": 141.16, "step": 127980, "train_speed(iter/s)": 0.290031 }, { "acc": 0.73948741, "epoch": 1.4316732541345858, "grad_norm": 7.6875, "learning_rate": 2.0498112036182616e-06, "loss": 1.03862267, "memory(GiB)": 141.16, "step": 128000, "train_speed(iter/s)": 0.290047 }, { "epoch": 1.4316732541345858, "eval_acc": 0.690131233611327, "eval_loss": 1.0791271924972534, "eval_runtime": 2316.36, "eval_samples_per_second": 32.501, "eval_steps_per_second": 16.25, "step": 128000 }, { "acc": 0.71967931, "epoch": 1.4318969530805443, "grad_norm": 6.75, "learning_rate": 2.0483182281264586e-06, "loss": 1.13442421, "memory(GiB)": 141.16, "step": 128020, "train_speed(iter/s)": 0.288515 }, { "acc": 0.7340353, "epoch": 1.4321206520265028, "grad_norm": 7.9375, "learning_rate": 2.0468256564669124e-06, "loss": 1.07158022, "memory(GiB)": 141.16, "step": 128040, "train_speed(iter/s)": 0.28853 }, { "acc": 0.74191694, "epoch": 1.4323443509724614, "grad_norm": 6.5625, "learning_rate": 2.0453334888438253e-06, "loss": 1.03543549, "memory(GiB)": 141.16, "step": 128060, "train_speed(iter/s)": 0.288545 }, { "acc": 0.73448706, "epoch": 1.43256804991842, "grad_norm": 8.125, "learning_rate": 2.0438417254613508e-06, "loss": 1.06575956, "memory(GiB)": 141.16, "step": 128080, "train_speed(iter/s)": 0.288561 }, { "acc": 0.74660835, "epoch": 1.4327917488643784, "grad_norm": 6.90625, "learning_rate": 2.042350366523582e-06, "loss": 1.02976084, "memory(GiB)": 141.16, "step": 128100, "train_speed(iter/s)": 0.288578 }, { "acc": 0.74181614, "epoch": 1.433015447810337, "grad_norm": 6.09375, "learning_rate": 2.040859412234555e-06, "loss": 1.04521914, "memory(GiB)": 141.16, "step": 128120, "train_speed(iter/s)": 0.288594 }, { "acc": 0.74124002, "epoch": 1.4332391467562955, "grad_norm": 6.3125, "learning_rate": 2.0393688627982585e-06, "loss": 1.05363007, "memory(GiB)": 141.16, "step": 128140, "train_speed(iter/s)": 0.288609 }, { "acc": 0.73277721, "epoch": 1.433462845702254, "grad_norm": 7.21875, "learning_rate": 2.0378787184186165e-06, "loss": 1.07787752, "memory(GiB)": 141.16, "step": 128160, "train_speed(iter/s)": 0.288624 }, { "acc": 0.71796961, "epoch": 1.4336865446482125, "grad_norm": 7.28125, "learning_rate": 2.0363889792995067e-06, "loss": 1.13441944, "memory(GiB)": 141.16, "step": 128180, "train_speed(iter/s)": 0.288641 }, { "acc": 0.73141766, "epoch": 1.433910243594171, "grad_norm": 5.25, "learning_rate": 2.0348996456447438e-06, "loss": 1.10108509, "memory(GiB)": 141.16, "step": 128200, "train_speed(iter/s)": 0.288657 }, { "acc": 0.73122501, "epoch": 1.4341339425401296, "grad_norm": 6.40625, "learning_rate": 2.033410717658089e-06, "loss": 1.07968388, "memory(GiB)": 141.16, "step": 128220, "train_speed(iter/s)": 0.288673 }, { "acc": 0.7403502, "epoch": 1.4343576414860881, "grad_norm": 6.78125, "learning_rate": 2.0319221955432515e-06, "loss": 1.04716091, "memory(GiB)": 141.16, "step": 128240, "train_speed(iter/s)": 0.288689 }, { "acc": 0.71608973, "epoch": 1.4345813404320467, "grad_norm": 7.15625, "learning_rate": 2.03043407950388e-06, "loss": 1.14326477, "memory(GiB)": 141.16, "step": 128260, "train_speed(iter/s)": 0.288703 }, { "acc": 0.7339282, "epoch": 1.4348050393780052, "grad_norm": 7.46875, "learning_rate": 2.028946369743573e-06, "loss": 1.07207527, "memory(GiB)": 141.16, "step": 128280, "train_speed(iter/s)": 0.288718 }, { "acc": 0.73672419, "epoch": 1.4350287383239637, "grad_norm": 7.125, "learning_rate": 2.027459066465869e-06, "loss": 1.05320606, "memory(GiB)": 141.16, "step": 128300, "train_speed(iter/s)": 0.288734 }, { "acc": 0.73942962, "epoch": 1.4352524372699222, "grad_norm": 6.34375, "learning_rate": 2.02597216987425e-06, "loss": 1.03966312, "memory(GiB)": 141.16, "step": 128320, "train_speed(iter/s)": 0.288749 }, { "acc": 0.73252754, "epoch": 1.4354761362158808, "grad_norm": 7.46875, "learning_rate": 2.0244856801721484e-06, "loss": 1.07527695, "memory(GiB)": 141.16, "step": 128340, "train_speed(iter/s)": 0.288764 }, { "acc": 0.73174467, "epoch": 1.4356998351618393, "grad_norm": 6.625, "learning_rate": 2.0229995975629348e-06, "loss": 1.06403503, "memory(GiB)": 141.16, "step": 128360, "train_speed(iter/s)": 0.288782 }, { "acc": 0.7396512, "epoch": 1.4359235341077978, "grad_norm": 6.53125, "learning_rate": 2.021513922249928e-06, "loss": 1.05637608, "memory(GiB)": 141.16, "step": 128380, "train_speed(iter/s)": 0.288798 }, { "acc": 0.72160492, "epoch": 1.4361472330537564, "grad_norm": 7.21875, "learning_rate": 2.0200286544363902e-06, "loss": 1.12793245, "memory(GiB)": 141.16, "step": 128400, "train_speed(iter/s)": 0.288811 }, { "acc": 0.74209857, "epoch": 1.4363709319997149, "grad_norm": 7.1875, "learning_rate": 2.0185437943255233e-06, "loss": 1.0366272, "memory(GiB)": 141.16, "step": 128420, "train_speed(iter/s)": 0.288828 }, { "acc": 0.74771471, "epoch": 1.4365946309456734, "grad_norm": 7.625, "learning_rate": 2.017059342120482e-06, "loss": 0.9994978, "memory(GiB)": 141.16, "step": 128440, "train_speed(iter/s)": 0.288844 }, { "acc": 0.73838425, "epoch": 1.436818329891632, "grad_norm": 6.71875, "learning_rate": 2.0155752980243575e-06, "loss": 1.04091949, "memory(GiB)": 141.16, "step": 128460, "train_speed(iter/s)": 0.288859 }, { "acc": 0.73897676, "epoch": 1.4370420288375905, "grad_norm": 7.03125, "learning_rate": 2.0140916622401914e-06, "loss": 1.04191504, "memory(GiB)": 141.16, "step": 128480, "train_speed(iter/s)": 0.288873 }, { "acc": 0.72702646, "epoch": 1.437265727783549, "grad_norm": 8.4375, "learning_rate": 2.0126084349709635e-06, "loss": 1.10076427, "memory(GiB)": 141.16, "step": 128500, "train_speed(iter/s)": 0.288888 }, { "acc": 0.7237711, "epoch": 1.4374894267295075, "grad_norm": 6.375, "learning_rate": 2.0111256164196e-06, "loss": 1.11065626, "memory(GiB)": 141.16, "step": 128520, "train_speed(iter/s)": 0.288903 }, { "acc": 0.74024048, "epoch": 1.437713125675466, "grad_norm": 7.8125, "learning_rate": 2.0096432067889752e-06, "loss": 1.04135237, "memory(GiB)": 141.16, "step": 128540, "train_speed(iter/s)": 0.288917 }, { "acc": 0.73868728, "epoch": 1.4379368246214246, "grad_norm": 8.75, "learning_rate": 2.0081612062818995e-06, "loss": 1.05051441, "memory(GiB)": 141.16, "step": 128560, "train_speed(iter/s)": 0.288934 }, { "acc": 0.74181519, "epoch": 1.438160523567383, "grad_norm": 8.0, "learning_rate": 2.0066796151011358e-06, "loss": 1.03036423, "memory(GiB)": 141.16, "step": 128580, "train_speed(iter/s)": 0.28895 }, { "acc": 0.73319912, "epoch": 1.4383842225133416, "grad_norm": 8.0625, "learning_rate": 2.0051984334493857e-06, "loss": 1.06955023, "memory(GiB)": 141.16, "step": 128600, "train_speed(iter/s)": 0.288965 }, { "acc": 0.74040227, "epoch": 1.4386079214593002, "grad_norm": 7.625, "learning_rate": 2.003717661529293e-06, "loss": 1.05083075, "memory(GiB)": 141.16, "step": 128620, "train_speed(iter/s)": 0.288981 }, { "acc": 0.7371388, "epoch": 1.4388316204052587, "grad_norm": 7.09375, "learning_rate": 2.002237299543453e-06, "loss": 1.05196915, "memory(GiB)": 141.16, "step": 128640, "train_speed(iter/s)": 0.288997 }, { "acc": 0.72749395, "epoch": 1.4390553193512172, "grad_norm": 9.1875, "learning_rate": 2.000757347694397e-06, "loss": 1.0938859, "memory(GiB)": 141.16, "step": 128660, "train_speed(iter/s)": 0.289012 }, { "acc": 0.72123661, "epoch": 1.4392790182971757, "grad_norm": 9.5, "learning_rate": 1.9992778061846064e-06, "loss": 1.13315449, "memory(GiB)": 141.16, "step": 128680, "train_speed(iter/s)": 0.289027 }, { "acc": 0.72878132, "epoch": 1.4395027172431343, "grad_norm": 7.21875, "learning_rate": 1.9977986752165017e-06, "loss": 1.09408302, "memory(GiB)": 141.16, "step": 128700, "train_speed(iter/s)": 0.289044 }, { "acc": 0.74031372, "epoch": 1.4397264161890928, "grad_norm": 5.96875, "learning_rate": 1.996319954992448e-06, "loss": 1.05891542, "memory(GiB)": 141.16, "step": 128720, "train_speed(iter/s)": 0.289059 }, { "acc": 0.73462467, "epoch": 1.4399501151350513, "grad_norm": 7.09375, "learning_rate": 1.994841645714759e-06, "loss": 1.08278379, "memory(GiB)": 141.16, "step": 128740, "train_speed(iter/s)": 0.289075 }, { "acc": 0.73321524, "epoch": 1.4401738140810099, "grad_norm": 7.46875, "learning_rate": 1.9933637475856845e-06, "loss": 1.07043648, "memory(GiB)": 141.16, "step": 128760, "train_speed(iter/s)": 0.289091 }, { "acc": 0.73451996, "epoch": 1.4403975130269684, "grad_norm": 6.0, "learning_rate": 1.9918862608074258e-06, "loss": 1.07530508, "memory(GiB)": 141.16, "step": 128780, "train_speed(iter/s)": 0.289107 }, { "acc": 0.72702136, "epoch": 1.440621211972927, "grad_norm": 5.875, "learning_rate": 1.9904091855821223e-06, "loss": 1.10405159, "memory(GiB)": 141.16, "step": 128800, "train_speed(iter/s)": 0.28912 }, { "acc": 0.73305225, "epoch": 1.4408449109188854, "grad_norm": 7.375, "learning_rate": 1.9889325221118576e-06, "loss": 1.06218138, "memory(GiB)": 141.16, "step": 128820, "train_speed(iter/s)": 0.289134 }, { "acc": 0.72132769, "epoch": 1.441068609864844, "grad_norm": 6.59375, "learning_rate": 1.987456270598664e-06, "loss": 1.11978226, "memory(GiB)": 141.16, "step": 128840, "train_speed(iter/s)": 0.289149 }, { "acc": 0.74766617, "epoch": 1.4412923088108025, "grad_norm": 8.625, "learning_rate": 1.9859804312445096e-06, "loss": 1.00083542, "memory(GiB)": 141.16, "step": 128860, "train_speed(iter/s)": 0.289163 }, { "acc": 0.74454231, "epoch": 1.441516007756761, "grad_norm": 7.875, "learning_rate": 1.984505004251314e-06, "loss": 1.00009518, "memory(GiB)": 141.16, "step": 128880, "train_speed(iter/s)": 0.289179 }, { "acc": 0.72343397, "epoch": 1.4417397067027196, "grad_norm": 6.8125, "learning_rate": 1.983029989820936e-06, "loss": 1.11411514, "memory(GiB)": 141.16, "step": 128900, "train_speed(iter/s)": 0.289195 }, { "acc": 0.74963503, "epoch": 1.441963405648678, "grad_norm": 5.34375, "learning_rate": 1.9815553881551753e-06, "loss": 1.00996904, "memory(GiB)": 141.16, "step": 128920, "train_speed(iter/s)": 0.289211 }, { "acc": 0.73565583, "epoch": 1.4421871045946366, "grad_norm": 6.5625, "learning_rate": 1.9800811994557833e-06, "loss": 1.06099758, "memory(GiB)": 141.16, "step": 128940, "train_speed(iter/s)": 0.289225 }, { "acc": 0.72983999, "epoch": 1.4424108035405951, "grad_norm": 8.125, "learning_rate": 1.9786074239244458e-06, "loss": 1.0807766, "memory(GiB)": 141.16, "step": 128960, "train_speed(iter/s)": 0.289241 }, { "acc": 0.73403196, "epoch": 1.4426345024865537, "grad_norm": 7.21875, "learning_rate": 1.9771340617628e-06, "loss": 1.05759544, "memory(GiB)": 141.16, "step": 128980, "train_speed(iter/s)": 0.289257 }, { "acc": 0.74450636, "epoch": 1.4428582014325122, "grad_norm": 5.75, "learning_rate": 1.9756611131724215e-06, "loss": 1.02920361, "memory(GiB)": 141.16, "step": 129000, "train_speed(iter/s)": 0.289271 }, { "acc": 0.74656277, "epoch": 1.4430819003784707, "grad_norm": 8.3125, "learning_rate": 1.974188578354829e-06, "loss": 1.0192564, "memory(GiB)": 141.16, "step": 129020, "train_speed(iter/s)": 0.289287 }, { "acc": 0.72691612, "epoch": 1.4433055993244293, "grad_norm": 7.8125, "learning_rate": 1.972716457511489e-06, "loss": 1.09564915, "memory(GiB)": 141.16, "step": 129040, "train_speed(iter/s)": 0.289302 }, { "acc": 0.72507348, "epoch": 1.4435292982703878, "grad_norm": 8.4375, "learning_rate": 1.9712447508438072e-06, "loss": 1.10288258, "memory(GiB)": 141.16, "step": 129060, "train_speed(iter/s)": 0.289318 }, { "acc": 0.74058843, "epoch": 1.4437529972163463, "grad_norm": 8.5625, "learning_rate": 1.9697734585531348e-06, "loss": 1.03996849, "memory(GiB)": 141.16, "step": 129080, "train_speed(iter/s)": 0.289334 }, { "acc": 0.72992592, "epoch": 1.4439766961623048, "grad_norm": 6.125, "learning_rate": 1.9683025808407635e-06, "loss": 1.08432026, "memory(GiB)": 141.16, "step": 129100, "train_speed(iter/s)": 0.28935 }, { "acc": 0.72991028, "epoch": 1.4442003951082634, "grad_norm": 6.84375, "learning_rate": 1.9668321179079337e-06, "loss": 1.08809109, "memory(GiB)": 141.16, "step": 129120, "train_speed(iter/s)": 0.289365 }, { "acc": 0.73738046, "epoch": 1.444424094054222, "grad_norm": 6.5625, "learning_rate": 1.965362069955824e-06, "loss": 1.04367676, "memory(GiB)": 141.16, "step": 129140, "train_speed(iter/s)": 0.289381 }, { "acc": 0.74102678, "epoch": 1.4446477930001804, "grad_norm": 6.84375, "learning_rate": 1.9638924371855565e-06, "loss": 1.02322598, "memory(GiB)": 141.16, "step": 129160, "train_speed(iter/s)": 0.289395 }, { "acc": 0.72309723, "epoch": 1.444871491946139, "grad_norm": 8.6875, "learning_rate": 1.962423219798202e-06, "loss": 1.10406046, "memory(GiB)": 141.16, "step": 129180, "train_speed(iter/s)": 0.28941 }, { "acc": 0.7259409, "epoch": 1.4450951908920975, "grad_norm": 8.5, "learning_rate": 1.9609544179947653e-06, "loss": 1.09628954, "memory(GiB)": 141.16, "step": 129200, "train_speed(iter/s)": 0.289425 }, { "acc": 0.7298707, "epoch": 1.445318889838056, "grad_norm": 7.0625, "learning_rate": 1.9594860319762045e-06, "loss": 1.0910635, "memory(GiB)": 141.16, "step": 129220, "train_speed(iter/s)": 0.289439 }, { "acc": 0.74866199, "epoch": 1.4455425887840145, "grad_norm": 7.84375, "learning_rate": 1.958018061943413e-06, "loss": 1.01040764, "memory(GiB)": 141.16, "step": 129240, "train_speed(iter/s)": 0.289455 }, { "acc": 0.72308731, "epoch": 1.445766287729973, "grad_norm": 8.5625, "learning_rate": 1.9565505080972293e-06, "loss": 1.10410614, "memory(GiB)": 141.16, "step": 129260, "train_speed(iter/s)": 0.28947 }, { "acc": 0.72609854, "epoch": 1.4459899866759316, "grad_norm": 5.71875, "learning_rate": 1.955083370638438e-06, "loss": 1.09650078, "memory(GiB)": 141.16, "step": 129280, "train_speed(iter/s)": 0.289486 }, { "acc": 0.71756487, "epoch": 1.4462136856218901, "grad_norm": 7.25, "learning_rate": 1.953616649767762e-06, "loss": 1.1374589, "memory(GiB)": 141.16, "step": 129300, "train_speed(iter/s)": 0.2895 }, { "acc": 0.740837, "epoch": 1.4464373845678486, "grad_norm": 6.9375, "learning_rate": 1.952150345685874e-06, "loss": 1.04747353, "memory(GiB)": 141.16, "step": 129320, "train_speed(iter/s)": 0.289516 }, { "acc": 0.73793015, "epoch": 1.4466610835138072, "grad_norm": 5.78125, "learning_rate": 1.9506844585933817e-06, "loss": 1.05852242, "memory(GiB)": 141.16, "step": 129340, "train_speed(iter/s)": 0.289531 }, { "acc": 0.7325695, "epoch": 1.4468847824597657, "grad_norm": 5.5625, "learning_rate": 1.949218988690838e-06, "loss": 1.07262268, "memory(GiB)": 141.16, "step": 129360, "train_speed(iter/s)": 0.289547 }, { "acc": 0.73100152, "epoch": 1.4471084814057242, "grad_norm": 8.8125, "learning_rate": 1.9477539361787447e-06, "loss": 1.07682171, "memory(GiB)": 141.16, "step": 129380, "train_speed(iter/s)": 0.289561 }, { "acc": 0.72866158, "epoch": 1.4473321803516828, "grad_norm": 8.6875, "learning_rate": 1.9462893012575373e-06, "loss": 1.09501562, "memory(GiB)": 141.16, "step": 129400, "train_speed(iter/s)": 0.289577 }, { "acc": 0.74586096, "epoch": 1.4475558792976413, "grad_norm": 7.03125, "learning_rate": 1.9448250841276033e-06, "loss": 1.01240101, "memory(GiB)": 141.16, "step": 129420, "train_speed(iter/s)": 0.289592 }, { "acc": 0.73237524, "epoch": 1.4477795782435998, "grad_norm": 7.0, "learning_rate": 1.9433612849892664e-06, "loss": 1.0817915, "memory(GiB)": 141.16, "step": 129440, "train_speed(iter/s)": 0.289607 }, { "acc": 0.75144806, "epoch": 1.4480032771895583, "grad_norm": 8.875, "learning_rate": 1.9418979040427934e-06, "loss": 0.98973427, "memory(GiB)": 141.16, "step": 129460, "train_speed(iter/s)": 0.289622 }, { "acc": 0.73768597, "epoch": 1.4482269761355169, "grad_norm": 6.3125, "learning_rate": 1.940434941488399e-06, "loss": 1.05084305, "memory(GiB)": 141.16, "step": 129480, "train_speed(iter/s)": 0.289637 }, { "acc": 0.7330678, "epoch": 1.4484506750814754, "grad_norm": 6.03125, "learning_rate": 1.9389723975262337e-06, "loss": 1.06640129, "memory(GiB)": 141.16, "step": 129500, "train_speed(iter/s)": 0.28965 }, { "acc": 0.74508429, "epoch": 1.448674374027434, "grad_norm": 6.28125, "learning_rate": 1.937510272356399e-06, "loss": 1.02392311, "memory(GiB)": 141.16, "step": 129520, "train_speed(iter/s)": 0.289666 }, { "acc": 0.73458643, "epoch": 1.4488980729733925, "grad_norm": 9.5625, "learning_rate": 1.936048566178932e-06, "loss": 1.07160892, "memory(GiB)": 141.16, "step": 129540, "train_speed(iter/s)": 0.289681 }, { "acc": 0.74071503, "epoch": 1.449121771919351, "grad_norm": 6.84375, "learning_rate": 1.934587279193813e-06, "loss": 1.03792734, "memory(GiB)": 141.16, "step": 129560, "train_speed(iter/s)": 0.289696 }, { "acc": 0.74228115, "epoch": 1.4493454708653095, "grad_norm": 6.0, "learning_rate": 1.933126411600971e-06, "loss": 1.02556524, "memory(GiB)": 141.16, "step": 129580, "train_speed(iter/s)": 0.289711 }, { "acc": 0.72613983, "epoch": 1.449569169811268, "grad_norm": 9.5, "learning_rate": 1.9316659636002698e-06, "loss": 1.09761591, "memory(GiB)": 141.16, "step": 129600, "train_speed(iter/s)": 0.289726 }, { "acc": 0.75338635, "epoch": 1.4497928687572266, "grad_norm": 6.96875, "learning_rate": 1.930205935391524e-06, "loss": 0.98229628, "memory(GiB)": 141.16, "step": 129620, "train_speed(iter/s)": 0.289739 }, { "acc": 0.72407703, "epoch": 1.450016567703185, "grad_norm": 8.3125, "learning_rate": 1.9287463271744827e-06, "loss": 1.12737427, "memory(GiB)": 141.16, "step": 129640, "train_speed(iter/s)": 0.289753 }, { "acc": 0.74343381, "epoch": 1.4502402666491436, "grad_norm": 6.96875, "learning_rate": 1.927287139148841e-06, "loss": 1.01577053, "memory(GiB)": 141.16, "step": 129660, "train_speed(iter/s)": 0.289769 }, { "acc": 0.73206201, "epoch": 1.4504639655951022, "grad_norm": 8.5625, "learning_rate": 1.925828371514239e-06, "loss": 1.08704815, "memory(GiB)": 141.16, "step": 129680, "train_speed(iter/s)": 0.289782 }, { "acc": 0.73929605, "epoch": 1.4506876645410607, "grad_norm": 7.4375, "learning_rate": 1.924370024470254e-06, "loss": 1.03949986, "memory(GiB)": 141.16, "step": 129700, "train_speed(iter/s)": 0.289798 }, { "acc": 0.72759337, "epoch": 1.4509113634870192, "grad_norm": 5.5, "learning_rate": 1.922912098216413e-06, "loss": 1.0966362, "memory(GiB)": 141.16, "step": 129720, "train_speed(iter/s)": 0.289812 }, { "acc": 0.75067859, "epoch": 1.4511350624329777, "grad_norm": 9.1875, "learning_rate": 1.921454592952178e-06, "loss": 0.98968334, "memory(GiB)": 141.16, "step": 129740, "train_speed(iter/s)": 0.289828 }, { "acc": 0.7307375, "epoch": 1.4513587613789363, "grad_norm": 6.96875, "learning_rate": 1.9199975088769558e-06, "loss": 1.08551588, "memory(GiB)": 141.16, "step": 129760, "train_speed(iter/s)": 0.289841 }, { "acc": 0.7361001, "epoch": 1.4515824603248948, "grad_norm": 6.53125, "learning_rate": 1.9185408461900997e-06, "loss": 1.06110649, "memory(GiB)": 141.16, "step": 129780, "train_speed(iter/s)": 0.289857 }, { "acc": 0.73392148, "epoch": 1.4518061592708533, "grad_norm": 7.71875, "learning_rate": 1.9170846050908983e-06, "loss": 1.09446249, "memory(GiB)": 141.16, "step": 129800, "train_speed(iter/s)": 0.289872 }, { "acc": 0.73732586, "epoch": 1.4520298582168119, "grad_norm": 7.5, "learning_rate": 1.915628785778589e-06, "loss": 1.04453678, "memory(GiB)": 141.16, "step": 129820, "train_speed(iter/s)": 0.289887 }, { "acc": 0.74218855, "epoch": 1.4522535571627704, "grad_norm": 6.5625, "learning_rate": 1.9141733884523485e-06, "loss": 1.04860592, "memory(GiB)": 141.16, "step": 129840, "train_speed(iter/s)": 0.289901 }, { "acc": 0.74128213, "epoch": 1.452477256108729, "grad_norm": 8.0625, "learning_rate": 1.9127184133112923e-06, "loss": 1.02695789, "memory(GiB)": 141.16, "step": 129860, "train_speed(iter/s)": 0.289917 }, { "acc": 0.73682013, "epoch": 1.4527009550546874, "grad_norm": 7.78125, "learning_rate": 1.911263860554487e-06, "loss": 1.04907093, "memory(GiB)": 141.16, "step": 129880, "train_speed(iter/s)": 0.289932 }, { "acc": 0.73410454, "epoch": 1.452924654000646, "grad_norm": 6.28125, "learning_rate": 1.909809730380932e-06, "loss": 1.06456432, "memory(GiB)": 141.16, "step": 129900, "train_speed(iter/s)": 0.289947 }, { "acc": 0.73915157, "epoch": 1.4531483529466045, "grad_norm": 7.125, "learning_rate": 1.908356022989577e-06, "loss": 1.03466816, "memory(GiB)": 141.16, "step": 129920, "train_speed(iter/s)": 0.28996 }, { "acc": 0.7135994, "epoch": 1.453372051892563, "grad_norm": 8.25, "learning_rate": 1.906902738579307e-06, "loss": 1.15958614, "memory(GiB)": 141.16, "step": 129940, "train_speed(iter/s)": 0.289976 }, { "acc": 0.73114901, "epoch": 1.4535957508385215, "grad_norm": 6.46875, "learning_rate": 1.9054498773489521e-06, "loss": 1.07947092, "memory(GiB)": 141.16, "step": 129960, "train_speed(iter/s)": 0.28999 }, { "acc": 0.72472048, "epoch": 1.45381944978448, "grad_norm": 6.9375, "learning_rate": 1.9039974394972865e-06, "loss": 1.1259161, "memory(GiB)": 141.16, "step": 129980, "train_speed(iter/s)": 0.290004 }, { "acc": 0.73933687, "epoch": 1.4540431487304386, "grad_norm": 8.0625, "learning_rate": 1.9025454252230214e-06, "loss": 1.04688454, "memory(GiB)": 141.16, "step": 130000, "train_speed(iter/s)": 0.29002 }, { "epoch": 1.4540431487304386, "eval_acc": 0.6901460226078404, "eval_loss": 1.0791447162628174, "eval_runtime": 2322.9905, "eval_samples_per_second": 32.408, "eval_steps_per_second": 16.204, "step": 130000 }, { "acc": 0.72817545, "epoch": 1.4542668476763971, "grad_norm": 5.875, "learning_rate": 1.901093834724817e-06, "loss": 1.08360472, "memory(GiB)": 141.16, "step": 130020, "train_speed(iter/s)": 0.288508 }, { "acc": 0.7331677, "epoch": 1.4544905466223557, "grad_norm": 7.34375, "learning_rate": 1.8996426682012675e-06, "loss": 1.06707249, "memory(GiB)": 141.16, "step": 130040, "train_speed(iter/s)": 0.288525 }, { "acc": 0.74122429, "epoch": 1.4547142455683142, "grad_norm": 6.84375, "learning_rate": 1.8981919258509174e-06, "loss": 1.05133018, "memory(GiB)": 141.16, "step": 130060, "train_speed(iter/s)": 0.28854 }, { "acc": 0.7318851, "epoch": 1.4549379445142727, "grad_norm": 6.09375, "learning_rate": 1.8967416078722466e-06, "loss": 1.07351208, "memory(GiB)": 141.16, "step": 130080, "train_speed(iter/s)": 0.288555 }, { "acc": 0.73933802, "epoch": 1.4551616434602312, "grad_norm": 7.09375, "learning_rate": 1.8952917144636784e-06, "loss": 1.05058575, "memory(GiB)": 141.16, "step": 130100, "train_speed(iter/s)": 0.28857 }, { "acc": 0.72401857, "epoch": 1.4553853424061898, "grad_norm": 7.5625, "learning_rate": 1.8938422458235816e-06, "loss": 1.11426411, "memory(GiB)": 141.16, "step": 130120, "train_speed(iter/s)": 0.288585 }, { "acc": 0.73131256, "epoch": 1.4556090413521483, "grad_norm": 7.40625, "learning_rate": 1.892393202150261e-06, "loss": 1.08886538, "memory(GiB)": 141.16, "step": 130140, "train_speed(iter/s)": 0.288599 }, { "acc": 0.74055319, "epoch": 1.4558327402981068, "grad_norm": 5.875, "learning_rate": 1.8909445836419699e-06, "loss": 1.02575493, "memory(GiB)": 141.16, "step": 130160, "train_speed(iter/s)": 0.288614 }, { "acc": 0.73447628, "epoch": 1.4560564392440654, "grad_norm": 4.5625, "learning_rate": 1.8894963904968982e-06, "loss": 1.06299534, "memory(GiB)": 141.16, "step": 130180, "train_speed(iter/s)": 0.288628 }, { "acc": 0.71683998, "epoch": 1.4562801381900239, "grad_norm": 6.375, "learning_rate": 1.8880486229131783e-06, "loss": 1.15031261, "memory(GiB)": 141.16, "step": 130200, "train_speed(iter/s)": 0.288642 }, { "acc": 0.72650185, "epoch": 1.4565038371359824, "grad_norm": 7.25, "learning_rate": 1.8866012810888889e-06, "loss": 1.10023746, "memory(GiB)": 141.16, "step": 130220, "train_speed(iter/s)": 0.288658 }, { "acc": 0.73027382, "epoch": 1.456727536081941, "grad_norm": 7.75, "learning_rate": 1.8851543652220445e-06, "loss": 1.0888751, "memory(GiB)": 141.16, "step": 130240, "train_speed(iter/s)": 0.288673 }, { "acc": 0.74675026, "epoch": 1.4569512350278995, "grad_norm": 6.96875, "learning_rate": 1.883707875510604e-06, "loss": 1.01792517, "memory(GiB)": 141.16, "step": 130260, "train_speed(iter/s)": 0.28869 }, { "acc": 0.7392859, "epoch": 1.457174933973858, "grad_norm": 4.84375, "learning_rate": 1.8822618121524671e-06, "loss": 1.03750534, "memory(GiB)": 141.16, "step": 130280, "train_speed(iter/s)": 0.288706 }, { "acc": 0.72895393, "epoch": 1.4573986329198165, "grad_norm": 6.21875, "learning_rate": 1.8808161753454785e-06, "loss": 1.09266968, "memory(GiB)": 141.16, "step": 130300, "train_speed(iter/s)": 0.288722 }, { "acc": 0.72921557, "epoch": 1.457622331865775, "grad_norm": 6.8125, "learning_rate": 1.8793709652874203e-06, "loss": 1.07810822, "memory(GiB)": 141.16, "step": 130320, "train_speed(iter/s)": 0.288737 }, { "acc": 0.72823305, "epoch": 1.4578460308117336, "grad_norm": 8.3125, "learning_rate": 1.877926182176017e-06, "loss": 1.0978322, "memory(GiB)": 141.16, "step": 130340, "train_speed(iter/s)": 0.288752 }, { "acc": 0.73714457, "epoch": 1.4580697297576921, "grad_norm": 7.3125, "learning_rate": 1.876481826208938e-06, "loss": 1.06488457, "memory(GiB)": 141.16, "step": 130360, "train_speed(iter/s)": 0.288766 }, { "acc": 0.73221936, "epoch": 1.4582934287036506, "grad_norm": 9.125, "learning_rate": 1.8750378975837884e-06, "loss": 1.06821747, "memory(GiB)": 141.16, "step": 130380, "train_speed(iter/s)": 0.288782 }, { "acc": 0.74241943, "epoch": 1.4585171276496092, "grad_norm": 6.6875, "learning_rate": 1.8735943964981229e-06, "loss": 1.01455793, "memory(GiB)": 141.16, "step": 130400, "train_speed(iter/s)": 0.288798 }, { "acc": 0.72090454, "epoch": 1.4587408265955677, "grad_norm": 7.25, "learning_rate": 1.8721513231494304e-06, "loss": 1.13473644, "memory(GiB)": 141.16, "step": 130420, "train_speed(iter/s)": 0.288812 }, { "acc": 0.7342083, "epoch": 1.4589645255415262, "grad_norm": 6.6875, "learning_rate": 1.8707086777351424e-06, "loss": 1.057757, "memory(GiB)": 141.16, "step": 130440, "train_speed(iter/s)": 0.288828 }, { "acc": 0.74056602, "epoch": 1.4591882244874848, "grad_norm": 5.875, "learning_rate": 1.8692664604526368e-06, "loss": 1.04274216, "memory(GiB)": 141.16, "step": 130460, "train_speed(iter/s)": 0.288842 }, { "acc": 0.7388845, "epoch": 1.4594119234334433, "grad_norm": 7.53125, "learning_rate": 1.867824671499226e-06, "loss": 1.05092888, "memory(GiB)": 141.16, "step": 130480, "train_speed(iter/s)": 0.288856 }, { "acc": 0.7373714, "epoch": 1.4596356223794018, "grad_norm": 6.90625, "learning_rate": 1.8663833110721714e-06, "loss": 1.0469286, "memory(GiB)": 141.16, "step": 130500, "train_speed(iter/s)": 0.288872 }, { "acc": 0.7237577, "epoch": 1.4598593213253603, "grad_norm": 5.53125, "learning_rate": 1.8649423793686694e-06, "loss": 1.10715771, "memory(GiB)": 141.16, "step": 130520, "train_speed(iter/s)": 0.288885 }, { "acc": 0.73592315, "epoch": 1.4600830202713189, "grad_norm": 7.25, "learning_rate": 1.8635018765858582e-06, "loss": 1.06394644, "memory(GiB)": 141.16, "step": 130540, "train_speed(iter/s)": 0.288901 }, { "acc": 0.72747002, "epoch": 1.4603067192172774, "grad_norm": 5.71875, "learning_rate": 1.8620618029208231e-06, "loss": 1.09437313, "memory(GiB)": 141.16, "step": 130560, "train_speed(iter/s)": 0.288917 }, { "acc": 0.74185057, "epoch": 1.460530418163236, "grad_norm": 9.125, "learning_rate": 1.8606221585705831e-06, "loss": 1.00503464, "memory(GiB)": 141.16, "step": 130580, "train_speed(iter/s)": 0.288931 }, { "acc": 0.73511724, "epoch": 1.4607541171091944, "grad_norm": 7.84375, "learning_rate": 1.8591829437321058e-06, "loss": 1.07486916, "memory(GiB)": 141.16, "step": 130600, "train_speed(iter/s)": 0.288947 }, { "acc": 0.73628516, "epoch": 1.460977816055153, "grad_norm": 8.625, "learning_rate": 1.8577441586022937e-06, "loss": 1.03840199, "memory(GiB)": 141.16, "step": 130620, "train_speed(iter/s)": 0.288963 }, { "acc": 0.72846994, "epoch": 1.4612015150011115, "grad_norm": 6.40625, "learning_rate": 1.856305803377993e-06, "loss": 1.09505138, "memory(GiB)": 141.16, "step": 130640, "train_speed(iter/s)": 0.288976 }, { "acc": 0.72738619, "epoch": 1.46142521394707, "grad_norm": 7.3125, "learning_rate": 1.8548678782559932e-06, "loss": 1.08576164, "memory(GiB)": 141.16, "step": 130660, "train_speed(iter/s)": 0.288992 }, { "acc": 0.73065858, "epoch": 1.4616489128930286, "grad_norm": 7.75, "learning_rate": 1.853430383433021e-06, "loss": 1.07433233, "memory(GiB)": 141.16, "step": 130680, "train_speed(iter/s)": 0.289006 }, { "acc": 0.73668699, "epoch": 1.461872611838987, "grad_norm": 7.09375, "learning_rate": 1.8519933191057483e-06, "loss": 1.05622349, "memory(GiB)": 141.16, "step": 130700, "train_speed(iter/s)": 0.289022 }, { "acc": 0.7368731, "epoch": 1.4620963107849456, "grad_norm": 6.5625, "learning_rate": 1.8505566854707845e-06, "loss": 1.05934525, "memory(GiB)": 141.16, "step": 130720, "train_speed(iter/s)": 0.289038 }, { "acc": 0.73223686, "epoch": 1.4623200097309041, "grad_norm": 6.96875, "learning_rate": 1.8491204827246811e-06, "loss": 1.08128881, "memory(GiB)": 141.16, "step": 130740, "train_speed(iter/s)": 0.289052 }, { "acc": 0.73454113, "epoch": 1.4625437086768627, "grad_norm": 8.0625, "learning_rate": 1.847684711063934e-06, "loss": 1.06058617, "memory(GiB)": 141.16, "step": 130760, "train_speed(iter/s)": 0.289067 }, { "acc": 0.73447113, "epoch": 1.4627674076228212, "grad_norm": 8.9375, "learning_rate": 1.8462493706849733e-06, "loss": 1.0712944, "memory(GiB)": 141.16, "step": 130780, "train_speed(iter/s)": 0.289083 }, { "acc": 0.75263634, "epoch": 1.4629911065687797, "grad_norm": 7.96875, "learning_rate": 1.844814461784178e-06, "loss": 0.97270498, "memory(GiB)": 141.16, "step": 130800, "train_speed(iter/s)": 0.289099 }, { "acc": 0.73282261, "epoch": 1.4632148055147383, "grad_norm": 7.71875, "learning_rate": 1.843379984557862e-06, "loss": 1.06988811, "memory(GiB)": 141.16, "step": 130820, "train_speed(iter/s)": 0.289113 }, { "acc": 0.74188948, "epoch": 1.4634385044606968, "grad_norm": 8.5, "learning_rate": 1.841945939202281e-06, "loss": 1.02220888, "memory(GiB)": 141.16, "step": 130840, "train_speed(iter/s)": 0.289129 }, { "acc": 0.73610659, "epoch": 1.4636622034066553, "grad_norm": 8.9375, "learning_rate": 1.8405123259136365e-06, "loss": 1.05859871, "memory(GiB)": 141.16, "step": 130860, "train_speed(iter/s)": 0.289143 }, { "acc": 0.73902006, "epoch": 1.4638859023526138, "grad_norm": 6.9375, "learning_rate": 1.8390791448880635e-06, "loss": 1.05235939, "memory(GiB)": 141.16, "step": 130880, "train_speed(iter/s)": 0.289158 }, { "acc": 0.73075132, "epoch": 1.4641096012985724, "grad_norm": 6.0, "learning_rate": 1.837646396321645e-06, "loss": 1.07526875, "memory(GiB)": 141.16, "step": 130900, "train_speed(iter/s)": 0.289174 }, { "acc": 0.72779036, "epoch": 1.464333300244531, "grad_norm": 6.90625, "learning_rate": 1.8362140804104e-06, "loss": 1.09263563, "memory(GiB)": 141.16, "step": 130920, "train_speed(iter/s)": 0.28919 }, { "acc": 0.7360404, "epoch": 1.4645569991904894, "grad_norm": 8.0625, "learning_rate": 1.8347821973502878e-06, "loss": 1.06728764, "memory(GiB)": 141.16, "step": 130940, "train_speed(iter/s)": 0.289204 }, { "acc": 0.75514131, "epoch": 1.464780698136448, "grad_norm": 6.875, "learning_rate": 1.8333507473372142e-06, "loss": 0.98523054, "memory(GiB)": 141.16, "step": 130960, "train_speed(iter/s)": 0.289218 }, { "acc": 0.74746351, "epoch": 1.4650043970824065, "grad_norm": 7.21875, "learning_rate": 1.8319197305670189e-06, "loss": 1.00223999, "memory(GiB)": 141.16, "step": 130980, "train_speed(iter/s)": 0.289233 }, { "acc": 0.73397098, "epoch": 1.465228096028365, "grad_norm": 5.75, "learning_rate": 1.830489147235488e-06, "loss": 1.07373123, "memory(GiB)": 141.16, "step": 131000, "train_speed(iter/s)": 0.289247 }, { "acc": 0.73060379, "epoch": 1.4654517949743235, "grad_norm": 6.53125, "learning_rate": 1.829058997538345e-06, "loss": 1.07817364, "memory(GiB)": 141.16, "step": 131020, "train_speed(iter/s)": 0.289262 }, { "acc": 0.73050795, "epoch": 1.465675493920282, "grad_norm": 7.71875, "learning_rate": 1.8276292816712521e-06, "loss": 1.08012524, "memory(GiB)": 141.16, "step": 131040, "train_speed(iter/s)": 0.289277 }, { "acc": 0.74287186, "epoch": 1.4658991928662406, "grad_norm": 7.71875, "learning_rate": 1.8261999998298192e-06, "loss": 1.01644707, "memory(GiB)": 141.16, "step": 131060, "train_speed(iter/s)": 0.289293 }, { "acc": 0.74209528, "epoch": 1.4661228918121991, "grad_norm": 8.1875, "learning_rate": 1.8247711522095884e-06, "loss": 1.04259367, "memory(GiB)": 141.16, "step": 131080, "train_speed(iter/s)": 0.28931 }, { "acc": 0.73085976, "epoch": 1.4663465907581577, "grad_norm": 6.625, "learning_rate": 1.8233427390060505e-06, "loss": 1.09162235, "memory(GiB)": 141.16, "step": 131100, "train_speed(iter/s)": 0.289326 }, { "acc": 0.73593988, "epoch": 1.4665702897041162, "grad_norm": 7.375, "learning_rate": 1.8219147604146303e-06, "loss": 1.06294079, "memory(GiB)": 141.16, "step": 131120, "train_speed(iter/s)": 0.289342 }, { "acc": 0.73569717, "epoch": 1.4667939886500747, "grad_norm": 6.9375, "learning_rate": 1.8204872166306948e-06, "loss": 1.06199818, "memory(GiB)": 141.16, "step": 131140, "train_speed(iter/s)": 0.289356 }, { "acc": 0.74733109, "epoch": 1.4670176875960332, "grad_norm": 7.125, "learning_rate": 1.819060107849555e-06, "loss": 0.99735241, "memory(GiB)": 141.16, "step": 131160, "train_speed(iter/s)": 0.28937 }, { "acc": 0.75149312, "epoch": 1.4672413865419918, "grad_norm": 7.84375, "learning_rate": 1.8176334342664576e-06, "loss": 0.99517536, "memory(GiB)": 141.16, "step": 131180, "train_speed(iter/s)": 0.289385 }, { "acc": 0.74360542, "epoch": 1.4674650854879503, "grad_norm": 6.53125, "learning_rate": 1.8162071960765941e-06, "loss": 1.0082407, "memory(GiB)": 141.16, "step": 131200, "train_speed(iter/s)": 0.289399 }, { "acc": 0.73273234, "epoch": 1.4676887844339088, "grad_norm": 5.5625, "learning_rate": 1.8147813934750935e-06, "loss": 1.06567202, "memory(GiB)": 141.16, "step": 131220, "train_speed(iter/s)": 0.289415 }, { "acc": 0.74786673, "epoch": 1.4679124833798673, "grad_norm": 7.34375, "learning_rate": 1.8133560266570234e-06, "loss": 1.00936499, "memory(GiB)": 141.16, "step": 131240, "train_speed(iter/s)": 0.289429 }, { "acc": 0.75028958, "epoch": 1.4681361823258259, "grad_norm": 7.53125, "learning_rate": 1.811931095817398e-06, "loss": 0.99323235, "memory(GiB)": 141.16, "step": 131260, "train_speed(iter/s)": 0.289445 }, { "acc": 0.74946012, "epoch": 1.4683598812717844, "grad_norm": 7.375, "learning_rate": 1.8105066011511657e-06, "loss": 0.99623823, "memory(GiB)": 141.16, "step": 131280, "train_speed(iter/s)": 0.28946 }, { "acc": 0.731282, "epoch": 1.468583580217743, "grad_norm": 6.21875, "learning_rate": 1.8090825428532198e-06, "loss": 1.09288931, "memory(GiB)": 141.16, "step": 131300, "train_speed(iter/s)": 0.289472 }, { "acc": 0.73379908, "epoch": 1.4688072791637015, "grad_norm": 8.375, "learning_rate": 1.8076589211183909e-06, "loss": 1.08240967, "memory(GiB)": 141.16, "step": 131320, "train_speed(iter/s)": 0.289487 }, { "acc": 0.72703571, "epoch": 1.46903097810966, "grad_norm": 8.375, "learning_rate": 1.8062357361414496e-06, "loss": 1.09803228, "memory(GiB)": 141.16, "step": 131340, "train_speed(iter/s)": 0.289502 }, { "acc": 0.74312968, "epoch": 1.4692546770556185, "grad_norm": 7.625, "learning_rate": 1.80481298811711e-06, "loss": 1.02691822, "memory(GiB)": 141.16, "step": 131360, "train_speed(iter/s)": 0.289515 }, { "acc": 0.73453417, "epoch": 1.469478376001577, "grad_norm": 5.53125, "learning_rate": 1.8033906772400217e-06, "loss": 1.06683483, "memory(GiB)": 141.16, "step": 131380, "train_speed(iter/s)": 0.289528 }, { "acc": 0.74132223, "epoch": 1.4697020749475356, "grad_norm": 8.6875, "learning_rate": 1.8019688037047806e-06, "loss": 1.03374128, "memory(GiB)": 141.16, "step": 131400, "train_speed(iter/s)": 0.289542 }, { "acc": 0.71671219, "epoch": 1.469925773893494, "grad_norm": 6.34375, "learning_rate": 1.8005473677059176e-06, "loss": 1.14046478, "memory(GiB)": 141.16, "step": 131420, "train_speed(iter/s)": 0.289557 }, { "acc": 0.72621984, "epoch": 1.4701494728394526, "grad_norm": 6.21875, "learning_rate": 1.7991263694379058e-06, "loss": 1.09894123, "memory(GiB)": 141.16, "step": 131440, "train_speed(iter/s)": 0.289572 }, { "acc": 0.73233948, "epoch": 1.4703731717854112, "grad_norm": 6.34375, "learning_rate": 1.7977058090951571e-06, "loss": 1.09254847, "memory(GiB)": 141.16, "step": 131460, "train_speed(iter/s)": 0.289587 }, { "acc": 0.74769936, "epoch": 1.4705968707313697, "grad_norm": 7.28125, "learning_rate": 1.7962856868720236e-06, "loss": 1.01002083, "memory(GiB)": 141.16, "step": 131480, "train_speed(iter/s)": 0.289602 }, { "acc": 0.74550061, "epoch": 1.4708205696773282, "grad_norm": 6.4375, "learning_rate": 1.7948660029628013e-06, "loss": 1.01288338, "memory(GiB)": 141.16, "step": 131500, "train_speed(iter/s)": 0.289618 }, { "acc": 0.72097621, "epoch": 1.4710442686232867, "grad_norm": 5.875, "learning_rate": 1.7934467575617204e-06, "loss": 1.11689682, "memory(GiB)": 141.16, "step": 131520, "train_speed(iter/s)": 0.289635 }, { "acc": 0.7379909, "epoch": 1.4712679675692453, "grad_norm": 7.5625, "learning_rate": 1.7920279508629569e-06, "loss": 1.05258121, "memory(GiB)": 141.16, "step": 131540, "train_speed(iter/s)": 0.289652 }, { "acc": 0.71681423, "epoch": 1.4714916665152038, "grad_norm": 8.0, "learning_rate": 1.790609583060622e-06, "loss": 1.14015427, "memory(GiB)": 141.16, "step": 131560, "train_speed(iter/s)": 0.289666 }, { "acc": 0.73901396, "epoch": 1.4717153654611623, "grad_norm": 8.375, "learning_rate": 1.789191654348767e-06, "loss": 1.05700779, "memory(GiB)": 141.16, "step": 131580, "train_speed(iter/s)": 0.289681 }, { "acc": 0.75409732, "epoch": 1.4719390644071209, "grad_norm": 6.25, "learning_rate": 1.7877741649213886e-06, "loss": 0.99089146, "memory(GiB)": 141.16, "step": 131600, "train_speed(iter/s)": 0.289696 }, { "acc": 0.73145266, "epoch": 1.4721627633530794, "grad_norm": 9.1875, "learning_rate": 1.7863571149724163e-06, "loss": 1.07243567, "memory(GiB)": 141.16, "step": 131620, "train_speed(iter/s)": 0.289712 }, { "acc": 0.73799067, "epoch": 1.472386462299038, "grad_norm": 6.5, "learning_rate": 1.7849405046957251e-06, "loss": 1.03626709, "memory(GiB)": 141.16, "step": 131640, "train_speed(iter/s)": 0.289728 }, { "acc": 0.7326623, "epoch": 1.4726101612449964, "grad_norm": 9.625, "learning_rate": 1.7835243342851277e-06, "loss": 1.06792488, "memory(GiB)": 141.16, "step": 131660, "train_speed(iter/s)": 0.289742 }, { "acc": 0.72903156, "epoch": 1.472833860190955, "grad_norm": 9.3125, "learning_rate": 1.7821086039343733e-06, "loss": 1.0806675, "memory(GiB)": 141.16, "step": 131680, "train_speed(iter/s)": 0.289757 }, { "acc": 0.7430624, "epoch": 1.4730575591369135, "grad_norm": 7.5625, "learning_rate": 1.7806933138371573e-06, "loss": 1.03388958, "memory(GiB)": 141.16, "step": 131700, "train_speed(iter/s)": 0.289773 }, { "acc": 0.73545403, "epoch": 1.473281258082872, "grad_norm": 7.5625, "learning_rate": 1.7792784641871097e-06, "loss": 1.06176472, "memory(GiB)": 141.16, "step": 131720, "train_speed(iter/s)": 0.289789 }, { "acc": 0.72540722, "epoch": 1.4735049570288306, "grad_norm": 6.90625, "learning_rate": 1.7778640551778038e-06, "loss": 1.1105998, "memory(GiB)": 141.16, "step": 131740, "train_speed(iter/s)": 0.289804 }, { "acc": 0.74048519, "epoch": 1.473728655974789, "grad_norm": 6.03125, "learning_rate": 1.7764500870027507e-06, "loss": 1.0327961, "memory(GiB)": 141.16, "step": 131760, "train_speed(iter/s)": 0.289819 }, { "acc": 0.73724842, "epoch": 1.4739523549207476, "grad_norm": 7.0, "learning_rate": 1.7750365598553988e-06, "loss": 1.05600967, "memory(GiB)": 141.16, "step": 131780, "train_speed(iter/s)": 0.289834 }, { "acc": 0.72881308, "epoch": 1.4741760538667061, "grad_norm": 8.625, "learning_rate": 1.7736234739291424e-06, "loss": 1.08740549, "memory(GiB)": 141.16, "step": 131800, "train_speed(iter/s)": 0.289851 }, { "acc": 0.72989197, "epoch": 1.4743997528126647, "grad_norm": 5.59375, "learning_rate": 1.772210829417309e-06, "loss": 1.0798748, "memory(GiB)": 141.16, "step": 131820, "train_speed(iter/s)": 0.289867 }, { "acc": 0.73653297, "epoch": 1.4746234517586232, "grad_norm": 8.5, "learning_rate": 1.7707986265131717e-06, "loss": 1.06270876, "memory(GiB)": 141.16, "step": 131840, "train_speed(iter/s)": 0.289878 }, { "acc": 0.73615403, "epoch": 1.4748471507045817, "grad_norm": 6.6875, "learning_rate": 1.7693868654099377e-06, "loss": 1.05766935, "memory(GiB)": 141.16, "step": 131860, "train_speed(iter/s)": 0.289894 }, { "acc": 0.7325633, "epoch": 1.4750708496505403, "grad_norm": 5.65625, "learning_rate": 1.7679755463007552e-06, "loss": 1.08717537, "memory(GiB)": 141.16, "step": 131880, "train_speed(iter/s)": 0.28991 }, { "acc": 0.74865446, "epoch": 1.4752945485964988, "grad_norm": 8.375, "learning_rate": 1.7665646693787158e-06, "loss": 0.98684978, "memory(GiB)": 141.16, "step": 131900, "train_speed(iter/s)": 0.289924 }, { "acc": 0.72165289, "epoch": 1.4755182475424573, "grad_norm": 8.1875, "learning_rate": 1.7651542348368445e-06, "loss": 1.13193703, "memory(GiB)": 141.16, "step": 131920, "train_speed(iter/s)": 0.289939 }, { "acc": 0.72687693, "epoch": 1.4757419464884158, "grad_norm": 9.0625, "learning_rate": 1.7637442428681123e-06, "loss": 1.09521675, "memory(GiB)": 141.16, "step": 131940, "train_speed(iter/s)": 0.289954 }, { "acc": 0.72745895, "epoch": 1.4759656454343744, "grad_norm": 7.3125, "learning_rate": 1.762334693665424e-06, "loss": 1.09756222, "memory(GiB)": 141.16, "step": 131960, "train_speed(iter/s)": 0.289969 }, { "acc": 0.7418663, "epoch": 1.476189344380333, "grad_norm": 8.75, "learning_rate": 1.7609255874216252e-06, "loss": 1.03647509, "memory(GiB)": 141.16, "step": 131980, "train_speed(iter/s)": 0.289984 }, { "acc": 0.75125976, "epoch": 1.4764130433262914, "grad_norm": 7.90625, "learning_rate": 1.7595169243295045e-06, "loss": 0.98258972, "memory(GiB)": 141.16, "step": 132000, "train_speed(iter/s)": 0.29 }, { "epoch": 1.4764130433262914, "eval_acc": 0.69016781172937, "eval_loss": 1.0791600942611694, "eval_runtime": 2325.5001, "eval_samples_per_second": 32.373, "eval_steps_per_second": 16.187, "step": 132000 }, { "acc": 0.75468044, "epoch": 1.47663674227225, "grad_norm": 6.25, "learning_rate": 1.7581087045817841e-06, "loss": 0.98474121, "memory(GiB)": 141.16, "step": 132020, "train_speed(iter/s)": 0.288508 }, { "acc": 0.73508492, "epoch": 1.4768604412182085, "grad_norm": 7.40625, "learning_rate": 1.7567009283711322e-06, "loss": 1.04889288, "memory(GiB)": 141.16, "step": 132040, "train_speed(iter/s)": 0.288522 }, { "acc": 0.73832006, "epoch": 1.477084140164167, "grad_norm": 6.84375, "learning_rate": 1.7552935958901506e-06, "loss": 1.03300018, "memory(GiB)": 141.16, "step": 132060, "train_speed(iter/s)": 0.288537 }, { "acc": 0.73175411, "epoch": 1.4773078391101255, "grad_norm": 7.6875, "learning_rate": 1.753886707331381e-06, "loss": 1.07801037, "memory(GiB)": 141.16, "step": 132080, "train_speed(iter/s)": 0.288551 }, { "acc": 0.72907166, "epoch": 1.477531538056084, "grad_norm": 7.0, "learning_rate": 1.7524802628873089e-06, "loss": 1.10002251, "memory(GiB)": 141.16, "step": 132100, "train_speed(iter/s)": 0.288565 }, { "acc": 0.73590465, "epoch": 1.4777552370020426, "grad_norm": 7.46875, "learning_rate": 1.751074262750353e-06, "loss": 1.06786528, "memory(GiB)": 141.16, "step": 132120, "train_speed(iter/s)": 0.288581 }, { "acc": 0.7285625, "epoch": 1.4779789359480011, "grad_norm": 6.25, "learning_rate": 1.7496687071128776e-06, "loss": 1.09802227, "memory(GiB)": 141.16, "step": 132140, "train_speed(iter/s)": 0.288595 }, { "acc": 0.72999506, "epoch": 1.4782026348939596, "grad_norm": 7.1875, "learning_rate": 1.7482635961671807e-06, "loss": 1.07181473, "memory(GiB)": 141.16, "step": 132160, "train_speed(iter/s)": 0.288608 }, { "acc": 0.72115393, "epoch": 1.4784263338399182, "grad_norm": 6.0, "learning_rate": 1.7468589301055005e-06, "loss": 1.11426907, "memory(GiB)": 141.16, "step": 132180, "train_speed(iter/s)": 0.288622 }, { "acc": 0.73153067, "epoch": 1.4786500327858767, "grad_norm": 6.03125, "learning_rate": 1.7454547091200186e-06, "loss": 1.07558899, "memory(GiB)": 141.16, "step": 132200, "train_speed(iter/s)": 0.288637 }, { "acc": 0.72376537, "epoch": 1.4788737317318352, "grad_norm": 5.21875, "learning_rate": 1.7440509334028482e-06, "loss": 1.11502266, "memory(GiB)": 141.16, "step": 132220, "train_speed(iter/s)": 0.288652 }, { "acc": 0.72412004, "epoch": 1.4790974306777938, "grad_norm": 7.75, "learning_rate": 1.742647603146051e-06, "loss": 1.11896534, "memory(GiB)": 141.16, "step": 132240, "train_speed(iter/s)": 0.288666 }, { "acc": 0.74982824, "epoch": 1.4793211296237523, "grad_norm": 8.25, "learning_rate": 1.7412447185416193e-06, "loss": 0.99192343, "memory(GiB)": 141.16, "step": 132260, "train_speed(iter/s)": 0.28868 }, { "acc": 0.74041262, "epoch": 1.4795448285697108, "grad_norm": 7.8125, "learning_rate": 1.7398422797814868e-06, "loss": 1.04181671, "memory(GiB)": 141.16, "step": 132280, "train_speed(iter/s)": 0.288696 }, { "acc": 0.74164419, "epoch": 1.4797685275156693, "grad_norm": 8.125, "learning_rate": 1.7384402870575312e-06, "loss": 1.03994408, "memory(GiB)": 141.16, "step": 132300, "train_speed(iter/s)": 0.288712 }, { "acc": 0.7398037, "epoch": 1.4799922264616279, "grad_norm": 7.8125, "learning_rate": 1.7370387405615602e-06, "loss": 1.0449439, "memory(GiB)": 141.16, "step": 132320, "train_speed(iter/s)": 0.288726 }, { "acc": 0.73526497, "epoch": 1.4802159254075864, "grad_norm": 5.8125, "learning_rate": 1.7356376404853303e-06, "loss": 1.05374794, "memory(GiB)": 141.16, "step": 132340, "train_speed(iter/s)": 0.288742 }, { "acc": 0.73575554, "epoch": 1.480439624353545, "grad_norm": 5.375, "learning_rate": 1.7342369870205294e-06, "loss": 1.06005001, "memory(GiB)": 141.16, "step": 132360, "train_speed(iter/s)": 0.288758 }, { "acc": 0.74368749, "epoch": 1.4806633232995035, "grad_norm": 6.3125, "learning_rate": 1.7328367803587853e-06, "loss": 1.01936855, "memory(GiB)": 141.16, "step": 132380, "train_speed(iter/s)": 0.288771 }, { "acc": 0.73840623, "epoch": 1.480887022245462, "grad_norm": 6.78125, "learning_rate": 1.7314370206916703e-06, "loss": 1.03829365, "memory(GiB)": 141.16, "step": 132400, "train_speed(iter/s)": 0.288786 }, { "acc": 0.72653837, "epoch": 1.4811107211914205, "grad_norm": 6.90625, "learning_rate": 1.7300377082106873e-06, "loss": 1.11518402, "memory(GiB)": 141.16, "step": 132420, "train_speed(iter/s)": 0.288802 }, { "acc": 0.73543653, "epoch": 1.481334420137379, "grad_norm": 7.3125, "learning_rate": 1.7286388431072859e-06, "loss": 1.0578002, "memory(GiB)": 141.16, "step": 132440, "train_speed(iter/s)": 0.288815 }, { "acc": 0.72759209, "epoch": 1.4815581190833376, "grad_norm": 8.125, "learning_rate": 1.7272404255728498e-06, "loss": 1.09410009, "memory(GiB)": 141.16, "step": 132460, "train_speed(iter/s)": 0.288831 }, { "acc": 0.74664431, "epoch": 1.481781818029296, "grad_norm": 9.4375, "learning_rate": 1.7258424557987002e-06, "loss": 1.01602955, "memory(GiB)": 141.16, "step": 132480, "train_speed(iter/s)": 0.288845 }, { "acc": 0.73598933, "epoch": 1.4820055169752546, "grad_norm": 5.625, "learning_rate": 1.7244449339761028e-06, "loss": 1.06805706, "memory(GiB)": 141.16, "step": 132500, "train_speed(iter/s)": 0.28886 }, { "acc": 0.7320272, "epoch": 1.4822292159212132, "grad_norm": 8.0625, "learning_rate": 1.7230478602962553e-06, "loss": 1.07550907, "memory(GiB)": 141.16, "step": 132520, "train_speed(iter/s)": 0.288872 }, { "acc": 0.7264081, "epoch": 1.4824529148671717, "grad_norm": 7.5625, "learning_rate": 1.7216512349503001e-06, "loss": 1.11603451, "memory(GiB)": 141.16, "step": 132540, "train_speed(iter/s)": 0.288886 }, { "acc": 0.73639145, "epoch": 1.4826766138131302, "grad_norm": 7.9375, "learning_rate": 1.7202550581293147e-06, "loss": 1.05900898, "memory(GiB)": 141.16, "step": 132560, "train_speed(iter/s)": 0.288902 }, { "acc": 0.73885803, "epoch": 1.4829003127590887, "grad_norm": 8.625, "learning_rate": 1.718859330024314e-06, "loss": 1.04127445, "memory(GiB)": 141.16, "step": 132580, "train_speed(iter/s)": 0.288917 }, { "acc": 0.73841171, "epoch": 1.4831240117050473, "grad_norm": 4.96875, "learning_rate": 1.7174640508262585e-06, "loss": 1.06935101, "memory(GiB)": 141.16, "step": 132600, "train_speed(iter/s)": 0.288932 }, { "acc": 0.74004488, "epoch": 1.4833477106510058, "grad_norm": 8.5625, "learning_rate": 1.716069220726036e-06, "loss": 1.05411797, "memory(GiB)": 141.16, "step": 132620, "train_speed(iter/s)": 0.288947 }, { "acc": 0.73400898, "epoch": 1.4835714095969643, "grad_norm": 6.0, "learning_rate": 1.714674839914484e-06, "loss": 1.06669369, "memory(GiB)": 141.16, "step": 132640, "train_speed(iter/s)": 0.288961 }, { "acc": 0.73535714, "epoch": 1.4837951085429228, "grad_norm": 8.5625, "learning_rate": 1.7132809085823705e-06, "loss": 1.06041069, "memory(GiB)": 141.16, "step": 132660, "train_speed(iter/s)": 0.288975 }, { "acc": 0.73550267, "epoch": 1.4840188074888814, "grad_norm": 9.0, "learning_rate": 1.7118874269204078e-06, "loss": 1.06799488, "memory(GiB)": 141.16, "step": 132680, "train_speed(iter/s)": 0.28899 }, { "acc": 0.72192335, "epoch": 1.48424250643484, "grad_norm": 8.8125, "learning_rate": 1.7104943951192437e-06, "loss": 1.12035847, "memory(GiB)": 141.16, "step": 132700, "train_speed(iter/s)": 0.289005 }, { "acc": 0.73131852, "epoch": 1.4844662053807984, "grad_norm": 7.125, "learning_rate": 1.709101813369462e-06, "loss": 1.08776026, "memory(GiB)": 141.16, "step": 132720, "train_speed(iter/s)": 0.289022 }, { "acc": 0.73345737, "epoch": 1.484689904326757, "grad_norm": 7.5625, "learning_rate": 1.7077096818615918e-06, "loss": 1.09275799, "memory(GiB)": 141.16, "step": 132740, "train_speed(iter/s)": 0.289037 }, { "acc": 0.7376894, "epoch": 1.4849136032727155, "grad_norm": 7.1875, "learning_rate": 1.7063180007860935e-06, "loss": 1.05499992, "memory(GiB)": 141.16, "step": 132760, "train_speed(iter/s)": 0.28905 }, { "acc": 0.72868099, "epoch": 1.485137302218674, "grad_norm": 7.4375, "learning_rate": 1.7049267703333715e-06, "loss": 1.08870201, "memory(GiB)": 141.16, "step": 132780, "train_speed(iter/s)": 0.289064 }, { "acc": 0.72863111, "epoch": 1.4853610011646325, "grad_norm": 7.09375, "learning_rate": 1.7035359906937649e-06, "loss": 1.08856688, "memory(GiB)": 141.16, "step": 132800, "train_speed(iter/s)": 0.289079 }, { "acc": 0.74209557, "epoch": 1.485584700110591, "grad_norm": 5.90625, "learning_rate": 1.7021456620575504e-06, "loss": 1.04698887, "memory(GiB)": 141.16, "step": 132820, "train_speed(iter/s)": 0.289096 }, { "acc": 0.73591595, "epoch": 1.4858083990565496, "grad_norm": 6.375, "learning_rate": 1.7007557846149487e-06, "loss": 1.06799164, "memory(GiB)": 141.16, "step": 132840, "train_speed(iter/s)": 0.289111 }, { "acc": 0.74368901, "epoch": 1.4860320980025081, "grad_norm": 6.75, "learning_rate": 1.6993663585561105e-06, "loss": 1.01578722, "memory(GiB)": 141.16, "step": 132860, "train_speed(iter/s)": 0.289127 }, { "acc": 0.72703333, "epoch": 1.4862557969484667, "grad_norm": 6.59375, "learning_rate": 1.6979773840711328e-06, "loss": 1.09911728, "memory(GiB)": 141.16, "step": 132880, "train_speed(iter/s)": 0.289144 }, { "acc": 0.73311119, "epoch": 1.4864794958944252, "grad_norm": 6.15625, "learning_rate": 1.6965888613500464e-06, "loss": 1.07892208, "memory(GiB)": 141.16, "step": 132900, "train_speed(iter/s)": 0.289158 }, { "acc": 0.73114882, "epoch": 1.4867031948403837, "grad_norm": 6.21875, "learning_rate": 1.6952007905828184e-06, "loss": 1.06889067, "memory(GiB)": 141.16, "step": 132920, "train_speed(iter/s)": 0.289173 }, { "acc": 0.71979151, "epoch": 1.4869268937863422, "grad_norm": 9.0, "learning_rate": 1.6938131719593603e-06, "loss": 1.13719635, "memory(GiB)": 141.16, "step": 132940, "train_speed(iter/s)": 0.289189 }, { "acc": 0.73037977, "epoch": 1.4871505927323008, "grad_norm": 8.3125, "learning_rate": 1.6924260056695153e-06, "loss": 1.0685524, "memory(GiB)": 141.16, "step": 132960, "train_speed(iter/s)": 0.289202 }, { "acc": 0.72924461, "epoch": 1.4873742916782593, "grad_norm": 6.0, "learning_rate": 1.69103929190307e-06, "loss": 1.07694988, "memory(GiB)": 141.16, "step": 132980, "train_speed(iter/s)": 0.289217 }, { "acc": 0.72414761, "epoch": 1.4875979906242178, "grad_norm": 7.0625, "learning_rate": 1.6896530308497455e-06, "loss": 1.09785824, "memory(GiB)": 141.16, "step": 133000, "train_speed(iter/s)": 0.289232 }, { "acc": 0.7500905, "epoch": 1.4878216895701764, "grad_norm": 7.6875, "learning_rate": 1.6882672226992008e-06, "loss": 0.98523035, "memory(GiB)": 141.16, "step": 133020, "train_speed(iter/s)": 0.289247 }, { "acc": 0.72147694, "epoch": 1.4880453885161349, "grad_norm": 7.46875, "learning_rate": 1.6868818676410376e-06, "loss": 1.1369936, "memory(GiB)": 141.16, "step": 133040, "train_speed(iter/s)": 0.289261 }, { "acc": 0.73160992, "epoch": 1.4882690874620934, "grad_norm": 8.0, "learning_rate": 1.685496965864788e-06, "loss": 1.07331505, "memory(GiB)": 141.16, "step": 133060, "train_speed(iter/s)": 0.289274 }, { "acc": 0.74384546, "epoch": 1.488492786408052, "grad_norm": 5.9375, "learning_rate": 1.6841125175599304e-06, "loss": 0.99878216, "memory(GiB)": 141.16, "step": 133080, "train_speed(iter/s)": 0.289288 }, { "acc": 0.74079094, "epoch": 1.4887164853540105, "grad_norm": 7.25, "learning_rate": 1.6827285229158753e-06, "loss": 1.01705189, "memory(GiB)": 141.16, "step": 133100, "train_speed(iter/s)": 0.289303 }, { "acc": 0.73778038, "epoch": 1.488940184299969, "grad_norm": 7.40625, "learning_rate": 1.681344982121972e-06, "loss": 1.0493206, "memory(GiB)": 141.16, "step": 133120, "train_speed(iter/s)": 0.289319 }, { "acc": 0.72894526, "epoch": 1.4891638832459275, "grad_norm": 8.3125, "learning_rate": 1.6799618953675106e-06, "loss": 1.07409992, "memory(GiB)": 141.16, "step": 133140, "train_speed(iter/s)": 0.289332 }, { "acc": 0.73372669, "epoch": 1.489387582191886, "grad_norm": 8.4375, "learning_rate": 1.6785792628417147e-06, "loss": 1.07364655, "memory(GiB)": 141.16, "step": 133160, "train_speed(iter/s)": 0.289348 }, { "acc": 0.7374567, "epoch": 1.4896112811378446, "grad_norm": 5.625, "learning_rate": 1.677197084733751e-06, "loss": 1.04590893, "memory(GiB)": 141.16, "step": 133180, "train_speed(iter/s)": 0.289364 }, { "acc": 0.73808651, "epoch": 1.489834980083803, "grad_norm": 6.09375, "learning_rate": 1.6758153612327204e-06, "loss": 1.0460125, "memory(GiB)": 141.16, "step": 133200, "train_speed(iter/s)": 0.289379 }, { "acc": 0.72815409, "epoch": 1.4900586790297616, "grad_norm": 8.3125, "learning_rate": 1.6744340925276602e-06, "loss": 1.10861502, "memory(GiB)": 141.16, "step": 133220, "train_speed(iter/s)": 0.289394 }, { "acc": 0.7180635, "epoch": 1.4902823779757202, "grad_norm": 6.125, "learning_rate": 1.6730532788075509e-06, "loss": 1.14998426, "memory(GiB)": 141.16, "step": 133240, "train_speed(iter/s)": 0.28941 }, { "acc": 0.73328557, "epoch": 1.4905060769216787, "grad_norm": 7.40625, "learning_rate": 1.6716729202613046e-06, "loss": 1.06541719, "memory(GiB)": 141.16, "step": 133260, "train_speed(iter/s)": 0.289425 }, { "acc": 0.73367529, "epoch": 1.4907297758676372, "grad_norm": 6.03125, "learning_rate": 1.6702930170777776e-06, "loss": 1.06149235, "memory(GiB)": 141.16, "step": 133280, "train_speed(iter/s)": 0.28944 }, { "acc": 0.73328896, "epoch": 1.4909534748135957, "grad_norm": 5.875, "learning_rate": 1.6689135694457575e-06, "loss": 1.0841423, "memory(GiB)": 141.16, "step": 133300, "train_speed(iter/s)": 0.289455 }, { "acc": 0.74777956, "epoch": 1.4911771737595543, "grad_norm": 7.71875, "learning_rate": 1.667534577553972e-06, "loss": 1.00495024, "memory(GiB)": 141.16, "step": 133320, "train_speed(iter/s)": 0.28947 }, { "acc": 0.74215136, "epoch": 1.4914008727055128, "grad_norm": 7.09375, "learning_rate": 1.66615604159109e-06, "loss": 1.03843098, "memory(GiB)": 141.16, "step": 133340, "train_speed(iter/s)": 0.289485 }, { "acc": 0.74618635, "epoch": 1.4916245716514713, "grad_norm": 6.875, "learning_rate": 1.6647779617457116e-06, "loss": 0.99952831, "memory(GiB)": 141.16, "step": 133360, "train_speed(iter/s)": 0.289498 }, { "acc": 0.73541884, "epoch": 1.4918482705974299, "grad_norm": 6.4375, "learning_rate": 1.6634003382063806e-06, "loss": 1.05973969, "memory(GiB)": 141.16, "step": 133380, "train_speed(iter/s)": 0.289513 }, { "acc": 0.71533918, "epoch": 1.4920719695433884, "grad_norm": 6.75, "learning_rate": 1.6620231711615747e-06, "loss": 1.13351507, "memory(GiB)": 141.16, "step": 133400, "train_speed(iter/s)": 0.289527 }, { "acc": 0.7457057, "epoch": 1.492295668489347, "grad_norm": 8.5625, "learning_rate": 1.6606464607997075e-06, "loss": 1.02153568, "memory(GiB)": 141.16, "step": 133420, "train_speed(iter/s)": 0.28954 }, { "acc": 0.7368516, "epoch": 1.4925193674353054, "grad_norm": 7.25, "learning_rate": 1.6592702073091371e-06, "loss": 1.06001549, "memory(GiB)": 141.16, "step": 133440, "train_speed(iter/s)": 0.289554 }, { "acc": 0.72819343, "epoch": 1.492743066381264, "grad_norm": 5.875, "learning_rate": 1.6578944108781503e-06, "loss": 1.09149446, "memory(GiB)": 141.16, "step": 133460, "train_speed(iter/s)": 0.289568 }, { "acc": 0.72548552, "epoch": 1.4929667653272225, "grad_norm": 6.65625, "learning_rate": 1.6565190716949797e-06, "loss": 1.08731213, "memory(GiB)": 141.16, "step": 133480, "train_speed(iter/s)": 0.289582 }, { "acc": 0.73732653, "epoch": 1.493190464273181, "grad_norm": 9.75, "learning_rate": 1.6551441899477894e-06, "loss": 1.06321011, "memory(GiB)": 141.16, "step": 133500, "train_speed(iter/s)": 0.289597 }, { "acc": 0.729702, "epoch": 1.4934141632191396, "grad_norm": 7.0, "learning_rate": 1.6537697658246809e-06, "loss": 1.07228642, "memory(GiB)": 141.16, "step": 133520, "train_speed(iter/s)": 0.289612 }, { "acc": 0.73889246, "epoch": 1.493637862165098, "grad_norm": 8.0, "learning_rate": 1.6523957995136992e-06, "loss": 1.04324274, "memory(GiB)": 141.16, "step": 133540, "train_speed(iter/s)": 0.289627 }, { "acc": 0.73920345, "epoch": 1.4938615611110566, "grad_norm": 7.28125, "learning_rate": 1.6510222912028185e-06, "loss": 1.045788, "memory(GiB)": 141.16, "step": 133560, "train_speed(iter/s)": 0.289641 }, { "acc": 0.73687062, "epoch": 1.4940852600570151, "grad_norm": 5.75, "learning_rate": 1.649649241079958e-06, "loss": 1.05011826, "memory(GiB)": 141.16, "step": 133580, "train_speed(iter/s)": 0.289656 }, { "acc": 0.73698492, "epoch": 1.4943089590029737, "grad_norm": 7.6875, "learning_rate": 1.648276649332969e-06, "loss": 1.05563812, "memory(GiB)": 141.16, "step": 133600, "train_speed(iter/s)": 0.289671 }, { "acc": 0.72365751, "epoch": 1.4945326579489322, "grad_norm": 9.1875, "learning_rate": 1.6469045161496395e-06, "loss": 1.11044064, "memory(GiB)": 141.16, "step": 133620, "train_speed(iter/s)": 0.289687 }, { "acc": 0.73857718, "epoch": 1.4947563568948907, "grad_norm": 7.0625, "learning_rate": 1.6455328417177007e-06, "loss": 1.03981323, "memory(GiB)": 141.16, "step": 133640, "train_speed(iter/s)": 0.289701 }, { "acc": 0.7400743, "epoch": 1.4949800558408493, "grad_norm": 7.1875, "learning_rate": 1.6441616262248144e-06, "loss": 1.0517271, "memory(GiB)": 141.16, "step": 133660, "train_speed(iter/s)": 0.289715 }, { "acc": 0.73823271, "epoch": 1.4952037547868078, "grad_norm": 5.8125, "learning_rate": 1.6427908698585848e-06, "loss": 1.04672031, "memory(GiB)": 141.16, "step": 133680, "train_speed(iter/s)": 0.289727 }, { "acc": 0.74540186, "epoch": 1.4954274537327663, "grad_norm": 6.4375, "learning_rate": 1.6414205728065502e-06, "loss": 1.01897678, "memory(GiB)": 141.16, "step": 133700, "train_speed(iter/s)": 0.28974 }, { "acc": 0.74110141, "epoch": 1.4956511526787248, "grad_norm": 8.3125, "learning_rate": 1.6400507352561846e-06, "loss": 1.03889818, "memory(GiB)": 141.16, "step": 133720, "train_speed(iter/s)": 0.289754 }, { "acc": 0.73945665, "epoch": 1.4958748516246834, "grad_norm": 7.75, "learning_rate": 1.6386813573949044e-06, "loss": 1.05423012, "memory(GiB)": 141.16, "step": 133740, "train_speed(iter/s)": 0.289769 }, { "acc": 0.73645644, "epoch": 1.496098550570642, "grad_norm": 8.875, "learning_rate": 1.6373124394100576e-06, "loss": 1.0621603, "memory(GiB)": 141.16, "step": 133760, "train_speed(iter/s)": 0.289783 }, { "acc": 0.72461672, "epoch": 1.4963222495166004, "grad_norm": 6.375, "learning_rate": 1.6359439814889344e-06, "loss": 1.12278986, "memory(GiB)": 141.16, "step": 133780, "train_speed(iter/s)": 0.289798 }, { "acc": 0.73124561, "epoch": 1.496545948462559, "grad_norm": 6.71875, "learning_rate": 1.6345759838187581e-06, "loss": 1.08648834, "memory(GiB)": 141.16, "step": 133800, "train_speed(iter/s)": 0.289813 }, { "acc": 0.73654366, "epoch": 1.4967696474085175, "grad_norm": 7.0625, "learning_rate": 1.6332084465866898e-06, "loss": 1.03685169, "memory(GiB)": 141.16, "step": 133820, "train_speed(iter/s)": 0.289827 }, { "acc": 0.72367311, "epoch": 1.4969933463544762, "grad_norm": 6.96875, "learning_rate": 1.631841369979829e-06, "loss": 1.11697521, "memory(GiB)": 141.16, "step": 133840, "train_speed(iter/s)": 0.28984 }, { "acc": 0.73259726, "epoch": 1.4972170453004348, "grad_norm": 7.71875, "learning_rate": 1.630474754185209e-06, "loss": 1.07719383, "memory(GiB)": 141.16, "step": 133860, "train_speed(iter/s)": 0.289854 }, { "acc": 0.74023418, "epoch": 1.4974407442463933, "grad_norm": 8.75, "learning_rate": 1.629108599389806e-06, "loss": 1.05952911, "memory(GiB)": 141.16, "step": 133880, "train_speed(iter/s)": 0.289868 }, { "acc": 0.74156837, "epoch": 1.4976644431923518, "grad_norm": 7.84375, "learning_rate": 1.627742905780526e-06, "loss": 1.01998615, "memory(GiB)": 141.16, "step": 133900, "train_speed(iter/s)": 0.289883 }, { "acc": 0.73807716, "epoch": 1.4978881421383103, "grad_norm": 8.25, "learning_rate": 1.6263776735442189e-06, "loss": 1.03763618, "memory(GiB)": 141.16, "step": 133920, "train_speed(iter/s)": 0.2899 }, { "acc": 0.73168187, "epoch": 1.4981118410842689, "grad_norm": 7.53125, "learning_rate": 1.6250129028676664e-06, "loss": 1.06672182, "memory(GiB)": 141.16, "step": 133940, "train_speed(iter/s)": 0.289915 }, { "acc": 0.74283352, "epoch": 1.4983355400302274, "grad_norm": 7.3125, "learning_rate": 1.6236485939375867e-06, "loss": 1.02295103, "memory(GiB)": 141.16, "step": 133960, "train_speed(iter/s)": 0.289929 }, { "acc": 0.72718558, "epoch": 1.498559238976186, "grad_norm": 7.09375, "learning_rate": 1.6222847469406399e-06, "loss": 1.11319866, "memory(GiB)": 141.16, "step": 133980, "train_speed(iter/s)": 0.289946 }, { "acc": 0.72956357, "epoch": 1.4987829379221445, "grad_norm": 7.09375, "learning_rate": 1.6209213620634174e-06, "loss": 1.08229504, "memory(GiB)": 141.16, "step": 134000, "train_speed(iter/s)": 0.289961 }, { "epoch": 1.4987829379221445, "eval_acc": 0.6901913755304813, "eval_loss": 1.0791161060333252, "eval_runtime": 2323.8312, "eval_samples_per_second": 32.396, "eval_steps_per_second": 16.198, "step": 134000 }, { "acc": 0.73048954, "epoch": 1.499006636868103, "grad_norm": 7.15625, "learning_rate": 1.6195584394924519e-06, "loss": 1.09097672, "memory(GiB)": 141.16, "step": 134020, "train_speed(iter/s)": 0.288493 }, { "acc": 0.73319392, "epoch": 1.4992303358140615, "grad_norm": 8.75, "learning_rate": 1.6181959794142094e-06, "loss": 1.07239227, "memory(GiB)": 141.16, "step": 134040, "train_speed(iter/s)": 0.288506 }, { "acc": 0.72373505, "epoch": 1.49945403476002, "grad_norm": 6.03125, "learning_rate": 1.6168339820150924e-06, "loss": 1.12047558, "memory(GiB)": 141.16, "step": 134060, "train_speed(iter/s)": 0.288521 }, { "acc": 0.72490921, "epoch": 1.4996777337059786, "grad_norm": 7.40625, "learning_rate": 1.6154724474814454e-06, "loss": 1.12835598, "memory(GiB)": 141.16, "step": 134080, "train_speed(iter/s)": 0.288534 }, { "acc": 0.73626909, "epoch": 1.499901432651937, "grad_norm": 7.75, "learning_rate": 1.6141113759995414e-06, "loss": 1.05603943, "memory(GiB)": 141.16, "step": 134100, "train_speed(iter/s)": 0.288547 }, { "acc": 0.73246641, "epoch": 1.5001251315978954, "grad_norm": 8.125, "learning_rate": 1.6127507677555988e-06, "loss": 1.07399921, "memory(GiB)": 141.16, "step": 134120, "train_speed(iter/s)": 0.28856 }, { "acc": 0.73492193, "epoch": 1.500348830543854, "grad_norm": 6.75, "learning_rate": 1.6113906229357656e-06, "loss": 1.0760931, "memory(GiB)": 141.16, "step": 134140, "train_speed(iter/s)": 0.288576 }, { "acc": 0.73661771, "epoch": 1.5005725294898125, "grad_norm": 6.65625, "learning_rate": 1.6100309417261278e-06, "loss": 1.04282627, "memory(GiB)": 141.16, "step": 134160, "train_speed(iter/s)": 0.288588 }, { "acc": 0.73126097, "epoch": 1.500796228435771, "grad_norm": 8.75, "learning_rate": 1.6086717243127132e-06, "loss": 1.09598656, "memory(GiB)": 141.16, "step": 134180, "train_speed(iter/s)": 0.288601 }, { "acc": 0.72629056, "epoch": 1.5010199273817295, "grad_norm": 7.15625, "learning_rate": 1.6073129708814783e-06, "loss": 1.09732056, "memory(GiB)": 141.16, "step": 134200, "train_speed(iter/s)": 0.288615 }, { "acc": 0.72870932, "epoch": 1.501243626327688, "grad_norm": 7.25, "learning_rate": 1.605954681618323e-06, "loss": 1.10096273, "memory(GiB)": 141.16, "step": 134220, "train_speed(iter/s)": 0.28863 }, { "acc": 0.73165159, "epoch": 1.5014673252736466, "grad_norm": 9.375, "learning_rate": 1.6045968567090797e-06, "loss": 1.06772375, "memory(GiB)": 141.16, "step": 134240, "train_speed(iter/s)": 0.288644 }, { "acc": 0.73434505, "epoch": 1.501691024219605, "grad_norm": 6.28125, "learning_rate": 1.6032394963395159e-06, "loss": 1.07630043, "memory(GiB)": 141.16, "step": 134260, "train_speed(iter/s)": 0.288659 }, { "acc": 0.72878551, "epoch": 1.5019147231655636, "grad_norm": 6.96875, "learning_rate": 1.6018826006953415e-06, "loss": 1.08612041, "memory(GiB)": 141.16, "step": 134280, "train_speed(iter/s)": 0.288673 }, { "acc": 0.72214251, "epoch": 1.5021384221115222, "grad_norm": 5.28125, "learning_rate": 1.600526169962196e-06, "loss": 1.11064787, "memory(GiB)": 141.16, "step": 134300, "train_speed(iter/s)": 0.288687 }, { "acc": 0.74778509, "epoch": 1.5023621210574807, "grad_norm": 6.4375, "learning_rate": 1.5991702043256623e-06, "loss": 0.99192848, "memory(GiB)": 141.16, "step": 134320, "train_speed(iter/s)": 0.288703 }, { "acc": 0.73737721, "epoch": 1.5025858200034392, "grad_norm": 6.84375, "learning_rate": 1.5978147039712533e-06, "loss": 1.0654747, "memory(GiB)": 141.16, "step": 134340, "train_speed(iter/s)": 0.288716 }, { "acc": 0.74420919, "epoch": 1.5028095189493977, "grad_norm": 8.5, "learning_rate": 1.5964596690844198e-06, "loss": 1.00968208, "memory(GiB)": 141.16, "step": 134360, "train_speed(iter/s)": 0.28873 }, { "acc": 0.74051447, "epoch": 1.5030332178953563, "grad_norm": 6.59375, "learning_rate": 1.5951050998505523e-06, "loss": 1.06741304, "memory(GiB)": 141.16, "step": 134380, "train_speed(iter/s)": 0.288745 }, { "acc": 0.74147491, "epoch": 1.5032569168413148, "grad_norm": 6.53125, "learning_rate": 1.5937509964549736e-06, "loss": 1.04203701, "memory(GiB)": 141.16, "step": 134400, "train_speed(iter/s)": 0.288761 }, { "acc": 0.73920765, "epoch": 1.5034806157872733, "grad_norm": 9.0625, "learning_rate": 1.5923973590829462e-06, "loss": 1.04216785, "memory(GiB)": 141.16, "step": 134420, "train_speed(iter/s)": 0.288776 }, { "acc": 0.722999, "epoch": 1.5037043147332319, "grad_norm": 5.75, "learning_rate": 1.5910441879196658e-06, "loss": 1.12255783, "memory(GiB)": 141.16, "step": 134440, "train_speed(iter/s)": 0.288791 }, { "acc": 0.73272552, "epoch": 1.5039280136791906, "grad_norm": 7.59375, "learning_rate": 1.5896914831502646e-06, "loss": 1.06699781, "memory(GiB)": 141.16, "step": 134460, "train_speed(iter/s)": 0.288804 }, { "acc": 0.74383688, "epoch": 1.5041517126251491, "grad_norm": 6.15625, "learning_rate": 1.5883392449598139e-06, "loss": 1.02650948, "memory(GiB)": 141.16, "step": 134480, "train_speed(iter/s)": 0.288818 }, { "acc": 0.74599762, "epoch": 1.5043754115711077, "grad_norm": 8.3125, "learning_rate": 1.5869874735333173e-06, "loss": 1.01908226, "memory(GiB)": 141.16, "step": 134500, "train_speed(iter/s)": 0.288834 }, { "acc": 0.75011439, "epoch": 1.5045991105170662, "grad_norm": 8.875, "learning_rate": 1.5856361690557192e-06, "loss": 0.97992544, "memory(GiB)": 141.16, "step": 134520, "train_speed(iter/s)": 0.288849 }, { "acc": 0.72125916, "epoch": 1.5048228094630247, "grad_norm": 8.0, "learning_rate": 1.5842853317118957e-06, "loss": 1.13202038, "memory(GiB)": 141.16, "step": 134540, "train_speed(iter/s)": 0.288864 }, { "acc": 0.73272171, "epoch": 1.5050465084089832, "grad_norm": 6.0625, "learning_rate": 1.5829349616866591e-06, "loss": 1.07759113, "memory(GiB)": 141.16, "step": 134560, "train_speed(iter/s)": 0.288879 }, { "acc": 0.73475647, "epoch": 1.5052702073549418, "grad_norm": 6.6875, "learning_rate": 1.5815850591647618e-06, "loss": 1.07181797, "memory(GiB)": 141.16, "step": 134580, "train_speed(iter/s)": 0.288894 }, { "acc": 0.73941169, "epoch": 1.5054939063009003, "grad_norm": 8.5625, "learning_rate": 1.5802356243308875e-06, "loss": 1.03830948, "memory(GiB)": 141.16, "step": 134600, "train_speed(iter/s)": 0.28891 }, { "acc": 0.74171581, "epoch": 1.5057176052468588, "grad_norm": 7.375, "learning_rate": 1.5788866573696615e-06, "loss": 1.03042202, "memory(GiB)": 141.16, "step": 134620, "train_speed(iter/s)": 0.288924 }, { "acc": 0.73971319, "epoch": 1.5059413041928174, "grad_norm": 7.09375, "learning_rate": 1.5775381584656397e-06, "loss": 1.04677773, "memory(GiB)": 141.16, "step": 134640, "train_speed(iter/s)": 0.28894 }, { "acc": 0.73283935, "epoch": 1.5061650031387759, "grad_norm": 6.15625, "learning_rate": 1.5761901278033137e-06, "loss": 1.08430033, "memory(GiB)": 141.16, "step": 134660, "train_speed(iter/s)": 0.288955 }, { "acc": 0.73784046, "epoch": 1.5063887020847344, "grad_norm": 6.4375, "learning_rate": 1.5748425655671179e-06, "loss": 1.04424763, "memory(GiB)": 141.16, "step": 134680, "train_speed(iter/s)": 0.288971 }, { "acc": 0.72324414, "epoch": 1.506612401030693, "grad_norm": 5.875, "learning_rate": 1.5734954719414147e-06, "loss": 1.11928253, "memory(GiB)": 141.16, "step": 134700, "train_speed(iter/s)": 0.288985 }, { "acc": 0.73536005, "epoch": 1.5068360999766515, "grad_norm": 6.375, "learning_rate": 1.572148847110508e-06, "loss": 1.06576843, "memory(GiB)": 141.16, "step": 134720, "train_speed(iter/s)": 0.288999 }, { "acc": 0.73082628, "epoch": 1.50705979892261, "grad_norm": 7.84375, "learning_rate": 1.5708026912586343e-06, "loss": 1.06960487, "memory(GiB)": 141.16, "step": 134740, "train_speed(iter/s)": 0.289015 }, { "acc": 0.72490892, "epoch": 1.5072834978685685, "grad_norm": 7.84375, "learning_rate": 1.5694570045699658e-06, "loss": 1.11266403, "memory(GiB)": 141.16, "step": 134760, "train_speed(iter/s)": 0.289026 }, { "acc": 0.73223085, "epoch": 1.507507196814527, "grad_norm": 6.21875, "learning_rate": 1.568111787228614e-06, "loss": 1.06783962, "memory(GiB)": 141.16, "step": 134780, "train_speed(iter/s)": 0.289042 }, { "acc": 0.7333045, "epoch": 1.5077308957604856, "grad_norm": 6.5, "learning_rate": 1.5667670394186212e-06, "loss": 1.05779963, "memory(GiB)": 141.16, "step": 134800, "train_speed(iter/s)": 0.289056 }, { "acc": 0.72460241, "epoch": 1.507954594706444, "grad_norm": 6.46875, "learning_rate": 1.5654227613239714e-06, "loss": 1.11746902, "memory(GiB)": 141.16, "step": 134820, "train_speed(iter/s)": 0.289071 }, { "acc": 0.73829222, "epoch": 1.5081782936524026, "grad_norm": 8.375, "learning_rate": 1.5640789531285787e-06, "loss": 1.04622154, "memory(GiB)": 141.16, "step": 134840, "train_speed(iter/s)": 0.289086 }, { "acc": 0.7361063, "epoch": 1.5084019925983612, "grad_norm": 5.8125, "learning_rate": 1.5627356150162948e-06, "loss": 1.05074024, "memory(GiB)": 141.16, "step": 134860, "train_speed(iter/s)": 0.289101 }, { "acc": 0.74318318, "epoch": 1.5086256915443197, "grad_norm": 6.8125, "learning_rate": 1.5613927471709101e-06, "loss": 1.03470316, "memory(GiB)": 141.16, "step": 134880, "train_speed(iter/s)": 0.289116 }, { "acc": 0.74086599, "epoch": 1.5088493904902782, "grad_norm": 6.4375, "learning_rate": 1.5600503497761449e-06, "loss": 1.04260826, "memory(GiB)": 141.16, "step": 134900, "train_speed(iter/s)": 0.289131 }, { "acc": 0.7262291, "epoch": 1.5090730894362367, "grad_norm": 7.34375, "learning_rate": 1.5587084230156618e-06, "loss": 1.09288511, "memory(GiB)": 141.16, "step": 134920, "train_speed(iter/s)": 0.289146 }, { "acc": 0.74176702, "epoch": 1.5092967883821953, "grad_norm": 7.40625, "learning_rate": 1.557366967073054e-06, "loss": 1.0126545, "memory(GiB)": 141.16, "step": 134940, "train_speed(iter/s)": 0.289159 }, { "acc": 0.73399277, "epoch": 1.5095204873281538, "grad_norm": 7.25, "learning_rate": 1.5560259821318496e-06, "loss": 1.08946743, "memory(GiB)": 141.16, "step": 134960, "train_speed(iter/s)": 0.289175 }, { "acc": 0.73293743, "epoch": 1.5097441862741123, "grad_norm": 6.34375, "learning_rate": 1.5546854683755203e-06, "loss": 1.07471886, "memory(GiB)": 141.16, "step": 134980, "train_speed(iter/s)": 0.28919 }, { "acc": 0.73568525, "epoch": 1.5099678852200709, "grad_norm": 6.59375, "learning_rate": 1.5533454259874597e-06, "loss": 1.0481514, "memory(GiB)": 141.16, "step": 135000, "train_speed(iter/s)": 0.289202 }, { "acc": 0.73416476, "epoch": 1.5101915841660294, "grad_norm": 7.21875, "learning_rate": 1.5520058551510115e-06, "loss": 1.08179321, "memory(GiB)": 141.16, "step": 135020, "train_speed(iter/s)": 0.289216 }, { "acc": 0.74066238, "epoch": 1.510415283111988, "grad_norm": 6.75, "learning_rate": 1.5506667560494432e-06, "loss": 1.0444128, "memory(GiB)": 141.16, "step": 135040, "train_speed(iter/s)": 0.289231 }, { "acc": 0.745294, "epoch": 1.5106389820579464, "grad_norm": 8.125, "learning_rate": 1.5493281288659672e-06, "loss": 1.03428478, "memory(GiB)": 141.16, "step": 135060, "train_speed(iter/s)": 0.289246 }, { "acc": 0.73287535, "epoch": 1.510862681003905, "grad_norm": 8.3125, "learning_rate": 1.547989973783724e-06, "loss": 1.06375875, "memory(GiB)": 141.16, "step": 135080, "train_speed(iter/s)": 0.289263 }, { "acc": 0.72692118, "epoch": 1.5110863799498635, "grad_norm": 6.0625, "learning_rate": 1.5466522909857917e-06, "loss": 1.10905781, "memory(GiB)": 141.16, "step": 135100, "train_speed(iter/s)": 0.289277 }, { "acc": 0.73000045, "epoch": 1.511310078895822, "grad_norm": 5.75, "learning_rate": 1.5453150806551875e-06, "loss": 1.07227249, "memory(GiB)": 141.16, "step": 135120, "train_speed(iter/s)": 0.289292 }, { "acc": 0.7409184, "epoch": 1.5115337778417806, "grad_norm": 8.25, "learning_rate": 1.5439783429748574e-06, "loss": 1.01838226, "memory(GiB)": 141.16, "step": 135140, "train_speed(iter/s)": 0.289307 }, { "acc": 0.73114948, "epoch": 1.511757476787739, "grad_norm": 6.9375, "learning_rate": 1.542642078127689e-06, "loss": 1.07332802, "memory(GiB)": 141.16, "step": 135160, "train_speed(iter/s)": 0.289321 }, { "acc": 0.72926846, "epoch": 1.5119811757336976, "grad_norm": 6.6875, "learning_rate": 1.5413062862965023e-06, "loss": 1.09780025, "memory(GiB)": 141.16, "step": 135180, "train_speed(iter/s)": 0.289335 }, { "acc": 0.74900112, "epoch": 1.5122048746796561, "grad_norm": 6.0, "learning_rate": 1.5399709676640496e-06, "loss": 1.00957689, "memory(GiB)": 141.16, "step": 135200, "train_speed(iter/s)": 0.289347 }, { "acc": 0.74326377, "epoch": 1.5124285736256147, "grad_norm": 7.125, "learning_rate": 1.5386361224130253e-06, "loss": 1.02586336, "memory(GiB)": 141.16, "step": 135220, "train_speed(iter/s)": 0.289359 }, { "acc": 0.74481769, "epoch": 1.5126522725715732, "grad_norm": 7.21875, "learning_rate": 1.5373017507260517e-06, "loss": 1.02036457, "memory(GiB)": 141.16, "step": 135240, "train_speed(iter/s)": 0.289373 }, { "acc": 0.72913809, "epoch": 1.5128759715175317, "grad_norm": 7.65625, "learning_rate": 1.5359678527856943e-06, "loss": 1.08197632, "memory(GiB)": 141.16, "step": 135260, "train_speed(iter/s)": 0.289389 }, { "acc": 0.73188138, "epoch": 1.5130996704634903, "grad_norm": 8.75, "learning_rate": 1.5346344287744452e-06, "loss": 1.07668133, "memory(GiB)": 141.16, "step": 135280, "train_speed(iter/s)": 0.289404 }, { "acc": 0.73754001, "epoch": 1.5133233694094488, "grad_norm": 8.25, "learning_rate": 1.5333014788747397e-06, "loss": 1.06190987, "memory(GiB)": 141.16, "step": 135300, "train_speed(iter/s)": 0.289419 }, { "acc": 0.73014832, "epoch": 1.5135470683554073, "grad_norm": 7.5, "learning_rate": 1.5319690032689417e-06, "loss": 1.07263718, "memory(GiB)": 141.16, "step": 135320, "train_speed(iter/s)": 0.289434 }, { "acc": 0.74299536, "epoch": 1.5137707673013658, "grad_norm": 6.46875, "learning_rate": 1.5306370021393524e-06, "loss": 1.03675842, "memory(GiB)": 141.16, "step": 135340, "train_speed(iter/s)": 0.289448 }, { "acc": 0.73254623, "epoch": 1.5139944662473244, "grad_norm": 7.28125, "learning_rate": 1.5293054756682113e-06, "loss": 1.08175545, "memory(GiB)": 141.16, "step": 135360, "train_speed(iter/s)": 0.289461 }, { "acc": 0.73982534, "epoch": 1.514218165193283, "grad_norm": 8.0625, "learning_rate": 1.5279744240376877e-06, "loss": 1.02978439, "memory(GiB)": 141.16, "step": 135380, "train_speed(iter/s)": 0.289475 }, { "acc": 0.73363409, "epoch": 1.5144418641392414, "grad_norm": 5.46875, "learning_rate": 1.5266438474298907e-06, "loss": 1.06112404, "memory(GiB)": 141.16, "step": 135400, "train_speed(iter/s)": 0.28949 }, { "acc": 0.74620247, "epoch": 1.5146655630852, "grad_norm": 6.9375, "learning_rate": 1.5253137460268612e-06, "loss": 1.01281643, "memory(GiB)": 141.16, "step": 135420, "train_speed(iter/s)": 0.289504 }, { "acc": 0.74372787, "epoch": 1.5148892620311585, "grad_norm": 7.375, "learning_rate": 1.5239841200105743e-06, "loss": 1.02646103, "memory(GiB)": 141.16, "step": 135440, "train_speed(iter/s)": 0.289518 }, { "acc": 0.73916855, "epoch": 1.515112960977117, "grad_norm": 7.8125, "learning_rate": 1.522654969562945e-06, "loss": 1.04486933, "memory(GiB)": 141.16, "step": 135460, "train_speed(iter/s)": 0.289531 }, { "acc": 0.73411837, "epoch": 1.5153366599230755, "grad_norm": 8.4375, "learning_rate": 1.521326294865817e-06, "loss": 1.07151661, "memory(GiB)": 141.16, "step": 135480, "train_speed(iter/s)": 0.289546 }, { "acc": 0.71489697, "epoch": 1.515560358869034, "grad_norm": 6.78125, "learning_rate": 1.5199980961009754e-06, "loss": 1.17195501, "memory(GiB)": 141.16, "step": 135500, "train_speed(iter/s)": 0.289561 }, { "acc": 0.733846, "epoch": 1.5157840578149926, "grad_norm": 8.0, "learning_rate": 1.518670373450135e-06, "loss": 1.08264589, "memory(GiB)": 141.16, "step": 135520, "train_speed(iter/s)": 0.289574 }, { "acc": 0.74069219, "epoch": 1.5160077567609511, "grad_norm": 6.625, "learning_rate": 1.5173431270949451e-06, "loss": 1.05677738, "memory(GiB)": 141.16, "step": 135540, "train_speed(iter/s)": 0.289589 }, { "acc": 0.73680096, "epoch": 1.5162314557069096, "grad_norm": 8.0, "learning_rate": 1.5160163572169962e-06, "loss": 1.05576191, "memory(GiB)": 141.16, "step": 135560, "train_speed(iter/s)": 0.289601 }, { "acc": 0.74423895, "epoch": 1.5164551546528682, "grad_norm": 6.5, "learning_rate": 1.5146900639978052e-06, "loss": 1.01260281, "memory(GiB)": 141.16, "step": 135580, "train_speed(iter/s)": 0.289616 }, { "acc": 0.72352276, "epoch": 1.5166788535988267, "grad_norm": 6.65625, "learning_rate": 1.513364247618832e-06, "loss": 1.12144279, "memory(GiB)": 141.16, "step": 135600, "train_speed(iter/s)": 0.28963 }, { "acc": 0.73224826, "epoch": 1.5169025525447852, "grad_norm": 7.84375, "learning_rate": 1.512038908261465e-06, "loss": 1.08382683, "memory(GiB)": 141.16, "step": 135620, "train_speed(iter/s)": 0.289645 }, { "acc": 0.73692527, "epoch": 1.5171262514907438, "grad_norm": 8.0625, "learning_rate": 1.510714046107028e-06, "loss": 1.05029478, "memory(GiB)": 141.16, "step": 135640, "train_speed(iter/s)": 0.289659 }, { "acc": 0.73223214, "epoch": 1.5173499504367023, "grad_norm": 8.125, "learning_rate": 1.5093896613367847e-06, "loss": 1.07346478, "memory(GiB)": 141.16, "step": 135660, "train_speed(iter/s)": 0.289674 }, { "acc": 0.73917046, "epoch": 1.5175736493826608, "grad_norm": 7.28125, "learning_rate": 1.5080657541319265e-06, "loss": 1.04672632, "memory(GiB)": 141.16, "step": 135680, "train_speed(iter/s)": 0.289689 }, { "acc": 0.74534693, "epoch": 1.5177973483286193, "grad_norm": 6.59375, "learning_rate": 1.5067423246735857e-06, "loss": 1.02261276, "memory(GiB)": 141.16, "step": 135700, "train_speed(iter/s)": 0.289704 }, { "acc": 0.72813482, "epoch": 1.5180210472745779, "grad_norm": 7.5, "learning_rate": 1.5054193731428257e-06, "loss": 1.08271351, "memory(GiB)": 141.16, "step": 135720, "train_speed(iter/s)": 0.289717 }, { "acc": 0.73486805, "epoch": 1.5182447462205364, "grad_norm": 8.6875, "learning_rate": 1.5040968997206423e-06, "loss": 1.06204338, "memory(GiB)": 141.16, "step": 135740, "train_speed(iter/s)": 0.28973 }, { "acc": 0.72967448, "epoch": 1.518468445166495, "grad_norm": 7.65625, "learning_rate": 1.5027749045879724e-06, "loss": 1.07700329, "memory(GiB)": 141.16, "step": 135760, "train_speed(iter/s)": 0.289743 }, { "acc": 0.74627571, "epoch": 1.5186921441124535, "grad_norm": 7.21875, "learning_rate": 1.5014533879256816e-06, "loss": 1.02219381, "memory(GiB)": 141.16, "step": 135780, "train_speed(iter/s)": 0.289757 }, { "acc": 0.73782997, "epoch": 1.518915843058412, "grad_norm": 6.625, "learning_rate": 1.5001323499145743e-06, "loss": 1.05480785, "memory(GiB)": 141.16, "step": 135800, "train_speed(iter/s)": 0.289772 }, { "acc": 0.74189544, "epoch": 1.5191395420043705, "grad_norm": 7.1875, "learning_rate": 1.4988117907353861e-06, "loss": 1.03148689, "memory(GiB)": 141.16, "step": 135820, "train_speed(iter/s)": 0.289786 }, { "acc": 0.73581362, "epoch": 1.519363240950329, "grad_norm": 6.65625, "learning_rate": 1.497491710568787e-06, "loss": 1.06821785, "memory(GiB)": 141.16, "step": 135840, "train_speed(iter/s)": 0.289798 }, { "acc": 0.7265574, "epoch": 1.5195869398962876, "grad_norm": 8.375, "learning_rate": 1.496172109595385e-06, "loss": 1.11754656, "memory(GiB)": 141.16, "step": 135860, "train_speed(iter/s)": 0.289813 }, { "acc": 0.73041039, "epoch": 1.519810638842246, "grad_norm": 7.78125, "learning_rate": 1.494852987995719e-06, "loss": 1.09266109, "memory(GiB)": 141.16, "step": 135880, "train_speed(iter/s)": 0.289827 }, { "acc": 0.73185959, "epoch": 1.5200343377882046, "grad_norm": 6.40625, "learning_rate": 1.493534345950266e-06, "loss": 1.07149267, "memory(GiB)": 141.16, "step": 135900, "train_speed(iter/s)": 0.289841 }, { "acc": 0.72245169, "epoch": 1.5202580367341632, "grad_norm": 7.3125, "learning_rate": 1.4922161836394332e-06, "loss": 1.1245244, "memory(GiB)": 141.16, "step": 135920, "train_speed(iter/s)": 0.289856 }, { "acc": 0.73618307, "epoch": 1.5204817356801217, "grad_norm": 7.25, "learning_rate": 1.4908985012435624e-06, "loss": 1.05968647, "memory(GiB)": 141.16, "step": 135940, "train_speed(iter/s)": 0.289869 }, { "acc": 0.74421749, "epoch": 1.5207054346260802, "grad_norm": 7.6875, "learning_rate": 1.4895812989429353e-06, "loss": 1.02112331, "memory(GiB)": 141.16, "step": 135960, "train_speed(iter/s)": 0.289884 }, { "acc": 0.72690635, "epoch": 1.5209291335720387, "grad_norm": 7.4375, "learning_rate": 1.48826457691776e-06, "loss": 1.09034443, "memory(GiB)": 141.16, "step": 135980, "train_speed(iter/s)": 0.289896 }, { "acc": 0.73808823, "epoch": 1.5211528325179973, "grad_norm": 6.9375, "learning_rate": 1.4869483353481867e-06, "loss": 1.04027843, "memory(GiB)": 141.16, "step": 136000, "train_speed(iter/s)": 0.28991 }, { "epoch": 1.5211528325179973, "eval_acc": 0.6901361139801764, "eval_loss": 1.0791542530059814, "eval_runtime": 2326.1295, "eval_samples_per_second": 32.364, "eval_steps_per_second": 16.182, "step": 136000 }, { "acc": 0.72807941, "epoch": 1.5213765314639558, "grad_norm": 5.75, "learning_rate": 1.4856325744142936e-06, "loss": 1.09953957, "memory(GiB)": 141.16, "step": 136020, "train_speed(iter/s)": 0.288464 }, { "acc": 0.75088387, "epoch": 1.5216002304099143, "grad_norm": 6.03125, "learning_rate": 1.4843172942960954e-06, "loss": 0.99804277, "memory(GiB)": 141.16, "step": 136040, "train_speed(iter/s)": 0.28848 }, { "acc": 0.73552856, "epoch": 1.5218239293558729, "grad_norm": 8.3125, "learning_rate": 1.4830024951735434e-06, "loss": 1.06567535, "memory(GiB)": 141.16, "step": 136060, "train_speed(iter/s)": 0.288494 }, { "acc": 0.72709122, "epoch": 1.5220476283018314, "grad_norm": 7.25, "learning_rate": 1.4816881772265173e-06, "loss": 1.10239534, "memory(GiB)": 141.16, "step": 136080, "train_speed(iter/s)": 0.288507 }, { "acc": 0.73230596, "epoch": 1.52227132724779, "grad_norm": 6.78125, "learning_rate": 1.4803743406348393e-06, "loss": 1.06701746, "memory(GiB)": 141.16, "step": 136100, "train_speed(iter/s)": 0.288521 }, { "acc": 0.72580781, "epoch": 1.5224950261937484, "grad_norm": 8.6875, "learning_rate": 1.4790609855782577e-06, "loss": 1.12124023, "memory(GiB)": 141.16, "step": 136120, "train_speed(iter/s)": 0.288535 }, { "acc": 0.72466898, "epoch": 1.522718725139707, "grad_norm": 6.625, "learning_rate": 1.4777481122364584e-06, "loss": 1.11275616, "memory(GiB)": 141.16, "step": 136140, "train_speed(iter/s)": 0.288549 }, { "acc": 0.73232374, "epoch": 1.5229424240856655, "grad_norm": 8.625, "learning_rate": 1.4764357207890645e-06, "loss": 1.06912899, "memory(GiB)": 141.16, "step": 136160, "train_speed(iter/s)": 0.288566 }, { "acc": 0.73639278, "epoch": 1.523166123031624, "grad_norm": 6.90625, "learning_rate": 1.4751238114156242e-06, "loss": 1.05515709, "memory(GiB)": 141.16, "step": 136180, "train_speed(iter/s)": 0.288581 }, { "acc": 0.72325735, "epoch": 1.5233898219775825, "grad_norm": 6.40625, "learning_rate": 1.4738123842956304e-06, "loss": 1.11244221, "memory(GiB)": 141.16, "step": 136200, "train_speed(iter/s)": 0.288596 }, { "acc": 0.72740507, "epoch": 1.523613520923541, "grad_norm": 6.3125, "learning_rate": 1.4725014396085014e-06, "loss": 1.08819103, "memory(GiB)": 141.16, "step": 136220, "train_speed(iter/s)": 0.28861 }, { "acc": 0.74034953, "epoch": 1.5238372198694996, "grad_norm": 7.5625, "learning_rate": 1.471190977533597e-06, "loss": 1.04125729, "memory(GiB)": 141.16, "step": 136240, "train_speed(iter/s)": 0.288627 }, { "acc": 0.73550992, "epoch": 1.5240609188154581, "grad_norm": 7.625, "learning_rate": 1.4698809982502048e-06, "loss": 1.0642765, "memory(GiB)": 141.16, "step": 136260, "train_speed(iter/s)": 0.288642 }, { "acc": 0.73847132, "epoch": 1.5242846177614167, "grad_norm": 5.3125, "learning_rate": 1.468571501937548e-06, "loss": 1.03845921, "memory(GiB)": 141.16, "step": 136280, "train_speed(iter/s)": 0.288657 }, { "acc": 0.74420891, "epoch": 1.5245083167073752, "grad_norm": 8.375, "learning_rate": 1.4672624887747865e-06, "loss": 1.03390856, "memory(GiB)": 141.16, "step": 136300, "train_speed(iter/s)": 0.288671 }, { "acc": 0.7405283, "epoch": 1.5247320156533337, "grad_norm": 7.90625, "learning_rate": 1.4659539589410099e-06, "loss": 1.02222195, "memory(GiB)": 141.16, "step": 136320, "train_speed(iter/s)": 0.288686 }, { "acc": 0.73990479, "epoch": 1.5249557145992922, "grad_norm": 7.5, "learning_rate": 1.4646459126152458e-06, "loss": 1.04107895, "memory(GiB)": 141.16, "step": 136340, "train_speed(iter/s)": 0.2887 }, { "acc": 0.73433194, "epoch": 1.5251794135452508, "grad_norm": 6.4375, "learning_rate": 1.4633383499764531e-06, "loss": 1.06293058, "memory(GiB)": 141.16, "step": 136360, "train_speed(iter/s)": 0.288714 }, { "acc": 0.73351564, "epoch": 1.5254031124912093, "grad_norm": 7.34375, "learning_rate": 1.4620312712035234e-06, "loss": 1.07472553, "memory(GiB)": 141.16, "step": 136380, "train_speed(iter/s)": 0.288727 }, { "acc": 0.74004946, "epoch": 1.5256268114371678, "grad_norm": 7.3125, "learning_rate": 1.4607246764752858e-06, "loss": 1.04829693, "memory(GiB)": 141.16, "step": 136400, "train_speed(iter/s)": 0.288741 }, { "acc": 0.74722462, "epoch": 1.5258505103831264, "grad_norm": 8.75, "learning_rate": 1.4594185659704995e-06, "loss": 1.01684999, "memory(GiB)": 141.16, "step": 136420, "train_speed(iter/s)": 0.288755 }, { "acc": 0.74352551, "epoch": 1.5260742093290849, "grad_norm": 7.21875, "learning_rate": 1.4581129398678612e-06, "loss": 1.03483963, "memory(GiB)": 141.16, "step": 136440, "train_speed(iter/s)": 0.288767 }, { "acc": 0.73390818, "epoch": 1.5262979082750434, "grad_norm": 7.6875, "learning_rate": 1.4568077983459982e-06, "loss": 1.06376534, "memory(GiB)": 141.16, "step": 136460, "train_speed(iter/s)": 0.288781 }, { "acc": 0.74371347, "epoch": 1.526521607221002, "grad_norm": 6.84375, "learning_rate": 1.4555031415834703e-06, "loss": 1.0419322, "memory(GiB)": 141.16, "step": 136480, "train_speed(iter/s)": 0.288798 }, { "acc": 0.71715755, "epoch": 1.5267453061669605, "grad_norm": 5.53125, "learning_rate": 1.4541989697587771e-06, "loss": 1.15212984, "memory(GiB)": 141.16, "step": 136500, "train_speed(iter/s)": 0.288813 }, { "acc": 0.74445238, "epoch": 1.526969005112919, "grad_norm": 7.3125, "learning_rate": 1.4528952830503445e-06, "loss": 1.03323517, "memory(GiB)": 141.16, "step": 136520, "train_speed(iter/s)": 0.288829 }, { "acc": 0.72376299, "epoch": 1.5271927040588775, "grad_norm": 6.28125, "learning_rate": 1.451592081636538e-06, "loss": 1.10365515, "memory(GiB)": 141.16, "step": 136540, "train_speed(iter/s)": 0.288845 }, { "acc": 0.73232617, "epoch": 1.527416403004836, "grad_norm": 5.625, "learning_rate": 1.4502893656956535e-06, "loss": 1.07308826, "memory(GiB)": 141.16, "step": 136560, "train_speed(iter/s)": 0.288859 }, { "acc": 0.72885103, "epoch": 1.5276401019507946, "grad_norm": 5.8125, "learning_rate": 1.4489871354059192e-06, "loss": 1.08111029, "memory(GiB)": 141.16, "step": 136580, "train_speed(iter/s)": 0.288874 }, { "acc": 0.73977957, "epoch": 1.527863800896753, "grad_norm": 7.125, "learning_rate": 1.4476853909455025e-06, "loss": 1.0382431, "memory(GiB)": 141.16, "step": 136600, "train_speed(iter/s)": 0.288889 }, { "acc": 0.73483133, "epoch": 1.5280874998427116, "grad_norm": 9.375, "learning_rate": 1.4463841324924966e-06, "loss": 1.06716251, "memory(GiB)": 141.16, "step": 136620, "train_speed(iter/s)": 0.288903 }, { "acc": 0.74936724, "epoch": 1.5283111987886702, "grad_norm": 5.84375, "learning_rate": 1.4450833602249359e-06, "loss": 1.01455765, "memory(GiB)": 141.16, "step": 136640, "train_speed(iter/s)": 0.288919 }, { "acc": 0.73257151, "epoch": 1.5285348977346287, "grad_norm": 6.21875, "learning_rate": 1.4437830743207827e-06, "loss": 1.08095207, "memory(GiB)": 141.16, "step": 136660, "train_speed(iter/s)": 0.288934 }, { "acc": 0.73744974, "epoch": 1.5287585966805872, "grad_norm": 6.375, "learning_rate": 1.4424832749579338e-06, "loss": 1.06100664, "memory(GiB)": 141.16, "step": 136680, "train_speed(iter/s)": 0.288948 }, { "acc": 0.73787732, "epoch": 1.5289822956265458, "grad_norm": 6.5625, "learning_rate": 1.4411839623142227e-06, "loss": 1.03237772, "memory(GiB)": 141.16, "step": 136700, "train_speed(iter/s)": 0.288962 }, { "acc": 0.73473711, "epoch": 1.5292059945725043, "grad_norm": 6.625, "learning_rate": 1.4398851365674115e-06, "loss": 1.06483593, "memory(GiB)": 141.16, "step": 136720, "train_speed(iter/s)": 0.288975 }, { "acc": 0.73672748, "epoch": 1.5294296935184628, "grad_norm": 8.875, "learning_rate": 1.4385867978952011e-06, "loss": 1.0663023, "memory(GiB)": 141.16, "step": 136740, "train_speed(iter/s)": 0.288989 }, { "acc": 0.73535752, "epoch": 1.5296533924644213, "grad_norm": 7.8125, "learning_rate": 1.4372889464752203e-06, "loss": 1.05298986, "memory(GiB)": 141.16, "step": 136760, "train_speed(iter/s)": 0.289001 }, { "acc": 0.74504156, "epoch": 1.5298770914103799, "grad_norm": 7.625, "learning_rate": 1.435991582485034e-06, "loss": 1.0133894, "memory(GiB)": 141.16, "step": 136780, "train_speed(iter/s)": 0.289015 }, { "acc": 0.75191226, "epoch": 1.5301007903563384, "grad_norm": 9.0625, "learning_rate": 1.4346947061021417e-06, "loss": 0.98607693, "memory(GiB)": 141.16, "step": 136800, "train_speed(iter/s)": 0.289028 }, { "acc": 0.7401248, "epoch": 1.530324489302297, "grad_norm": 9.1875, "learning_rate": 1.4333983175039717e-06, "loss": 1.03533077, "memory(GiB)": 141.16, "step": 136820, "train_speed(iter/s)": 0.289042 }, { "acc": 0.74663777, "epoch": 1.5305481882482554, "grad_norm": 6.6875, "learning_rate": 1.432102416867892e-06, "loss": 0.9997242, "memory(GiB)": 141.16, "step": 136840, "train_speed(iter/s)": 0.289058 }, { "acc": 0.73632894, "epoch": 1.530771887194214, "grad_norm": 6.6875, "learning_rate": 1.4308070043711992e-06, "loss": 1.04677525, "memory(GiB)": 141.16, "step": 136860, "train_speed(iter/s)": 0.289072 }, { "acc": 0.72507992, "epoch": 1.5309955861401725, "grad_norm": 8.3125, "learning_rate": 1.4295120801911216e-06, "loss": 1.10901222, "memory(GiB)": 141.16, "step": 136880, "train_speed(iter/s)": 0.289086 }, { "acc": 0.72617712, "epoch": 1.531219285086131, "grad_norm": 6.40625, "learning_rate": 1.4282176445048274e-06, "loss": 1.11684685, "memory(GiB)": 141.16, "step": 136900, "train_speed(iter/s)": 0.2891 }, { "acc": 0.73879929, "epoch": 1.5314429840320896, "grad_norm": 7.5, "learning_rate": 1.4269236974894103e-06, "loss": 1.03857307, "memory(GiB)": 141.16, "step": 136920, "train_speed(iter/s)": 0.289115 }, { "acc": 0.75147262, "epoch": 1.531666682978048, "grad_norm": 7.25, "learning_rate": 1.4256302393219041e-06, "loss": 0.98932514, "memory(GiB)": 141.16, "step": 136940, "train_speed(iter/s)": 0.289127 }, { "acc": 0.73582067, "epoch": 1.5318903819240066, "grad_norm": 8.8125, "learning_rate": 1.4243372701792702e-06, "loss": 1.06101093, "memory(GiB)": 141.16, "step": 136960, "train_speed(iter/s)": 0.289141 }, { "acc": 0.73890038, "epoch": 1.5321140808699654, "grad_norm": 6.0625, "learning_rate": 1.4230447902384049e-06, "loss": 1.0447731, "memory(GiB)": 141.16, "step": 136980, "train_speed(iter/s)": 0.289154 }, { "acc": 0.73290205, "epoch": 1.532337779815924, "grad_norm": 6.6875, "learning_rate": 1.4217527996761399e-06, "loss": 1.07449846, "memory(GiB)": 141.16, "step": 137000, "train_speed(iter/s)": 0.289168 }, { "acc": 0.73873253, "epoch": 1.5325614787618824, "grad_norm": 6.46875, "learning_rate": 1.420461298669235e-06, "loss": 1.03878193, "memory(GiB)": 141.16, "step": 137020, "train_speed(iter/s)": 0.289182 }, { "acc": 0.74465561, "epoch": 1.532785177707841, "grad_norm": 6.21875, "learning_rate": 1.4191702873943898e-06, "loss": 1.01953773, "memory(GiB)": 141.16, "step": 137040, "train_speed(iter/s)": 0.289195 }, { "acc": 0.74349918, "epoch": 1.5330088766537995, "grad_norm": 7.65625, "learning_rate": 1.4178797660282313e-06, "loss": 1.03074341, "memory(GiB)": 141.16, "step": 137060, "train_speed(iter/s)": 0.28921 }, { "acc": 0.7392952, "epoch": 1.533232575599758, "grad_norm": 6.1875, "learning_rate": 1.416589734747319e-06, "loss": 1.05652485, "memory(GiB)": 141.16, "step": 137080, "train_speed(iter/s)": 0.289226 }, { "acc": 0.73982582, "epoch": 1.5334562745457165, "grad_norm": 6.5, "learning_rate": 1.4153001937281512e-06, "loss": 1.04831524, "memory(GiB)": 141.16, "step": 137100, "train_speed(iter/s)": 0.289239 }, { "acc": 0.73165064, "epoch": 1.533679973491675, "grad_norm": 7.625, "learning_rate": 1.4140111431471531e-06, "loss": 1.07161446, "memory(GiB)": 141.16, "step": 137120, "train_speed(iter/s)": 0.289253 }, { "acc": 0.7391737, "epoch": 1.5339036724376336, "grad_norm": 7.0, "learning_rate": 1.4127225831806873e-06, "loss": 1.03604279, "memory(GiB)": 141.16, "step": 137140, "train_speed(iter/s)": 0.289267 }, { "acc": 0.72924681, "epoch": 1.5341273713835921, "grad_norm": 7.5, "learning_rate": 1.4114345140050466e-06, "loss": 1.09108648, "memory(GiB)": 141.16, "step": 137160, "train_speed(iter/s)": 0.289282 }, { "acc": 0.71817379, "epoch": 1.5343510703295506, "grad_norm": 6.875, "learning_rate": 1.4101469357964549e-06, "loss": 1.1448885, "memory(GiB)": 141.16, "step": 137180, "train_speed(iter/s)": 0.289296 }, { "acc": 0.73297653, "epoch": 1.5345747692755092, "grad_norm": 6.3125, "learning_rate": 1.408859848731075e-06, "loss": 1.07614307, "memory(GiB)": 141.16, "step": 137200, "train_speed(iter/s)": 0.28931 }, { "acc": 0.74294024, "epoch": 1.5347984682214677, "grad_norm": 8.0, "learning_rate": 1.407573252984995e-06, "loss": 1.03331738, "memory(GiB)": 141.16, "step": 137220, "train_speed(iter/s)": 0.289325 }, { "acc": 0.73729315, "epoch": 1.5350221671674262, "grad_norm": 7.53125, "learning_rate": 1.406287148734244e-06, "loss": 1.05863762, "memory(GiB)": 141.16, "step": 137240, "train_speed(iter/s)": 0.289338 }, { "acc": 0.73167477, "epoch": 1.5352458661133848, "grad_norm": 6.375, "learning_rate": 1.4050015361547764e-06, "loss": 1.07273426, "memory(GiB)": 141.16, "step": 137260, "train_speed(iter/s)": 0.289352 }, { "acc": 0.72284536, "epoch": 1.5354695650593433, "grad_norm": 7.125, "learning_rate": 1.4037164154224813e-06, "loss": 1.12694817, "memory(GiB)": 141.16, "step": 137280, "train_speed(iter/s)": 0.289366 }, { "acc": 0.74997797, "epoch": 1.5356932640053018, "grad_norm": 6.21875, "learning_rate": 1.4024317867131854e-06, "loss": 0.99144821, "memory(GiB)": 141.16, "step": 137300, "train_speed(iter/s)": 0.289381 }, { "acc": 0.73417578, "epoch": 1.5359169629512603, "grad_norm": 7.21875, "learning_rate": 1.40114765020264e-06, "loss": 1.05977612, "memory(GiB)": 141.16, "step": 137320, "train_speed(iter/s)": 0.289396 }, { "acc": 0.73895063, "epoch": 1.5361406618972189, "grad_norm": 5.46875, "learning_rate": 1.3998640060665391e-06, "loss": 1.03630295, "memory(GiB)": 141.16, "step": 137340, "train_speed(iter/s)": 0.289411 }, { "acc": 0.7345623, "epoch": 1.5363643608431774, "grad_norm": 6.03125, "learning_rate": 1.3985808544804969e-06, "loss": 1.05005846, "memory(GiB)": 141.16, "step": 137360, "train_speed(iter/s)": 0.289425 }, { "acc": 0.73605833, "epoch": 1.536588059789136, "grad_norm": 7.3125, "learning_rate": 1.397298195620071e-06, "loss": 1.04908504, "memory(GiB)": 141.16, "step": 137380, "train_speed(iter/s)": 0.289438 }, { "acc": 0.73593597, "epoch": 1.5368117587350945, "grad_norm": 5.96875, "learning_rate": 1.3960160296607468e-06, "loss": 1.04817677, "memory(GiB)": 141.16, "step": 137400, "train_speed(iter/s)": 0.28945 }, { "acc": 0.72799416, "epoch": 1.537035457681053, "grad_norm": 6.375, "learning_rate": 1.394734356777941e-06, "loss": 1.09071598, "memory(GiB)": 141.16, "step": 137420, "train_speed(iter/s)": 0.289464 }, { "acc": 0.74742351, "epoch": 1.5372591566270115, "grad_norm": 6.65625, "learning_rate": 1.3934531771470078e-06, "loss": 0.98882561, "memory(GiB)": 141.16, "step": 137440, "train_speed(iter/s)": 0.289478 }, { "acc": 0.72757421, "epoch": 1.53748285557297, "grad_norm": 6.78125, "learning_rate": 1.3921724909432277e-06, "loss": 1.10352736, "memory(GiB)": 141.16, "step": 137460, "train_speed(iter/s)": 0.289489 }, { "acc": 0.73571587, "epoch": 1.5377065545189286, "grad_norm": 7.09375, "learning_rate": 1.3908922983418205e-06, "loss": 1.07694244, "memory(GiB)": 141.16, "step": 137480, "train_speed(iter/s)": 0.289503 }, { "acc": 0.74048681, "epoch": 1.537930253464887, "grad_norm": 7.5, "learning_rate": 1.3896125995179328e-06, "loss": 1.02590103, "memory(GiB)": 141.16, "step": 137500, "train_speed(iter/s)": 0.289518 }, { "acc": 0.73859615, "epoch": 1.5381539524108456, "grad_norm": 7.25, "learning_rate": 1.3883333946466443e-06, "loss": 1.06438046, "memory(GiB)": 141.16, "step": 137520, "train_speed(iter/s)": 0.289532 }, { "acc": 0.73937702, "epoch": 1.5383776513568042, "grad_norm": 7.21875, "learning_rate": 1.3870546839029713e-06, "loss": 1.03664188, "memory(GiB)": 141.16, "step": 137540, "train_speed(iter/s)": 0.289549 }, { "acc": 0.73510561, "epoch": 1.5386013503027627, "grad_norm": 7.9375, "learning_rate": 1.3857764674618568e-06, "loss": 1.06518364, "memory(GiB)": 141.16, "step": 137560, "train_speed(iter/s)": 0.289562 }, { "acc": 0.73936243, "epoch": 1.5388250492487212, "grad_norm": 6.75, "learning_rate": 1.3844987454981823e-06, "loss": 1.03775053, "memory(GiB)": 141.16, "step": 137580, "train_speed(iter/s)": 0.289576 }, { "acc": 0.73022089, "epoch": 1.5390487481946797, "grad_norm": 8.75, "learning_rate": 1.3832215181867575e-06, "loss": 1.07182178, "memory(GiB)": 141.16, "step": 137600, "train_speed(iter/s)": 0.289587 }, { "acc": 0.72963505, "epoch": 1.5392724471406383, "grad_norm": 7.59375, "learning_rate": 1.3819447857023222e-06, "loss": 1.07506332, "memory(GiB)": 141.16, "step": 137620, "train_speed(iter/s)": 0.289599 }, { "acc": 0.74246488, "epoch": 1.5394961460865968, "grad_norm": 7.375, "learning_rate": 1.3806685482195565e-06, "loss": 1.02742414, "memory(GiB)": 141.16, "step": 137640, "train_speed(iter/s)": 0.289611 }, { "acc": 0.7378674, "epoch": 1.5397198450325553, "grad_norm": 7.21875, "learning_rate": 1.3793928059130635e-06, "loss": 1.05067711, "memory(GiB)": 141.16, "step": 137660, "train_speed(iter/s)": 0.289625 }, { "acc": 0.72159071, "epoch": 1.5399435439785139, "grad_norm": 7.375, "learning_rate": 1.3781175589573869e-06, "loss": 1.11506529, "memory(GiB)": 141.16, "step": 137680, "train_speed(iter/s)": 0.289639 }, { "acc": 0.7357121, "epoch": 1.5401672429244724, "grad_norm": 6.71875, "learning_rate": 1.3768428075269969e-06, "loss": 1.06612349, "memory(GiB)": 141.16, "step": 137700, "train_speed(iter/s)": 0.289653 }, { "acc": 0.7422893, "epoch": 1.540390941870431, "grad_norm": 7.53125, "learning_rate": 1.3755685517962958e-06, "loss": 1.04231987, "memory(GiB)": 141.16, "step": 137720, "train_speed(iter/s)": 0.289664 }, { "acc": 0.7356781, "epoch": 1.5406146408163894, "grad_norm": 7.21875, "learning_rate": 1.3742947919396231e-06, "loss": 1.04284077, "memory(GiB)": 141.16, "step": 137740, "train_speed(iter/s)": 0.289675 }, { "acc": 0.735146, "epoch": 1.540838339762348, "grad_norm": 7.71875, "learning_rate": 1.3730215281312454e-06, "loss": 1.07303753, "memory(GiB)": 141.16, "step": 137760, "train_speed(iter/s)": 0.28969 }, { "acc": 0.73115911, "epoch": 1.5410620387083065, "grad_norm": 4.9375, "learning_rate": 1.3717487605453655e-06, "loss": 1.08158417, "memory(GiB)": 141.16, "step": 137780, "train_speed(iter/s)": 0.289702 }, { "acc": 0.7361948, "epoch": 1.541285737654265, "grad_norm": 8.25, "learning_rate": 1.3704764893561145e-06, "loss": 1.05398912, "memory(GiB)": 141.16, "step": 137800, "train_speed(iter/s)": 0.289716 }, { "acc": 0.73579164, "epoch": 1.5415094366002235, "grad_norm": 8.375, "learning_rate": 1.369204714737556e-06, "loss": 1.05745373, "memory(GiB)": 141.16, "step": 137820, "train_speed(iter/s)": 0.28973 }, { "acc": 0.73633699, "epoch": 1.541733135546182, "grad_norm": 6.5, "learning_rate": 1.3679334368636905e-06, "loss": 1.0649889, "memory(GiB)": 141.16, "step": 137840, "train_speed(iter/s)": 0.289744 }, { "acc": 0.74380136, "epoch": 1.5419568344921406, "grad_norm": 9.375, "learning_rate": 1.3666626559084434e-06, "loss": 1.01486378, "memory(GiB)": 141.16, "step": 137860, "train_speed(iter/s)": 0.289759 }, { "acc": 0.72548099, "epoch": 1.5421805334380991, "grad_norm": 6.8125, "learning_rate": 1.3653923720456785e-06, "loss": 1.08895054, "memory(GiB)": 141.16, "step": 137880, "train_speed(iter/s)": 0.289774 }, { "acc": 0.73510041, "epoch": 1.5424042323840577, "grad_norm": 7.25, "learning_rate": 1.364122585449188e-06, "loss": 1.05737705, "memory(GiB)": 141.16, "step": 137900, "train_speed(iter/s)": 0.289788 }, { "acc": 0.72009406, "epoch": 1.5426279313300162, "grad_norm": 8.3125, "learning_rate": 1.3628532962926949e-06, "loss": 1.14673901, "memory(GiB)": 141.16, "step": 137920, "train_speed(iter/s)": 0.289802 }, { "acc": 0.73171883, "epoch": 1.5428516302759747, "grad_norm": 7.21875, "learning_rate": 1.361584504749859e-06, "loss": 1.06532402, "memory(GiB)": 141.16, "step": 137940, "train_speed(iter/s)": 0.289815 }, { "acc": 0.73347168, "epoch": 1.5430753292219332, "grad_norm": 6.53125, "learning_rate": 1.3603162109942664e-06, "loss": 1.0581234, "memory(GiB)": 141.16, "step": 137960, "train_speed(iter/s)": 0.289829 }, { "acc": 0.72812805, "epoch": 1.5432990281678918, "grad_norm": 5.0625, "learning_rate": 1.3590484151994405e-06, "loss": 1.07233791, "memory(GiB)": 141.16, "step": 137980, "train_speed(iter/s)": 0.289844 }, { "acc": 0.74347234, "epoch": 1.5435227271138503, "grad_norm": 7.96875, "learning_rate": 1.3577811175388328e-06, "loss": 1.02079, "memory(GiB)": 141.16, "step": 138000, "train_speed(iter/s)": 0.289857 }, { "epoch": 1.5435227271138503, "eval_acc": 0.6901782133235844, "eval_loss": 1.0791373252868652, "eval_runtime": 2323.6322, "eval_samples_per_second": 32.399, "eval_steps_per_second": 16.2, "step": 138000 }, { "acc": 0.74215598, "epoch": 1.5437464260598088, "grad_norm": 7.21875, "learning_rate": 1.3565143181858258e-06, "loss": 1.01552048, "memory(GiB)": 141.16, "step": 138020, "train_speed(iter/s)": 0.288432 }, { "acc": 0.72214918, "epoch": 1.5439701250057674, "grad_norm": 7.375, "learning_rate": 1.3552480173137395e-06, "loss": 1.1303751, "memory(GiB)": 141.16, "step": 138040, "train_speed(iter/s)": 0.288445 }, { "acc": 0.725529, "epoch": 1.5441938239517259, "grad_norm": 8.875, "learning_rate": 1.3539822150958181e-06, "loss": 1.09666672, "memory(GiB)": 141.16, "step": 138060, "train_speed(iter/s)": 0.288459 }, { "acc": 0.73392057, "epoch": 1.5444175228976844, "grad_norm": 7.40625, "learning_rate": 1.3527169117052447e-06, "loss": 1.07255058, "memory(GiB)": 141.16, "step": 138080, "train_speed(iter/s)": 0.288472 }, { "acc": 0.74300184, "epoch": 1.544641221843643, "grad_norm": 8.375, "learning_rate": 1.3514521073151298e-06, "loss": 1.02611294, "memory(GiB)": 141.16, "step": 138100, "train_speed(iter/s)": 0.288485 }, { "acc": 0.73339243, "epoch": 1.5448649207896015, "grad_norm": 7.1875, "learning_rate": 1.3501878020985144e-06, "loss": 1.05817461, "memory(GiB)": 141.16, "step": 138120, "train_speed(iter/s)": 0.288499 }, { "acc": 0.72785425, "epoch": 1.54508861973556, "grad_norm": 7.46875, "learning_rate": 1.3489239962283774e-06, "loss": 1.10741634, "memory(GiB)": 141.16, "step": 138140, "train_speed(iter/s)": 0.288512 }, { "acc": 0.73092179, "epoch": 1.5453123186815185, "grad_norm": 7.875, "learning_rate": 1.3476606898776217e-06, "loss": 1.08484812, "memory(GiB)": 141.16, "step": 138160, "train_speed(iter/s)": 0.288526 }, { "acc": 0.74228349, "epoch": 1.545536017627477, "grad_norm": 6.71875, "learning_rate": 1.3463978832190893e-06, "loss": 1.04264174, "memory(GiB)": 141.16, "step": 138180, "train_speed(iter/s)": 0.28854 }, { "acc": 0.73462305, "epoch": 1.5457597165734356, "grad_norm": 7.125, "learning_rate": 1.3451355764255486e-06, "loss": 1.0585969, "memory(GiB)": 141.16, "step": 138200, "train_speed(iter/s)": 0.288555 }, { "acc": 0.75459456, "epoch": 1.545983415519394, "grad_norm": 7.40625, "learning_rate": 1.3438737696696996e-06, "loss": 0.97101212, "memory(GiB)": 141.16, "step": 138220, "train_speed(iter/s)": 0.288567 }, { "acc": 0.7325141, "epoch": 1.5462071144653526, "grad_norm": 6.4375, "learning_rate": 1.342612463124179e-06, "loss": 1.06777115, "memory(GiB)": 141.16, "step": 138240, "train_speed(iter/s)": 0.28858 }, { "acc": 0.72665224, "epoch": 1.5464308134113112, "grad_norm": 5.875, "learning_rate": 1.341351656961547e-06, "loss": 1.11123896, "memory(GiB)": 141.16, "step": 138260, "train_speed(iter/s)": 0.288593 }, { "acc": 0.73509121, "epoch": 1.5466545123572697, "grad_norm": 5.65625, "learning_rate": 1.3400913513543045e-06, "loss": 1.05803881, "memory(GiB)": 141.16, "step": 138280, "train_speed(iter/s)": 0.288606 }, { "acc": 0.74476733, "epoch": 1.5468782113032282, "grad_norm": 7.15625, "learning_rate": 1.3388315464748775e-06, "loss": 1.01861582, "memory(GiB)": 141.16, "step": 138300, "train_speed(iter/s)": 0.28862 }, { "acc": 0.73377085, "epoch": 1.5471019102491868, "grad_norm": 6.3125, "learning_rate": 1.3375722424956233e-06, "loss": 1.06316414, "memory(GiB)": 141.16, "step": 138320, "train_speed(iter/s)": 0.288634 }, { "acc": 0.73666792, "epoch": 1.5473256091951453, "grad_norm": 6.8125, "learning_rate": 1.336313439588836e-06, "loss": 1.0476965, "memory(GiB)": 141.16, "step": 138340, "train_speed(iter/s)": 0.288648 }, { "acc": 0.74010992, "epoch": 1.5475493081411038, "grad_norm": 8.125, "learning_rate": 1.3350551379267347e-06, "loss": 1.04022951, "memory(GiB)": 141.16, "step": 138360, "train_speed(iter/s)": 0.288662 }, { "acc": 0.7290554, "epoch": 1.5477730070870623, "grad_norm": 7.25, "learning_rate": 1.3337973376814761e-06, "loss": 1.08993874, "memory(GiB)": 141.16, "step": 138380, "train_speed(iter/s)": 0.288676 }, { "acc": 0.73157482, "epoch": 1.5479967060330209, "grad_norm": 7.46875, "learning_rate": 1.3325400390251442e-06, "loss": 1.06305676, "memory(GiB)": 141.16, "step": 138400, "train_speed(iter/s)": 0.28869 }, { "acc": 0.73673334, "epoch": 1.5482204049789794, "grad_norm": 8.125, "learning_rate": 1.3312832421297534e-06, "loss": 1.0472147, "memory(GiB)": 141.16, "step": 138420, "train_speed(iter/s)": 0.288703 }, { "acc": 0.72521029, "epoch": 1.548444103924938, "grad_norm": 6.53125, "learning_rate": 1.3300269471672545e-06, "loss": 1.09508362, "memory(GiB)": 141.16, "step": 138440, "train_speed(iter/s)": 0.288716 }, { "acc": 0.73178539, "epoch": 1.5486678028708964, "grad_norm": 9.0, "learning_rate": 1.328771154309524e-06, "loss": 1.07434502, "memory(GiB)": 141.16, "step": 138460, "train_speed(iter/s)": 0.288728 }, { "acc": 0.7415956, "epoch": 1.548891501816855, "grad_norm": 5.875, "learning_rate": 1.3275158637283747e-06, "loss": 1.02043123, "memory(GiB)": 141.16, "step": 138480, "train_speed(iter/s)": 0.288742 }, { "acc": 0.73049874, "epoch": 1.5491152007628135, "grad_norm": 6.4375, "learning_rate": 1.3262610755955468e-06, "loss": 1.08786869, "memory(GiB)": 141.16, "step": 138500, "train_speed(iter/s)": 0.288757 }, { "acc": 0.74139791, "epoch": 1.549338899708772, "grad_norm": 7.34375, "learning_rate": 1.3250067900827129e-06, "loss": 1.02566814, "memory(GiB)": 141.16, "step": 138520, "train_speed(iter/s)": 0.288772 }, { "acc": 0.73505774, "epoch": 1.5495625986547306, "grad_norm": 7.8125, "learning_rate": 1.3237530073614807e-06, "loss": 1.06526031, "memory(GiB)": 141.16, "step": 138540, "train_speed(iter/s)": 0.288787 }, { "acc": 0.72832408, "epoch": 1.549786297600689, "grad_norm": 6.15625, "learning_rate": 1.3224997276033797e-06, "loss": 1.09914379, "memory(GiB)": 141.16, "step": 138560, "train_speed(iter/s)": 0.288802 }, { "acc": 0.74181566, "epoch": 1.5500099965466476, "grad_norm": 6.65625, "learning_rate": 1.321246950979881e-06, "loss": 1.01891823, "memory(GiB)": 141.16, "step": 138580, "train_speed(iter/s)": 0.288818 }, { "acc": 0.73576555, "epoch": 1.5502336954926061, "grad_norm": 7.71875, "learning_rate": 1.319994677662379e-06, "loss": 1.0523243, "memory(GiB)": 141.16, "step": 138600, "train_speed(iter/s)": 0.288831 }, { "acc": 0.73576431, "epoch": 1.5504573944385647, "grad_norm": 7.15625, "learning_rate": 1.3187429078222063e-06, "loss": 1.05082073, "memory(GiB)": 141.16, "step": 138620, "train_speed(iter/s)": 0.288845 }, { "acc": 0.73846922, "epoch": 1.5506810933845232, "grad_norm": 7.09375, "learning_rate": 1.317491641630621e-06, "loss": 1.04531488, "memory(GiB)": 141.16, "step": 138640, "train_speed(iter/s)": 0.288858 }, { "acc": 0.74639316, "epoch": 1.5509047923304817, "grad_norm": 5.78125, "learning_rate": 1.3162408792588132e-06, "loss": 1.03098965, "memory(GiB)": 141.16, "step": 138660, "train_speed(iter/s)": 0.288873 }, { "acc": 0.7311913, "epoch": 1.5511284912764403, "grad_norm": 9.3125, "learning_rate": 1.3149906208779073e-06, "loss": 1.07818737, "memory(GiB)": 141.16, "step": 138680, "train_speed(iter/s)": 0.288887 }, { "acc": 0.74270306, "epoch": 1.5513521902223988, "grad_norm": 4.875, "learning_rate": 1.313740866658954e-06, "loss": 1.02806816, "memory(GiB)": 141.16, "step": 138700, "train_speed(iter/s)": 0.288901 }, { "acc": 0.72301569, "epoch": 1.5515758891683573, "grad_norm": 6.5, "learning_rate": 1.3124916167729407e-06, "loss": 1.12033558, "memory(GiB)": 141.16, "step": 138720, "train_speed(iter/s)": 0.288915 }, { "acc": 0.72986565, "epoch": 1.5517995881143158, "grad_norm": 6.34375, "learning_rate": 1.3112428713907804e-06, "loss": 1.10006466, "memory(GiB)": 141.16, "step": 138740, "train_speed(iter/s)": 0.288929 }, { "acc": 0.73546801, "epoch": 1.5520232870602744, "grad_norm": 8.5625, "learning_rate": 1.3099946306833184e-06, "loss": 1.06392527, "memory(GiB)": 141.16, "step": 138760, "train_speed(iter/s)": 0.288942 }, { "acc": 0.744345, "epoch": 1.552246986006233, "grad_norm": 7.0625, "learning_rate": 1.308746894821335e-06, "loss": 1.02113791, "memory(GiB)": 141.16, "step": 138780, "train_speed(iter/s)": 0.288957 }, { "acc": 0.72815542, "epoch": 1.5524706849521914, "grad_norm": 7.0, "learning_rate": 1.307499663975535e-06, "loss": 1.09340363, "memory(GiB)": 141.16, "step": 138800, "train_speed(iter/s)": 0.288973 }, { "acc": 0.73623114, "epoch": 1.55269438389815, "grad_norm": 8.625, "learning_rate": 1.30625293831656e-06, "loss": 1.06410656, "memory(GiB)": 141.16, "step": 138820, "train_speed(iter/s)": 0.288988 }, { "acc": 0.74238548, "epoch": 1.5529180828441085, "grad_norm": 7.4375, "learning_rate": 1.3050067180149794e-06, "loss": 1.03524284, "memory(GiB)": 141.16, "step": 138840, "train_speed(iter/s)": 0.289002 }, { "acc": 0.72660055, "epoch": 1.553141781790067, "grad_norm": 7.1875, "learning_rate": 1.3037610032412917e-06, "loss": 1.10523748, "memory(GiB)": 141.16, "step": 138860, "train_speed(iter/s)": 0.289016 }, { "acc": 0.71951141, "epoch": 1.5533654807360255, "grad_norm": 6.0, "learning_rate": 1.3025157941659316e-06, "loss": 1.14390354, "memory(GiB)": 141.16, "step": 138880, "train_speed(iter/s)": 0.28903 }, { "acc": 0.72870622, "epoch": 1.553589179681984, "grad_norm": 7.28125, "learning_rate": 1.3012710909592586e-06, "loss": 1.08732338, "memory(GiB)": 141.16, "step": 138900, "train_speed(iter/s)": 0.289045 }, { "acc": 0.73005433, "epoch": 1.5538128786279426, "grad_norm": 6.5, "learning_rate": 1.3000268937915689e-06, "loss": 1.09000931, "memory(GiB)": 141.16, "step": 138920, "train_speed(iter/s)": 0.289059 }, { "acc": 0.74123468, "epoch": 1.5540365775739011, "grad_norm": 7.59375, "learning_rate": 1.2987832028330849e-06, "loss": 1.03541327, "memory(GiB)": 141.16, "step": 138940, "train_speed(iter/s)": 0.289074 }, { "acc": 0.72960424, "epoch": 1.5542602765198597, "grad_norm": 7.15625, "learning_rate": 1.29754001825396e-06, "loss": 1.08877764, "memory(GiB)": 141.16, "step": 138960, "train_speed(iter/s)": 0.289087 }, { "acc": 0.72797661, "epoch": 1.5544839754658182, "grad_norm": 7.3125, "learning_rate": 1.2962973402242823e-06, "loss": 1.09701843, "memory(GiB)": 141.16, "step": 138980, "train_speed(iter/s)": 0.289101 }, { "acc": 0.73576269, "epoch": 1.5547076744117767, "grad_norm": 7.125, "learning_rate": 1.2950551689140651e-06, "loss": 1.05292072, "memory(GiB)": 141.16, "step": 139000, "train_speed(iter/s)": 0.289115 }, { "acc": 0.7361237, "epoch": 1.5549313733577352, "grad_norm": 8.0625, "learning_rate": 1.293813504493258e-06, "loss": 1.04967203, "memory(GiB)": 141.16, "step": 139020, "train_speed(iter/s)": 0.28913 }, { "acc": 0.73852606, "epoch": 1.5551550723036938, "grad_norm": 7.59375, "learning_rate": 1.2925723471317374e-06, "loss": 1.04962759, "memory(GiB)": 141.16, "step": 139040, "train_speed(iter/s)": 0.289143 }, { "acc": 0.72455745, "epoch": 1.5553787712496523, "grad_norm": 6.09375, "learning_rate": 1.2913316969993096e-06, "loss": 1.11036434, "memory(GiB)": 141.16, "step": 139060, "train_speed(iter/s)": 0.289157 }, { "acc": 0.73235588, "epoch": 1.5556024701956108, "grad_norm": 6.21875, "learning_rate": 1.2900915542657155e-06, "loss": 1.06265306, "memory(GiB)": 141.16, "step": 139080, "train_speed(iter/s)": 0.28917 }, { "acc": 0.73684916, "epoch": 1.5558261691415693, "grad_norm": 6.28125, "learning_rate": 1.2888519191006227e-06, "loss": 1.0684269, "memory(GiB)": 141.16, "step": 139100, "train_speed(iter/s)": 0.289183 }, { "acc": 0.73273792, "epoch": 1.5560498680875279, "grad_norm": 7.25, "learning_rate": 1.2876127916736335e-06, "loss": 1.0667223, "memory(GiB)": 141.16, "step": 139120, "train_speed(iter/s)": 0.289197 }, { "acc": 0.72237844, "epoch": 1.5562735670334864, "grad_norm": 6.90625, "learning_rate": 1.2863741721542767e-06, "loss": 1.12057095, "memory(GiB)": 141.16, "step": 139140, "train_speed(iter/s)": 0.289208 }, { "acc": 0.74178553, "epoch": 1.556497265979445, "grad_norm": 7.90625, "learning_rate": 1.2851360607120112e-06, "loss": 1.04355221, "memory(GiB)": 141.16, "step": 139160, "train_speed(iter/s)": 0.289224 }, { "acc": 0.72813706, "epoch": 1.5567209649254035, "grad_norm": 6.40625, "learning_rate": 1.2838984575162316e-06, "loss": 1.10050049, "memory(GiB)": 141.16, "step": 139180, "train_speed(iter/s)": 0.289237 }, { "acc": 0.73762026, "epoch": 1.556944663871362, "grad_norm": 7.75, "learning_rate": 1.2826613627362572e-06, "loss": 1.04579506, "memory(GiB)": 141.16, "step": 139200, "train_speed(iter/s)": 0.289251 }, { "acc": 0.73052821, "epoch": 1.5571683628173205, "grad_norm": 6.78125, "learning_rate": 1.281424776541343e-06, "loss": 1.0918272, "memory(GiB)": 141.16, "step": 139220, "train_speed(iter/s)": 0.289264 }, { "acc": 0.73997841, "epoch": 1.557392061763279, "grad_norm": 7.6875, "learning_rate": 1.2801886991006695e-06, "loss": 1.04692774, "memory(GiB)": 141.16, "step": 139240, "train_speed(iter/s)": 0.289278 }, { "acc": 0.73099389, "epoch": 1.5576157607092376, "grad_norm": 6.875, "learning_rate": 1.2789531305833497e-06, "loss": 1.0815527, "memory(GiB)": 141.16, "step": 139260, "train_speed(iter/s)": 0.289291 }, { "acc": 0.735221, "epoch": 1.557839459655196, "grad_norm": 8.1875, "learning_rate": 1.2777180711584287e-06, "loss": 1.05317268, "memory(GiB)": 141.16, "step": 139280, "train_speed(iter/s)": 0.289305 }, { "acc": 0.74270787, "epoch": 1.5580631586011546, "grad_norm": 8.9375, "learning_rate": 1.2764835209948772e-06, "loss": 1.01992664, "memory(GiB)": 141.16, "step": 139300, "train_speed(iter/s)": 0.289319 }, { "acc": 0.73263822, "epoch": 1.5582868575471132, "grad_norm": 6.0, "learning_rate": 1.2752494802616034e-06, "loss": 1.06108866, "memory(GiB)": 141.16, "step": 139320, "train_speed(iter/s)": 0.289332 }, { "acc": 0.73786135, "epoch": 1.5585105564930717, "grad_norm": 9.5625, "learning_rate": 1.2740159491274394e-06, "loss": 1.06185551, "memory(GiB)": 141.16, "step": 139340, "train_speed(iter/s)": 0.289346 }, { "acc": 0.73382292, "epoch": 1.5587342554390302, "grad_norm": 6.3125, "learning_rate": 1.2727829277611492e-06, "loss": 1.06228065, "memory(GiB)": 141.16, "step": 139360, "train_speed(iter/s)": 0.289361 }, { "acc": 0.74711037, "epoch": 1.5589579543849887, "grad_norm": 7.84375, "learning_rate": 1.2715504163314295e-06, "loss": 0.99291191, "memory(GiB)": 141.16, "step": 139380, "train_speed(iter/s)": 0.289375 }, { "acc": 0.72338333, "epoch": 1.5591816533309473, "grad_norm": 6.84375, "learning_rate": 1.2703184150069037e-06, "loss": 1.11517782, "memory(GiB)": 141.16, "step": 139400, "train_speed(iter/s)": 0.289389 }, { "acc": 0.730093, "epoch": 1.5594053522769058, "grad_norm": 8.5, "learning_rate": 1.2690869239561293e-06, "loss": 1.09073582, "memory(GiB)": 141.16, "step": 139420, "train_speed(iter/s)": 0.289403 }, { "acc": 0.72575121, "epoch": 1.5596290512228643, "grad_norm": 6.65625, "learning_rate": 1.2678559433475911e-06, "loss": 1.11432247, "memory(GiB)": 141.16, "step": 139440, "train_speed(iter/s)": 0.289417 }, { "acc": 0.72855191, "epoch": 1.5598527501688229, "grad_norm": 8.625, "learning_rate": 1.266625473349703e-06, "loss": 1.09716549, "memory(GiB)": 141.16, "step": 139460, "train_speed(iter/s)": 0.289431 }, { "acc": 0.7258791, "epoch": 1.5600764491147814, "grad_norm": 5.96875, "learning_rate": 1.2653955141308132e-06, "loss": 1.1034421, "memory(GiB)": 141.16, "step": 139480, "train_speed(iter/s)": 0.289446 }, { "acc": 0.74340224, "epoch": 1.56030014806074, "grad_norm": 7.8125, "learning_rate": 1.2641660658591959e-06, "loss": 1.02692585, "memory(GiB)": 141.16, "step": 139500, "train_speed(iter/s)": 0.289458 }, { "acc": 0.73436046, "epoch": 1.5605238470066984, "grad_norm": 5.96875, "learning_rate": 1.2629371287030596e-06, "loss": 1.07965355, "memory(GiB)": 141.16, "step": 139520, "train_speed(iter/s)": 0.289472 }, { "acc": 0.74401169, "epoch": 1.560747545952657, "grad_norm": 7.84375, "learning_rate": 1.2617087028305392e-06, "loss": 1.02358189, "memory(GiB)": 141.16, "step": 139540, "train_speed(iter/s)": 0.289485 }, { "acc": 0.72427616, "epoch": 1.5609712448986155, "grad_norm": 7.71875, "learning_rate": 1.2604807884096986e-06, "loss": 1.11463661, "memory(GiB)": 141.16, "step": 139560, "train_speed(iter/s)": 0.289499 }, { "acc": 0.7283535, "epoch": 1.561194943844574, "grad_norm": 7.96875, "learning_rate": 1.259253385608538e-06, "loss": 1.08811398, "memory(GiB)": 141.16, "step": 139580, "train_speed(iter/s)": 0.289514 }, { "acc": 0.74180927, "epoch": 1.5614186427905326, "grad_norm": 6.46875, "learning_rate": 1.2580264945949805e-06, "loss": 1.02799778, "memory(GiB)": 141.16, "step": 139600, "train_speed(iter/s)": 0.289528 }, { "acc": 0.72902384, "epoch": 1.561642341736491, "grad_norm": 7.4375, "learning_rate": 1.2568001155368853e-06, "loss": 1.09956741, "memory(GiB)": 141.16, "step": 139620, "train_speed(iter/s)": 0.289542 }, { "acc": 0.7386776, "epoch": 1.5618660406824496, "grad_norm": 6.84375, "learning_rate": 1.2555742486020368e-06, "loss": 1.04943905, "memory(GiB)": 141.16, "step": 139640, "train_speed(iter/s)": 0.289557 }, { "acc": 0.73059864, "epoch": 1.5620897396284081, "grad_norm": 7.21875, "learning_rate": 1.25434889395815e-06, "loss": 1.09369678, "memory(GiB)": 141.16, "step": 139660, "train_speed(iter/s)": 0.289571 }, { "acc": 0.72605934, "epoch": 1.5623134385743667, "grad_norm": 5.40625, "learning_rate": 1.2531240517728731e-06, "loss": 1.09882278, "memory(GiB)": 141.16, "step": 139680, "train_speed(iter/s)": 0.289586 }, { "acc": 0.73829441, "epoch": 1.5625371375203252, "grad_norm": 8.25, "learning_rate": 1.2518997222137802e-06, "loss": 1.04328556, "memory(GiB)": 141.16, "step": 139700, "train_speed(iter/s)": 0.2896 }, { "acc": 0.73608766, "epoch": 1.5627608364662837, "grad_norm": 6.25, "learning_rate": 1.2506759054483802e-06, "loss": 1.0631382, "memory(GiB)": 141.16, "step": 139720, "train_speed(iter/s)": 0.289614 }, { "acc": 0.72995958, "epoch": 1.5629845354122422, "grad_norm": 6.09375, "learning_rate": 1.2494526016441044e-06, "loss": 1.07874928, "memory(GiB)": 141.16, "step": 139740, "train_speed(iter/s)": 0.289628 }, { "acc": 0.73825569, "epoch": 1.5632082343582008, "grad_norm": 7.40625, "learning_rate": 1.2482298109683216e-06, "loss": 1.04122877, "memory(GiB)": 141.16, "step": 139760, "train_speed(iter/s)": 0.289643 }, { "acc": 0.73650141, "epoch": 1.5634319333041593, "grad_norm": 6.4375, "learning_rate": 1.2470075335883258e-06, "loss": 1.05955305, "memory(GiB)": 141.16, "step": 139780, "train_speed(iter/s)": 0.289656 }, { "acc": 0.7281292, "epoch": 1.5636556322501178, "grad_norm": 8.375, "learning_rate": 1.2457857696713405e-06, "loss": 1.09906597, "memory(GiB)": 141.16, "step": 139800, "train_speed(iter/s)": 0.289669 }, { "acc": 0.73712826, "epoch": 1.5638793311960764, "grad_norm": 7.59375, "learning_rate": 1.2445645193845236e-06, "loss": 1.06150379, "memory(GiB)": 141.16, "step": 139820, "train_speed(iter/s)": 0.289684 }, { "acc": 0.73217711, "epoch": 1.564103030142035, "grad_norm": 8.9375, "learning_rate": 1.2433437828949562e-06, "loss": 1.09095812, "memory(GiB)": 141.16, "step": 139840, "train_speed(iter/s)": 0.289699 }, { "acc": 0.74131889, "epoch": 1.5643267290879934, "grad_norm": 7.28125, "learning_rate": 1.2421235603696558e-06, "loss": 1.02713833, "memory(GiB)": 141.16, "step": 139860, "train_speed(iter/s)": 0.289715 }, { "acc": 0.73405209, "epoch": 1.564550428033952, "grad_norm": 7.90625, "learning_rate": 1.240903851975565e-06, "loss": 1.05821409, "memory(GiB)": 141.16, "step": 139880, "train_speed(iter/s)": 0.289729 }, { "acc": 0.74547381, "epoch": 1.5647741269799105, "grad_norm": 7.125, "learning_rate": 1.239684657879555e-06, "loss": 1.00388689, "memory(GiB)": 141.16, "step": 139900, "train_speed(iter/s)": 0.289742 }, { "acc": 0.73749123, "epoch": 1.564997825925869, "grad_norm": 5.6875, "learning_rate": 1.2384659782484338e-06, "loss": 1.05275822, "memory(GiB)": 141.16, "step": 139920, "train_speed(iter/s)": 0.289754 }, { "acc": 0.7410924, "epoch": 1.5652215248718275, "grad_norm": 6.46875, "learning_rate": 1.2372478132489291e-06, "loss": 1.04670515, "memory(GiB)": 141.16, "step": 139940, "train_speed(iter/s)": 0.289769 }, { "acc": 0.7297123, "epoch": 1.565445223817786, "grad_norm": 5.8125, "learning_rate": 1.2360301630477074e-06, "loss": 1.08891945, "memory(GiB)": 141.16, "step": 139960, "train_speed(iter/s)": 0.289783 }, { "acc": 0.72405729, "epoch": 1.5656689227637446, "grad_norm": 6.84375, "learning_rate": 1.234813027811359e-06, "loss": 1.11402121, "memory(GiB)": 141.16, "step": 139980, "train_speed(iter/s)": 0.289798 }, { "acc": 0.73373518, "epoch": 1.5658926217097031, "grad_norm": 8.1875, "learning_rate": 1.2335964077064034e-06, "loss": 1.08412209, "memory(GiB)": 141.16, "step": 140000, "train_speed(iter/s)": 0.289811 }, { "epoch": 1.5658926217097031, "eval_acc": 0.690172248428324, "eval_loss": 1.0791089534759521, "eval_runtime": 2322.3321, "eval_samples_per_second": 32.417, "eval_steps_per_second": 16.209, "step": 140000 }, { "acc": 0.74390516, "epoch": 1.5661163206556616, "grad_norm": 6.6875, "learning_rate": 1.2323803028992953e-06, "loss": 1.02571888, "memory(GiB)": 141.16, "step": 140020, "train_speed(iter/s)": 0.288408 }, { "acc": 0.73821464, "epoch": 1.5663400196016202, "grad_norm": 8.1875, "learning_rate": 1.2311647135564119e-06, "loss": 1.0654665, "memory(GiB)": 141.16, "step": 140040, "train_speed(iter/s)": 0.288423 }, { "acc": 0.74756546, "epoch": 1.5665637185475787, "grad_norm": 7.8125, "learning_rate": 1.2299496398440669e-06, "loss": 1.00664358, "memory(GiB)": 141.16, "step": 140060, "train_speed(iter/s)": 0.288438 }, { "acc": 0.73949404, "epoch": 1.5667874174935372, "grad_norm": 5.9375, "learning_rate": 1.2287350819284966e-06, "loss": 1.03795261, "memory(GiB)": 141.16, "step": 140080, "train_speed(iter/s)": 0.288451 }, { "acc": 0.72749372, "epoch": 1.5670111164394958, "grad_norm": 5.9375, "learning_rate": 1.2275210399758703e-06, "loss": 1.10497799, "memory(GiB)": 141.16, "step": 140100, "train_speed(iter/s)": 0.288465 }, { "acc": 0.72732286, "epoch": 1.5672348153854543, "grad_norm": 8.0625, "learning_rate": 1.2263075141522878e-06, "loss": 1.09791069, "memory(GiB)": 141.16, "step": 140120, "train_speed(iter/s)": 0.288478 }, { "acc": 0.74634118, "epoch": 1.5674585143314128, "grad_norm": 8.6875, "learning_rate": 1.2250945046237744e-06, "loss": 1.02196083, "memory(GiB)": 141.16, "step": 140140, "train_speed(iter/s)": 0.288493 }, { "acc": 0.73362327, "epoch": 1.5676822132773713, "grad_norm": 5.75, "learning_rate": 1.2238820115562899e-06, "loss": 1.09039917, "memory(GiB)": 141.16, "step": 140160, "train_speed(iter/s)": 0.288507 }, { "acc": 0.73010082, "epoch": 1.5679059122233299, "grad_norm": 6.625, "learning_rate": 1.22267003511572e-06, "loss": 1.09134712, "memory(GiB)": 141.16, "step": 140180, "train_speed(iter/s)": 0.288521 }, { "acc": 0.72536035, "epoch": 1.5681296111692884, "grad_norm": 7.09375, "learning_rate": 1.2214585754678782e-06, "loss": 1.11195059, "memory(GiB)": 141.16, "step": 140200, "train_speed(iter/s)": 0.288534 }, { "acc": 0.72576985, "epoch": 1.568353310115247, "grad_norm": 8.0625, "learning_rate": 1.2202476327785118e-06, "loss": 1.11932621, "memory(GiB)": 141.16, "step": 140220, "train_speed(iter/s)": 0.288548 }, { "acc": 0.72222056, "epoch": 1.5685770090612055, "grad_norm": 8.1875, "learning_rate": 1.219037207213294e-06, "loss": 1.11351566, "memory(GiB)": 141.16, "step": 140240, "train_speed(iter/s)": 0.288563 }, { "acc": 0.7371294, "epoch": 1.568800708007164, "grad_norm": 7.25, "learning_rate": 1.2178272989378293e-06, "loss": 1.03705521, "memory(GiB)": 141.16, "step": 140260, "train_speed(iter/s)": 0.288576 }, { "acc": 0.726056, "epoch": 1.5690244069531225, "grad_norm": 4.84375, "learning_rate": 1.21661790811765e-06, "loss": 1.09968395, "memory(GiB)": 141.16, "step": 140280, "train_speed(iter/s)": 0.288589 }, { "acc": 0.74556713, "epoch": 1.569248105899081, "grad_norm": 9.4375, "learning_rate": 1.2154090349182163e-06, "loss": 1.01627789, "memory(GiB)": 141.16, "step": 140300, "train_speed(iter/s)": 0.288603 }, { "acc": 0.7345562, "epoch": 1.5694718048450396, "grad_norm": 7.90625, "learning_rate": 1.2142006795049227e-06, "loss": 1.07359123, "memory(GiB)": 141.16, "step": 140320, "train_speed(iter/s)": 0.288618 }, { "acc": 0.71894646, "epoch": 1.569695503790998, "grad_norm": 6.71875, "learning_rate": 1.212992842043086e-06, "loss": 1.13214722, "memory(GiB)": 141.16, "step": 140340, "train_speed(iter/s)": 0.288631 }, { "acc": 0.74560533, "epoch": 1.5699192027369566, "grad_norm": 6.65625, "learning_rate": 1.2117855226979585e-06, "loss": 1.02257366, "memory(GiB)": 141.16, "step": 140360, "train_speed(iter/s)": 0.288645 }, { "acc": 0.74301596, "epoch": 1.5701429016829151, "grad_norm": 7.34375, "learning_rate": 1.210578721634718e-06, "loss": 1.01929474, "memory(GiB)": 141.16, "step": 140380, "train_speed(iter/s)": 0.288656 }, { "acc": 0.73731489, "epoch": 1.5703666006288737, "grad_norm": 7.0, "learning_rate": 1.2093724390184703e-06, "loss": 1.05051193, "memory(GiB)": 141.16, "step": 140400, "train_speed(iter/s)": 0.28867 }, { "acc": 0.73607583, "epoch": 1.5705902995748322, "grad_norm": 7.0, "learning_rate": 1.2081666750142546e-06, "loss": 1.07072582, "memory(GiB)": 141.16, "step": 140420, "train_speed(iter/s)": 0.288683 }, { "acc": 0.73327179, "epoch": 1.5708139985207907, "grad_norm": 5.375, "learning_rate": 1.2069614297870342e-06, "loss": 1.07314606, "memory(GiB)": 141.16, "step": 140440, "train_speed(iter/s)": 0.288695 }, { "acc": 0.72653828, "epoch": 1.5710376974667493, "grad_norm": 5.625, "learning_rate": 1.2057567035017064e-06, "loss": 1.11742611, "memory(GiB)": 141.16, "step": 140460, "train_speed(iter/s)": 0.288709 }, { "acc": 0.7323297, "epoch": 1.5712613964127078, "grad_norm": 6.9375, "learning_rate": 1.2045524963230943e-06, "loss": 1.09189072, "memory(GiB)": 141.16, "step": 140480, "train_speed(iter/s)": 0.288724 }, { "acc": 0.74249935, "epoch": 1.5714850953586663, "grad_norm": 11.25, "learning_rate": 1.2033488084159484e-06, "loss": 1.05030031, "memory(GiB)": 141.16, "step": 140500, "train_speed(iter/s)": 0.288739 }, { "acc": 0.73865957, "epoch": 1.5717087943046248, "grad_norm": 7.875, "learning_rate": 1.2021456399449537e-06, "loss": 1.06376209, "memory(GiB)": 141.16, "step": 140520, "train_speed(iter/s)": 0.288752 }, { "acc": 0.71961918, "epoch": 1.5719324932505834, "grad_norm": 9.0, "learning_rate": 1.2009429910747178e-06, "loss": 1.12135553, "memory(GiB)": 141.16, "step": 140540, "train_speed(iter/s)": 0.288766 }, { "acc": 0.74937429, "epoch": 1.572156192196542, "grad_norm": 6.875, "learning_rate": 1.199740861969783e-06, "loss": 1.01181908, "memory(GiB)": 141.16, "step": 140560, "train_speed(iter/s)": 0.28878 }, { "acc": 0.73486414, "epoch": 1.5723798911425004, "grad_norm": 7.5, "learning_rate": 1.1985392527946172e-06, "loss": 1.06671371, "memory(GiB)": 141.16, "step": 140580, "train_speed(iter/s)": 0.288794 }, { "acc": 0.73629332, "epoch": 1.572603590088459, "grad_norm": 9.125, "learning_rate": 1.197338163713615e-06, "loss": 1.05779362, "memory(GiB)": 141.16, "step": 140600, "train_speed(iter/s)": 0.288806 }, { "acc": 0.74005961, "epoch": 1.5728272890344175, "grad_norm": 5.40625, "learning_rate": 1.1961375948911058e-06, "loss": 1.03931322, "memory(GiB)": 141.16, "step": 140620, "train_speed(iter/s)": 0.288821 }, { "acc": 0.72232947, "epoch": 1.573050987980376, "grad_norm": 5.71875, "learning_rate": 1.1949375464913427e-06, "loss": 1.10934162, "memory(GiB)": 141.16, "step": 140640, "train_speed(iter/s)": 0.288837 }, { "acc": 0.73082018, "epoch": 1.5732746869263345, "grad_norm": 7.1875, "learning_rate": 1.1937380186785108e-06, "loss": 1.0805603, "memory(GiB)": 141.16, "step": 140660, "train_speed(iter/s)": 0.288852 }, { "acc": 0.7414053, "epoch": 1.573498385872293, "grad_norm": 6.21875, "learning_rate": 1.1925390116167223e-06, "loss": 1.04366417, "memory(GiB)": 141.16, "step": 140680, "train_speed(iter/s)": 0.288867 }, { "acc": 0.73583994, "epoch": 1.5737220848182516, "grad_norm": 5.96875, "learning_rate": 1.1913405254700168e-06, "loss": 1.05755539, "memory(GiB)": 141.16, "step": 140700, "train_speed(iter/s)": 0.288878 }, { "acc": 0.73308716, "epoch": 1.5739457837642101, "grad_norm": 7.28125, "learning_rate": 1.190142560402367e-06, "loss": 1.08768978, "memory(GiB)": 141.16, "step": 140720, "train_speed(iter/s)": 0.288891 }, { "acc": 0.74222021, "epoch": 1.5741694827101687, "grad_norm": 8.4375, "learning_rate": 1.1889451165776688e-06, "loss": 1.02977629, "memory(GiB)": 141.16, "step": 140740, "train_speed(iter/s)": 0.288905 }, { "acc": 0.72917995, "epoch": 1.5743931816561272, "grad_norm": 7.3125, "learning_rate": 1.1877481941597523e-06, "loss": 1.07313194, "memory(GiB)": 141.16, "step": 140760, "train_speed(iter/s)": 0.288919 }, { "acc": 0.74543843, "epoch": 1.5746168806020857, "grad_norm": 7.3125, "learning_rate": 1.1865517933123732e-06, "loss": 1.00818672, "memory(GiB)": 141.16, "step": 140780, "train_speed(iter/s)": 0.288933 }, { "acc": 0.73176217, "epoch": 1.5748405795480442, "grad_norm": 6.90625, "learning_rate": 1.1853559141992138e-06, "loss": 1.08386583, "memory(GiB)": 141.16, "step": 140800, "train_speed(iter/s)": 0.288946 }, { "acc": 0.73098593, "epoch": 1.5750642784940028, "grad_norm": 7.1875, "learning_rate": 1.1841605569838905e-06, "loss": 1.08955412, "memory(GiB)": 141.16, "step": 140820, "train_speed(iter/s)": 0.28896 }, { "acc": 0.71983314, "epoch": 1.5752879774399613, "grad_norm": 7.78125, "learning_rate": 1.1829657218299428e-06, "loss": 1.13079424, "memory(GiB)": 141.16, "step": 140840, "train_speed(iter/s)": 0.288974 }, { "acc": 0.74521408, "epoch": 1.5755116763859198, "grad_norm": 7.71875, "learning_rate": 1.1817714089008436e-06, "loss": 1.03158598, "memory(GiB)": 141.16, "step": 140860, "train_speed(iter/s)": 0.288988 }, { "acc": 0.72515583, "epoch": 1.5757353753318784, "grad_norm": 6.5625, "learning_rate": 1.1805776183599904e-06, "loss": 1.09209194, "memory(GiB)": 141.16, "step": 140880, "train_speed(iter/s)": 0.289003 }, { "acc": 0.73764668, "epoch": 1.5759590742778369, "grad_norm": 6.59375, "learning_rate": 1.1793843503707115e-06, "loss": 1.04669132, "memory(GiB)": 141.16, "step": 140900, "train_speed(iter/s)": 0.289015 }, { "acc": 0.7333149, "epoch": 1.5761827732237954, "grad_norm": 6.625, "learning_rate": 1.178191605096261e-06, "loss": 1.07679081, "memory(GiB)": 141.16, "step": 140920, "train_speed(iter/s)": 0.289028 }, { "acc": 0.73827829, "epoch": 1.576406472169754, "grad_norm": 7.09375, "learning_rate": 1.1769993826998267e-06, "loss": 1.05012217, "memory(GiB)": 141.16, "step": 140940, "train_speed(iter/s)": 0.289041 }, { "acc": 0.7359602, "epoch": 1.5766301711157125, "grad_norm": 7.46875, "learning_rate": 1.1758076833445203e-06, "loss": 1.06316071, "memory(GiB)": 141.16, "step": 140960, "train_speed(iter/s)": 0.289054 }, { "acc": 0.7319644, "epoch": 1.576853870061671, "grad_norm": 7.9375, "learning_rate": 1.1746165071933812e-06, "loss": 1.04344921, "memory(GiB)": 141.16, "step": 140980, "train_speed(iter/s)": 0.289069 }, { "acc": 0.73987341, "epoch": 1.5770775690076295, "grad_norm": 9.25, "learning_rate": 1.173425854409383e-06, "loss": 1.0498642, "memory(GiB)": 141.16, "step": 141000, "train_speed(iter/s)": 0.289082 }, { "acc": 0.72977762, "epoch": 1.577301267953588, "grad_norm": 6.9375, "learning_rate": 1.172235725155421e-06, "loss": 1.08068609, "memory(GiB)": 141.16, "step": 141020, "train_speed(iter/s)": 0.289095 }, { "acc": 0.73752451, "epoch": 1.5775249668995466, "grad_norm": 8.1875, "learning_rate": 1.1710461195943245e-06, "loss": 1.05526056, "memory(GiB)": 141.16, "step": 141040, "train_speed(iter/s)": 0.289109 }, { "acc": 0.72577152, "epoch": 1.577748665845505, "grad_norm": 7.28125, "learning_rate": 1.1698570378888469e-06, "loss": 1.10859413, "memory(GiB)": 141.16, "step": 141060, "train_speed(iter/s)": 0.289122 }, { "acc": 0.72347174, "epoch": 1.5779723647914636, "grad_norm": 7.3125, "learning_rate": 1.1686684802016706e-06, "loss": 1.1148448, "memory(GiB)": 141.16, "step": 141080, "train_speed(iter/s)": 0.289137 }, { "acc": 0.72865467, "epoch": 1.5781960637374222, "grad_norm": 6.34375, "learning_rate": 1.1674804466954099e-06, "loss": 1.09176645, "memory(GiB)": 141.16, "step": 141100, "train_speed(iter/s)": 0.28915 }, { "acc": 0.74792728, "epoch": 1.5784197626833807, "grad_norm": 6.375, "learning_rate": 1.166292937532602e-06, "loss": 1.01605701, "memory(GiB)": 141.16, "step": 141120, "train_speed(iter/s)": 0.289165 }, { "acc": 0.73011408, "epoch": 1.5786434616293392, "grad_norm": 8.25, "learning_rate": 1.1651059528757186e-06, "loss": 1.10194073, "memory(GiB)": 141.16, "step": 141140, "train_speed(iter/s)": 0.289178 }, { "acc": 0.73355999, "epoch": 1.5788671605752977, "grad_norm": 7.5625, "learning_rate": 1.1639194928871533e-06, "loss": 1.06368465, "memory(GiB)": 141.16, "step": 141160, "train_speed(iter/s)": 0.289192 }, { "acc": 0.73491464, "epoch": 1.5790908595212563, "grad_norm": 6.96875, "learning_rate": 1.1627335577292303e-06, "loss": 1.06418142, "memory(GiB)": 141.16, "step": 141180, "train_speed(iter/s)": 0.289205 }, { "acc": 0.7205802, "epoch": 1.5793145584672148, "grad_norm": 7.71875, "learning_rate": 1.1615481475642053e-06, "loss": 1.11353855, "memory(GiB)": 141.16, "step": 141200, "train_speed(iter/s)": 0.289219 }, { "acc": 0.73519764, "epoch": 1.5795382574131733, "grad_norm": 7.9375, "learning_rate": 1.1603632625542565e-06, "loss": 1.0595499, "memory(GiB)": 141.16, "step": 141220, "train_speed(iter/s)": 0.289233 }, { "acc": 0.73678007, "epoch": 1.5797619563591319, "grad_norm": 6.5, "learning_rate": 1.1591789028614964e-06, "loss": 1.05341167, "memory(GiB)": 141.16, "step": 141240, "train_speed(iter/s)": 0.289249 }, { "acc": 0.72961602, "epoch": 1.5799856553050904, "grad_norm": 6.78125, "learning_rate": 1.1579950686479595e-06, "loss": 1.07881823, "memory(GiB)": 141.16, "step": 141260, "train_speed(iter/s)": 0.289264 }, { "acc": 0.73574591, "epoch": 1.580209354251049, "grad_norm": 5.875, "learning_rate": 1.1568117600756112e-06, "loss": 1.06973753, "memory(GiB)": 141.16, "step": 141280, "train_speed(iter/s)": 0.289279 }, { "acc": 0.72995024, "epoch": 1.5804330531970074, "grad_norm": 6.78125, "learning_rate": 1.1556289773063468e-06, "loss": 1.09559917, "memory(GiB)": 141.16, "step": 141300, "train_speed(iter/s)": 0.289293 }, { "acc": 0.72948499, "epoch": 1.580656752142966, "grad_norm": 6.84375, "learning_rate": 1.154446720501986e-06, "loss": 1.07562332, "memory(GiB)": 141.16, "step": 141320, "train_speed(iter/s)": 0.289307 }, { "acc": 0.72611151, "epoch": 1.5808804510889245, "grad_norm": 7.96875, "learning_rate": 1.153264989824281e-06, "loss": 1.10733194, "memory(GiB)": 141.16, "step": 141340, "train_speed(iter/s)": 0.289321 }, { "acc": 0.73163157, "epoch": 1.581104150034883, "grad_norm": 8.25, "learning_rate": 1.1520837854349077e-06, "loss": 1.06857338, "memory(GiB)": 141.16, "step": 141360, "train_speed(iter/s)": 0.289336 }, { "acc": 0.71550112, "epoch": 1.5813278489808416, "grad_norm": 7.34375, "learning_rate": 1.1509031074954707e-06, "loss": 1.14964352, "memory(GiB)": 141.16, "step": 141380, "train_speed(iter/s)": 0.289347 }, { "acc": 0.74342337, "epoch": 1.5815515479268, "grad_norm": 6.0625, "learning_rate": 1.149722956167506e-06, "loss": 1.03365345, "memory(GiB)": 141.16, "step": 141400, "train_speed(iter/s)": 0.289361 }, { "acc": 0.74094305, "epoch": 1.5817752468727586, "grad_norm": 5.90625, "learning_rate": 1.1485433316124728e-06, "loss": 1.045998, "memory(GiB)": 141.16, "step": 141420, "train_speed(iter/s)": 0.289373 }, { "acc": 0.73537359, "epoch": 1.5819989458187171, "grad_norm": 7.03125, "learning_rate": 1.1473642339917635e-06, "loss": 1.06561127, "memory(GiB)": 141.16, "step": 141440, "train_speed(iter/s)": 0.289386 }, { "acc": 0.7233161, "epoch": 1.5822226447646757, "grad_norm": 6.15625, "learning_rate": 1.1461856634666935e-06, "loss": 1.1137784, "memory(GiB)": 141.16, "step": 141460, "train_speed(iter/s)": 0.289401 }, { "acc": 0.735639, "epoch": 1.5824463437106342, "grad_norm": 6.71875, "learning_rate": 1.1450076201985072e-06, "loss": 1.04854813, "memory(GiB)": 141.16, "step": 141480, "train_speed(iter/s)": 0.289414 }, { "acc": 0.73823156, "epoch": 1.5826700426565927, "grad_norm": 6.25, "learning_rate": 1.14383010434838e-06, "loss": 1.04453936, "memory(GiB)": 141.16, "step": 141500, "train_speed(iter/s)": 0.289429 }, { "acc": 0.74407501, "epoch": 1.5828937416025513, "grad_norm": 6.375, "learning_rate": 1.1426531160774106e-06, "loss": 1.0323451, "memory(GiB)": 141.16, "step": 141520, "train_speed(iter/s)": 0.289441 }, { "acc": 0.73507662, "epoch": 1.5831174405485098, "grad_norm": 6.3125, "learning_rate": 1.1414766555466311e-06, "loss": 1.05399122, "memory(GiB)": 141.16, "step": 141540, "train_speed(iter/s)": 0.289455 }, { "acc": 0.73441625, "epoch": 1.5833411394944683, "grad_norm": 12.3125, "learning_rate": 1.1403007229169955e-06, "loss": 1.06443176, "memory(GiB)": 141.16, "step": 141560, "train_speed(iter/s)": 0.289468 }, { "acc": 0.7295373, "epoch": 1.5835648384404268, "grad_norm": 7.03125, "learning_rate": 1.1391253183493877e-06, "loss": 1.09086723, "memory(GiB)": 141.16, "step": 141580, "train_speed(iter/s)": 0.289482 }, { "acc": 0.73255119, "epoch": 1.5837885373863854, "grad_norm": 5.9375, "learning_rate": 1.1379504420046222e-06, "loss": 1.08684559, "memory(GiB)": 141.16, "step": 141600, "train_speed(iter/s)": 0.289495 }, { "acc": 0.72844172, "epoch": 1.584012236332344, "grad_norm": 6.5, "learning_rate": 1.1367760940434364e-06, "loss": 1.10433426, "memory(GiB)": 141.16, "step": 141620, "train_speed(iter/s)": 0.289506 }, { "acc": 0.73803453, "epoch": 1.5842359352783024, "grad_norm": 7.125, "learning_rate": 1.1356022746265005e-06, "loss": 1.06114216, "memory(GiB)": 141.16, "step": 141640, "train_speed(iter/s)": 0.28952 }, { "acc": 0.73873363, "epoch": 1.584459634224261, "grad_norm": 6.21875, "learning_rate": 1.1344289839144084e-06, "loss": 1.0585762, "memory(GiB)": 141.16, "step": 141660, "train_speed(iter/s)": 0.289534 }, { "acc": 0.73470397, "epoch": 1.5846833331702195, "grad_norm": 7.5, "learning_rate": 1.1332562220676818e-06, "loss": 1.06116848, "memory(GiB)": 141.16, "step": 141680, "train_speed(iter/s)": 0.289549 }, { "acc": 0.72863426, "epoch": 1.584907032116178, "grad_norm": 6.40625, "learning_rate": 1.132083989246774e-06, "loss": 1.0920536, "memory(GiB)": 141.16, "step": 141700, "train_speed(iter/s)": 0.289562 }, { "acc": 0.72228899, "epoch": 1.5851307310621365, "grad_norm": 7.8125, "learning_rate": 1.1309122856120597e-06, "loss": 1.12136307, "memory(GiB)": 141.16, "step": 141720, "train_speed(iter/s)": 0.289576 }, { "acc": 0.73337288, "epoch": 1.585354430008095, "grad_norm": 7.625, "learning_rate": 1.1297411113238488e-06, "loss": 1.06461048, "memory(GiB)": 141.16, "step": 141740, "train_speed(iter/s)": 0.28959 }, { "acc": 0.73422241, "epoch": 1.5855781289540536, "grad_norm": 10.8125, "learning_rate": 1.1285704665423718e-06, "loss": 1.07903824, "memory(GiB)": 141.16, "step": 141760, "train_speed(iter/s)": 0.289604 }, { "acc": 0.72893353, "epoch": 1.5858018279000121, "grad_norm": 9.3125, "learning_rate": 1.1274003514277899e-06, "loss": 1.09348755, "memory(GiB)": 141.16, "step": 141780, "train_speed(iter/s)": 0.289618 }, { "acc": 0.71622567, "epoch": 1.5860255268459706, "grad_norm": 7.875, "learning_rate": 1.1262307661401934e-06, "loss": 1.14350815, "memory(GiB)": 141.16, "step": 141800, "train_speed(iter/s)": 0.289632 }, { "acc": 0.72667308, "epoch": 1.5862492257919292, "grad_norm": 6.0625, "learning_rate": 1.125061710839595e-06, "loss": 1.10247593, "memory(GiB)": 141.16, "step": 141820, "train_speed(iter/s)": 0.289645 }, { "acc": 0.72323322, "epoch": 1.5864729247378877, "grad_norm": 8.4375, "learning_rate": 1.123893185685942e-06, "loss": 1.12582483, "memory(GiB)": 141.16, "step": 141840, "train_speed(iter/s)": 0.289659 }, { "acc": 0.73124008, "epoch": 1.5866966236838462, "grad_norm": 8.75, "learning_rate": 1.1227251908391034e-06, "loss": 1.07957001, "memory(GiB)": 141.16, "step": 141860, "train_speed(iter/s)": 0.289671 }, { "acc": 0.73306522, "epoch": 1.5869203226298048, "grad_norm": 7.46875, "learning_rate": 1.1215577264588767e-06, "loss": 1.070117, "memory(GiB)": 141.16, "step": 141880, "train_speed(iter/s)": 0.289685 }, { "acc": 0.73746653, "epoch": 1.5871440215757633, "grad_norm": 6.90625, "learning_rate": 1.1203907927049901e-06, "loss": 1.05452328, "memory(GiB)": 141.16, "step": 141900, "train_speed(iter/s)": 0.289698 }, { "acc": 0.73994179, "epoch": 1.5873677205217218, "grad_norm": 6.34375, "learning_rate": 1.1192243897370937e-06, "loss": 1.04583569, "memory(GiB)": 141.16, "step": 141920, "train_speed(iter/s)": 0.289712 }, { "acc": 0.73193445, "epoch": 1.5875914194676803, "grad_norm": 7.1875, "learning_rate": 1.1180585177147712e-06, "loss": 1.05711918, "memory(GiB)": 141.16, "step": 141940, "train_speed(iter/s)": 0.289725 }, { "acc": 0.74521265, "epoch": 1.5878151184136389, "grad_norm": 7.46875, "learning_rate": 1.1168931767975295e-06, "loss": 1.01017513, "memory(GiB)": 141.16, "step": 141960, "train_speed(iter/s)": 0.28974 }, { "acc": 0.72522035, "epoch": 1.5880388173595974, "grad_norm": 8.875, "learning_rate": 1.1157283671448022e-06, "loss": 1.10592384, "memory(GiB)": 141.16, "step": 141980, "train_speed(iter/s)": 0.289752 }, { "acc": 0.73453045, "epoch": 1.588262516305556, "grad_norm": 6.59375, "learning_rate": 1.1145640889159548e-06, "loss": 1.06333256, "memory(GiB)": 141.16, "step": 142000, "train_speed(iter/s)": 0.289765 }, { "epoch": 1.588262516305556, "eval_acc": 0.690173332954735, "eval_loss": 1.0791003704071045, "eval_runtime": 2329.5116, "eval_samples_per_second": 32.317, "eval_steps_per_second": 16.159, "step": 142000 }, { "acc": 0.75420084, "epoch": 1.5884862152515145, "grad_norm": 8.125, "learning_rate": 1.1134003422702738e-06, "loss": 0.95711641, "memory(GiB)": 141.16, "step": 142020, "train_speed(iter/s)": 0.288378 }, { "acc": 0.73752155, "epoch": 1.588709914197473, "grad_norm": 6.25, "learning_rate": 1.1122371273669802e-06, "loss": 1.0402689, "memory(GiB)": 141.16, "step": 142040, "train_speed(iter/s)": 0.288389 }, { "acc": 0.736092, "epoch": 1.5889336131434315, "grad_norm": 6.15625, "learning_rate": 1.1110744443652161e-06, "loss": 1.054564, "memory(GiB)": 141.16, "step": 142060, "train_speed(iter/s)": 0.288403 }, { "acc": 0.73559194, "epoch": 1.58915731208939, "grad_norm": 7.4375, "learning_rate": 1.109912293424054e-06, "loss": 1.06565704, "memory(GiB)": 141.16, "step": 142080, "train_speed(iter/s)": 0.288416 }, { "acc": 0.73947635, "epoch": 1.5893810110353486, "grad_norm": 9.5, "learning_rate": 1.1087506747024924e-06, "loss": 1.04564686, "memory(GiB)": 141.16, "step": 142100, "train_speed(iter/s)": 0.28843 }, { "acc": 0.73449202, "epoch": 1.589604709981307, "grad_norm": 7.8125, "learning_rate": 1.107589588359455e-06, "loss": 1.06789942, "memory(GiB)": 141.16, "step": 142120, "train_speed(iter/s)": 0.288443 }, { "acc": 0.72629638, "epoch": 1.5898284089272656, "grad_norm": 6.8125, "learning_rate": 1.1064290345537992e-06, "loss": 1.10261688, "memory(GiB)": 141.16, "step": 142140, "train_speed(iter/s)": 0.288456 }, { "acc": 0.73439403, "epoch": 1.5900521078732242, "grad_norm": 5.84375, "learning_rate": 1.1052690134443022e-06, "loss": 1.06254539, "memory(GiB)": 141.16, "step": 142160, "train_speed(iter/s)": 0.288469 }, { "acc": 0.72675285, "epoch": 1.5902758068191827, "grad_norm": 6.78125, "learning_rate": 1.1041095251896738e-06, "loss": 1.1062809, "memory(GiB)": 141.16, "step": 142180, "train_speed(iter/s)": 0.288481 }, { "acc": 0.74194384, "epoch": 1.5904995057651412, "grad_norm": 6.75, "learning_rate": 1.1029505699485482e-06, "loss": 1.03413734, "memory(GiB)": 141.16, "step": 142200, "train_speed(iter/s)": 0.288495 }, { "acc": 0.72879791, "epoch": 1.5907232047110997, "grad_norm": 6.28125, "learning_rate": 1.101792147879484e-06, "loss": 1.0931282, "memory(GiB)": 141.16, "step": 142220, "train_speed(iter/s)": 0.288509 }, { "acc": 0.73095942, "epoch": 1.5909469036570583, "grad_norm": 6.46875, "learning_rate": 1.1006342591409742e-06, "loss": 1.06999006, "memory(GiB)": 141.16, "step": 142240, "train_speed(iter/s)": 0.288522 }, { "acc": 0.72429705, "epoch": 1.5911706026030168, "grad_norm": 6.0625, "learning_rate": 1.0994769038914304e-06, "loss": 1.12379723, "memory(GiB)": 141.16, "step": 142260, "train_speed(iter/s)": 0.288534 }, { "acc": 0.73075795, "epoch": 1.5913943015489753, "grad_norm": 6.8125, "learning_rate": 1.0983200822891998e-06, "loss": 1.0908989, "memory(GiB)": 141.16, "step": 142280, "train_speed(iter/s)": 0.288547 }, { "acc": 0.73889098, "epoch": 1.5916180004949338, "grad_norm": 6.875, "learning_rate": 1.097163794492549e-06, "loss": 1.04127598, "memory(GiB)": 141.16, "step": 142300, "train_speed(iter/s)": 0.288561 }, { "acc": 0.735783, "epoch": 1.5918416994408924, "grad_norm": 7.96875, "learning_rate": 1.0960080406596747e-06, "loss": 1.06066608, "memory(GiB)": 141.16, "step": 142320, "train_speed(iter/s)": 0.288575 }, { "acc": 0.74331007, "epoch": 1.592065398386851, "grad_norm": 7.75, "learning_rate": 1.0948528209487026e-06, "loss": 1.03629627, "memory(GiB)": 141.16, "step": 142340, "train_speed(iter/s)": 0.288588 }, { "acc": 0.74717717, "epoch": 1.5922890973328094, "grad_norm": 5.40625, "learning_rate": 1.0936981355176802e-06, "loss": 1.00802555, "memory(GiB)": 141.16, "step": 142360, "train_speed(iter/s)": 0.288602 }, { "acc": 0.72090497, "epoch": 1.592512796278768, "grad_norm": 6.5, "learning_rate": 1.0925439845245883e-06, "loss": 1.11412334, "memory(GiB)": 141.16, "step": 142380, "train_speed(iter/s)": 0.288616 }, { "acc": 0.74676023, "epoch": 1.5927364952247265, "grad_norm": 5.65625, "learning_rate": 1.0913903681273303e-06, "loss": 0.98991241, "memory(GiB)": 141.16, "step": 142400, "train_speed(iter/s)": 0.28863 }, { "acc": 0.74427242, "epoch": 1.592960194170685, "grad_norm": 6.84375, "learning_rate": 1.0902372864837347e-06, "loss": 1.0026247, "memory(GiB)": 141.16, "step": 142420, "train_speed(iter/s)": 0.288638 }, { "acc": 0.74036045, "epoch": 1.5931838931166435, "grad_norm": 8.0, "learning_rate": 1.0890847397515635e-06, "loss": 1.05323734, "memory(GiB)": 141.16, "step": 142440, "train_speed(iter/s)": 0.28865 }, { "acc": 0.72893243, "epoch": 1.593407592062602, "grad_norm": 7.15625, "learning_rate": 1.0879327280884983e-06, "loss": 1.07632751, "memory(GiB)": 141.16, "step": 142460, "train_speed(iter/s)": 0.288662 }, { "acc": 0.7359561, "epoch": 1.5936312910085606, "grad_norm": 7.21875, "learning_rate": 1.0867812516521537e-06, "loss": 1.04682016, "memory(GiB)": 141.16, "step": 142480, "train_speed(iter/s)": 0.288676 }, { "acc": 0.73983974, "epoch": 1.5938549899545191, "grad_norm": 7.0625, "learning_rate": 1.0856303106000666e-06, "loss": 1.04607334, "memory(GiB)": 141.16, "step": 142500, "train_speed(iter/s)": 0.288691 }, { "acc": 0.73178167, "epoch": 1.5940786889004777, "grad_norm": 6.84375, "learning_rate": 1.0844799050897004e-06, "loss": 1.09030113, "memory(GiB)": 141.16, "step": 142520, "train_speed(iter/s)": 0.288707 }, { "acc": 0.73624954, "epoch": 1.5943023878464362, "grad_norm": 7.34375, "learning_rate": 1.083330035278451e-06, "loss": 1.05344, "memory(GiB)": 141.16, "step": 142540, "train_speed(iter/s)": 0.288719 }, { "acc": 0.72537103, "epoch": 1.5945260867923947, "grad_norm": 5.8125, "learning_rate": 1.082180701323633e-06, "loss": 1.11405764, "memory(GiB)": 141.16, "step": 142560, "train_speed(iter/s)": 0.288733 }, { "acc": 0.72713985, "epoch": 1.5947497857383532, "grad_norm": 7.28125, "learning_rate": 1.081031903382495e-06, "loss": 1.10685806, "memory(GiB)": 141.16, "step": 142580, "train_speed(iter/s)": 0.288746 }, { "acc": 0.73600688, "epoch": 1.5949734846843118, "grad_norm": 8.375, "learning_rate": 1.0798836416122078e-06, "loss": 1.05389795, "memory(GiB)": 141.16, "step": 142600, "train_speed(iter/s)": 0.288759 }, { "acc": 0.73270783, "epoch": 1.5951971836302703, "grad_norm": 7.53125, "learning_rate": 1.0787359161698684e-06, "loss": 1.07937431, "memory(GiB)": 141.16, "step": 142620, "train_speed(iter/s)": 0.288773 }, { "acc": 0.7360754, "epoch": 1.5954208825762288, "grad_norm": 7.78125, "learning_rate": 1.0775887272125046e-06, "loss": 1.06073608, "memory(GiB)": 141.16, "step": 142640, "train_speed(iter/s)": 0.288787 }, { "acc": 0.73174124, "epoch": 1.5956445815221874, "grad_norm": 6.375, "learning_rate": 1.076442074897066e-06, "loss": 1.06101398, "memory(GiB)": 141.16, "step": 142660, "train_speed(iter/s)": 0.288798 }, { "acc": 0.73291821, "epoch": 1.5958682804681459, "grad_norm": 7.78125, "learning_rate": 1.0752959593804336e-06, "loss": 1.09438229, "memory(GiB)": 141.16, "step": 142680, "train_speed(iter/s)": 0.288811 }, { "acc": 0.74003091, "epoch": 1.5960919794141044, "grad_norm": 7.875, "learning_rate": 1.0741503808194104e-06, "loss": 1.04259233, "memory(GiB)": 141.16, "step": 142700, "train_speed(iter/s)": 0.288826 }, { "acc": 0.74056149, "epoch": 1.596315678360063, "grad_norm": 8.25, "learning_rate": 1.0730053393707274e-06, "loss": 1.045578, "memory(GiB)": 141.16, "step": 142720, "train_speed(iter/s)": 0.288838 }, { "acc": 0.73900747, "epoch": 1.5965393773060215, "grad_norm": 7.84375, "learning_rate": 1.0718608351910453e-06, "loss": 1.04077616, "memory(GiB)": 141.16, "step": 142740, "train_speed(iter/s)": 0.288853 }, { "acc": 0.71815538, "epoch": 1.59676307625198, "grad_norm": 7.125, "learning_rate": 1.070716868436945e-06, "loss": 1.12415142, "memory(GiB)": 141.16, "step": 142760, "train_speed(iter/s)": 0.288866 }, { "acc": 0.72640715, "epoch": 1.5969867751979385, "grad_norm": 5.84375, "learning_rate": 1.0695734392649415e-06, "loss": 1.11415205, "memory(GiB)": 141.16, "step": 142780, "train_speed(iter/s)": 0.28888 }, { "acc": 0.73906541, "epoch": 1.597210474143897, "grad_norm": 8.0, "learning_rate": 1.0684305478314693e-06, "loss": 1.04932508, "memory(GiB)": 141.16, "step": 142800, "train_speed(iter/s)": 0.288893 }, { "acc": 0.7224195, "epoch": 1.5974341730898556, "grad_norm": 8.75, "learning_rate": 1.0672881942928926e-06, "loss": 1.11296349, "memory(GiB)": 141.16, "step": 142820, "train_speed(iter/s)": 0.288907 }, { "acc": 0.73652668, "epoch": 1.597657872035814, "grad_norm": 5.34375, "learning_rate": 1.0661463788055037e-06, "loss": 1.07591982, "memory(GiB)": 141.16, "step": 142840, "train_speed(iter/s)": 0.288921 }, { "acc": 0.74469357, "epoch": 1.5978815709817726, "grad_norm": 9.5, "learning_rate": 1.0650051015255163e-06, "loss": 1.02626143, "memory(GiB)": 141.16, "step": 142860, "train_speed(iter/s)": 0.288933 }, { "acc": 0.74563713, "epoch": 1.5981052699277312, "grad_norm": 7.1875, "learning_rate": 1.0638643626090766e-06, "loss": 1.02032394, "memory(GiB)": 141.16, "step": 142880, "train_speed(iter/s)": 0.288947 }, { "acc": 0.72115145, "epoch": 1.5983289688736897, "grad_norm": 7.25, "learning_rate": 1.0627241622122525e-06, "loss": 1.140695, "memory(GiB)": 141.16, "step": 142900, "train_speed(iter/s)": 0.288959 }, { "acc": 0.73584137, "epoch": 1.5985526678196482, "grad_norm": 7.5, "learning_rate": 1.061584500491038e-06, "loss": 1.06212521, "memory(GiB)": 141.16, "step": 142920, "train_speed(iter/s)": 0.288974 }, { "acc": 0.73252063, "epoch": 1.5987763667656067, "grad_norm": 7.21875, "learning_rate": 1.0604453776013585e-06, "loss": 1.06902752, "memory(GiB)": 141.16, "step": 142940, "train_speed(iter/s)": 0.288988 }, { "acc": 0.73598471, "epoch": 1.5990000657115653, "grad_norm": 8.875, "learning_rate": 1.0593067936990586e-06, "loss": 1.05986814, "memory(GiB)": 141.16, "step": 142960, "train_speed(iter/s)": 0.289002 }, { "acc": 0.74535327, "epoch": 1.5992237646575238, "grad_norm": 7.03125, "learning_rate": 1.0581687489399167e-06, "loss": 1.00802231, "memory(GiB)": 141.16, "step": 142980, "train_speed(iter/s)": 0.289016 }, { "acc": 0.74250245, "epoch": 1.5994474636034823, "grad_norm": 7.9375, "learning_rate": 1.0570312434796315e-06, "loss": 1.03822203, "memory(GiB)": 141.16, "step": 143000, "train_speed(iter/s)": 0.289029 }, { "acc": 0.72912273, "epoch": 1.5996711625494409, "grad_norm": 6.40625, "learning_rate": 1.055894277473829e-06, "loss": 1.09697876, "memory(GiB)": 141.16, "step": 143020, "train_speed(iter/s)": 0.289042 }, { "acc": 0.74006133, "epoch": 1.5998948614953994, "grad_norm": 6.28125, "learning_rate": 1.0547578510780648e-06, "loss": 1.02454948, "memory(GiB)": 141.16, "step": 143040, "train_speed(iter/s)": 0.289056 }, { "acc": 0.74139204, "epoch": 1.600118560441358, "grad_norm": 7.71875, "learning_rate": 1.0536219644478157e-06, "loss": 1.02681293, "memory(GiB)": 141.16, "step": 143060, "train_speed(iter/s)": 0.289069 }, { "acc": 0.72594414, "epoch": 1.6003422593873164, "grad_norm": 6.40625, "learning_rate": 1.0524866177384896e-06, "loss": 1.10877304, "memory(GiB)": 141.16, "step": 143080, "train_speed(iter/s)": 0.289083 }, { "acc": 0.72681246, "epoch": 1.600565958333275, "grad_norm": 6.6875, "learning_rate": 1.0513518111054177e-06, "loss": 1.10458899, "memory(GiB)": 141.16, "step": 143100, "train_speed(iter/s)": 0.289095 }, { "acc": 0.73371086, "epoch": 1.6007896572792335, "grad_norm": 7.5, "learning_rate": 1.050217544703856e-06, "loss": 1.07116613, "memory(GiB)": 141.16, "step": 143120, "train_speed(iter/s)": 0.289109 }, { "acc": 0.72693486, "epoch": 1.601013356225192, "grad_norm": 7.34375, "learning_rate": 1.0490838186889906e-06, "loss": 1.1000803, "memory(GiB)": 141.16, "step": 143140, "train_speed(iter/s)": 0.289121 }, { "acc": 0.74040828, "epoch": 1.6012370551711506, "grad_norm": 6.84375, "learning_rate": 1.047950633215929e-06, "loss": 1.0475071, "memory(GiB)": 141.16, "step": 143160, "train_speed(iter/s)": 0.289134 }, { "acc": 0.73622904, "epoch": 1.601460754117109, "grad_norm": 6.1875, "learning_rate": 1.04681798843971e-06, "loss": 1.06629982, "memory(GiB)": 141.16, "step": 143180, "train_speed(iter/s)": 0.289146 }, { "acc": 0.72780981, "epoch": 1.6016844530630676, "grad_norm": 7.9375, "learning_rate": 1.045685884515294e-06, "loss": 1.10403519, "memory(GiB)": 141.16, "step": 143200, "train_speed(iter/s)": 0.28916 }, { "acc": 0.72861691, "epoch": 1.6019081520090261, "grad_norm": 5.4375, "learning_rate": 1.0445543215975683e-06, "loss": 1.10853348, "memory(GiB)": 141.16, "step": 143220, "train_speed(iter/s)": 0.289174 }, { "acc": 0.72881474, "epoch": 1.6021318509549847, "grad_norm": 6.78125, "learning_rate": 1.043423299841349e-06, "loss": 1.09386826, "memory(GiB)": 141.16, "step": 143240, "train_speed(iter/s)": 0.289188 }, { "acc": 0.73078661, "epoch": 1.6023555499009432, "grad_norm": 7.21875, "learning_rate": 1.0422928194013732e-06, "loss": 1.07922916, "memory(GiB)": 141.16, "step": 143260, "train_speed(iter/s)": 0.289201 }, { "acc": 0.73211308, "epoch": 1.6025792488469017, "grad_norm": 5.28125, "learning_rate": 1.041162880432311e-06, "loss": 1.05946846, "memory(GiB)": 141.16, "step": 143280, "train_speed(iter/s)": 0.289214 }, { "acc": 0.73228502, "epoch": 1.6028029477928603, "grad_norm": 9.0, "learning_rate": 1.0400334830887494e-06, "loss": 1.06681805, "memory(GiB)": 141.16, "step": 143300, "train_speed(iter/s)": 0.289227 }, { "acc": 0.73745556, "epoch": 1.6030266467388188, "grad_norm": 6.09375, "learning_rate": 1.038904627525209e-06, "loss": 1.0646081, "memory(GiB)": 141.16, "step": 143320, "train_speed(iter/s)": 0.289241 }, { "acc": 0.72814741, "epoch": 1.6032503456847773, "grad_norm": 9.0625, "learning_rate": 1.0377763138961327e-06, "loss": 1.10846882, "memory(GiB)": 141.16, "step": 143340, "train_speed(iter/s)": 0.289255 }, { "acc": 0.72576919, "epoch": 1.6034740446307358, "grad_norm": 5.375, "learning_rate": 1.0366485423558886e-06, "loss": 1.12030296, "memory(GiB)": 141.16, "step": 143360, "train_speed(iter/s)": 0.289269 }, { "acc": 0.72164507, "epoch": 1.6036977435766944, "grad_norm": 6.6875, "learning_rate": 1.035521313058775e-06, "loss": 1.10892391, "memory(GiB)": 141.16, "step": 143380, "train_speed(iter/s)": 0.289282 }, { "acc": 0.72871304, "epoch": 1.603921442522653, "grad_norm": 7.6875, "learning_rate": 1.0343946261590099e-06, "loss": 1.11672096, "memory(GiB)": 141.16, "step": 143400, "train_speed(iter/s)": 0.289294 }, { "acc": 0.73503962, "epoch": 1.6041451414686114, "grad_norm": 7.6875, "learning_rate": 1.0332684818107425e-06, "loss": 1.05781307, "memory(GiB)": 141.16, "step": 143420, "train_speed(iter/s)": 0.289309 }, { "acc": 0.73617816, "epoch": 1.60436884041457, "grad_norm": 6.125, "learning_rate": 1.0321428801680445e-06, "loss": 1.05711603, "memory(GiB)": 141.16, "step": 143440, "train_speed(iter/s)": 0.289322 }, { "acc": 0.749681, "epoch": 1.6045925393605285, "grad_norm": 4.5625, "learning_rate": 1.0310178213849126e-06, "loss": 1.01549463, "memory(GiB)": 141.16, "step": 143460, "train_speed(iter/s)": 0.289333 }, { "acc": 0.74039783, "epoch": 1.604816238306487, "grad_norm": 6.65625, "learning_rate": 1.0298933056152744e-06, "loss": 1.042309, "memory(GiB)": 141.16, "step": 143480, "train_speed(iter/s)": 0.289347 }, { "acc": 0.74271631, "epoch": 1.6050399372524455, "grad_norm": 8.6875, "learning_rate": 1.0287693330129762e-06, "loss": 1.03293266, "memory(GiB)": 141.16, "step": 143500, "train_speed(iter/s)": 0.289361 }, { "acc": 0.73016729, "epoch": 1.605263636198404, "grad_norm": 7.34375, "learning_rate": 1.0276459037317972e-06, "loss": 1.06015873, "memory(GiB)": 141.16, "step": 143520, "train_speed(iter/s)": 0.289375 }, { "acc": 0.72325797, "epoch": 1.6054873351443626, "grad_norm": 6.59375, "learning_rate": 1.026523017925436e-06, "loss": 1.12511997, "memory(GiB)": 141.16, "step": 143540, "train_speed(iter/s)": 0.289389 }, { "acc": 0.74055381, "epoch": 1.6057110340903211, "grad_norm": 5.78125, "learning_rate": 1.0254006757475188e-06, "loss": 1.03705635, "memory(GiB)": 141.16, "step": 143560, "train_speed(iter/s)": 0.289402 }, { "acc": 0.72532129, "epoch": 1.6059347330362796, "grad_norm": 7.0625, "learning_rate": 1.0242788773516004e-06, "loss": 1.11552229, "memory(GiB)": 141.16, "step": 143580, "train_speed(iter/s)": 0.289416 }, { "acc": 0.74141836, "epoch": 1.6061584319822382, "grad_norm": 11.4375, "learning_rate": 1.0231576228911566e-06, "loss": 1.0408823, "memory(GiB)": 141.16, "step": 143600, "train_speed(iter/s)": 0.289429 }, { "acc": 0.74651766, "epoch": 1.6063821309281967, "grad_norm": 8.0, "learning_rate": 1.0220369125195933e-06, "loss": 0.99927158, "memory(GiB)": 141.16, "step": 143620, "train_speed(iter/s)": 0.289443 }, { "acc": 0.73286262, "epoch": 1.6066058298741552, "grad_norm": 5.96875, "learning_rate": 1.020916746390239e-06, "loss": 1.07302322, "memory(GiB)": 141.16, "step": 143640, "train_speed(iter/s)": 0.289456 }, { "acc": 0.72427063, "epoch": 1.6068295288201138, "grad_norm": 8.8125, "learning_rate": 1.0197971246563465e-06, "loss": 1.11767559, "memory(GiB)": 141.16, "step": 143660, "train_speed(iter/s)": 0.289471 }, { "acc": 0.73876276, "epoch": 1.6070532277660723, "grad_norm": 10.875, "learning_rate": 1.018678047471099e-06, "loss": 1.05302382, "memory(GiB)": 141.16, "step": 143680, "train_speed(iter/s)": 0.289486 }, { "acc": 0.73191605, "epoch": 1.6072769267120308, "grad_norm": 9.875, "learning_rate": 1.0175595149875988e-06, "loss": 1.07835636, "memory(GiB)": 141.16, "step": 143700, "train_speed(iter/s)": 0.289499 }, { "acc": 0.74236231, "epoch": 1.6075006256579893, "grad_norm": 10.25, "learning_rate": 1.0164415273588812e-06, "loss": 1.03092651, "memory(GiB)": 141.16, "step": 143720, "train_speed(iter/s)": 0.28951 }, { "acc": 0.73284445, "epoch": 1.6077243246039479, "grad_norm": 10.0625, "learning_rate": 1.0153240847379003e-06, "loss": 1.06547318, "memory(GiB)": 141.16, "step": 143740, "train_speed(iter/s)": 0.289524 }, { "acc": 0.74199114, "epoch": 1.6079480235499064, "grad_norm": 6.6875, "learning_rate": 1.0142071872775378e-06, "loss": 1.03469181, "memory(GiB)": 141.16, "step": 143760, "train_speed(iter/s)": 0.289539 }, { "acc": 0.74112973, "epoch": 1.608171722495865, "grad_norm": 6.84375, "learning_rate": 1.0130908351306036e-06, "loss": 1.03612156, "memory(GiB)": 141.16, "step": 143780, "train_speed(iter/s)": 0.289551 }, { "acc": 0.7329627, "epoch": 1.6083954214418235, "grad_norm": 7.3125, "learning_rate": 1.0119750284498275e-06, "loss": 1.08713932, "memory(GiB)": 141.16, "step": 143800, "train_speed(iter/s)": 0.289564 }, { "acc": 0.75450306, "epoch": 1.608619120387782, "grad_norm": 6.53125, "learning_rate": 1.0108597673878712e-06, "loss": 0.98154869, "memory(GiB)": 141.16, "step": 143820, "train_speed(iter/s)": 0.289578 }, { "acc": 0.71816583, "epoch": 1.6088428193337405, "grad_norm": 6.09375, "learning_rate": 1.0097450520973162e-06, "loss": 1.13685932, "memory(GiB)": 141.16, "step": 143840, "train_speed(iter/s)": 0.28959 }, { "acc": 0.74439678, "epoch": 1.609066518279699, "grad_norm": 6.59375, "learning_rate": 1.0086308827306711e-06, "loss": 1.03166504, "memory(GiB)": 141.16, "step": 143860, "train_speed(iter/s)": 0.289604 }, { "acc": 0.73463221, "epoch": 1.6092902172256576, "grad_norm": 6.28125, "learning_rate": 1.0075172594403726e-06, "loss": 1.06083393, "memory(GiB)": 141.16, "step": 143880, "train_speed(iter/s)": 0.289617 }, { "acc": 0.72423396, "epoch": 1.609513916171616, "grad_norm": 7.03125, "learning_rate": 1.0064041823787768e-06, "loss": 1.12485304, "memory(GiB)": 141.16, "step": 143900, "train_speed(iter/s)": 0.289631 }, { "acc": 0.74201488, "epoch": 1.6097376151175746, "grad_norm": 6.6875, "learning_rate": 1.005291651698172e-06, "loss": 1.02695236, "memory(GiB)": 141.16, "step": 143920, "train_speed(iter/s)": 0.289643 }, { "acc": 0.73542318, "epoch": 1.6099613140635332, "grad_norm": 6.75, "learning_rate": 1.004179667550767e-06, "loss": 1.05913811, "memory(GiB)": 141.16, "step": 143940, "train_speed(iter/s)": 0.289656 }, { "acc": 0.73423519, "epoch": 1.6101850130094917, "grad_norm": 8.125, "learning_rate": 1.003068230088695e-06, "loss": 1.05391273, "memory(GiB)": 141.16, "step": 143960, "train_speed(iter/s)": 0.28967 }, { "acc": 0.73545094, "epoch": 1.6104087119554502, "grad_norm": 6.125, "learning_rate": 1.0019573394640204e-06, "loss": 1.06271076, "memory(GiB)": 141.16, "step": 143980, "train_speed(iter/s)": 0.289683 }, { "acc": 0.74892845, "epoch": 1.6106324109014087, "grad_norm": 6.1875, "learning_rate": 1.0008469958287253e-06, "loss": 0.99300537, "memory(GiB)": 141.16, "step": 144000, "train_speed(iter/s)": 0.289696 }, { "epoch": 1.6106324109014087, "eval_acc": 0.690153417106097, "eval_loss": 1.079132080078125, "eval_runtime": 2328.8627, "eval_samples_per_second": 32.326, "eval_steps_per_second": 16.163, "step": 144000 } ], "logging_steps": 20, "max_steps": 178810, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "total_flos": 2.644076374384653e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }