{ "best_metric": 0.4071576, "best_model_checkpoint": "/home/adithya/workspace/Nayana/data/hindi_got_model_full_ft/got-ocr2/v2-20241103-190944/checkpoint-10500", "epoch": 1.4141414141414141, "eval_steps": 300, "global_step": 10500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.51908016, "epoch": 0.00013468013468013467, "grad_norm": 23.125, "learning_rate": 2.6917900403768507e-08, "loss": 1.52505195, "memory(GiB)": 5.54, "step": 1, "train_speed(iter/s)": 0.240681 }, { "acc": 0.63399017, "epoch": 0.0006734006734006734, "grad_norm": 12.4375, "learning_rate": 1.3458950201884255e-07, "loss": 1.57149458, "memory(GiB)": 9.53, "step": 5, "train_speed(iter/s)": 0.3205 }, { "acc": 0.62002406, "epoch": 0.0013468013468013469, "grad_norm": 27.25, "learning_rate": 2.691790040376851e-07, "loss": 1.54218817, "memory(GiB)": 9.53, "step": 10, "train_speed(iter/s)": 0.366563 }, { "acc": 0.55254116, "epoch": 0.00202020202020202, "grad_norm": 18.375, "learning_rate": 4.037685060565276e-07, "loss": 1.88158722, "memory(GiB)": 9.53, "step": 15, "train_speed(iter/s)": 0.384698 }, { "acc": 0.57307739, "epoch": 0.0026936026936026937, "grad_norm": 22.25, "learning_rate": 5.383580080753702e-07, "loss": 1.70565033, "memory(GiB)": 9.53, "step": 20, "train_speed(iter/s)": 0.396148 }, { "acc": 0.58070536, "epoch": 0.003367003367003367, "grad_norm": 23.125, "learning_rate": 6.729475100942127e-07, "loss": 1.41031113, "memory(GiB)": 9.53, "step": 25, "train_speed(iter/s)": 0.392144 }, { "acc": 0.55927949, "epoch": 0.00404040404040404, "grad_norm": 13.8125, "learning_rate": 8.075370121130552e-07, "loss": 1.77414036, "memory(GiB)": 12.13, "step": 30, "train_speed(iter/s)": 0.378235 }, { "acc": 0.55564704, "epoch": 0.0047138047138047135, "grad_norm": 12.0625, "learning_rate": 9.421265141318977e-07, "loss": 1.64451008, "memory(GiB)": 14.36, "step": 35, "train_speed(iter/s)": 0.359814 }, { "acc": 0.51643333, "epoch": 0.0053872053872053875, "grad_norm": 18.625, "learning_rate": 1.0767160161507404e-06, "loss": 1.69634914, "memory(GiB)": 8.14, "step": 40, "train_speed(iter/s)": 0.366719 }, { "acc": 0.59141669, "epoch": 0.006060606060606061, "grad_norm": 30.75, "learning_rate": 1.2113055181695828e-06, "loss": 1.78964634, "memory(GiB)": 8.14, "step": 45, "train_speed(iter/s)": 0.371 }, { "acc": 0.51927662, "epoch": 0.006734006734006734, "grad_norm": 27.25, "learning_rate": 1.3458950201884255e-06, "loss": 2.00049534, "memory(GiB)": 8.14, "step": 50, "train_speed(iter/s)": 0.372852 }, { "acc": 0.557618, "epoch": 0.007407407407407408, "grad_norm": 20.75, "learning_rate": 1.4804845222072681e-06, "loss": 1.64312649, "memory(GiB)": 8.14, "step": 55, "train_speed(iter/s)": 0.373569 }, { "acc": 0.53778725, "epoch": 0.00808080808080808, "grad_norm": 13.4375, "learning_rate": 1.6150740242261104e-06, "loss": 1.7603075, "memory(GiB)": 11.57, "step": 60, "train_speed(iter/s)": 0.365224 }, { "acc": 0.58266153, "epoch": 0.008754208754208754, "grad_norm": 17.625, "learning_rate": 1.749663526244953e-06, "loss": 1.49343386, "memory(GiB)": 11.57, "step": 65, "train_speed(iter/s)": 0.363502 }, { "acc": 0.57580504, "epoch": 0.009427609427609427, "grad_norm": 17.875, "learning_rate": 1.8842530282637955e-06, "loss": 1.66268654, "memory(GiB)": 11.57, "step": 70, "train_speed(iter/s)": 0.36812 }, { "acc": 0.61554484, "epoch": 0.010101010101010102, "grad_norm": 18.5, "learning_rate": 2.018842530282638e-06, "loss": 1.58854914, "memory(GiB)": 11.57, "step": 75, "train_speed(iter/s)": 0.371307 }, { "acc": 0.54652371, "epoch": 0.010774410774410775, "grad_norm": 17.125, "learning_rate": 2.1534320323014808e-06, "loss": 1.61172218, "memory(GiB)": 11.57, "step": 80, "train_speed(iter/s)": 0.37045 }, { "acc": 0.62052369, "epoch": 0.011447811447811448, "grad_norm": 17.625, "learning_rate": 2.2880215343203232e-06, "loss": 1.61053829, "memory(GiB)": 11.57, "step": 85, "train_speed(iter/s)": 0.371281 }, { "acc": 0.57062507, "epoch": 0.012121212121212121, "grad_norm": 24.0, "learning_rate": 2.4226110363391657e-06, "loss": 1.63032074, "memory(GiB)": 11.57, "step": 90, "train_speed(iter/s)": 0.371793 }, { "acc": 0.62314548, "epoch": 0.012794612794612794, "grad_norm": 19.75, "learning_rate": 2.5572005383580085e-06, "loss": 1.42481365, "memory(GiB)": 11.57, "step": 95, "train_speed(iter/s)": 0.366657 }, { "acc": 0.56463518, "epoch": 0.013468013468013467, "grad_norm": 14.625, "learning_rate": 2.691790040376851e-06, "loss": 1.66354046, "memory(GiB)": 11.57, "step": 100, "train_speed(iter/s)": 0.365204 }, { "acc": 0.67074585, "epoch": 0.014141414141414142, "grad_norm": 15.0, "learning_rate": 2.8263795423956934e-06, "loss": 1.34491758, "memory(GiB)": 11.57, "step": 105, "train_speed(iter/s)": 0.367968 }, { "acc": 0.54115891, "epoch": 0.014814814814814815, "grad_norm": 18.25, "learning_rate": 2.9609690444145363e-06, "loss": 1.61188202, "memory(GiB)": 11.57, "step": 110, "train_speed(iter/s)": 0.368511 }, { "acc": 0.55807858, "epoch": 0.015488215488215488, "grad_norm": 17.25, "learning_rate": 3.0955585464333787e-06, "loss": 1.61766186, "memory(GiB)": 11.57, "step": 115, "train_speed(iter/s)": 0.369026 }, { "acc": 0.56790361, "epoch": 0.01616161616161616, "grad_norm": 25.875, "learning_rate": 3.2301480484522207e-06, "loss": 1.44421263, "memory(GiB)": 11.57, "step": 120, "train_speed(iter/s)": 0.369682 }, { "acc": 0.62042007, "epoch": 0.016835016835016835, "grad_norm": 26.5, "learning_rate": 3.364737550471063e-06, "loss": 1.53551636, "memory(GiB)": 11.57, "step": 125, "train_speed(iter/s)": 0.372003 }, { "acc": 0.53192263, "epoch": 0.017508417508417508, "grad_norm": 33.75, "learning_rate": 3.499327052489906e-06, "loss": 1.86681862, "memory(GiB)": 11.57, "step": 130, "train_speed(iter/s)": 0.372911 }, { "acc": 0.62459035, "epoch": 0.01818181818181818, "grad_norm": 14.6875, "learning_rate": 3.6339165545087485e-06, "loss": 1.41363401, "memory(GiB)": 11.57, "step": 135, "train_speed(iter/s)": 0.373871 }, { "acc": 0.61042018, "epoch": 0.018855218855218854, "grad_norm": 14.75, "learning_rate": 3.768506056527591e-06, "loss": 1.44343262, "memory(GiB)": 11.57, "step": 140, "train_speed(iter/s)": 0.375919 }, { "acc": 0.63943481, "epoch": 0.019528619528619527, "grad_norm": 25.75, "learning_rate": 3.903095558546434e-06, "loss": 1.44404135, "memory(GiB)": 11.57, "step": 145, "train_speed(iter/s)": 0.377064 }, { "acc": 0.66924458, "epoch": 0.020202020202020204, "grad_norm": 18.875, "learning_rate": 4.037685060565276e-06, "loss": 1.43287277, "memory(GiB)": 11.57, "step": 150, "train_speed(iter/s)": 0.376558 }, { "acc": 0.64015527, "epoch": 0.020875420875420877, "grad_norm": 15.75, "learning_rate": 4.172274562584119e-06, "loss": 1.31725788, "memory(GiB)": 11.57, "step": 155, "train_speed(iter/s)": 0.377151 }, { "acc": 0.56795197, "epoch": 0.02154882154882155, "grad_norm": 28.5, "learning_rate": 4.3068640646029616e-06, "loss": 1.58099527, "memory(GiB)": 11.57, "step": 160, "train_speed(iter/s)": 0.37853 }, { "acc": 0.56977224, "epoch": 0.022222222222222223, "grad_norm": 18.625, "learning_rate": 4.4414535666218036e-06, "loss": 1.74765873, "memory(GiB)": 11.57, "step": 165, "train_speed(iter/s)": 0.37979 }, { "acc": 0.57543573, "epoch": 0.022895622895622896, "grad_norm": 14.3125, "learning_rate": 4.5760430686406464e-06, "loss": 1.48664017, "memory(GiB)": 11.57, "step": 170, "train_speed(iter/s)": 0.379818 }, { "acc": 0.62976146, "epoch": 0.02356902356902357, "grad_norm": 13.375, "learning_rate": 4.710632570659489e-06, "loss": 1.4615386, "memory(GiB)": 11.57, "step": 175, "train_speed(iter/s)": 0.379344 }, { "acc": 0.61564579, "epoch": 0.024242424242424242, "grad_norm": 11.6875, "learning_rate": 4.845222072678331e-06, "loss": 1.33001451, "memory(GiB)": 11.57, "step": 180, "train_speed(iter/s)": 0.37618 }, { "acc": 0.59344044, "epoch": 0.024915824915824916, "grad_norm": 12.6875, "learning_rate": 4.979811574697174e-06, "loss": 1.36530914, "memory(GiB)": 11.57, "step": 185, "train_speed(iter/s)": 0.374182 }, { "acc": 0.60937552, "epoch": 0.02558922558922559, "grad_norm": 17.75, "learning_rate": 5.114401076716017e-06, "loss": 1.42839565, "memory(GiB)": 11.57, "step": 190, "train_speed(iter/s)": 0.375383 }, { "acc": 0.56700568, "epoch": 0.026262626262626262, "grad_norm": 23.125, "learning_rate": 5.248990578734859e-06, "loss": 1.72391968, "memory(GiB)": 11.57, "step": 195, "train_speed(iter/s)": 0.376799 }, { "acc": 0.64125962, "epoch": 0.026936026936026935, "grad_norm": 12.875, "learning_rate": 5.383580080753702e-06, "loss": 1.45175018, "memory(GiB)": 11.57, "step": 200, "train_speed(iter/s)": 0.377518 }, { "acc": 0.63175621, "epoch": 0.027609427609427608, "grad_norm": 18.75, "learning_rate": 5.518169582772545e-06, "loss": 1.22912884, "memory(GiB)": 11.57, "step": 205, "train_speed(iter/s)": 0.378115 }, { "acc": 0.54751568, "epoch": 0.028282828282828285, "grad_norm": 13.5, "learning_rate": 5.652759084791387e-06, "loss": 1.51581717, "memory(GiB)": 11.57, "step": 210, "train_speed(iter/s)": 0.37873 }, { "acc": 0.59878306, "epoch": 0.028956228956228958, "grad_norm": 25.25, "learning_rate": 5.78734858681023e-06, "loss": 1.66876678, "memory(GiB)": 11.57, "step": 215, "train_speed(iter/s)": 0.379176 }, { "acc": 0.68903089, "epoch": 0.02962962962962963, "grad_norm": 15.75, "learning_rate": 5.9219380888290726e-06, "loss": 1.21776981, "memory(GiB)": 11.57, "step": 220, "train_speed(iter/s)": 0.380325 }, { "acc": 0.65183606, "epoch": 0.030303030303030304, "grad_norm": 16.5, "learning_rate": 6.056527590847915e-06, "loss": 1.33960438, "memory(GiB)": 11.57, "step": 225, "train_speed(iter/s)": 0.378646 }, { "acc": 0.56697273, "epoch": 0.030976430976430977, "grad_norm": 20.25, "learning_rate": 6.1911170928667574e-06, "loss": 1.61164932, "memory(GiB)": 11.57, "step": 230, "train_speed(iter/s)": 0.379002 }, { "acc": 0.60521441, "epoch": 0.03164983164983165, "grad_norm": 14.125, "learning_rate": 6.325706594885599e-06, "loss": 1.62042446, "memory(GiB)": 11.57, "step": 235, "train_speed(iter/s)": 0.379663 }, { "acc": 0.57916322, "epoch": 0.03232323232323232, "grad_norm": 20.0, "learning_rate": 6.4602960969044415e-06, "loss": 1.60195808, "memory(GiB)": 11.57, "step": 240, "train_speed(iter/s)": 0.380356 }, { "acc": 0.63395228, "epoch": 0.032996632996632996, "grad_norm": 29.75, "learning_rate": 6.594885598923284e-06, "loss": 1.40564528, "memory(GiB)": 11.57, "step": 245, "train_speed(iter/s)": 0.380995 }, { "acc": 0.58923273, "epoch": 0.03367003367003367, "grad_norm": 25.125, "learning_rate": 6.729475100942126e-06, "loss": 1.4849618, "memory(GiB)": 11.57, "step": 250, "train_speed(iter/s)": 0.381146 }, { "acc": 0.57267365, "epoch": 0.03434343434343434, "grad_norm": 20.25, "learning_rate": 6.864064602960969e-06, "loss": 1.61652508, "memory(GiB)": 11.57, "step": 255, "train_speed(iter/s)": 0.382063 }, { "acc": 0.54358578, "epoch": 0.035016835016835016, "grad_norm": 13.875, "learning_rate": 6.998654104979812e-06, "loss": 1.63632908, "memory(GiB)": 11.57, "step": 260, "train_speed(iter/s)": 0.38277 }, { "acc": 0.60778151, "epoch": 0.03569023569023569, "grad_norm": 21.875, "learning_rate": 7.133243606998654e-06, "loss": 1.2964942, "memory(GiB)": 11.57, "step": 265, "train_speed(iter/s)": 0.383463 }, { "acc": 0.6651093, "epoch": 0.03636363636363636, "grad_norm": 11.9375, "learning_rate": 7.267833109017497e-06, "loss": 1.15631075, "memory(GiB)": 11.57, "step": 270, "train_speed(iter/s)": 0.381222 }, { "acc": 0.61537771, "epoch": 0.037037037037037035, "grad_norm": 19.625, "learning_rate": 7.40242261103634e-06, "loss": 1.52807426, "memory(GiB)": 11.57, "step": 275, "train_speed(iter/s)": 0.381404 }, { "acc": 0.69333591, "epoch": 0.03771043771043771, "grad_norm": 16.0, "learning_rate": 7.537012113055182e-06, "loss": 1.10553503, "memory(GiB)": 11.57, "step": 280, "train_speed(iter/s)": 0.381658 }, { "acc": 0.65817041, "epoch": 0.03838383838383838, "grad_norm": 21.125, "learning_rate": 7.671601615074024e-06, "loss": 1.39679871, "memory(GiB)": 11.57, "step": 285, "train_speed(iter/s)": 0.381858 }, { "acc": 0.65443778, "epoch": 0.039057239057239054, "grad_norm": 15.6875, "learning_rate": 7.806191117092868e-06, "loss": 1.19904938, "memory(GiB)": 11.57, "step": 290, "train_speed(iter/s)": 0.382372 }, { "acc": 0.5628746, "epoch": 0.03973063973063973, "grad_norm": 10.5, "learning_rate": 7.94078061911171e-06, "loss": 1.59957199, "memory(GiB)": 11.57, "step": 295, "train_speed(iter/s)": 0.382493 }, { "acc": 0.6009872, "epoch": 0.04040404040404041, "grad_norm": 17.0, "learning_rate": 8.075370121130552e-06, "loss": 1.64843807, "memory(GiB)": 11.57, "step": 300, "train_speed(iter/s)": 0.383049 }, { "epoch": 0.04040404040404041, "eval_acc": 0.6508429803434077, "eval_loss": 1.4724615812301636, "eval_runtime": 109.8308, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 300 }, { "acc": 0.63098454, "epoch": 0.04107744107744108, "grad_norm": 14.3125, "learning_rate": 8.209959623149395e-06, "loss": 1.40613441, "memory(GiB)": 11.57, "step": 305, "train_speed(iter/s)": 0.337115 }, { "acc": 0.663204, "epoch": 0.041750841750841754, "grad_norm": 8.25, "learning_rate": 8.344549125168237e-06, "loss": 1.29444733, "memory(GiB)": 11.57, "step": 310, "train_speed(iter/s)": 0.337523 }, { "acc": 0.56645985, "epoch": 0.04242424242424243, "grad_norm": 12.875, "learning_rate": 8.47913862718708e-06, "loss": 1.68605423, "memory(GiB)": 11.57, "step": 315, "train_speed(iter/s)": 0.338738 }, { "acc": 0.67048483, "epoch": 0.0430976430976431, "grad_norm": 11.9375, "learning_rate": 8.613728129205923e-06, "loss": 1.39615965, "memory(GiB)": 11.57, "step": 320, "train_speed(iter/s)": 0.338694 }, { "acc": 0.72679968, "epoch": 0.04377104377104377, "grad_norm": 23.375, "learning_rate": 8.748317631224765e-06, "loss": 1.10950098, "memory(GiB)": 11.57, "step": 325, "train_speed(iter/s)": 0.339094 }, { "acc": 0.62103057, "epoch": 0.044444444444444446, "grad_norm": 18.375, "learning_rate": 8.882907133243607e-06, "loss": 1.33663902, "memory(GiB)": 11.57, "step": 330, "train_speed(iter/s)": 0.339679 }, { "acc": 0.71397562, "epoch": 0.04511784511784512, "grad_norm": 22.25, "learning_rate": 9.017496635262451e-06, "loss": 1.09442234, "memory(GiB)": 11.57, "step": 335, "train_speed(iter/s)": 0.340369 }, { "acc": 0.67727013, "epoch": 0.04579124579124579, "grad_norm": 16.125, "learning_rate": 9.152086137281293e-06, "loss": 1.37714205, "memory(GiB)": 11.57, "step": 340, "train_speed(iter/s)": 0.340722 }, { "acc": 0.56682105, "epoch": 0.046464646464646465, "grad_norm": 23.25, "learning_rate": 9.286675639300135e-06, "loss": 1.47191801, "memory(GiB)": 11.57, "step": 345, "train_speed(iter/s)": 0.341435 }, { "acc": 0.64976182, "epoch": 0.04713804713804714, "grad_norm": 19.625, "learning_rate": 9.421265141318979e-06, "loss": 1.43252811, "memory(GiB)": 11.57, "step": 350, "train_speed(iter/s)": 0.342317 }, { "acc": 0.73611102, "epoch": 0.04781144781144781, "grad_norm": 9.125, "learning_rate": 9.55585464333782e-06, "loss": 1.09405451, "memory(GiB)": 11.57, "step": 355, "train_speed(iter/s)": 0.342304 }, { "acc": 0.62947536, "epoch": 0.048484848484848485, "grad_norm": 17.0, "learning_rate": 9.690444145356663e-06, "loss": 1.33388605, "memory(GiB)": 11.57, "step": 360, "train_speed(iter/s)": 0.343008 }, { "acc": 0.61660504, "epoch": 0.04915824915824916, "grad_norm": 14.0, "learning_rate": 9.825033647375506e-06, "loss": 1.26432142, "memory(GiB)": 11.57, "step": 365, "train_speed(iter/s)": 0.343933 }, { "acc": 0.6861485, "epoch": 0.04983164983164983, "grad_norm": 13.125, "learning_rate": 9.959623149394348e-06, "loss": 1.17055111, "memory(GiB)": 11.57, "step": 370, "train_speed(iter/s)": 0.344701 }, { "acc": 0.68641739, "epoch": 0.050505050505050504, "grad_norm": 10.75, "learning_rate": 1.009421265141319e-05, "loss": 1.19751577, "memory(GiB)": 11.57, "step": 375, "train_speed(iter/s)": 0.344801 }, { "acc": 0.58911633, "epoch": 0.05117845117845118, "grad_norm": 12.875, "learning_rate": 1.0228802153432034e-05, "loss": 1.57844391, "memory(GiB)": 11.57, "step": 380, "train_speed(iter/s)": 0.343816 }, { "acc": 0.66787591, "epoch": 0.05185185185185185, "grad_norm": 9.375, "learning_rate": 1.0363391655450876e-05, "loss": 1.22666464, "memory(GiB)": 11.57, "step": 385, "train_speed(iter/s)": 0.344265 }, { "acc": 0.72761278, "epoch": 0.052525252525252523, "grad_norm": 13.5, "learning_rate": 1.0497981157469718e-05, "loss": 1.11251936, "memory(GiB)": 11.57, "step": 390, "train_speed(iter/s)": 0.344006 }, { "acc": 0.63036947, "epoch": 0.0531986531986532, "grad_norm": 14.75, "learning_rate": 1.0632570659488562e-05, "loss": 1.37007179, "memory(GiB)": 11.57, "step": 395, "train_speed(iter/s)": 0.344776 }, { "acc": 0.7391459, "epoch": 0.05387205387205387, "grad_norm": 9.0, "learning_rate": 1.0767160161507404e-05, "loss": 1.07246151, "memory(GiB)": 11.57, "step": 400, "train_speed(iter/s)": 0.345077 }, { "acc": 0.69003735, "epoch": 0.05454545454545454, "grad_norm": 20.125, "learning_rate": 1.0901749663526246e-05, "loss": 1.18787746, "memory(GiB)": 11.57, "step": 405, "train_speed(iter/s)": 0.34616 }, { "acc": 0.75034552, "epoch": 0.055218855218855216, "grad_norm": 37.75, "learning_rate": 1.103633916554509e-05, "loss": 0.96831808, "memory(GiB)": 11.57, "step": 410, "train_speed(iter/s)": 0.346841 }, { "acc": 0.57640429, "epoch": 0.05589225589225589, "grad_norm": 39.0, "learning_rate": 1.1170928667563932e-05, "loss": 1.33788805, "memory(GiB)": 11.57, "step": 415, "train_speed(iter/s)": 0.347735 }, { "acc": 0.73999271, "epoch": 0.05656565656565657, "grad_norm": 9.375, "learning_rate": 1.1305518169582774e-05, "loss": 1.05237989, "memory(GiB)": 11.57, "step": 420, "train_speed(iter/s)": 0.347691 }, { "acc": 0.66486025, "epoch": 0.05723905723905724, "grad_norm": 13.8125, "learning_rate": 1.1440107671601617e-05, "loss": 1.10302191, "memory(GiB)": 11.57, "step": 425, "train_speed(iter/s)": 0.3484 }, { "acc": 0.6567131, "epoch": 0.057912457912457915, "grad_norm": 20.5, "learning_rate": 1.157469717362046e-05, "loss": 1.13996429, "memory(GiB)": 11.57, "step": 430, "train_speed(iter/s)": 0.349331 }, { "acc": 0.70993357, "epoch": 0.05858585858585859, "grad_norm": 12.5625, "learning_rate": 1.1709286675639301e-05, "loss": 1.02943163, "memory(GiB)": 11.57, "step": 435, "train_speed(iter/s)": 0.349709 }, { "acc": 0.75071635, "epoch": 0.05925925925925926, "grad_norm": 12.25, "learning_rate": 1.1843876177658145e-05, "loss": 0.89214468, "memory(GiB)": 11.57, "step": 440, "train_speed(iter/s)": 0.349855 }, { "acc": 0.67376766, "epoch": 0.059932659932659935, "grad_norm": 9.9375, "learning_rate": 1.1978465679676987e-05, "loss": 1.3653923, "memory(GiB)": 11.57, "step": 445, "train_speed(iter/s)": 0.3505 }, { "acc": 0.64020987, "epoch": 0.06060606060606061, "grad_norm": 18.25, "learning_rate": 1.211305518169583e-05, "loss": 1.10264673, "memory(GiB)": 11.57, "step": 450, "train_speed(iter/s)": 0.351128 }, { "acc": 0.70698514, "epoch": 0.06127946127946128, "grad_norm": 16.625, "learning_rate": 1.2247644683714673e-05, "loss": 0.94584637, "memory(GiB)": 11.57, "step": 455, "train_speed(iter/s)": 0.351868 }, { "acc": 0.70906076, "epoch": 0.061952861952861954, "grad_norm": 11.5625, "learning_rate": 1.2382234185733515e-05, "loss": 1.19026566, "memory(GiB)": 11.57, "step": 460, "train_speed(iter/s)": 0.351803 }, { "acc": 0.71455946, "epoch": 0.06262626262626263, "grad_norm": 13.9375, "learning_rate": 1.2516823687752355e-05, "loss": 1.10838032, "memory(GiB)": 11.57, "step": 465, "train_speed(iter/s)": 0.352101 }, { "acc": 0.65413985, "epoch": 0.0632996632996633, "grad_norm": 9.25, "learning_rate": 1.2651413189771197e-05, "loss": 1.20070848, "memory(GiB)": 11.57, "step": 470, "train_speed(iter/s)": 0.352223 }, { "acc": 0.66868377, "epoch": 0.06397306397306397, "grad_norm": 12.6875, "learning_rate": 1.2786002691790041e-05, "loss": 1.40500479, "memory(GiB)": 11.57, "step": 475, "train_speed(iter/s)": 0.352577 }, { "acc": 0.71377177, "epoch": 0.06464646464646465, "grad_norm": 9.75, "learning_rate": 1.2920592193808883e-05, "loss": 0.95710163, "memory(GiB)": 11.57, "step": 480, "train_speed(iter/s)": 0.35326 }, { "acc": 0.69024115, "epoch": 0.06531986531986532, "grad_norm": 9.625, "learning_rate": 1.3055181695827725e-05, "loss": 1.21541462, "memory(GiB)": 11.57, "step": 485, "train_speed(iter/s)": 0.353899 }, { "acc": 0.67221255, "epoch": 0.06599326599326599, "grad_norm": 15.1875, "learning_rate": 1.3189771197846569e-05, "loss": 1.12308636, "memory(GiB)": 11.57, "step": 490, "train_speed(iter/s)": 0.354344 }, { "acc": 0.72153354, "epoch": 0.06666666666666667, "grad_norm": 7.1875, "learning_rate": 1.332436069986541e-05, "loss": 1.1415411, "memory(GiB)": 11.57, "step": 495, "train_speed(iter/s)": 0.354829 }, { "acc": 0.65405703, "epoch": 0.06734006734006734, "grad_norm": 10.125, "learning_rate": 1.3458950201884253e-05, "loss": 1.10353146, "memory(GiB)": 11.57, "step": 500, "train_speed(iter/s)": 0.355372 }, { "acc": 0.71678686, "epoch": 0.06801346801346801, "grad_norm": 11.3125, "learning_rate": 1.3593539703903096e-05, "loss": 0.81731205, "memory(GiB)": 11.57, "step": 505, "train_speed(iter/s)": 0.354671 }, { "acc": 0.75929475, "epoch": 0.06868686868686869, "grad_norm": 10.75, "learning_rate": 1.3728129205921938e-05, "loss": 0.86307383, "memory(GiB)": 11.57, "step": 510, "train_speed(iter/s)": 0.355125 }, { "acc": 0.71500969, "epoch": 0.06936026936026936, "grad_norm": 14.9375, "learning_rate": 1.386271870794078e-05, "loss": 1.25699072, "memory(GiB)": 11.57, "step": 515, "train_speed(iter/s)": 0.355201 }, { "acc": 0.70714483, "epoch": 0.07003367003367003, "grad_norm": 10.0, "learning_rate": 1.3997308209959624e-05, "loss": 1.0081563, "memory(GiB)": 11.57, "step": 520, "train_speed(iter/s)": 0.35544 }, { "acc": 0.69690123, "epoch": 0.0707070707070707, "grad_norm": 8.5, "learning_rate": 1.4131897711978466e-05, "loss": 1.17593975, "memory(GiB)": 11.57, "step": 525, "train_speed(iter/s)": 0.354865 }, { "acc": 0.65076323, "epoch": 0.07138047138047138, "grad_norm": 12.625, "learning_rate": 1.4266487213997308e-05, "loss": 0.90211763, "memory(GiB)": 11.57, "step": 530, "train_speed(iter/s)": 0.355576 }, { "acc": 0.75683184, "epoch": 0.07205387205387205, "grad_norm": 10.9375, "learning_rate": 1.4401076716016152e-05, "loss": 0.78305464, "memory(GiB)": 11.57, "step": 535, "train_speed(iter/s)": 0.355643 }, { "acc": 0.63366609, "epoch": 0.07272727272727272, "grad_norm": 24.5, "learning_rate": 1.4535666218034994e-05, "loss": 1.13021212, "memory(GiB)": 11.57, "step": 540, "train_speed(iter/s)": 0.355557 }, { "acc": 0.79359851, "epoch": 0.0734006734006734, "grad_norm": 13.4375, "learning_rate": 1.4670255720053836e-05, "loss": 0.66878433, "memory(GiB)": 11.57, "step": 545, "train_speed(iter/s)": 0.355904 }, { "acc": 0.73218951, "epoch": 0.07407407407407407, "grad_norm": 13.1875, "learning_rate": 1.480484522207268e-05, "loss": 0.8534152, "memory(GiB)": 11.57, "step": 550, "train_speed(iter/s)": 0.356506 }, { "acc": 0.74032345, "epoch": 0.07474747474747474, "grad_norm": 18.125, "learning_rate": 1.4939434724091522e-05, "loss": 1.04831533, "memory(GiB)": 11.57, "step": 555, "train_speed(iter/s)": 0.356691 }, { "acc": 0.79713354, "epoch": 0.07542087542087542, "grad_norm": 9.8125, "learning_rate": 1.5074024226110364e-05, "loss": 0.72469769, "memory(GiB)": 11.57, "step": 560, "train_speed(iter/s)": 0.357149 }, { "acc": 0.73963046, "epoch": 0.07609427609427609, "grad_norm": 13.0625, "learning_rate": 1.5208613728129207e-05, "loss": 0.92076893, "memory(GiB)": 11.57, "step": 565, "train_speed(iter/s)": 0.35763 }, { "acc": 0.76453247, "epoch": 0.07676767676767676, "grad_norm": 17.75, "learning_rate": 1.5343203230148048e-05, "loss": 0.82259836, "memory(GiB)": 11.57, "step": 570, "train_speed(iter/s)": 0.358102 }, { "acc": 0.72729082, "epoch": 0.07744107744107744, "grad_norm": 10.375, "learning_rate": 1.547779273216689e-05, "loss": 0.81022062, "memory(GiB)": 11.57, "step": 575, "train_speed(iter/s)": 0.358299 }, { "acc": 0.67112513, "epoch": 0.07811447811447811, "grad_norm": 11.5625, "learning_rate": 1.5612382234185735e-05, "loss": 1.24032078, "memory(GiB)": 11.57, "step": 580, "train_speed(iter/s)": 0.358747 }, { "acc": 0.74445891, "epoch": 0.07878787878787878, "grad_norm": 9.25, "learning_rate": 1.5746971736204576e-05, "loss": 0.91865206, "memory(GiB)": 11.57, "step": 585, "train_speed(iter/s)": 0.358349 }, { "acc": 0.67735329, "epoch": 0.07946127946127945, "grad_norm": 15.75, "learning_rate": 1.588156123822342e-05, "loss": 1.07022667, "memory(GiB)": 11.57, "step": 590, "train_speed(iter/s)": 0.358313 }, { "acc": 0.65999546, "epoch": 0.08013468013468013, "grad_norm": 19.25, "learning_rate": 1.6016150740242263e-05, "loss": 1.37132463, "memory(GiB)": 11.57, "step": 595, "train_speed(iter/s)": 0.358626 }, { "acc": 0.72108564, "epoch": 0.08080808080808081, "grad_norm": 9.5, "learning_rate": 1.6150740242261103e-05, "loss": 0.84458551, "memory(GiB)": 11.57, "step": 600, "train_speed(iter/s)": 0.358946 }, { "epoch": 0.08080808080808081, "eval_acc": 0.7462481867334828, "eval_loss": 1.052156686782837, "eval_runtime": 109.4723, "eval_samples_per_second": 1.37, "eval_steps_per_second": 1.37, "step": 600 }, { "acc": 0.81282282, "epoch": 0.08148148148148149, "grad_norm": 16.75, "learning_rate": 1.6285329744279947e-05, "loss": 0.7542716, "memory(GiB)": 11.57, "step": 605, "train_speed(iter/s)": 0.336852 }, { "acc": 0.70444484, "epoch": 0.08215488215488216, "grad_norm": 10.375, "learning_rate": 1.641991924629879e-05, "loss": 1.11807833, "memory(GiB)": 11.57, "step": 610, "train_speed(iter/s)": 0.337448 }, { "acc": 0.73636794, "epoch": 0.08282828282828283, "grad_norm": 20.125, "learning_rate": 1.655450874831763e-05, "loss": 0.9746563, "memory(GiB)": 11.57, "step": 615, "train_speed(iter/s)": 0.33771 }, { "acc": 0.69418383, "epoch": 0.08350168350168351, "grad_norm": 9.5625, "learning_rate": 1.6689098250336475e-05, "loss": 1.19150572, "memory(GiB)": 11.57, "step": 620, "train_speed(iter/s)": 0.338338 }, { "acc": 0.77662058, "epoch": 0.08417508417508418, "grad_norm": 15.4375, "learning_rate": 1.682368775235532e-05, "loss": 0.85510263, "memory(GiB)": 11.57, "step": 625, "train_speed(iter/s)": 0.338681 }, { "acc": 0.66876321, "epoch": 0.08484848484848485, "grad_norm": 10.75, "learning_rate": 1.695827725437416e-05, "loss": 1.03631792, "memory(GiB)": 11.57, "step": 630, "train_speed(iter/s)": 0.339105 }, { "acc": 0.72490911, "epoch": 0.08552188552188553, "grad_norm": 12.4375, "learning_rate": 1.7092866756393003e-05, "loss": 0.90714159, "memory(GiB)": 11.57, "step": 635, "train_speed(iter/s)": 0.339573 }, { "acc": 0.715308, "epoch": 0.0861952861952862, "grad_norm": 16.75, "learning_rate": 1.7227456258411846e-05, "loss": 0.83676376, "memory(GiB)": 11.57, "step": 640, "train_speed(iter/s)": 0.339536 }, { "acc": 0.72004409, "epoch": 0.08686868686868687, "grad_norm": 11.125, "learning_rate": 1.7362045760430687e-05, "loss": 0.99052076, "memory(GiB)": 11.57, "step": 645, "train_speed(iter/s)": 0.340137 }, { "acc": 0.77303476, "epoch": 0.08754208754208755, "grad_norm": 13.625, "learning_rate": 1.749663526244953e-05, "loss": 0.89626684, "memory(GiB)": 11.57, "step": 650, "train_speed(iter/s)": 0.340687 }, { "acc": 0.73230481, "epoch": 0.08821548821548822, "grad_norm": 16.5, "learning_rate": 1.7631224764468374e-05, "loss": 0.84235859, "memory(GiB)": 11.57, "step": 655, "train_speed(iter/s)": 0.341039 }, { "acc": 0.78219714, "epoch": 0.08888888888888889, "grad_norm": 6.90625, "learning_rate": 1.7765814266487214e-05, "loss": 0.67909622, "memory(GiB)": 11.57, "step": 660, "train_speed(iter/s)": 0.341389 }, { "acc": 0.76528678, "epoch": 0.08956228956228957, "grad_norm": 13.125, "learning_rate": 1.7900403768506058e-05, "loss": 0.81274815, "memory(GiB)": 11.57, "step": 665, "train_speed(iter/s)": 0.341734 }, { "acc": 0.7723166, "epoch": 0.09023569023569024, "grad_norm": 11.4375, "learning_rate": 1.8034993270524902e-05, "loss": 0.86763144, "memory(GiB)": 11.57, "step": 670, "train_speed(iter/s)": 0.341873 }, { "acc": 0.75492234, "epoch": 0.09090909090909091, "grad_norm": 13.3125, "learning_rate": 1.8169582772543742e-05, "loss": 0.96772375, "memory(GiB)": 15.04, "step": 675, "train_speed(iter/s)": 0.341943 }, { "acc": 0.77179217, "epoch": 0.09158249158249158, "grad_norm": 8.5625, "learning_rate": 1.8304172274562586e-05, "loss": 0.804498, "memory(GiB)": 15.04, "step": 680, "train_speed(iter/s)": 0.342304 }, { "acc": 0.7159348, "epoch": 0.09225589225589226, "grad_norm": 9.6875, "learning_rate": 1.843876177658143e-05, "loss": 0.99472179, "memory(GiB)": 15.04, "step": 685, "train_speed(iter/s)": 0.342728 }, { "acc": 0.69876709, "epoch": 0.09292929292929293, "grad_norm": 8.0625, "learning_rate": 1.857335127860027e-05, "loss": 1.07095146, "memory(GiB)": 15.04, "step": 690, "train_speed(iter/s)": 0.343191 }, { "acc": 0.80019283, "epoch": 0.0936026936026936, "grad_norm": 22.5, "learning_rate": 1.8707940780619114e-05, "loss": 0.71981606, "memory(GiB)": 15.04, "step": 695, "train_speed(iter/s)": 0.343506 }, { "acc": 0.80901213, "epoch": 0.09427609427609428, "grad_norm": 12.375, "learning_rate": 1.8842530282637957e-05, "loss": 0.70246754, "memory(GiB)": 15.04, "step": 700, "train_speed(iter/s)": 0.343768 }, { "acc": 0.71540055, "epoch": 0.09494949494949495, "grad_norm": 12.9375, "learning_rate": 1.8977119784656798e-05, "loss": 1.19597988, "memory(GiB)": 15.04, "step": 705, "train_speed(iter/s)": 0.344148 }, { "acc": 0.65902338, "epoch": 0.09562289562289562, "grad_norm": 9.25, "learning_rate": 1.911170928667564e-05, "loss": 1.11431704, "memory(GiB)": 15.04, "step": 710, "train_speed(iter/s)": 0.34419 }, { "acc": 0.8256176, "epoch": 0.0962962962962963, "grad_norm": 8.875, "learning_rate": 1.9246298788694485e-05, "loss": 0.58568578, "memory(GiB)": 15.04, "step": 715, "train_speed(iter/s)": 0.344353 }, { "acc": 0.79482641, "epoch": 0.09696969696969697, "grad_norm": 9.75, "learning_rate": 1.9380888290713325e-05, "loss": 0.8713213, "memory(GiB)": 15.04, "step": 720, "train_speed(iter/s)": 0.344625 }, { "acc": 0.74385633, "epoch": 0.09764309764309764, "grad_norm": 8.375, "learning_rate": 1.951547779273217e-05, "loss": 0.83829308, "memory(GiB)": 15.04, "step": 725, "train_speed(iter/s)": 0.344891 }, { "acc": 0.79673042, "epoch": 0.09831649831649832, "grad_norm": 16.25, "learning_rate": 1.9650067294751013e-05, "loss": 0.73992858, "memory(GiB)": 15.04, "step": 730, "train_speed(iter/s)": 0.345234 }, { "acc": 0.80602369, "epoch": 0.09898989898989899, "grad_norm": 10.8125, "learning_rate": 1.9784656796769853e-05, "loss": 0.68064899, "memory(GiB)": 15.04, "step": 735, "train_speed(iter/s)": 0.34545 }, { "acc": 0.77511039, "epoch": 0.09966329966329966, "grad_norm": 7.4375, "learning_rate": 1.9919246298788697e-05, "loss": 0.77711539, "memory(GiB)": 15.04, "step": 740, "train_speed(iter/s)": 0.345826 }, { "acc": 0.76451511, "epoch": 0.10033670033670034, "grad_norm": 8.125, "learning_rate": 1.9999999008117105e-05, "loss": 0.83295755, "memory(GiB)": 15.04, "step": 745, "train_speed(iter/s)": 0.34599 }, { "acc": 0.78624372, "epoch": 0.10101010101010101, "grad_norm": 7.84375, "learning_rate": 1.999998784943679e-05, "loss": 0.7498014, "memory(GiB)": 15.04, "step": 750, "train_speed(iter/s)": 0.346026 }, { "acc": 0.77836795, "epoch": 0.10168350168350168, "grad_norm": 15.75, "learning_rate": 1.999996429223642e-05, "loss": 0.76124926, "memory(GiB)": 15.04, "step": 755, "train_speed(iter/s)": 0.346306 }, { "acc": 0.84560328, "epoch": 0.10235690235690235, "grad_norm": 7.5625, "learning_rate": 1.9999928336545205e-05, "loss": 0.63052053, "memory(GiB)": 15.04, "step": 760, "train_speed(iter/s)": 0.346406 }, { "acc": 0.77344265, "epoch": 0.10303030303030303, "grad_norm": 16.5, "learning_rate": 1.9999879982407722e-05, "loss": 0.75681195, "memory(GiB)": 15.04, "step": 765, "train_speed(iter/s)": 0.346668 }, { "acc": 0.78544083, "epoch": 0.1037037037037037, "grad_norm": 14.5, "learning_rate": 1.9999819229883925e-05, "loss": 0.7890686, "memory(GiB)": 15.04, "step": 770, "train_speed(iter/s)": 0.346842 }, { "acc": 0.80033808, "epoch": 0.10437710437710437, "grad_norm": 12.5625, "learning_rate": 1.9999746079049136e-05, "loss": 0.72480288, "memory(GiB)": 15.04, "step": 775, "train_speed(iter/s)": 0.346905 }, { "acc": 0.81095028, "epoch": 0.10505050505050505, "grad_norm": 7.78125, "learning_rate": 1.9999660529994056e-05, "loss": 0.647194, "memory(GiB)": 15.04, "step": 780, "train_speed(iter/s)": 0.347101 }, { "acc": 0.83096733, "epoch": 0.10572390572390572, "grad_norm": 8.75, "learning_rate": 1.9999562582824747e-05, "loss": 0.67273521, "memory(GiB)": 15.04, "step": 785, "train_speed(iter/s)": 0.347234 }, { "acc": 0.74170909, "epoch": 0.1063973063973064, "grad_norm": 12.125, "learning_rate": 1.9999452237662655e-05, "loss": 0.9027566, "memory(GiB)": 15.04, "step": 790, "train_speed(iter/s)": 0.34764 }, { "acc": 0.78647127, "epoch": 0.10707070707070707, "grad_norm": 9.875, "learning_rate": 1.9999329494644588e-05, "loss": 0.85048122, "memory(GiB)": 15.04, "step": 795, "train_speed(iter/s)": 0.347928 }, { "acc": 0.81664906, "epoch": 0.10774410774410774, "grad_norm": 25.125, "learning_rate": 1.9999194353922732e-05, "loss": 0.60916672, "memory(GiB)": 15.04, "step": 800, "train_speed(iter/s)": 0.348221 }, { "acc": 0.76929884, "epoch": 0.10841750841750841, "grad_norm": 15.0625, "learning_rate": 1.999904681566464e-05, "loss": 1.00046215, "memory(GiB)": 15.04, "step": 805, "train_speed(iter/s)": 0.348048 }, { "acc": 0.79135976, "epoch": 0.10909090909090909, "grad_norm": 11.8125, "learning_rate": 1.9998886880053233e-05, "loss": 0.7122694, "memory(GiB)": 15.04, "step": 810, "train_speed(iter/s)": 0.348227 }, { "acc": 0.79001317, "epoch": 0.10976430976430976, "grad_norm": 16.25, "learning_rate": 1.9998714547286816e-05, "loss": 0.7056983, "memory(GiB)": 15.04, "step": 815, "train_speed(iter/s)": 0.348282 }, { "acc": 0.73441644, "epoch": 0.11043771043771043, "grad_norm": 28.5, "learning_rate": 1.9998529817579055e-05, "loss": 1.05510197, "memory(GiB)": 15.04, "step": 820, "train_speed(iter/s)": 0.348166 }, { "acc": 0.79821463, "epoch": 0.1111111111111111, "grad_norm": 9.3125, "learning_rate": 1.9998332691158985e-05, "loss": 0.60838032, "memory(GiB)": 15.04, "step": 825, "train_speed(iter/s)": 0.348333 }, { "acc": 0.72554183, "epoch": 0.11178451178451178, "grad_norm": 10.1875, "learning_rate": 1.9998123168271017e-05, "loss": 0.8306942, "memory(GiB)": 15.04, "step": 830, "train_speed(iter/s)": 0.348743 }, { "acc": 0.86007366, "epoch": 0.11245791245791245, "grad_norm": 6.8125, "learning_rate": 1.9997901249174924e-05, "loss": 0.5353889, "memory(GiB)": 15.04, "step": 835, "train_speed(iter/s)": 0.348942 }, { "acc": 0.78542347, "epoch": 0.11313131313131314, "grad_norm": 11.5625, "learning_rate": 1.9997666934145858e-05, "loss": 0.6618031, "memory(GiB)": 15.04, "step": 840, "train_speed(iter/s)": 0.349304 }, { "acc": 0.76181016, "epoch": 0.11380471380471381, "grad_norm": 17.875, "learning_rate": 1.999742022347433e-05, "loss": 0.74508605, "memory(GiB)": 15.04, "step": 845, "train_speed(iter/s)": 0.349287 }, { "acc": 0.75039043, "epoch": 0.11447811447811448, "grad_norm": 7.4375, "learning_rate": 1.999716111746623e-05, "loss": 0.70882645, "memory(GiB)": 15.04, "step": 850, "train_speed(iter/s)": 0.349239 }, { "acc": 0.75623989, "epoch": 0.11515151515151516, "grad_norm": 19.25, "learning_rate": 1.9996889616442808e-05, "loss": 0.73483896, "memory(GiB)": 15.04, "step": 855, "train_speed(iter/s)": 0.349329 }, { "acc": 0.82510233, "epoch": 0.11582491582491583, "grad_norm": 18.75, "learning_rate": 1.999660572074069e-05, "loss": 0.57684283, "memory(GiB)": 15.04, "step": 860, "train_speed(iter/s)": 0.349558 }, { "acc": 0.76790061, "epoch": 0.1164983164983165, "grad_norm": 16.75, "learning_rate": 1.999630943071186e-05, "loss": 1.15222225, "memory(GiB)": 15.04, "step": 865, "train_speed(iter/s)": 0.34962 }, { "acc": 0.70848417, "epoch": 0.11717171717171718, "grad_norm": 15.625, "learning_rate": 1.9996000746723677e-05, "loss": 0.76727939, "memory(GiB)": 15.04, "step": 870, "train_speed(iter/s)": 0.349986 }, { "acc": 0.75316901, "epoch": 0.11784511784511785, "grad_norm": 14.5625, "learning_rate": 1.999567966915886e-05, "loss": 0.86907434, "memory(GiB)": 15.04, "step": 875, "train_speed(iter/s)": 0.34999 }, { "acc": 0.81012983, "epoch": 0.11851851851851852, "grad_norm": 8.5625, "learning_rate": 1.9995346198415507e-05, "loss": 0.69052753, "memory(GiB)": 15.04, "step": 880, "train_speed(iter/s)": 0.350055 }, { "acc": 0.72671976, "epoch": 0.1191919191919192, "grad_norm": 13.5625, "learning_rate": 1.9995000334907067e-05, "loss": 1.07237949, "memory(GiB)": 15.04, "step": 885, "train_speed(iter/s)": 0.350041 }, { "acc": 0.79563127, "epoch": 0.11986531986531987, "grad_norm": 6.34375, "learning_rate": 1.9994642079062355e-05, "loss": 0.71864276, "memory(GiB)": 15.04, "step": 890, "train_speed(iter/s)": 0.35027 }, { "acc": 0.80651369, "epoch": 0.12053872053872054, "grad_norm": 11.4375, "learning_rate": 1.999427143132557e-05, "loss": 0.55639653, "memory(GiB)": 15.04, "step": 895, "train_speed(iter/s)": 0.350582 }, { "acc": 0.80839796, "epoch": 0.12121212121212122, "grad_norm": 6.8125, "learning_rate": 1.9993888392156243e-05, "loss": 0.78113098, "memory(GiB)": 15.04, "step": 900, "train_speed(iter/s)": 0.35086 }, { "epoch": 0.12121212121212122, "eval_acc": 0.7964180042418371, "eval_loss": 0.8123638033866882, "eval_runtime": 109.7552, "eval_samples_per_second": 1.367, "eval_steps_per_second": 1.367, "step": 900 }, { "acc": 0.82671185, "epoch": 0.12188552188552189, "grad_norm": 8.9375, "learning_rate": 1.99934929620293e-05, "loss": 0.67872958, "memory(GiB)": 15.04, "step": 905, "train_speed(iter/s)": 0.336644 }, { "acc": 0.78594241, "epoch": 0.12255892255892256, "grad_norm": 13.0, "learning_rate": 1.9993085141435013e-05, "loss": 0.72096338, "memory(GiB)": 15.04, "step": 910, "train_speed(iter/s)": 0.336923 }, { "acc": 0.80444469, "epoch": 0.12323232323232323, "grad_norm": 8.4375, "learning_rate": 1.9992664930879018e-05, "loss": 0.80335388, "memory(GiB)": 15.04, "step": 915, "train_speed(iter/s)": 0.337332 }, { "acc": 0.87560616, "epoch": 0.12390572390572391, "grad_norm": 8.6875, "learning_rate": 1.9992232330882314e-05, "loss": 0.45894027, "memory(GiB)": 15.04, "step": 920, "train_speed(iter/s)": 0.337763 }, { "acc": 0.73397541, "epoch": 0.12457912457912458, "grad_norm": 16.75, "learning_rate": 1.9991787341981263e-05, "loss": 1.12627192, "memory(GiB)": 15.04, "step": 925, "train_speed(iter/s)": 0.338106 }, { "acc": 0.80420237, "epoch": 0.12525252525252525, "grad_norm": 17.875, "learning_rate": 1.9991329964727585e-05, "loss": 0.65375113, "memory(GiB)": 15.04, "step": 930, "train_speed(iter/s)": 0.338396 }, { "acc": 0.76427999, "epoch": 0.1259259259259259, "grad_norm": 12.375, "learning_rate": 1.9990860199688366e-05, "loss": 0.82494192, "memory(GiB)": 15.04, "step": 935, "train_speed(iter/s)": 0.33873 }, { "acc": 0.6646564, "epoch": 0.1265993265993266, "grad_norm": 9.6875, "learning_rate": 1.999037804744604e-05, "loss": 0.83276958, "memory(GiB)": 15.04, "step": 940, "train_speed(iter/s)": 0.338974 }, { "acc": 0.81569929, "epoch": 0.12727272727272726, "grad_norm": 9.4375, "learning_rate": 1.9989883508598406e-05, "loss": 0.73578134, "memory(GiB)": 15.04, "step": 945, "train_speed(iter/s)": 0.339056 }, { "acc": 0.7477592, "epoch": 0.12794612794612795, "grad_norm": 8.625, "learning_rate": 1.998937658375862e-05, "loss": 0.58547649, "memory(GiB)": 15.04, "step": 950, "train_speed(iter/s)": 0.339272 }, { "acc": 0.79679656, "epoch": 0.1286195286195286, "grad_norm": 6.9375, "learning_rate": 1.9988857273555196e-05, "loss": 0.64831743, "memory(GiB)": 15.04, "step": 955, "train_speed(iter/s)": 0.339339 }, { "acc": 0.83822346, "epoch": 0.1292929292929293, "grad_norm": 6.71875, "learning_rate": 1.9988325578632003e-05, "loss": 0.66460371, "memory(GiB)": 15.04, "step": 960, "train_speed(iter/s)": 0.339392 }, { "acc": 0.78960447, "epoch": 0.12996632996632998, "grad_norm": 8.0625, "learning_rate": 1.9987781499648262e-05, "loss": 0.60568805, "memory(GiB)": 15.04, "step": 965, "train_speed(iter/s)": 0.339771 }, { "acc": 0.79516697, "epoch": 0.13063973063973064, "grad_norm": 10.5625, "learning_rate": 1.9987225037278553e-05, "loss": 0.79415574, "memory(GiB)": 15.04, "step": 970, "train_speed(iter/s)": 0.33991 }, { "acc": 0.80595303, "epoch": 0.13131313131313133, "grad_norm": 11.25, "learning_rate": 1.9986656192212805e-05, "loss": 0.64286175, "memory(GiB)": 15.04, "step": 975, "train_speed(iter/s)": 0.339948 }, { "acc": 0.80993471, "epoch": 0.13198653198653199, "grad_norm": 7.1875, "learning_rate": 1.9986074965156307e-05, "loss": 0.70274277, "memory(GiB)": 15.04, "step": 980, "train_speed(iter/s)": 0.340129 }, { "acc": 0.77113862, "epoch": 0.13265993265993267, "grad_norm": 9.375, "learning_rate": 1.9985481356829693e-05, "loss": 0.59495711, "memory(GiB)": 15.04, "step": 985, "train_speed(iter/s)": 0.340343 }, { "acc": 0.84521904, "epoch": 0.13333333333333333, "grad_norm": 7.75, "learning_rate": 1.9984875367968955e-05, "loss": 0.57561235, "memory(GiB)": 15.04, "step": 990, "train_speed(iter/s)": 0.340386 }, { "acc": 0.80663528, "epoch": 0.13400673400673402, "grad_norm": 13.0625, "learning_rate": 1.9984256999325423e-05, "loss": 0.90646124, "memory(GiB)": 15.04, "step": 995, "train_speed(iter/s)": 0.340563 }, { "acc": 0.74123559, "epoch": 0.13468013468013468, "grad_norm": 11.25, "learning_rate": 1.9983626251665788e-05, "loss": 0.90148401, "memory(GiB)": 15.04, "step": 1000, "train_speed(iter/s)": 0.340293 }, { "acc": 0.85422077, "epoch": 0.13535353535353536, "grad_norm": 7.03125, "learning_rate": 1.9982983125772082e-05, "loss": 0.51592302, "memory(GiB)": 15.04, "step": 1005, "train_speed(iter/s)": 0.339951 }, { "acc": 0.84535894, "epoch": 0.13602693602693602, "grad_norm": 13.1875, "learning_rate": 1.9982327622441688e-05, "loss": 0.53820696, "memory(GiB)": 15.04, "step": 1010, "train_speed(iter/s)": 0.340195 }, { "acc": 0.83110466, "epoch": 0.1367003367003367, "grad_norm": 10.75, "learning_rate": 1.9981659742487337e-05, "loss": 0.56353827, "memory(GiB)": 15.04, "step": 1015, "train_speed(iter/s)": 0.340461 }, { "acc": 0.87078714, "epoch": 0.13737373737373737, "grad_norm": 7.6875, "learning_rate": 1.99809794867371e-05, "loss": 0.47190948, "memory(GiB)": 15.04, "step": 1020, "train_speed(iter/s)": 0.340756 }, { "acc": 0.86921206, "epoch": 0.13804713804713806, "grad_norm": 12.5, "learning_rate": 1.998028685603439e-05, "loss": 0.47288909, "memory(GiB)": 15.04, "step": 1025, "train_speed(iter/s)": 0.34102 }, { "acc": 0.74840388, "epoch": 0.13872053872053872, "grad_norm": 10.6875, "learning_rate": 1.9979581851237974e-05, "loss": 0.69720249, "memory(GiB)": 15.04, "step": 1030, "train_speed(iter/s)": 0.341312 }, { "acc": 0.81624384, "epoch": 0.1393939393939394, "grad_norm": 7.9375, "learning_rate": 1.9978864473221954e-05, "loss": 0.59940052, "memory(GiB)": 15.04, "step": 1035, "train_speed(iter/s)": 0.341474 }, { "acc": 0.84893894, "epoch": 0.14006734006734006, "grad_norm": 9.125, "learning_rate": 1.997813472287577e-05, "loss": 0.53334293, "memory(GiB)": 15.04, "step": 1040, "train_speed(iter/s)": 0.341755 }, { "acc": 0.81686001, "epoch": 0.14074074074074075, "grad_norm": 10.0, "learning_rate": 1.997739260110421e-05, "loss": 0.70683684, "memory(GiB)": 15.04, "step": 1045, "train_speed(iter/s)": 0.341467 }, { "acc": 0.73511386, "epoch": 0.1414141414141414, "grad_norm": 12.4375, "learning_rate": 1.9976638108827395e-05, "loss": 0.8299778, "memory(GiB)": 15.04, "step": 1050, "train_speed(iter/s)": 0.341592 }, { "acc": 0.84761677, "epoch": 0.1420875420875421, "grad_norm": 7.34375, "learning_rate": 1.997587124698078e-05, "loss": 0.52306743, "memory(GiB)": 15.04, "step": 1055, "train_speed(iter/s)": 0.341691 }, { "acc": 0.80669956, "epoch": 0.14276094276094276, "grad_norm": 5.75, "learning_rate": 1.997509201651517e-05, "loss": 0.57092929, "memory(GiB)": 15.04, "step": 1060, "train_speed(iter/s)": 0.341936 }, { "acc": 0.79653311, "epoch": 0.14343434343434344, "grad_norm": 9.125, "learning_rate": 1.9974300418396688e-05, "loss": 0.84826164, "memory(GiB)": 15.04, "step": 1065, "train_speed(iter/s)": 0.342245 }, { "acc": 0.84764261, "epoch": 0.1441077441077441, "grad_norm": 7.9375, "learning_rate": 1.9973496453606808e-05, "loss": 0.48173232, "memory(GiB)": 15.04, "step": 1070, "train_speed(iter/s)": 0.34231 }, { "acc": 0.76810174, "epoch": 0.1447811447811448, "grad_norm": 11.25, "learning_rate": 1.9972680123142322e-05, "loss": 0.87655268, "memory(GiB)": 15.04, "step": 1075, "train_speed(iter/s)": 0.342497 }, { "acc": 0.83370619, "epoch": 0.14545454545454545, "grad_norm": 7.25, "learning_rate": 1.997185142801536e-05, "loss": 0.69885473, "memory(GiB)": 15.04, "step": 1080, "train_speed(iter/s)": 0.342508 }, { "acc": 0.83789968, "epoch": 0.14612794612794613, "grad_norm": 7.3125, "learning_rate": 1.9971010369253388e-05, "loss": 0.6340858, "memory(GiB)": 15.04, "step": 1085, "train_speed(iter/s)": 0.342112 }, { "acc": 0.80213232, "epoch": 0.1468013468013468, "grad_norm": 23.5, "learning_rate": 1.997015694789919e-05, "loss": 0.65443606, "memory(GiB)": 15.04, "step": 1090, "train_speed(iter/s)": 0.342229 }, { "acc": 0.74166551, "epoch": 0.14747474747474748, "grad_norm": 9.4375, "learning_rate": 1.9969291165010886e-05, "loss": 1.01165895, "memory(GiB)": 15.04, "step": 1095, "train_speed(iter/s)": 0.342523 }, { "acc": 0.8297987, "epoch": 0.14814814814814814, "grad_norm": 8.9375, "learning_rate": 1.9968413021661925e-05, "loss": 0.64009571, "memory(GiB)": 15.04, "step": 1100, "train_speed(iter/s)": 0.342779 }, { "acc": 0.83918276, "epoch": 0.14882154882154883, "grad_norm": 10.5625, "learning_rate": 1.9967522518941066e-05, "loss": 0.45333061, "memory(GiB)": 15.04, "step": 1105, "train_speed(iter/s)": 0.343074 }, { "acc": 0.78889961, "epoch": 0.1494949494949495, "grad_norm": 12.25, "learning_rate": 1.996661965795241e-05, "loss": 0.70123796, "memory(GiB)": 15.04, "step": 1110, "train_speed(iter/s)": 0.343202 }, { "acc": 0.70950618, "epoch": 0.15016835016835017, "grad_norm": 15.8125, "learning_rate": 1.9965704439815368e-05, "loss": 1.0612565, "memory(GiB)": 15.04, "step": 1115, "train_speed(iter/s)": 0.343061 }, { "acc": 0.71501498, "epoch": 0.15084175084175083, "grad_norm": 6.875, "learning_rate": 1.996477686566468e-05, "loss": 1.3117177, "memory(GiB)": 15.04, "step": 1120, "train_speed(iter/s)": 0.343087 }, { "acc": 0.85039539, "epoch": 0.15151515151515152, "grad_norm": 10.1875, "learning_rate": 1.9963836936650397e-05, "loss": 0.52983537, "memory(GiB)": 15.04, "step": 1125, "train_speed(iter/s)": 0.34331 }, { "acc": 0.85561619, "epoch": 0.15218855218855218, "grad_norm": 15.375, "learning_rate": 1.9962884653937897e-05, "loss": 0.4804214, "memory(GiB)": 15.04, "step": 1130, "train_speed(iter/s)": 0.343552 }, { "acc": 0.780967, "epoch": 0.15286195286195287, "grad_norm": 7.75, "learning_rate": 1.996192001870787e-05, "loss": 0.67414761, "memory(GiB)": 15.04, "step": 1135, "train_speed(iter/s)": 0.343508 }, { "acc": 0.78523622, "epoch": 0.15353535353535352, "grad_norm": 10.875, "learning_rate": 1.9960943032156327e-05, "loss": 0.79191904, "memory(GiB)": 15.04, "step": 1140, "train_speed(iter/s)": 0.343405 }, { "acc": 0.81021652, "epoch": 0.1542087542087542, "grad_norm": 18.0, "learning_rate": 1.995995369549458e-05, "loss": 0.63394156, "memory(GiB)": 15.04, "step": 1145, "train_speed(iter/s)": 0.343739 }, { "acc": 0.84955807, "epoch": 0.15488215488215487, "grad_norm": 6.8125, "learning_rate": 1.9958952009949264e-05, "loss": 0.53475342, "memory(GiB)": 15.04, "step": 1150, "train_speed(iter/s)": 0.343725 }, { "acc": 0.78412313, "epoch": 0.15555555555555556, "grad_norm": 40.5, "learning_rate": 1.9957937976762327e-05, "loss": 0.79959445, "memory(GiB)": 15.04, "step": 1155, "train_speed(iter/s)": 0.344017 }, { "acc": 0.7680315, "epoch": 0.15622895622895622, "grad_norm": 10.1875, "learning_rate": 1.9956911597191017e-05, "loss": 0.60923586, "memory(GiB)": 15.04, "step": 1160, "train_speed(iter/s)": 0.344356 }, { "acc": 0.83539982, "epoch": 0.1569023569023569, "grad_norm": 7.9375, "learning_rate": 1.9955872872507897e-05, "loss": 0.63824601, "memory(GiB)": 15.04, "step": 1165, "train_speed(iter/s)": 0.34463 }, { "acc": 0.79414153, "epoch": 0.15757575757575756, "grad_norm": 6.21875, "learning_rate": 1.995482180400083e-05, "loss": 0.73920679, "memory(GiB)": 15.04, "step": 1170, "train_speed(iter/s)": 0.344532 }, { "acc": 0.87649174, "epoch": 0.15824915824915825, "grad_norm": 5.78125, "learning_rate": 1.9953758392972988e-05, "loss": 0.52685447, "memory(GiB)": 15.04, "step": 1175, "train_speed(iter/s)": 0.344496 }, { "acc": 0.74325509, "epoch": 0.1589225589225589, "grad_norm": 36.25, "learning_rate": 1.9952682640742847e-05, "loss": 0.96344852, "memory(GiB)": 15.04, "step": 1180, "train_speed(iter/s)": 0.344618 }, { "acc": 0.8145812, "epoch": 0.1595959595959596, "grad_norm": 7.4375, "learning_rate": 1.9951594548644183e-05, "loss": 0.48234911, "memory(GiB)": 15.04, "step": 1185, "train_speed(iter/s)": 0.344703 }, { "acc": 0.81938677, "epoch": 0.16026936026936026, "grad_norm": 7.59375, "learning_rate": 1.995049411802607e-05, "loss": 0.47550235, "memory(GiB)": 15.04, "step": 1190, "train_speed(iter/s)": 0.344847 }, { "acc": 0.79189601, "epoch": 0.16094276094276094, "grad_norm": 18.5, "learning_rate": 1.9949381350252878e-05, "loss": 0.77639685, "memory(GiB)": 15.04, "step": 1195, "train_speed(iter/s)": 0.345159 }, { "acc": 0.86258478, "epoch": 0.16161616161616163, "grad_norm": 8.0, "learning_rate": 1.9948256246704275e-05, "loss": 0.60039167, "memory(GiB)": 15.04, "step": 1200, "train_speed(iter/s)": 0.345081 }, { "epoch": 0.16161616161616163, "eval_acc": 0.8230703414659442, "eval_loss": 0.7040194869041443, "eval_runtime": 109.8343, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 1200 }, { "acc": 0.82389164, "epoch": 0.1622895622895623, "grad_norm": 19.625, "learning_rate": 1.994711880877523e-05, "loss": 0.63736305, "memory(GiB)": 15.04, "step": 1205, "train_speed(iter/s)": 0.334716 }, { "acc": 0.82516699, "epoch": 0.16296296296296298, "grad_norm": 7.71875, "learning_rate": 1.9945969037876e-05, "loss": 0.62200007, "memory(GiB)": 15.04, "step": 1210, "train_speed(iter/s)": 0.334904 }, { "acc": 0.8145359, "epoch": 0.16363636363636364, "grad_norm": 16.125, "learning_rate": 1.9944806935432127e-05, "loss": 0.56476302, "memory(GiB)": 15.04, "step": 1215, "train_speed(iter/s)": 0.335158 }, { "acc": 0.76596346, "epoch": 0.16430976430976432, "grad_norm": 9.8125, "learning_rate": 1.9943632502884448e-05, "loss": 0.91625757, "memory(GiB)": 15.04, "step": 1220, "train_speed(iter/s)": 0.335401 }, { "acc": 0.80096502, "epoch": 0.16498316498316498, "grad_norm": 7.65625, "learning_rate": 1.9942445741689093e-05, "loss": 0.54522581, "memory(GiB)": 15.04, "step": 1225, "train_speed(iter/s)": 0.335478 }, { "acc": 0.82449436, "epoch": 0.16565656565656567, "grad_norm": 7.53125, "learning_rate": 1.9941246653317465e-05, "loss": 0.61662765, "memory(GiB)": 15.04, "step": 1230, "train_speed(iter/s)": 0.335694 }, { "acc": 0.78079591, "epoch": 0.16632996632996633, "grad_norm": 8.6875, "learning_rate": 1.9940035239256265e-05, "loss": 0.77522221, "memory(GiB)": 15.04, "step": 1235, "train_speed(iter/s)": 0.335946 }, { "acc": 0.78646274, "epoch": 0.16700336700336701, "grad_norm": 21.375, "learning_rate": 1.9938811501007462e-05, "loss": 0.62522068, "memory(GiB)": 15.04, "step": 1240, "train_speed(iter/s)": 0.336263 }, { "acc": 0.7506434, "epoch": 0.16767676767676767, "grad_norm": 7.5, "learning_rate": 1.9937575440088316e-05, "loss": 0.62290888, "memory(GiB)": 15.04, "step": 1245, "train_speed(iter/s)": 0.336456 }, { "acc": 0.84636002, "epoch": 0.16835016835016836, "grad_norm": 8.875, "learning_rate": 1.993632705803136e-05, "loss": 0.60505009, "memory(GiB)": 15.04, "step": 1250, "train_speed(iter/s)": 0.336644 }, { "acc": 0.80847054, "epoch": 0.16902356902356902, "grad_norm": 14.875, "learning_rate": 1.993506635638441e-05, "loss": 0.63604989, "memory(GiB)": 15.04, "step": 1255, "train_speed(iter/s)": 0.336913 }, { "acc": 0.87230434, "epoch": 0.1696969696969697, "grad_norm": 8.375, "learning_rate": 1.9933793336710545e-05, "loss": 0.50676417, "memory(GiB)": 15.04, "step": 1260, "train_speed(iter/s)": 0.337162 }, { "acc": 0.80247259, "epoch": 0.17037037037037037, "grad_norm": 18.25, "learning_rate": 1.9932508000588123e-05, "loss": 0.73981152, "memory(GiB)": 15.04, "step": 1265, "train_speed(iter/s)": 0.337447 }, { "acc": 0.82226982, "epoch": 0.17104377104377105, "grad_norm": 6.5, "learning_rate": 1.9931210349610776e-05, "loss": 0.9066226, "memory(GiB)": 15.04, "step": 1270, "train_speed(iter/s)": 0.337438 }, { "acc": 0.78620706, "epoch": 0.1717171717171717, "grad_norm": 5.5, "learning_rate": 1.99299003853874e-05, "loss": 0.70232997, "memory(GiB)": 15.04, "step": 1275, "train_speed(iter/s)": 0.337262 }, { "acc": 0.78566852, "epoch": 0.1723905723905724, "grad_norm": 13.375, "learning_rate": 1.992857810954216e-05, "loss": 0.53193336, "memory(GiB)": 15.04, "step": 1280, "train_speed(iter/s)": 0.337281 }, { "acc": 0.79574518, "epoch": 0.17306397306397306, "grad_norm": 11.625, "learning_rate": 1.992724352371448e-05, "loss": 0.50628576, "memory(GiB)": 15.04, "step": 1285, "train_speed(iter/s)": 0.337486 }, { "acc": 0.79572105, "epoch": 0.17373737373737375, "grad_norm": 7.8125, "learning_rate": 1.9925896629559058e-05, "loss": 0.89431658, "memory(GiB)": 15.04, "step": 1290, "train_speed(iter/s)": 0.337792 }, { "acc": 0.76915941, "epoch": 0.1744107744107744, "grad_norm": 14.1875, "learning_rate": 1.9924537428745838e-05, "loss": 0.71194468, "memory(GiB)": 15.04, "step": 1295, "train_speed(iter/s)": 0.337887 }, { "acc": 0.84247351, "epoch": 0.1750841750841751, "grad_norm": 28.5, "learning_rate": 1.9923165922960036e-05, "loss": 0.52920589, "memory(GiB)": 15.04, "step": 1300, "train_speed(iter/s)": 0.337901 }, { "acc": 0.80220499, "epoch": 0.17575757575757575, "grad_norm": 7.9375, "learning_rate": 1.9921782113902113e-05, "loss": 0.52305183, "memory(GiB)": 15.04, "step": 1305, "train_speed(iter/s)": 0.338015 }, { "acc": 0.79102888, "epoch": 0.17643097643097644, "grad_norm": 8.375, "learning_rate": 1.992038600328779e-05, "loss": 0.77678123, "memory(GiB)": 15.04, "step": 1310, "train_speed(iter/s)": 0.33802 }, { "acc": 0.84969645, "epoch": 0.1771043771043771, "grad_norm": 7.25, "learning_rate": 1.9918977592848044e-05, "loss": 0.68033156, "memory(GiB)": 15.04, "step": 1315, "train_speed(iter/s)": 0.33813 }, { "acc": 0.84281616, "epoch": 0.17777777777777778, "grad_norm": 13.0625, "learning_rate": 1.9917556884329096e-05, "loss": 0.55574412, "memory(GiB)": 15.04, "step": 1320, "train_speed(iter/s)": 0.33842 }, { "acc": 0.82773514, "epoch": 0.17845117845117844, "grad_norm": 9.75, "learning_rate": 1.9916123879492416e-05, "loss": 0.61481857, "memory(GiB)": 15.04, "step": 1325, "train_speed(iter/s)": 0.338574 }, { "acc": 0.8670886, "epoch": 0.17912457912457913, "grad_norm": 7.375, "learning_rate": 1.9914678580114716e-05, "loss": 0.52466178, "memory(GiB)": 15.04, "step": 1330, "train_speed(iter/s)": 0.338585 }, { "acc": 0.81682062, "epoch": 0.1797979797979798, "grad_norm": 19.375, "learning_rate": 1.9913220987987963e-05, "loss": 0.61289821, "memory(GiB)": 15.04, "step": 1335, "train_speed(iter/s)": 0.338825 }, { "acc": 0.85524416, "epoch": 0.18047138047138048, "grad_norm": 12.5, "learning_rate": 1.9911751104919353e-05, "loss": 0.5054049, "memory(GiB)": 15.04, "step": 1340, "train_speed(iter/s)": 0.339015 }, { "acc": 0.82324381, "epoch": 0.18114478114478114, "grad_norm": 8.875, "learning_rate": 1.9910268932731327e-05, "loss": 0.76316633, "memory(GiB)": 15.04, "step": 1345, "train_speed(iter/s)": 0.339023 }, { "acc": 0.83471432, "epoch": 0.18181818181818182, "grad_norm": 12.5625, "learning_rate": 1.9908774473261557e-05, "loss": 0.56146412, "memory(GiB)": 15.04, "step": 1350, "train_speed(iter/s)": 0.33925 }, { "acc": 0.81195011, "epoch": 0.18249158249158248, "grad_norm": 18.625, "learning_rate": 1.9907267728362962e-05, "loss": 0.54229565, "memory(GiB)": 15.04, "step": 1355, "train_speed(iter/s)": 0.339518 }, { "acc": 0.78173838, "epoch": 0.18316498316498317, "grad_norm": 13.8125, "learning_rate": 1.990574869990368e-05, "loss": 0.72042937, "memory(GiB)": 15.04, "step": 1360, "train_speed(iter/s)": 0.339795 }, { "acc": 0.89754705, "epoch": 0.18383838383838383, "grad_norm": 7.96875, "learning_rate": 1.9904217389767084e-05, "loss": 0.35997989, "memory(GiB)": 15.04, "step": 1365, "train_speed(iter/s)": 0.339999 }, { "acc": 0.80328083, "epoch": 0.18451178451178452, "grad_norm": 9.4375, "learning_rate": 1.9902673799851777e-05, "loss": 0.64457412, "memory(GiB)": 15.04, "step": 1370, "train_speed(iter/s)": 0.340028 }, { "acc": 0.75070305, "epoch": 0.18518518518518517, "grad_norm": 15.25, "learning_rate": 1.990111793207158e-05, "loss": 0.99465656, "memory(GiB)": 15.04, "step": 1375, "train_speed(iter/s)": 0.3402 }, { "acc": 0.86041613, "epoch": 0.18585858585858586, "grad_norm": 7.4375, "learning_rate": 1.9899549788355545e-05, "loss": 0.52056928, "memory(GiB)": 15.04, "step": 1380, "train_speed(iter/s)": 0.34023 }, { "acc": 0.85896616, "epoch": 0.18653198653198652, "grad_norm": 7.4375, "learning_rate": 1.9897969370647937e-05, "loss": 0.47336903, "memory(GiB)": 15.04, "step": 1385, "train_speed(iter/s)": 0.340349 }, { "acc": 0.8049366, "epoch": 0.1872053872053872, "grad_norm": 8.625, "learning_rate": 1.9896376680908244e-05, "loss": 0.92537689, "memory(GiB)": 15.04, "step": 1390, "train_speed(iter/s)": 0.340255 }, { "acc": 0.82606783, "epoch": 0.18787878787878787, "grad_norm": 7.0625, "learning_rate": 1.989477172111117e-05, "loss": 0.65767555, "memory(GiB)": 15.04, "step": 1395, "train_speed(iter/s)": 0.340219 }, { "acc": 0.84840879, "epoch": 0.18855218855218855, "grad_norm": 9.625, "learning_rate": 1.989315449324663e-05, "loss": 0.50497999, "memory(GiB)": 15.04, "step": 1400, "train_speed(iter/s)": 0.340368 }, { "acc": 0.83024111, "epoch": 0.1892255892255892, "grad_norm": 7.8125, "learning_rate": 1.9891524999319744e-05, "loss": 0.57054515, "memory(GiB)": 15.04, "step": 1405, "train_speed(iter/s)": 0.34047 }, { "acc": 0.75613337, "epoch": 0.1898989898989899, "grad_norm": 7.84375, "learning_rate": 1.988988324135085e-05, "loss": 0.86667471, "memory(GiB)": 15.04, "step": 1410, "train_speed(iter/s)": 0.340671 }, { "acc": 0.8119998, "epoch": 0.19057239057239056, "grad_norm": 8.3125, "learning_rate": 1.988822922137549e-05, "loss": 0.8280241, "memory(GiB)": 15.04, "step": 1415, "train_speed(iter/s)": 0.340947 }, { "acc": 0.73748841, "epoch": 0.19124579124579125, "grad_norm": 14.0, "learning_rate": 1.98865629414444e-05, "loss": 0.74178867, "memory(GiB)": 15.04, "step": 1420, "train_speed(iter/s)": 0.341137 }, { "acc": 0.85077839, "epoch": 0.1919191919191919, "grad_norm": 10.1875, "learning_rate": 1.988488440362353e-05, "loss": 0.60300379, "memory(GiB)": 15.04, "step": 1425, "train_speed(iter/s)": 0.341382 }, { "acc": 0.79046402, "epoch": 0.1925925925925926, "grad_norm": 6.34375, "learning_rate": 1.9883193609994013e-05, "loss": 0.4608397, "memory(GiB)": 15.04, "step": 1430, "train_speed(iter/s)": 0.341308 }, { "acc": 0.85704985, "epoch": 0.19326599326599325, "grad_norm": 8.5625, "learning_rate": 1.9881490562652195e-05, "loss": 0.49345632, "memory(GiB)": 15.04, "step": 1435, "train_speed(iter/s)": 0.341365 }, { "acc": 0.80162678, "epoch": 0.19393939393939394, "grad_norm": 11.25, "learning_rate": 1.9879775263709597e-05, "loss": 0.75367384, "memory(GiB)": 15.04, "step": 1440, "train_speed(iter/s)": 0.341417 }, { "acc": 0.86569061, "epoch": 0.19461279461279463, "grad_norm": 24.875, "learning_rate": 1.9878047715292944e-05, "loss": 0.47033658, "memory(GiB)": 15.04, "step": 1445, "train_speed(iter/s)": 0.341514 }, { "acc": 0.89666128, "epoch": 0.19528619528619529, "grad_norm": 13.9375, "learning_rate": 1.987630791954414e-05, "loss": 0.36658845, "memory(GiB)": 15.04, "step": 1450, "train_speed(iter/s)": 0.34175 }, { "acc": 0.86566019, "epoch": 0.19595959595959597, "grad_norm": 10.375, "learning_rate": 1.9874555878620278e-05, "loss": 0.47194514, "memory(GiB)": 15.04, "step": 1455, "train_speed(iter/s)": 0.341849 }, { "acc": 0.85536938, "epoch": 0.19663299663299663, "grad_norm": 6.75, "learning_rate": 1.987279159469363e-05, "loss": 0.54388247, "memory(GiB)": 15.04, "step": 1460, "train_speed(iter/s)": 0.341891 }, { "acc": 0.83794804, "epoch": 0.19730639730639732, "grad_norm": 7.125, "learning_rate": 1.987101506995165e-05, "loss": 0.55969214, "memory(GiB)": 15.04, "step": 1465, "train_speed(iter/s)": 0.341893 }, { "acc": 0.85852814, "epoch": 0.19797979797979798, "grad_norm": 11.6875, "learning_rate": 1.9869226306596973e-05, "loss": 0.45950704, "memory(GiB)": 15.04, "step": 1470, "train_speed(iter/s)": 0.342045 }, { "acc": 0.88174915, "epoch": 0.19865319865319866, "grad_norm": 11.1875, "learning_rate": 1.98674253068474e-05, "loss": 0.37424619, "memory(GiB)": 15.04, "step": 1475, "train_speed(iter/s)": 0.342135 }, { "acc": 0.83266773, "epoch": 0.19932659932659932, "grad_norm": 7.40625, "learning_rate": 1.9865612072935904e-05, "loss": 0.83164015, "memory(GiB)": 15.04, "step": 1480, "train_speed(iter/s)": 0.342252 }, { "acc": 0.8816473, "epoch": 0.2, "grad_norm": 8.0, "learning_rate": 1.9863786607110634e-05, "loss": 0.41165295, "memory(GiB)": 15.04, "step": 1485, "train_speed(iter/s)": 0.34244 }, { "acc": 0.85913954, "epoch": 0.20067340067340067, "grad_norm": 17.625, "learning_rate": 1.98619489116349e-05, "loss": 0.44735775, "memory(GiB)": 15.04, "step": 1490, "train_speed(iter/s)": 0.342647 }, { "acc": 0.87764893, "epoch": 0.20134680134680136, "grad_norm": 6.96875, "learning_rate": 1.9860098988787175e-05, "loss": 0.3400228, "memory(GiB)": 15.04, "step": 1495, "train_speed(iter/s)": 0.342664 }, { "acc": 0.8648241, "epoch": 0.20202020202020202, "grad_norm": 13.9375, "learning_rate": 1.9858236840861087e-05, "loss": 0.48049917, "memory(GiB)": 15.04, "step": 1500, "train_speed(iter/s)": 0.342613 }, { "epoch": 0.20202020202020202, "eval_acc": 0.8393517425851624, "eval_loss": 0.6326767206192017, "eval_runtime": 109.81, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 1500 }, { "acc": 0.83591461, "epoch": 0.2026936026936027, "grad_norm": 9.5, "learning_rate": 1.9856362470165432e-05, "loss": 0.62246451, "memory(GiB)": 15.04, "step": 1505, "train_speed(iter/s)": 0.33423 }, { "acc": 0.81393814, "epoch": 0.20336700336700336, "grad_norm": 10.8125, "learning_rate": 1.9854475879024155e-05, "loss": 0.71856928, "memory(GiB)": 15.04, "step": 1510, "train_speed(iter/s)": 0.334249 }, { "acc": 0.85712175, "epoch": 0.20404040404040405, "grad_norm": 8.0, "learning_rate": 1.9852577069776352e-05, "loss": 0.50289125, "memory(GiB)": 15.04, "step": 1515, "train_speed(iter/s)": 0.334309 }, { "acc": 0.74970808, "epoch": 0.2047138047138047, "grad_norm": 11.9375, "learning_rate": 1.985066604477627e-05, "loss": 1.00005846, "memory(GiB)": 15.04, "step": 1520, "train_speed(iter/s)": 0.334471 }, { "acc": 0.71272602, "epoch": 0.2053872053872054, "grad_norm": 23.0, "learning_rate": 1.9848742806393293e-05, "loss": 0.90867653, "memory(GiB)": 15.04, "step": 1525, "train_speed(iter/s)": 0.334625 }, { "acc": 0.83619108, "epoch": 0.20606060606060606, "grad_norm": 7.46875, "learning_rate": 1.984680735701196e-05, "loss": 0.78301525, "memory(GiB)": 15.04, "step": 1530, "train_speed(iter/s)": 0.334686 }, { "acc": 0.86082382, "epoch": 0.20673400673400674, "grad_norm": 12.5, "learning_rate": 1.984485969903195e-05, "loss": 0.48262935, "memory(GiB)": 15.04, "step": 1535, "train_speed(iter/s)": 0.334732 }, { "acc": 0.86204128, "epoch": 0.2074074074074074, "grad_norm": 8.375, "learning_rate": 1.9842899834868063e-05, "loss": 0.51524258, "memory(GiB)": 15.04, "step": 1540, "train_speed(iter/s)": 0.334906 }, { "acc": 0.79670696, "epoch": 0.2080808080808081, "grad_norm": 9.25, "learning_rate": 1.9840927766950253e-05, "loss": 0.64929891, "memory(GiB)": 15.04, "step": 1545, "train_speed(iter/s)": 0.335055 }, { "acc": 0.85708027, "epoch": 0.20875420875420875, "grad_norm": 13.0625, "learning_rate": 1.9838943497723585e-05, "loss": 0.54325323, "memory(GiB)": 15.04, "step": 1550, "train_speed(iter/s)": 0.33524 }, { "acc": 0.89223938, "epoch": 0.20942760942760943, "grad_norm": 9.625, "learning_rate": 1.9836947029648276e-05, "loss": 0.36963723, "memory(GiB)": 15.04, "step": 1555, "train_speed(iter/s)": 0.335507 }, { "acc": 0.84713097, "epoch": 0.2101010101010101, "grad_norm": 13.25, "learning_rate": 1.9834938365199637e-05, "loss": 0.49888825, "memory(GiB)": 15.04, "step": 1560, "train_speed(iter/s)": 0.335758 }, { "acc": 0.79915905, "epoch": 0.21077441077441078, "grad_norm": 17.25, "learning_rate": 1.9832917506868135e-05, "loss": 0.7733633, "memory(GiB)": 15.04, "step": 1565, "train_speed(iter/s)": 0.33599 }, { "acc": 0.80804539, "epoch": 0.21144781144781144, "grad_norm": 10.875, "learning_rate": 1.9830884457159328e-05, "loss": 0.7196496, "memory(GiB)": 15.04, "step": 1570, "train_speed(iter/s)": 0.336234 }, { "acc": 0.82204647, "epoch": 0.21212121212121213, "grad_norm": 12.0625, "learning_rate": 1.9828839218593897e-05, "loss": 0.66851048, "memory(GiB)": 15.04, "step": 1575, "train_speed(iter/s)": 0.336436 }, { "acc": 0.84909801, "epoch": 0.2127946127946128, "grad_norm": 6.90625, "learning_rate": 1.982678179370765e-05, "loss": 0.54903288, "memory(GiB)": 15.04, "step": 1580, "train_speed(iter/s)": 0.33664 }, { "acc": 0.849652, "epoch": 0.21346801346801347, "grad_norm": 14.1875, "learning_rate": 1.982471218505148e-05, "loss": 0.5503304, "memory(GiB)": 15.04, "step": 1585, "train_speed(iter/s)": 0.336776 }, { "acc": 0.8291213, "epoch": 0.21414141414141413, "grad_norm": 8.4375, "learning_rate": 1.9822630395191408e-05, "loss": 0.49029202, "memory(GiB)": 15.04, "step": 1590, "train_speed(iter/s)": 0.337014 }, { "acc": 0.88485823, "epoch": 0.21481481481481482, "grad_norm": 7.84375, "learning_rate": 1.982053642670854e-05, "loss": 0.45210991, "memory(GiB)": 15.04, "step": 1595, "train_speed(iter/s)": 0.337155 }, { "acc": 0.82303982, "epoch": 0.21548821548821548, "grad_norm": 7.15625, "learning_rate": 1.9818430282199098e-05, "loss": 0.44473071, "memory(GiB)": 15.04, "step": 1600, "train_speed(iter/s)": 0.337222 }, { "acc": 0.82110701, "epoch": 0.21616161616161617, "grad_norm": 10.6875, "learning_rate": 1.981631196427439e-05, "loss": 0.57491765, "memory(GiB)": 15.04, "step": 1605, "train_speed(iter/s)": 0.337458 }, { "acc": 0.82914457, "epoch": 0.21683501683501682, "grad_norm": 6.625, "learning_rate": 1.981418147556082e-05, "loss": 0.72364321, "memory(GiB)": 15.04, "step": 1610, "train_speed(iter/s)": 0.337569 }, { "acc": 0.8326498, "epoch": 0.2175084175084175, "grad_norm": 13.125, "learning_rate": 1.9812038818699878e-05, "loss": 0.56981144, "memory(GiB)": 15.04, "step": 1615, "train_speed(iter/s)": 0.337762 }, { "acc": 0.82243462, "epoch": 0.21818181818181817, "grad_norm": 6.625, "learning_rate": 1.980988399634815e-05, "loss": 0.48505726, "memory(GiB)": 15.04, "step": 1620, "train_speed(iter/s)": 0.337939 }, { "acc": 0.85339508, "epoch": 0.21885521885521886, "grad_norm": 11.3125, "learning_rate": 1.9807717011177298e-05, "loss": 0.59794049, "memory(GiB)": 15.04, "step": 1625, "train_speed(iter/s)": 0.338067 }, { "acc": 0.85775013, "epoch": 0.21952861952861952, "grad_norm": 11.6875, "learning_rate": 1.9805537865874063e-05, "loss": 0.59125137, "memory(GiB)": 15.04, "step": 1630, "train_speed(iter/s)": 0.338083 }, { "acc": 0.88099813, "epoch": 0.2202020202020202, "grad_norm": 10.6875, "learning_rate": 1.9803346563140273e-05, "loss": 0.37204971, "memory(GiB)": 15.04, "step": 1635, "train_speed(iter/s)": 0.338111 }, { "acc": 0.84456615, "epoch": 0.22087542087542086, "grad_norm": 11.4375, "learning_rate": 1.9801143105692815e-05, "loss": 0.65295229, "memory(GiB)": 15.04, "step": 1640, "train_speed(iter/s)": 0.338167 }, { "acc": 0.87735825, "epoch": 0.22154882154882155, "grad_norm": 8.125, "learning_rate": 1.979892749626366e-05, "loss": 0.41986647, "memory(GiB)": 15.04, "step": 1645, "train_speed(iter/s)": 0.338334 }, { "acc": 0.81775703, "epoch": 0.2222222222222222, "grad_norm": 15.3125, "learning_rate": 1.9796699737599835e-05, "loss": 0.67741585, "memory(GiB)": 15.04, "step": 1650, "train_speed(iter/s)": 0.338542 }, { "acc": 0.8514535, "epoch": 0.2228956228956229, "grad_norm": 9.0625, "learning_rate": 1.9794459832463438e-05, "loss": 0.49367156, "memory(GiB)": 15.04, "step": 1655, "train_speed(iter/s)": 0.338657 }, { "acc": 0.83197441, "epoch": 0.22356902356902356, "grad_norm": 8.5625, "learning_rate": 1.9792207783631615e-05, "loss": 0.57320757, "memory(GiB)": 15.04, "step": 1660, "train_speed(iter/s)": 0.338744 }, { "acc": 0.83286524, "epoch": 0.22424242424242424, "grad_norm": 7.34375, "learning_rate": 1.9789943593896588e-05, "loss": 0.63577981, "memory(GiB)": 15.04, "step": 1665, "train_speed(iter/s)": 0.338755 }, { "acc": 0.80521784, "epoch": 0.2249158249158249, "grad_norm": 15.625, "learning_rate": 1.9787667266065612e-05, "loss": 0.64762154, "memory(GiB)": 15.04, "step": 1670, "train_speed(iter/s)": 0.338956 }, { "acc": 0.79904494, "epoch": 0.2255892255892256, "grad_norm": 9.375, "learning_rate": 1.9785378802961005e-05, "loss": 0.43870797, "memory(GiB)": 15.04, "step": 1675, "train_speed(iter/s)": 0.339061 }, { "acc": 0.89039049, "epoch": 0.22626262626262628, "grad_norm": 12.875, "learning_rate": 1.978307820742012e-05, "loss": 0.4549356, "memory(GiB)": 15.04, "step": 1680, "train_speed(iter/s)": 0.339174 }, { "acc": 0.87218695, "epoch": 0.22693602693602694, "grad_norm": 6.5, "learning_rate": 1.9780765482295366e-05, "loss": 0.50175328, "memory(GiB)": 15.04, "step": 1685, "train_speed(iter/s)": 0.339312 }, { "acc": 0.88136101, "epoch": 0.22760942760942762, "grad_norm": 9.25, "learning_rate": 1.9778440630454178e-05, "loss": 0.48342233, "memory(GiB)": 15.04, "step": 1690, "train_speed(iter/s)": 0.339531 }, { "acc": 0.80393095, "epoch": 0.22828282828282828, "grad_norm": 10.4375, "learning_rate": 1.9776103654779037e-05, "loss": 0.93885641, "memory(GiB)": 15.04, "step": 1695, "train_speed(iter/s)": 0.339721 }, { "acc": 0.85020695, "epoch": 0.22895622895622897, "grad_norm": 20.125, "learning_rate": 1.9773754558167442e-05, "loss": 0.71851912, "memory(GiB)": 15.04, "step": 1700, "train_speed(iter/s)": 0.339822 }, { "acc": 0.81808643, "epoch": 0.22962962962962963, "grad_norm": 9.625, "learning_rate": 1.9771393343531938e-05, "loss": 0.7361897, "memory(GiB)": 15.04, "step": 1705, "train_speed(iter/s)": 0.339862 }, { "acc": 0.7765965, "epoch": 0.23030303030303031, "grad_norm": 11.4375, "learning_rate": 1.976902001380008e-05, "loss": 0.95105314, "memory(GiB)": 15.04, "step": 1710, "train_speed(iter/s)": 0.340035 }, { "acc": 0.86265593, "epoch": 0.23097643097643097, "grad_norm": 8.9375, "learning_rate": 1.9766634571914448e-05, "loss": 0.62800045, "memory(GiB)": 15.04, "step": 1715, "train_speed(iter/s)": 0.339976 }, { "acc": 0.89045258, "epoch": 0.23164983164983166, "grad_norm": 11.875, "learning_rate": 1.9764237020832644e-05, "loss": 0.42381425, "memory(GiB)": 15.04, "step": 1720, "train_speed(iter/s)": 0.340081 }, { "acc": 0.8708931, "epoch": 0.23232323232323232, "grad_norm": 16.375, "learning_rate": 1.976182736352728e-05, "loss": 0.57231526, "memory(GiB)": 15.04, "step": 1725, "train_speed(iter/s)": 0.340121 }, { "acc": 0.78722067, "epoch": 0.232996632996633, "grad_norm": 21.875, "learning_rate": 1.9759405602985973e-05, "loss": 0.74272079, "memory(GiB)": 15.04, "step": 1730, "train_speed(iter/s)": 0.340333 }, { "acc": 0.91115456, "epoch": 0.23367003367003367, "grad_norm": 10.5, "learning_rate": 1.975697174221136e-05, "loss": 0.33090246, "memory(GiB)": 15.04, "step": 1735, "train_speed(iter/s)": 0.340501 }, { "acc": 0.79775496, "epoch": 0.23434343434343435, "grad_norm": 15.8125, "learning_rate": 1.9754525784221067e-05, "loss": 0.71790228, "memory(GiB)": 15.04, "step": 1740, "train_speed(iter/s)": 0.340611 }, { "acc": 0.83832827, "epoch": 0.235016835016835, "grad_norm": 13.875, "learning_rate": 1.975206773204772e-05, "loss": 0.47882934, "memory(GiB)": 15.04, "step": 1745, "train_speed(iter/s)": 0.340706 }, { "acc": 0.81263514, "epoch": 0.2356902356902357, "grad_norm": 9.25, "learning_rate": 1.974959758873895e-05, "loss": 0.63873296, "memory(GiB)": 15.04, "step": 1750, "train_speed(iter/s)": 0.340898 }, { "acc": 0.8787571, "epoch": 0.23636363636363636, "grad_norm": 11.1875, "learning_rate": 1.974711535735737e-05, "loss": 0.52029638, "memory(GiB)": 15.04, "step": 1755, "train_speed(iter/s)": 0.341015 }, { "acc": 0.80551605, "epoch": 0.23703703703703705, "grad_norm": 7.125, "learning_rate": 1.9744621040980584e-05, "loss": 0.53825655, "memory(GiB)": 15.04, "step": 1760, "train_speed(iter/s)": 0.341192 }, { "acc": 0.87776775, "epoch": 0.2377104377104377, "grad_norm": 16.0, "learning_rate": 1.9742114642701177e-05, "loss": 0.42201047, "memory(GiB)": 15.04, "step": 1765, "train_speed(iter/s)": 0.341277 }, { "acc": 0.88196945, "epoch": 0.2383838383838384, "grad_norm": 9.5, "learning_rate": 1.9739596165626714e-05, "loss": 0.42846918, "memory(GiB)": 15.04, "step": 1770, "train_speed(iter/s)": 0.341506 }, { "acc": 0.83472624, "epoch": 0.23905723905723905, "grad_norm": 21.625, "learning_rate": 1.9737065612879748e-05, "loss": 0.5340138, "memory(GiB)": 15.04, "step": 1775, "train_speed(iter/s)": 0.341626 }, { "acc": 0.82384624, "epoch": 0.23973063973063974, "grad_norm": 9.375, "learning_rate": 1.973452298759778e-05, "loss": 0.65406966, "memory(GiB)": 15.04, "step": 1780, "train_speed(iter/s)": 0.341785 }, { "acc": 0.88805351, "epoch": 0.2404040404040404, "grad_norm": 13.5625, "learning_rate": 1.9731968292933303e-05, "loss": 0.42406859, "memory(GiB)": 15.04, "step": 1785, "train_speed(iter/s)": 0.341915 }, { "acc": 0.86141911, "epoch": 0.24107744107744108, "grad_norm": 6.75, "learning_rate": 1.972940153205376e-05, "loss": 0.44310484, "memory(GiB)": 15.04, "step": 1790, "train_speed(iter/s)": 0.342011 }, { "acc": 0.80693083, "epoch": 0.24175084175084174, "grad_norm": 27.0, "learning_rate": 1.972682270814156e-05, "loss": 0.82178726, "memory(GiB)": 15.04, "step": 1795, "train_speed(iter/s)": 0.342255 }, { "acc": 0.86837473, "epoch": 0.24242424242424243, "grad_norm": 7.5625, "learning_rate": 1.972423182439406e-05, "loss": 0.47514634, "memory(GiB)": 15.04, "step": 1800, "train_speed(iter/s)": 0.342382 }, { "epoch": 0.24242424242424243, "eval_acc": 0.843474600794465, "eval_loss": 0.6183801889419556, "eval_runtime": 109.7755, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 1800 }, { "acc": 0.89186497, "epoch": 0.2430976430976431, "grad_norm": 11.1875, "learning_rate": 1.972162888402359e-05, "loss": 0.45051031, "memory(GiB)": 15.04, "step": 1805, "train_speed(iter/s)": 0.335557 }, { "acc": 0.78250394, "epoch": 0.24377104377104378, "grad_norm": 9.0, "learning_rate": 1.9719013890257402e-05, "loss": 0.64437308, "memory(GiB)": 15.04, "step": 1810, "train_speed(iter/s)": 0.335702 }, { "acc": 0.80951223, "epoch": 0.24444444444444444, "grad_norm": 8.9375, "learning_rate": 1.971638684633771e-05, "loss": 0.83825159, "memory(GiB)": 15.04, "step": 1815, "train_speed(iter/s)": 0.335812 }, { "acc": 0.86868896, "epoch": 0.24511784511784512, "grad_norm": 12.9375, "learning_rate": 1.9713747755521665e-05, "loss": 0.52982354, "memory(GiB)": 15.04, "step": 1820, "train_speed(iter/s)": 0.335908 }, { "acc": 0.90424519, "epoch": 0.24579124579124578, "grad_norm": 12.1875, "learning_rate": 1.9711096621081353e-05, "loss": 0.34413009, "memory(GiB)": 15.04, "step": 1825, "train_speed(iter/s)": 0.336133 }, { "acc": 0.73576164, "epoch": 0.24646464646464647, "grad_norm": 6.96875, "learning_rate": 1.970843344630379e-05, "loss": 1.21805191, "memory(GiB)": 15.04, "step": 1830, "train_speed(iter/s)": 0.336158 }, { "acc": 0.77103219, "epoch": 0.24713804713804713, "grad_norm": 13.5, "learning_rate": 1.9705758234490923e-05, "loss": 0.7155683, "memory(GiB)": 15.04, "step": 1835, "train_speed(iter/s)": 0.336201 }, { "acc": 0.8558444, "epoch": 0.24781144781144782, "grad_norm": 9.5625, "learning_rate": 1.9703070988959622e-05, "loss": 0.49693632, "memory(GiB)": 15.04, "step": 1840, "train_speed(iter/s)": 0.336255 }, { "acc": 0.86143408, "epoch": 0.24848484848484848, "grad_norm": 9.5625, "learning_rate": 1.9700371713041682e-05, "loss": 0.53240633, "memory(GiB)": 15.04, "step": 1845, "train_speed(iter/s)": 0.336364 }, { "acc": 0.88647976, "epoch": 0.24915824915824916, "grad_norm": 8.9375, "learning_rate": 1.969766041008381e-05, "loss": 0.44659019, "memory(GiB)": 15.04, "step": 1850, "train_speed(iter/s)": 0.336546 }, { "acc": 0.89583845, "epoch": 0.24983164983164982, "grad_norm": 7.28125, "learning_rate": 1.9694937083447614e-05, "loss": 0.3929671, "memory(GiB)": 15.04, "step": 1855, "train_speed(iter/s)": 0.336773 }, { "acc": 0.84875755, "epoch": 0.2505050505050505, "grad_norm": 10.0625, "learning_rate": 1.9692201736509632e-05, "loss": 0.55382719, "memory(GiB)": 15.04, "step": 1860, "train_speed(iter/s)": 0.336874 }, { "acc": 0.86458168, "epoch": 0.25117845117845117, "grad_norm": 10.6875, "learning_rate": 1.968945437266129e-05, "loss": 0.53511877, "memory(GiB)": 15.04, "step": 1865, "train_speed(iter/s)": 0.336995 }, { "acc": 0.85841675, "epoch": 0.2518518518518518, "grad_norm": 14.125, "learning_rate": 1.9686694995308913e-05, "loss": 0.58149495, "memory(GiB)": 15.04, "step": 1870, "train_speed(iter/s)": 0.337087 }, { "acc": 0.81333094, "epoch": 0.25252525252525254, "grad_norm": 21.0, "learning_rate": 1.9683923607873726e-05, "loss": 0.51532841, "memory(GiB)": 15.04, "step": 1875, "train_speed(iter/s)": 0.337319 }, { "acc": 0.81919813, "epoch": 0.2531986531986532, "grad_norm": 9.625, "learning_rate": 1.968114021379185e-05, "loss": 0.72619085, "memory(GiB)": 15.04, "step": 1880, "train_speed(iter/s)": 0.337285 }, { "acc": 0.79009395, "epoch": 0.25387205387205386, "grad_norm": 6.71875, "learning_rate": 1.967834481651428e-05, "loss": 0.59885988, "memory(GiB)": 15.04, "step": 1885, "train_speed(iter/s)": 0.33742 }, { "acc": 0.82559881, "epoch": 0.2545454545454545, "grad_norm": 11.125, "learning_rate": 1.9675537419506897e-05, "loss": 0.73017178, "memory(GiB)": 15.04, "step": 1890, "train_speed(iter/s)": 0.337499 }, { "acc": 0.84612713, "epoch": 0.25521885521885523, "grad_norm": 8.8125, "learning_rate": 1.9672718026250467e-05, "loss": 0.56751752, "memory(GiB)": 15.04, "step": 1895, "train_speed(iter/s)": 0.337682 }, { "acc": 0.82871618, "epoch": 0.2558922558922559, "grad_norm": 9.375, "learning_rate": 1.9669886640240622e-05, "loss": 0.53650327, "memory(GiB)": 15.04, "step": 1900, "train_speed(iter/s)": 0.337762 }, { "acc": 0.88074055, "epoch": 0.25656565656565655, "grad_norm": 9.0625, "learning_rate": 1.966704326498787e-05, "loss": 0.37428265, "memory(GiB)": 15.04, "step": 1905, "train_speed(iter/s)": 0.337908 }, { "acc": 0.74937229, "epoch": 0.2572390572390572, "grad_norm": 7.34375, "learning_rate": 1.966418790401757e-05, "loss": 0.48255777, "memory(GiB)": 15.04, "step": 1910, "train_speed(iter/s)": 0.337963 }, { "acc": 0.91495829, "epoch": 0.2579124579124579, "grad_norm": 5.0625, "learning_rate": 1.966132056086996e-05, "loss": 0.29679358, "memory(GiB)": 15.04, "step": 1915, "train_speed(iter/s)": 0.337877 }, { "acc": 0.82159767, "epoch": 0.2585858585858586, "grad_norm": 7.09375, "learning_rate": 1.9658441239100125e-05, "loss": 0.50666471, "memory(GiB)": 15.04, "step": 1920, "train_speed(iter/s)": 0.337989 }, { "acc": 0.83229828, "epoch": 0.25925925925925924, "grad_norm": 5.34375, "learning_rate": 1.9655549942278e-05, "loss": 0.54949713, "memory(GiB)": 15.04, "step": 1925, "train_speed(iter/s)": 0.33792 }, { "acc": 0.85097256, "epoch": 0.25993265993265996, "grad_norm": 6.28125, "learning_rate": 1.9652646673988373e-05, "loss": 0.48549409, "memory(GiB)": 15.04, "step": 1930, "train_speed(iter/s)": 0.337977 }, { "acc": 0.84481211, "epoch": 0.2606060606060606, "grad_norm": 9.4375, "learning_rate": 1.964973143783086e-05, "loss": 0.67328267, "memory(GiB)": 15.04, "step": 1935, "train_speed(iter/s)": 0.338016 }, { "acc": 0.8501667, "epoch": 0.2612794612794613, "grad_norm": 19.125, "learning_rate": 1.964680423741994e-05, "loss": 0.43254037, "memory(GiB)": 15.04, "step": 1940, "train_speed(iter/s)": 0.338206 }, { "acc": 0.86048279, "epoch": 0.26195286195286194, "grad_norm": 32.75, "learning_rate": 1.964386507638491e-05, "loss": 0.59896231, "memory(GiB)": 15.04, "step": 1945, "train_speed(iter/s)": 0.338329 }, { "acc": 0.88457479, "epoch": 0.26262626262626265, "grad_norm": 8.25, "learning_rate": 1.9640913958369895e-05, "loss": 0.43649497, "memory(GiB)": 15.04, "step": 1950, "train_speed(iter/s)": 0.338487 }, { "acc": 0.82918968, "epoch": 0.2632996632996633, "grad_norm": 8.375, "learning_rate": 1.963795088703385e-05, "loss": 0.50043716, "memory(GiB)": 15.04, "step": 1955, "train_speed(iter/s)": 0.338408 }, { "acc": 0.77567244, "epoch": 0.26397306397306397, "grad_norm": 19.125, "learning_rate": 1.963497586605055e-05, "loss": 0.88877125, "memory(GiB)": 15.04, "step": 1960, "train_speed(iter/s)": 0.338565 }, { "acc": 0.87920694, "epoch": 0.26464646464646463, "grad_norm": 6.65625, "learning_rate": 1.963198889910859e-05, "loss": 0.4500257, "memory(GiB)": 15.04, "step": 1965, "train_speed(iter/s)": 0.338666 }, { "acc": 0.86709795, "epoch": 0.26531986531986534, "grad_norm": 11.4375, "learning_rate": 1.962898998991136e-05, "loss": 0.35767465, "memory(GiB)": 15.04, "step": 1970, "train_speed(iter/s)": 0.33889 }, { "acc": 0.87004099, "epoch": 0.265993265993266, "grad_norm": 6.96875, "learning_rate": 1.962597914217708e-05, "loss": 0.44529519, "memory(GiB)": 15.04, "step": 1975, "train_speed(iter/s)": 0.338998 }, { "acc": 0.88034582, "epoch": 0.26666666666666666, "grad_norm": 6.40625, "learning_rate": 1.9622956359638752e-05, "loss": 0.40983272, "memory(GiB)": 15.04, "step": 1980, "train_speed(iter/s)": 0.339066 }, { "acc": 0.85966377, "epoch": 0.2673400673400673, "grad_norm": 18.875, "learning_rate": 1.9619921646044188e-05, "loss": 0.57280025, "memory(GiB)": 15.04, "step": 1985, "train_speed(iter/s)": 0.339158 }, { "acc": 0.87830267, "epoch": 0.26801346801346804, "grad_norm": 7.875, "learning_rate": 1.9616875005155988e-05, "loss": 0.5095448, "memory(GiB)": 15.04, "step": 1990, "train_speed(iter/s)": 0.339323 }, { "acc": 0.86530752, "epoch": 0.2686868686868687, "grad_norm": 6.78125, "learning_rate": 1.961381644075154e-05, "loss": 0.45437994, "memory(GiB)": 15.04, "step": 1995, "train_speed(iter/s)": 0.339361 }, { "acc": 0.84290581, "epoch": 0.26936026936026936, "grad_norm": 9.6875, "learning_rate": 1.9610745956623013e-05, "loss": 0.48863568, "memory(GiB)": 15.04, "step": 2000, "train_speed(iter/s)": 0.339462 }, { "acc": 0.88849602, "epoch": 0.27003367003367, "grad_norm": 6.96875, "learning_rate": 1.9607663556577365e-05, "loss": 0.45863638, "memory(GiB)": 15.04, "step": 2005, "train_speed(iter/s)": 0.339325 }, { "acc": 0.87045746, "epoch": 0.27070707070707073, "grad_norm": 11.5625, "learning_rate": 1.9604569244436308e-05, "loss": 0.44171586, "memory(GiB)": 15.04, "step": 2010, "train_speed(iter/s)": 0.339288 }, { "acc": 0.84935579, "epoch": 0.2713804713804714, "grad_norm": 10.1875, "learning_rate": 1.9601463024036346e-05, "loss": 0.50538421, "memory(GiB)": 15.04, "step": 2015, "train_speed(iter/s)": 0.339413 }, { "acc": 0.90092773, "epoch": 0.27205387205387205, "grad_norm": 8.75, "learning_rate": 1.959834489922874e-05, "loss": 0.38511703, "memory(GiB)": 15.04, "step": 2020, "train_speed(iter/s)": 0.339554 }, { "acc": 0.87212915, "epoch": 0.2727272727272727, "grad_norm": 6.96875, "learning_rate": 1.9595214873879494e-05, "loss": 0.35981777, "memory(GiB)": 15.04, "step": 2025, "train_speed(iter/s)": 0.339741 }, { "acc": 0.86869631, "epoch": 0.2734006734006734, "grad_norm": 11.25, "learning_rate": 1.9592072951869394e-05, "loss": 0.53693933, "memory(GiB)": 15.04, "step": 2030, "train_speed(iter/s)": 0.339871 }, { "acc": 0.81653395, "epoch": 0.2740740740740741, "grad_norm": 7.5, "learning_rate": 1.9588919137093956e-05, "loss": 0.51912298, "memory(GiB)": 15.04, "step": 2035, "train_speed(iter/s)": 0.339811 }, { "acc": 0.79491487, "epoch": 0.27474747474747474, "grad_norm": 6.625, "learning_rate": 1.9585753433463452e-05, "loss": 0.64108443, "memory(GiB)": 15.04, "step": 2040, "train_speed(iter/s)": 0.339931 }, { "acc": 0.80765038, "epoch": 0.2754208754208754, "grad_norm": 9.6875, "learning_rate": 1.958257584490289e-05, "loss": 1.07152681, "memory(GiB)": 15.04, "step": 2045, "train_speed(iter/s)": 0.339958 }, { "acc": 0.79323549, "epoch": 0.2760942760942761, "grad_norm": 7.34375, "learning_rate": 1.9579386375352015e-05, "loss": 0.79024658, "memory(GiB)": 15.04, "step": 2050, "train_speed(iter/s)": 0.339931 }, { "acc": 0.78419557, "epoch": 0.2767676767676768, "grad_norm": 6.21875, "learning_rate": 1.9576185028765296e-05, "loss": 0.72721648, "memory(GiB)": 15.04, "step": 2055, "train_speed(iter/s)": 0.340049 }, { "acc": 0.86066732, "epoch": 0.27744107744107743, "grad_norm": 7.3125, "learning_rate": 1.9572971809111944e-05, "loss": 0.53243432, "memory(GiB)": 15.04, "step": 2060, "train_speed(iter/s)": 0.340117 }, { "acc": 0.87064056, "epoch": 0.2781144781144781, "grad_norm": 8.3125, "learning_rate": 1.9569746720375873e-05, "loss": 0.63533983, "memory(GiB)": 15.04, "step": 2065, "train_speed(iter/s)": 0.340126 }, { "acc": 0.87459402, "epoch": 0.2787878787878788, "grad_norm": 13.5, "learning_rate": 1.9566509766555725e-05, "loss": 0.5083847, "memory(GiB)": 15.04, "step": 2070, "train_speed(iter/s)": 0.340285 }, { "acc": 0.86485825, "epoch": 0.27946127946127947, "grad_norm": 6.09375, "learning_rate": 1.9563260951664844e-05, "loss": 0.46479993, "memory(GiB)": 15.04, "step": 2075, "train_speed(iter/s)": 0.340277 }, { "acc": 0.87869692, "epoch": 0.2801346801346801, "grad_norm": 16.625, "learning_rate": 1.9560000279731285e-05, "loss": 0.40669374, "memory(GiB)": 15.04, "step": 2080, "train_speed(iter/s)": 0.340421 }, { "acc": 0.87450075, "epoch": 0.2808080808080808, "grad_norm": 8.0, "learning_rate": 1.9556727754797808e-05, "loss": 0.41202154, "memory(GiB)": 15.04, "step": 2085, "train_speed(iter/s)": 0.340408 }, { "acc": 0.86928625, "epoch": 0.2814814814814815, "grad_norm": 9.8125, "learning_rate": 1.9553443380921862e-05, "loss": 0.4687561, "memory(GiB)": 15.04, "step": 2090, "train_speed(iter/s)": 0.340497 }, { "acc": 0.82436953, "epoch": 0.28215488215488216, "grad_norm": 10.5625, "learning_rate": 1.955014716217559e-05, "loss": 0.53379784, "memory(GiB)": 15.04, "step": 2095, "train_speed(iter/s)": 0.34057 }, { "acc": 0.86324949, "epoch": 0.2828282828282828, "grad_norm": 13.25, "learning_rate": 1.954683910264582e-05, "loss": 0.47917733, "memory(GiB)": 15.04, "step": 2100, "train_speed(iter/s)": 0.34075 }, { "epoch": 0.2828282828282828, "eval_acc": 0.8546162196510365, "eval_loss": 0.5698094964027405, "eval_runtime": 109.4795, "eval_samples_per_second": 1.37, "eval_steps_per_second": 1.37, "step": 2100 }, { "acc": 0.8573041, "epoch": 0.2835016835016835, "grad_norm": 68.5, "learning_rate": 1.954351920643406e-05, "loss": 0.45866795, "memory(GiB)": 15.04, "step": 2105, "train_speed(iter/s)": 0.335005 }, { "acc": 0.88993645, "epoch": 0.2841750841750842, "grad_norm": 8.0, "learning_rate": 1.95401874776565e-05, "loss": 0.43657699, "memory(GiB)": 15.04, "step": 2110, "train_speed(iter/s)": 0.335161 }, { "acc": 0.85032578, "epoch": 0.28484848484848485, "grad_norm": 9.125, "learning_rate": 1.953684392044399e-05, "loss": 0.49008636, "memory(GiB)": 15.04, "step": 2115, "train_speed(iter/s)": 0.335288 }, { "acc": 0.88042259, "epoch": 0.2855218855218855, "grad_norm": 9.0625, "learning_rate": 1.953348853894205e-05, "loss": 0.46665258, "memory(GiB)": 15.04, "step": 2120, "train_speed(iter/s)": 0.335299 }, { "acc": 0.84420929, "epoch": 0.28619528619528617, "grad_norm": 6.03125, "learning_rate": 1.9530121337310866e-05, "loss": 0.4866931, "memory(GiB)": 15.04, "step": 2125, "train_speed(iter/s)": 0.335485 }, { "acc": 0.75258598, "epoch": 0.2868686868686869, "grad_norm": 10.0625, "learning_rate": 1.952674231972527e-05, "loss": 0.78863392, "memory(GiB)": 15.04, "step": 2130, "train_speed(iter/s)": 0.335674 }, { "acc": 0.80859814, "epoch": 0.28754208754208754, "grad_norm": 17.375, "learning_rate": 1.952335149037476e-05, "loss": 0.73442564, "memory(GiB)": 15.04, "step": 2135, "train_speed(iter/s)": 0.335804 }, { "acc": 0.82581291, "epoch": 0.2882154882154882, "grad_norm": 30.125, "learning_rate": 1.9519948853463453e-05, "loss": 0.47450933, "memory(GiB)": 15.04, "step": 2140, "train_speed(iter/s)": 0.33591 }, { "acc": 0.84496117, "epoch": 0.28888888888888886, "grad_norm": 11.625, "learning_rate": 1.951653441321013e-05, "loss": 0.50630031, "memory(GiB)": 15.04, "step": 2145, "train_speed(iter/s)": 0.336086 }, { "acc": 0.90782948, "epoch": 0.2895622895622896, "grad_norm": 14.5625, "learning_rate": 1.9513108173848193e-05, "loss": 0.36844997, "memory(GiB)": 15.04, "step": 2150, "train_speed(iter/s)": 0.336242 }, { "acc": 0.83160648, "epoch": 0.29023569023569024, "grad_norm": 17.5, "learning_rate": 1.950967013962568e-05, "loss": 0.74357166, "memory(GiB)": 15.04, "step": 2155, "train_speed(iter/s)": 0.336361 }, { "acc": 0.86514874, "epoch": 0.2909090909090909, "grad_norm": 11.5, "learning_rate": 1.950622031480524e-05, "loss": 0.4806139, "memory(GiB)": 15.04, "step": 2160, "train_speed(iter/s)": 0.336509 }, { "acc": 0.88152399, "epoch": 0.2915824915824916, "grad_norm": 15.5625, "learning_rate": 1.950275870366417e-05, "loss": 0.42793164, "memory(GiB)": 15.04, "step": 2165, "train_speed(iter/s)": 0.336669 }, { "acc": 0.81066751, "epoch": 0.29225589225589227, "grad_norm": 13.9375, "learning_rate": 1.9499285310494337e-05, "loss": 0.62704086, "memory(GiB)": 15.04, "step": 2170, "train_speed(iter/s)": 0.336743 }, { "acc": 0.84785643, "epoch": 0.29292929292929293, "grad_norm": 13.5, "learning_rate": 1.949580013960226e-05, "loss": 0.52745781, "memory(GiB)": 15.04, "step": 2175, "train_speed(iter/s)": 0.33674 }, { "acc": 0.82926636, "epoch": 0.2936026936026936, "grad_norm": 19.125, "learning_rate": 1.9492303195309028e-05, "loss": 0.65729084, "memory(GiB)": 15.04, "step": 2180, "train_speed(iter/s)": 0.336937 }, { "acc": 0.72600865, "epoch": 0.2942760942760943, "grad_norm": 8.1875, "learning_rate": 1.9488794481950345e-05, "loss": 0.94059401, "memory(GiB)": 15.04, "step": 2185, "train_speed(iter/s)": 0.337106 }, { "acc": 0.85088053, "epoch": 0.29494949494949496, "grad_norm": 9.8125, "learning_rate": 1.9485274003876497e-05, "loss": 0.35568237, "memory(GiB)": 15.04, "step": 2190, "train_speed(iter/s)": 0.337205 }, { "acc": 0.88804274, "epoch": 0.2956228956228956, "grad_norm": 14.8125, "learning_rate": 1.9481741765452364e-05, "loss": 0.49214945, "memory(GiB)": 15.04, "step": 2195, "train_speed(iter/s)": 0.337299 }, { "acc": 0.88171883, "epoch": 0.2962962962962963, "grad_norm": 7.8125, "learning_rate": 1.9478197771057407e-05, "loss": 0.3708432, "memory(GiB)": 15.04, "step": 2200, "train_speed(iter/s)": 0.337492 }, { "acc": 0.83816452, "epoch": 0.296969696969697, "grad_norm": 13.4375, "learning_rate": 1.9474642025085656e-05, "loss": 0.63066597, "memory(GiB)": 15.04, "step": 2205, "train_speed(iter/s)": 0.337539 }, { "acc": 0.87264824, "epoch": 0.29764309764309765, "grad_norm": 12.375, "learning_rate": 1.9471074531945716e-05, "loss": 0.42701688, "memory(GiB)": 15.04, "step": 2210, "train_speed(iter/s)": 0.337641 }, { "acc": 0.86768637, "epoch": 0.2983164983164983, "grad_norm": 14.875, "learning_rate": 1.9467495296060755e-05, "loss": 0.48693333, "memory(GiB)": 15.04, "step": 2215, "train_speed(iter/s)": 0.337826 }, { "acc": 0.85808105, "epoch": 0.298989898989899, "grad_norm": 17.125, "learning_rate": 1.9463904321868508e-05, "loss": 0.51649294, "memory(GiB)": 15.04, "step": 2220, "train_speed(iter/s)": 0.337937 }, { "acc": 0.82949419, "epoch": 0.2996632996632997, "grad_norm": 17.625, "learning_rate": 1.9460301613821246e-05, "loss": 0.4572835, "memory(GiB)": 15.04, "step": 2225, "train_speed(iter/s)": 0.338094 }, { "acc": 0.84240913, "epoch": 0.30033670033670035, "grad_norm": 10.3125, "learning_rate": 1.9456687176385806e-05, "loss": 0.74288449, "memory(GiB)": 15.04, "step": 2230, "train_speed(iter/s)": 0.338173 }, { "acc": 0.86656504, "epoch": 0.301010101010101, "grad_norm": 6.5, "learning_rate": 1.945306101404356e-05, "loss": 0.38929241, "memory(GiB)": 15.04, "step": 2235, "train_speed(iter/s)": 0.338236 }, { "acc": 0.90445004, "epoch": 0.30168350168350166, "grad_norm": 7.875, "learning_rate": 1.944942313129042e-05, "loss": 0.40537362, "memory(GiB)": 15.04, "step": 2240, "train_speed(iter/s)": 0.338265 }, { "acc": 0.86881905, "epoch": 0.3023569023569024, "grad_norm": 13.375, "learning_rate": 1.9445773532636823e-05, "loss": 0.56779408, "memory(GiB)": 15.04, "step": 2245, "train_speed(iter/s)": 0.338382 }, { "acc": 0.84482965, "epoch": 0.30303030303030304, "grad_norm": 10.5625, "learning_rate": 1.9442112222607737e-05, "loss": 0.56202984, "memory(GiB)": 15.04, "step": 2250, "train_speed(iter/s)": 0.338441 }, { "acc": 0.88069286, "epoch": 0.3037037037037037, "grad_norm": 7.625, "learning_rate": 1.9438439205742656e-05, "loss": 0.26127765, "memory(GiB)": 15.04, "step": 2255, "train_speed(iter/s)": 0.338643 }, { "acc": 0.85242128, "epoch": 0.30437710437710436, "grad_norm": 7.03125, "learning_rate": 1.9434754486595576e-05, "loss": 0.53033781, "memory(GiB)": 15.04, "step": 2260, "train_speed(iter/s)": 0.338764 }, { "acc": 0.89758568, "epoch": 0.30505050505050507, "grad_norm": 13.375, "learning_rate": 1.9431058069735016e-05, "loss": 0.3443686, "memory(GiB)": 15.04, "step": 2265, "train_speed(iter/s)": 0.338912 }, { "acc": 0.86288328, "epoch": 0.30572390572390573, "grad_norm": 14.9375, "learning_rate": 1.9427349959743983e-05, "loss": 0.44742985, "memory(GiB)": 15.04, "step": 2270, "train_speed(iter/s)": 0.339055 }, { "acc": 0.7970829, "epoch": 0.3063973063973064, "grad_norm": 23.5, "learning_rate": 1.9423630161219996e-05, "loss": 0.73262329, "memory(GiB)": 15.04, "step": 2275, "train_speed(iter/s)": 0.339145 }, { "acc": 0.90343246, "epoch": 0.30707070707070705, "grad_norm": 10.3125, "learning_rate": 1.941989867877506e-05, "loss": 0.37732604, "memory(GiB)": 15.04, "step": 2280, "train_speed(iter/s)": 0.33926 }, { "acc": 0.89101963, "epoch": 0.30774410774410776, "grad_norm": 10.375, "learning_rate": 1.9416155517035666e-05, "loss": 0.42690949, "memory(GiB)": 15.04, "step": 2285, "train_speed(iter/s)": 0.339246 }, { "acc": 0.80757065, "epoch": 0.3084175084175084, "grad_norm": 22.375, "learning_rate": 1.9412400680642785e-05, "loss": 0.57092986, "memory(GiB)": 15.04, "step": 2290, "train_speed(iter/s)": 0.339331 }, { "acc": 0.84998417, "epoch": 0.3090909090909091, "grad_norm": 9.125, "learning_rate": 1.9408634174251864e-05, "loss": 0.52270007, "memory(GiB)": 15.04, "step": 2295, "train_speed(iter/s)": 0.339404 }, { "acc": 0.89776154, "epoch": 0.30976430976430974, "grad_norm": 6.90625, "learning_rate": 1.9404856002532822e-05, "loss": 0.34066579, "memory(GiB)": 15.04, "step": 2300, "train_speed(iter/s)": 0.339485 }, { "acc": 0.87052498, "epoch": 0.31043771043771046, "grad_norm": 7.1875, "learning_rate": 1.9401066170170034e-05, "loss": 0.279761, "memory(GiB)": 15.04, "step": 2305, "train_speed(iter/s)": 0.339601 }, { "acc": 0.8039012, "epoch": 0.3111111111111111, "grad_norm": 17.25, "learning_rate": 1.939726468186234e-05, "loss": 0.89665995, "memory(GiB)": 15.04, "step": 2310, "train_speed(iter/s)": 0.339741 }, { "acc": 0.87178888, "epoch": 0.3117845117845118, "grad_norm": 7.0625, "learning_rate": 1.939345154232303e-05, "loss": 0.46687875, "memory(GiB)": 15.04, "step": 2315, "train_speed(iter/s)": 0.339855 }, { "acc": 0.77452545, "epoch": 0.31245791245791243, "grad_norm": 7.15625, "learning_rate": 1.9389626756279834e-05, "loss": 0.58602438, "memory(GiB)": 15.04, "step": 2320, "train_speed(iter/s)": 0.339912 }, { "acc": 0.87413473, "epoch": 0.31313131313131315, "grad_norm": 11.3125, "learning_rate": 1.938579032847493e-05, "loss": 0.43738194, "memory(GiB)": 15.04, "step": 2325, "train_speed(iter/s)": 0.339952 }, { "acc": 0.91057472, "epoch": 0.3138047138047138, "grad_norm": 10.9375, "learning_rate": 1.9381942263664927e-05, "loss": 0.37268085, "memory(GiB)": 15.04, "step": 2330, "train_speed(iter/s)": 0.340096 }, { "acc": 0.87649097, "epoch": 0.31447811447811447, "grad_norm": 8.4375, "learning_rate": 1.9378082566620854e-05, "loss": 0.36977, "memory(GiB)": 15.04, "step": 2335, "train_speed(iter/s)": 0.340248 }, { "acc": 0.77074413, "epoch": 0.3151515151515151, "grad_norm": 6.125, "learning_rate": 1.9374211242128185e-05, "loss": 0.65000758, "memory(GiB)": 15.04, "step": 2340, "train_speed(iter/s)": 0.340298 }, { "acc": 0.88139248, "epoch": 0.31582491582491584, "grad_norm": 11.4375, "learning_rate": 1.937032829498678e-05, "loss": 0.48605113, "memory(GiB)": 15.04, "step": 2345, "train_speed(iter/s)": 0.340348 }, { "acc": 0.8511385, "epoch": 0.3164983164983165, "grad_norm": 9.9375, "learning_rate": 1.9366433730010933e-05, "loss": 0.60395231, "memory(GiB)": 15.04, "step": 2350, "train_speed(iter/s)": 0.340361 }, { "acc": 0.85783482, "epoch": 0.31717171717171716, "grad_norm": 6.59375, "learning_rate": 1.9362527552029332e-05, "loss": 0.5698195, "memory(GiB)": 15.04, "step": 2355, "train_speed(iter/s)": 0.340345 }, { "acc": 0.81080713, "epoch": 0.3178451178451178, "grad_norm": 7.21875, "learning_rate": 1.9358609765885066e-05, "loss": 0.66538157, "memory(GiB)": 15.04, "step": 2360, "train_speed(iter/s)": 0.340421 }, { "acc": 0.86503201, "epoch": 0.31851851851851853, "grad_norm": 8.125, "learning_rate": 1.9354680376435616e-05, "loss": 0.66491618, "memory(GiB)": 15.04, "step": 2365, "train_speed(iter/s)": 0.340483 }, { "acc": 0.87831783, "epoch": 0.3191919191919192, "grad_norm": 8.875, "learning_rate": 1.9350739388552845e-05, "loss": 0.43169312, "memory(GiB)": 15.04, "step": 2370, "train_speed(iter/s)": 0.340562 }, { "acc": 0.86912212, "epoch": 0.31986531986531985, "grad_norm": 10.5625, "learning_rate": 1.934678680712301e-05, "loss": 0.47349477, "memory(GiB)": 15.04, "step": 2375, "train_speed(iter/s)": 0.340661 }, { "acc": 0.88821764, "epoch": 0.3205387205387205, "grad_norm": 15.4375, "learning_rate": 1.934282263704672e-05, "loss": 0.46184831, "memory(GiB)": 15.04, "step": 2380, "train_speed(iter/s)": 0.340781 }, { "acc": 0.89957047, "epoch": 0.3212121212121212, "grad_norm": 8.5, "learning_rate": 1.933884688323898e-05, "loss": 0.38067825, "memory(GiB)": 15.04, "step": 2385, "train_speed(iter/s)": 0.340804 }, { "acc": 0.85859814, "epoch": 0.3218855218855219, "grad_norm": 21.375, "learning_rate": 1.933485955062913e-05, "loss": 0.39313557, "memory(GiB)": 15.04, "step": 2390, "train_speed(iter/s)": 0.340922 }, { "acc": 0.86764755, "epoch": 0.32255892255892255, "grad_norm": 6.9375, "learning_rate": 1.9330860644160884e-05, "loss": 0.39578037, "memory(GiB)": 15.04, "step": 2395, "train_speed(iter/s)": 0.340853 }, { "acc": 0.8775794, "epoch": 0.32323232323232326, "grad_norm": 16.0, "learning_rate": 1.93268501687923e-05, "loss": 0.35117748, "memory(GiB)": 15.04, "step": 2400, "train_speed(iter/s)": 0.340913 }, { "epoch": 0.32323232323232326, "eval_acc": 0.8610003959350666, "eval_loss": 0.5346771478652954, "eval_runtime": 109.5685, "eval_samples_per_second": 1.369, "eval_steps_per_second": 1.369, "step": 2400 }, { "acc": 0.85465479, "epoch": 0.3239057239057239, "grad_norm": 9.75, "learning_rate": 1.9322828129495783e-05, "loss": 0.58138494, "memory(GiB)": 15.04, "step": 2405, "train_speed(iter/s)": 0.335788 }, { "acc": 0.89197845, "epoch": 0.3245791245791246, "grad_norm": 24.5, "learning_rate": 1.9318794531258064e-05, "loss": 0.41574764, "memory(GiB)": 15.04, "step": 2410, "train_speed(iter/s)": 0.335922 }, { "acc": 0.82217188, "epoch": 0.32525252525252524, "grad_norm": 9.375, "learning_rate": 1.931474937908022e-05, "loss": 0.63134084, "memory(GiB)": 15.04, "step": 2415, "train_speed(iter/s)": 0.33601 }, { "acc": 0.87278347, "epoch": 0.32592592592592595, "grad_norm": 10.1875, "learning_rate": 1.9310692677977645e-05, "loss": 0.56534538, "memory(GiB)": 15.04, "step": 2420, "train_speed(iter/s)": 0.336135 }, { "acc": 0.83628635, "epoch": 0.3265993265993266, "grad_norm": 7.0, "learning_rate": 1.930662443298006e-05, "loss": 0.56281152, "memory(GiB)": 15.04, "step": 2425, "train_speed(iter/s)": 0.33613 }, { "acc": 0.85470467, "epoch": 0.32727272727272727, "grad_norm": 7.5, "learning_rate": 1.9302544649131482e-05, "loss": 0.4939652, "memory(GiB)": 15.04, "step": 2430, "train_speed(iter/s)": 0.336209 }, { "acc": 0.89396553, "epoch": 0.32794612794612793, "grad_norm": 8.9375, "learning_rate": 1.9298453331490257e-05, "loss": 0.43196988, "memory(GiB)": 15.04, "step": 2435, "train_speed(iter/s)": 0.336355 }, { "acc": 0.82339792, "epoch": 0.32861952861952864, "grad_norm": 5.375, "learning_rate": 1.929435048512901e-05, "loss": 0.61394019, "memory(GiB)": 15.04, "step": 2440, "train_speed(iter/s)": 0.336437 }, { "acc": 0.83198853, "epoch": 0.3292929292929293, "grad_norm": 12.25, "learning_rate": 1.9290236115134677e-05, "loss": 0.47708797, "memory(GiB)": 15.04, "step": 2445, "train_speed(iter/s)": 0.336573 }, { "acc": 0.82138634, "epoch": 0.32996632996632996, "grad_norm": 14.0625, "learning_rate": 1.9286110226608465e-05, "loss": 0.55513005, "memory(GiB)": 15.04, "step": 2450, "train_speed(iter/s)": 0.336678 }, { "acc": 0.89609404, "epoch": 0.3306397306397306, "grad_norm": 11.3125, "learning_rate": 1.928197282466588e-05, "loss": 0.36580346, "memory(GiB)": 15.04, "step": 2455, "train_speed(iter/s)": 0.336814 }, { "acc": 0.78545809, "epoch": 0.33131313131313134, "grad_norm": 28.25, "learning_rate": 1.9277823914436688e-05, "loss": 0.89613228, "memory(GiB)": 15.04, "step": 2460, "train_speed(iter/s)": 0.336865 }, { "acc": 0.89025421, "epoch": 0.331986531986532, "grad_norm": 7.65625, "learning_rate": 1.927366350106494e-05, "loss": 0.36613114, "memory(GiB)": 15.04, "step": 2465, "train_speed(iter/s)": 0.336846 }, { "acc": 0.86367445, "epoch": 0.33265993265993266, "grad_norm": 8.8125, "learning_rate": 1.9269491589708927e-05, "loss": 0.4765162, "memory(GiB)": 15.04, "step": 2470, "train_speed(iter/s)": 0.336932 }, { "acc": 0.90018415, "epoch": 0.3333333333333333, "grad_norm": 7.75, "learning_rate": 1.926530818554121e-05, "loss": 0.37076304, "memory(GiB)": 15.04, "step": 2475, "train_speed(iter/s)": 0.337088 }, { "acc": 0.89008112, "epoch": 0.33400673400673403, "grad_norm": 9.0625, "learning_rate": 1.9261113293748607e-05, "loss": 0.35568271, "memory(GiB)": 15.04, "step": 2480, "train_speed(iter/s)": 0.337163 }, { "acc": 0.881637, "epoch": 0.3346801346801347, "grad_norm": 10.875, "learning_rate": 1.9256906919532162e-05, "loss": 0.39150298, "memory(GiB)": 15.04, "step": 2485, "train_speed(iter/s)": 0.337288 }, { "acc": 0.87954836, "epoch": 0.33535353535353535, "grad_norm": 6.28125, "learning_rate": 1.925268906810716e-05, "loss": 0.4459631, "memory(GiB)": 15.04, "step": 2490, "train_speed(iter/s)": 0.337353 }, { "acc": 0.86157665, "epoch": 0.336026936026936, "grad_norm": 7.96875, "learning_rate": 1.9248459744703126e-05, "loss": 0.53909798, "memory(GiB)": 15.04, "step": 2495, "train_speed(iter/s)": 0.33748 }, { "acc": 0.84323149, "epoch": 0.3367003367003367, "grad_norm": 7.53125, "learning_rate": 1.9244218954563797e-05, "loss": 0.4920475, "memory(GiB)": 15.04, "step": 2500, "train_speed(iter/s)": 0.337538 }, { "acc": 0.82223568, "epoch": 0.3373737373737374, "grad_norm": 20.25, "learning_rate": 1.923996670294713e-05, "loss": 0.52200842, "memory(GiB)": 15.04, "step": 2505, "train_speed(iter/s)": 0.337522 }, { "acc": 0.86578159, "epoch": 0.33804713804713804, "grad_norm": 14.3125, "learning_rate": 1.92357029951253e-05, "loss": 0.49261842, "memory(GiB)": 15.04, "step": 2510, "train_speed(iter/s)": 0.337617 }, { "acc": 0.89437923, "epoch": 0.3387205387205387, "grad_norm": 6.9375, "learning_rate": 1.9231427836384673e-05, "loss": 0.38732541, "memory(GiB)": 15.04, "step": 2515, "train_speed(iter/s)": 0.337674 }, { "acc": 0.88720303, "epoch": 0.3393939393939394, "grad_norm": 5.75, "learning_rate": 1.9227141232025824e-05, "loss": 0.41020632, "memory(GiB)": 15.04, "step": 2520, "train_speed(iter/s)": 0.337773 }, { "acc": 0.88686686, "epoch": 0.3400673400673401, "grad_norm": 5.8125, "learning_rate": 1.9222843187363518e-05, "loss": 0.40615201, "memory(GiB)": 15.04, "step": 2525, "train_speed(iter/s)": 0.33776 }, { "acc": 0.88341837, "epoch": 0.34074074074074073, "grad_norm": 4.71875, "learning_rate": 1.9218533707726693e-05, "loss": 0.43876266, "memory(GiB)": 15.04, "step": 2530, "train_speed(iter/s)": 0.337763 }, { "acc": 0.88013363, "epoch": 0.3414141414141414, "grad_norm": 6.28125, "learning_rate": 1.9214212798458477e-05, "loss": 0.45347929, "memory(GiB)": 15.04, "step": 2535, "train_speed(iter/s)": 0.337818 }, { "acc": 0.89623661, "epoch": 0.3420875420875421, "grad_norm": 7.0, "learning_rate": 1.9209880464916163e-05, "loss": 0.41357417, "memory(GiB)": 15.04, "step": 2540, "train_speed(iter/s)": 0.337882 }, { "acc": 0.85681429, "epoch": 0.34276094276094277, "grad_norm": 8.4375, "learning_rate": 1.9205536712471212e-05, "loss": 0.47629442, "memory(GiB)": 15.04, "step": 2545, "train_speed(iter/s)": 0.337973 }, { "acc": 0.85323639, "epoch": 0.3434343434343434, "grad_norm": 7.9375, "learning_rate": 1.920118154650924e-05, "loss": 0.42014799, "memory(GiB)": 15.04, "step": 2550, "train_speed(iter/s)": 0.33803 }, { "acc": 0.81921091, "epoch": 0.3441077441077441, "grad_norm": 7.34375, "learning_rate": 1.9196814972430013e-05, "loss": 0.67137895, "memory(GiB)": 15.04, "step": 2555, "train_speed(iter/s)": 0.338099 }, { "acc": 0.86797428, "epoch": 0.3447811447811448, "grad_norm": 23.75, "learning_rate": 1.9192436995647444e-05, "loss": 0.6300138, "memory(GiB)": 15.04, "step": 2560, "train_speed(iter/s)": 0.338222 }, { "acc": 0.86078186, "epoch": 0.34545454545454546, "grad_norm": 9.25, "learning_rate": 1.918804762158958e-05, "loss": 0.46266317, "memory(GiB)": 15.04, "step": 2565, "train_speed(iter/s)": 0.338335 }, { "acc": 0.90208426, "epoch": 0.3461279461279461, "grad_norm": 8.6875, "learning_rate": 1.918364685569861e-05, "loss": 0.348434, "memory(GiB)": 15.04, "step": 2570, "train_speed(iter/s)": 0.33846 }, { "acc": 0.87423429, "epoch": 0.3468013468013468, "grad_norm": 5.15625, "learning_rate": 1.9179234703430834e-05, "loss": 0.42522211, "memory(GiB)": 15.04, "step": 2575, "train_speed(iter/s)": 0.3385 }, { "acc": 0.83370371, "epoch": 0.3474747474747475, "grad_norm": 9.875, "learning_rate": 1.917481117025667e-05, "loss": 0.6267858, "memory(GiB)": 15.04, "step": 2580, "train_speed(iter/s)": 0.338562 }, { "acc": 0.89248533, "epoch": 0.34814814814814815, "grad_norm": 12.0625, "learning_rate": 1.917037626166066e-05, "loss": 0.38710203, "memory(GiB)": 15.04, "step": 2585, "train_speed(iter/s)": 0.338704 }, { "acc": 0.88555288, "epoch": 0.3488215488215488, "grad_norm": 10.375, "learning_rate": 1.9165929983141436e-05, "loss": 0.36460023, "memory(GiB)": 15.04, "step": 2590, "train_speed(iter/s)": 0.338806 }, { "acc": 0.85448437, "epoch": 0.34949494949494947, "grad_norm": 16.75, "learning_rate": 1.916147234021173e-05, "loss": 0.58085775, "memory(GiB)": 15.04, "step": 2595, "train_speed(iter/s)": 0.338771 }, { "acc": 0.90577812, "epoch": 0.3501683501683502, "grad_norm": 8.5, "learning_rate": 1.915700333839837e-05, "loss": 0.28706489, "memory(GiB)": 15.04, "step": 2600, "train_speed(iter/s)": 0.338854 }, { "acc": 0.83092384, "epoch": 0.35084175084175084, "grad_norm": 18.125, "learning_rate": 1.9152522983242266e-05, "loss": 0.59093304, "memory(GiB)": 15.04, "step": 2605, "train_speed(iter/s)": 0.338916 }, { "acc": 0.877106, "epoch": 0.3515151515151515, "grad_norm": 9.5625, "learning_rate": 1.9148031280298393e-05, "loss": 0.53991098, "memory(GiB)": 15.04, "step": 2610, "train_speed(iter/s)": 0.338995 }, { "acc": 0.84207191, "epoch": 0.35218855218855216, "grad_norm": 6.40625, "learning_rate": 1.9143528235135815e-05, "loss": 0.80373363, "memory(GiB)": 15.04, "step": 2615, "train_speed(iter/s)": 0.338889 }, { "acc": 0.88392582, "epoch": 0.3528619528619529, "grad_norm": 10.5, "learning_rate": 1.9139013853337644e-05, "loss": 0.37835231, "memory(GiB)": 15.04, "step": 2620, "train_speed(iter/s)": 0.338966 }, { "acc": 0.85071363, "epoch": 0.35353535353535354, "grad_norm": 5.71875, "learning_rate": 1.9134488140501046e-05, "loss": 0.42265129, "memory(GiB)": 15.04, "step": 2625, "train_speed(iter/s)": 0.338968 }, { "acc": 0.83898754, "epoch": 0.3542087542087542, "grad_norm": 15.8125, "learning_rate": 1.9129951102237254e-05, "loss": 0.74677463, "memory(GiB)": 15.04, "step": 2630, "train_speed(iter/s)": 0.338976 }, { "acc": 0.84760046, "epoch": 0.3548821548821549, "grad_norm": 8.9375, "learning_rate": 1.9125402744171523e-05, "loss": 0.4743371, "memory(GiB)": 15.04, "step": 2635, "train_speed(iter/s)": 0.339044 }, { "acc": 0.88525162, "epoch": 0.35555555555555557, "grad_norm": 8.375, "learning_rate": 1.912084307194315e-05, "loss": 0.39847014, "memory(GiB)": 15.04, "step": 2640, "train_speed(iter/s)": 0.33912 }, { "acc": 0.86372814, "epoch": 0.35622895622895623, "grad_norm": 12.5, "learning_rate": 1.9116272091205464e-05, "loss": 0.40252566, "memory(GiB)": 15.04, "step": 2645, "train_speed(iter/s)": 0.339259 }, { "acc": 0.88826094, "epoch": 0.3569023569023569, "grad_norm": 9.375, "learning_rate": 1.9111689807625812e-05, "loss": 0.47242732, "memory(GiB)": 15.04, "step": 2650, "train_speed(iter/s)": 0.339291 }, { "acc": 0.87926731, "epoch": 0.3575757575757576, "grad_norm": 6.84375, "learning_rate": 1.910709622688555e-05, "loss": 0.36133299, "memory(GiB)": 15.04, "step": 2655, "train_speed(iter/s)": 0.339335 }, { "acc": 0.76420794, "epoch": 0.35824915824915826, "grad_norm": 9.125, "learning_rate": 1.9102491354680048e-05, "loss": 0.98229446, "memory(GiB)": 15.04, "step": 2660, "train_speed(iter/s)": 0.339389 }, { "acc": 0.84489317, "epoch": 0.3589225589225589, "grad_norm": 5.875, "learning_rate": 1.9097875196718676e-05, "loss": 0.47089186, "memory(GiB)": 15.04, "step": 2665, "train_speed(iter/s)": 0.339384 }, { "acc": 0.9002903, "epoch": 0.3595959595959596, "grad_norm": 6.125, "learning_rate": 1.9093247758724786e-05, "loss": 0.29986346, "memory(GiB)": 15.04, "step": 2670, "train_speed(iter/s)": 0.339496 }, { "acc": 0.8404459, "epoch": 0.3602693602693603, "grad_norm": 13.8125, "learning_rate": 1.9088609046435732e-05, "loss": 0.56306767, "memory(GiB)": 15.04, "step": 2675, "train_speed(iter/s)": 0.339618 }, { "acc": 0.89491272, "epoch": 0.36094276094276095, "grad_norm": 12.4375, "learning_rate": 1.9083959065602834e-05, "loss": 0.30484023, "memory(GiB)": 15.04, "step": 2680, "train_speed(iter/s)": 0.339702 }, { "acc": 0.85790787, "epoch": 0.3616161616161616, "grad_norm": 7.65625, "learning_rate": 1.9079297821991384e-05, "loss": 0.60322633, "memory(GiB)": 15.04, "step": 2685, "train_speed(iter/s)": 0.33974 }, { "acc": 0.80215073, "epoch": 0.3622895622895623, "grad_norm": 9.1875, "learning_rate": 1.9074625321380645e-05, "loss": 0.72699332, "memory(GiB)": 15.04, "step": 2690, "train_speed(iter/s)": 0.339882 }, { "acc": 0.83883438, "epoch": 0.362962962962963, "grad_norm": 7.46875, "learning_rate": 1.9069941569563833e-05, "loss": 0.78730655, "memory(GiB)": 15.04, "step": 2695, "train_speed(iter/s)": 0.339985 }, { "acc": 0.89459028, "epoch": 0.36363636363636365, "grad_norm": 11.875, "learning_rate": 1.9065246572348112e-05, "loss": 0.40213513, "memory(GiB)": 15.04, "step": 2700, "train_speed(iter/s)": 0.340122 }, { "epoch": 0.36363636363636365, "eval_acc": 0.8660754467240238, "eval_loss": 0.5180864334106445, "eval_runtime": 109.593, "eval_samples_per_second": 1.369, "eval_steps_per_second": 1.369, "step": 2700 }, { "acc": 0.81508904, "epoch": 0.3643097643097643, "grad_norm": 6.96875, "learning_rate": 1.9060540335554597e-05, "loss": 0.48082347, "memory(GiB)": 15.04, "step": 2705, "train_speed(iter/s)": 0.33562 }, { "acc": 0.85648174, "epoch": 0.36498316498316496, "grad_norm": 5.9375, "learning_rate": 1.905582286501832e-05, "loss": 0.4697526, "memory(GiB)": 15.04, "step": 2710, "train_speed(iter/s)": 0.335652 }, { "acc": 0.88261299, "epoch": 0.3656565656565657, "grad_norm": 13.75, "learning_rate": 1.9051094166588265e-05, "loss": 0.38674028, "memory(GiB)": 15.04, "step": 2715, "train_speed(iter/s)": 0.335695 }, { "acc": 0.86682205, "epoch": 0.36632996632996634, "grad_norm": 8.0625, "learning_rate": 1.9046354246127322e-05, "loss": 0.53947921, "memory(GiB)": 15.04, "step": 2720, "train_speed(iter/s)": 0.335667 }, { "acc": 0.8555212, "epoch": 0.367003367003367, "grad_norm": 7.25, "learning_rate": 1.9041603109512296e-05, "loss": 0.29316967, "memory(GiB)": 15.04, "step": 2725, "train_speed(iter/s)": 0.335742 }, { "acc": 0.89408617, "epoch": 0.36767676767676766, "grad_norm": 10.4375, "learning_rate": 1.90368407626339e-05, "loss": 0.3587872, "memory(GiB)": 15.04, "step": 2730, "train_speed(iter/s)": 0.335816 }, { "acc": 0.89987421, "epoch": 0.36835016835016837, "grad_norm": 8.6875, "learning_rate": 1.9032067211396747e-05, "loss": 0.40207324, "memory(GiB)": 15.04, "step": 2735, "train_speed(iter/s)": 0.335891 }, { "acc": 0.81146135, "epoch": 0.36902356902356903, "grad_norm": 13.5625, "learning_rate": 1.9027282461719348e-05, "loss": 0.69441767, "memory(GiB)": 15.04, "step": 2740, "train_speed(iter/s)": 0.335968 }, { "acc": 0.84760532, "epoch": 0.3696969696969697, "grad_norm": 10.375, "learning_rate": 1.902248651953408e-05, "loss": 0.47364888, "memory(GiB)": 15.04, "step": 2745, "train_speed(iter/s)": 0.336091 }, { "acc": 0.85408421, "epoch": 0.37037037037037035, "grad_norm": 6.28125, "learning_rate": 1.901767939078722e-05, "loss": 0.44126396, "memory(GiB)": 15.04, "step": 2750, "train_speed(iter/s)": 0.336111 }, { "acc": 0.86288681, "epoch": 0.37104377104377106, "grad_norm": 15.0, "learning_rate": 1.9012861081438896e-05, "loss": 0.63420038, "memory(GiB)": 15.04, "step": 2755, "train_speed(iter/s)": 0.336166 }, { "acc": 0.86562033, "epoch": 0.3717171717171717, "grad_norm": 12.125, "learning_rate": 1.900803159746311e-05, "loss": 0.36843266, "memory(GiB)": 15.04, "step": 2760, "train_speed(iter/s)": 0.336268 }, { "acc": 0.84120855, "epoch": 0.3723905723905724, "grad_norm": 11.5, "learning_rate": 1.900319094484771e-05, "loss": 0.62543631, "memory(GiB)": 15.04, "step": 2765, "train_speed(iter/s)": 0.336305 }, { "acc": 0.893818, "epoch": 0.37306397306397304, "grad_norm": 11.3125, "learning_rate": 1.89983391295944e-05, "loss": 0.41919742, "memory(GiB)": 15.04, "step": 2770, "train_speed(iter/s)": 0.336404 }, { "acc": 0.87164793, "epoch": 0.37373737373737376, "grad_norm": 5.3125, "learning_rate": 1.8993476157718715e-05, "loss": 0.35139127, "memory(GiB)": 15.04, "step": 2775, "train_speed(iter/s)": 0.336479 }, { "acc": 0.86639566, "epoch": 0.3744107744107744, "grad_norm": 8.3125, "learning_rate": 1.8988602035250037e-05, "loss": 0.49558616, "memory(GiB)": 15.04, "step": 2780, "train_speed(iter/s)": 0.336593 }, { "acc": 0.83523035, "epoch": 0.3750841750841751, "grad_norm": 27.75, "learning_rate": 1.8983716768231554e-05, "loss": 0.53672395, "memory(GiB)": 15.04, "step": 2785, "train_speed(iter/s)": 0.336731 }, { "acc": 0.87954035, "epoch": 0.37575757575757573, "grad_norm": 9.8125, "learning_rate": 1.897882036272029e-05, "loss": 0.48749771, "memory(GiB)": 15.04, "step": 2790, "train_speed(iter/s)": 0.336859 }, { "acc": 0.87321129, "epoch": 0.37643097643097645, "grad_norm": 6.1875, "learning_rate": 1.8973912824787068e-05, "loss": 0.44543729, "memory(GiB)": 15.04, "step": 2795, "train_speed(iter/s)": 0.336957 }, { "acc": 0.87619419, "epoch": 0.3771043771043771, "grad_norm": 6.5625, "learning_rate": 1.8968994160516516e-05, "loss": 0.52440739, "memory(GiB)": 15.04, "step": 2800, "train_speed(iter/s)": 0.336906 }, { "acc": 0.83263006, "epoch": 0.37777777777777777, "grad_norm": 15.0, "learning_rate": 1.896406437600705e-05, "loss": 0.69280262, "memory(GiB)": 15.04, "step": 2805, "train_speed(iter/s)": 0.336973 }, { "acc": 0.82279167, "epoch": 0.3784511784511784, "grad_norm": 9.0, "learning_rate": 1.895912347737089e-05, "loss": 0.48401203, "memory(GiB)": 15.04, "step": 2810, "train_speed(iter/s)": 0.337095 }, { "acc": 0.86654186, "epoch": 0.37912457912457914, "grad_norm": 18.25, "learning_rate": 1.8954171470734023e-05, "loss": 0.41237874, "memory(GiB)": 15.04, "step": 2815, "train_speed(iter/s)": 0.337226 }, { "acc": 0.8949935, "epoch": 0.3797979797979798, "grad_norm": 10.625, "learning_rate": 1.894920836223621e-05, "loss": 0.3719039, "memory(GiB)": 15.04, "step": 2820, "train_speed(iter/s)": 0.33733 }, { "acc": 0.88594656, "epoch": 0.38047138047138046, "grad_norm": 9.0, "learning_rate": 1.894423415803098e-05, "loss": 0.44963932, "memory(GiB)": 15.04, "step": 2825, "train_speed(iter/s)": 0.337397 }, { "acc": 0.8899971, "epoch": 0.3811447811447811, "grad_norm": 14.125, "learning_rate": 1.893924886428562e-05, "loss": 0.37841399, "memory(GiB)": 15.04, "step": 2830, "train_speed(iter/s)": 0.337479 }, { "acc": 0.8806571, "epoch": 0.38181818181818183, "grad_norm": 10.625, "learning_rate": 1.8934252487181165e-05, "loss": 0.44682927, "memory(GiB)": 15.04, "step": 2835, "train_speed(iter/s)": 0.337607 }, { "acc": 0.85668678, "epoch": 0.3824915824915825, "grad_norm": 4.34375, "learning_rate": 1.8929245032912385e-05, "loss": 0.46805487, "memory(GiB)": 15.04, "step": 2840, "train_speed(iter/s)": 0.337638 }, { "acc": 0.90456429, "epoch": 0.38316498316498315, "grad_norm": 8.375, "learning_rate": 1.8924226507687793e-05, "loss": 0.43100266, "memory(GiB)": 15.04, "step": 2845, "train_speed(iter/s)": 0.337693 }, { "acc": 0.92130375, "epoch": 0.3838383838383838, "grad_norm": 8.6875, "learning_rate": 1.8919196917729623e-05, "loss": 0.34470038, "memory(GiB)": 15.04, "step": 2850, "train_speed(iter/s)": 0.337775 }, { "acc": 0.81275425, "epoch": 0.3845117845117845, "grad_norm": 8.1875, "learning_rate": 1.8914156269273833e-05, "loss": 0.79330454, "memory(GiB)": 15.04, "step": 2855, "train_speed(iter/s)": 0.337919 }, { "acc": 0.84950256, "epoch": 0.3851851851851852, "grad_norm": 19.25, "learning_rate": 1.8909104568570086e-05, "loss": 0.3624054, "memory(GiB)": 15.04, "step": 2860, "train_speed(iter/s)": 0.338038 }, { "acc": 0.91833963, "epoch": 0.38585858585858585, "grad_norm": 9.5625, "learning_rate": 1.890404182188175e-05, "loss": 0.30394173, "memory(GiB)": 15.04, "step": 2865, "train_speed(iter/s)": 0.338122 }, { "acc": 0.88120604, "epoch": 0.3865319865319865, "grad_norm": 8.5625, "learning_rate": 1.8898968035485895e-05, "loss": 0.44618464, "memory(GiB)": 15.04, "step": 2870, "train_speed(iter/s)": 0.338145 }, { "acc": 0.90447998, "epoch": 0.3872053872053872, "grad_norm": 8.25, "learning_rate": 1.8893883215673266e-05, "loss": 0.39719179, "memory(GiB)": 15.04, "step": 2875, "train_speed(iter/s)": 0.338192 }, { "acc": 0.81772985, "epoch": 0.3878787878787879, "grad_norm": 21.25, "learning_rate": 1.88887873687483e-05, "loss": 0.3955199, "memory(GiB)": 15.04, "step": 2880, "train_speed(iter/s)": 0.338299 }, { "acc": 0.85085459, "epoch": 0.38855218855218854, "grad_norm": 4.5625, "learning_rate": 1.8883680501029098e-05, "loss": 0.59997473, "memory(GiB)": 15.04, "step": 2885, "train_speed(iter/s)": 0.338406 }, { "acc": 0.83106422, "epoch": 0.38922558922558925, "grad_norm": 6.0, "learning_rate": 1.887856261884743e-05, "loss": 0.30067787, "memory(GiB)": 15.04, "step": 2890, "train_speed(iter/s)": 0.338447 }, { "acc": 0.92400789, "epoch": 0.3898989898989899, "grad_norm": 7.0, "learning_rate": 1.8873433728548716e-05, "loss": 0.23872533, "memory(GiB)": 15.04, "step": 2895, "train_speed(iter/s)": 0.338594 }, { "acc": 0.8933054, "epoch": 0.39057239057239057, "grad_norm": 9.0, "learning_rate": 1.886829383649203e-05, "loss": 0.32230327, "memory(GiB)": 15.04, "step": 2900, "train_speed(iter/s)": 0.338727 }, { "acc": 0.90259237, "epoch": 0.39124579124579123, "grad_norm": 12.75, "learning_rate": 1.886314294905009e-05, "loss": 0.30169015, "memory(GiB)": 15.04, "step": 2905, "train_speed(iter/s)": 0.338845 }, { "acc": 0.84158916, "epoch": 0.39191919191919194, "grad_norm": 6.96875, "learning_rate": 1.8857981072609236e-05, "loss": 0.45472388, "memory(GiB)": 15.04, "step": 2910, "train_speed(iter/s)": 0.338896 }, { "acc": 0.8605051, "epoch": 0.3925925925925926, "grad_norm": 6.3125, "learning_rate": 1.8852808213569443e-05, "loss": 0.54497418, "memory(GiB)": 15.04, "step": 2915, "train_speed(iter/s)": 0.33896 }, { "acc": 0.87266006, "epoch": 0.39326599326599326, "grad_norm": 7.78125, "learning_rate": 1.8847624378344293e-05, "loss": 0.48023129, "memory(GiB)": 15.04, "step": 2920, "train_speed(iter/s)": 0.339032 }, { "acc": 0.88253145, "epoch": 0.3939393939393939, "grad_norm": 6.875, "learning_rate": 1.8842429573360987e-05, "loss": 0.27666228, "memory(GiB)": 15.04, "step": 2925, "train_speed(iter/s)": 0.339151 }, { "acc": 0.88198681, "epoch": 0.39461279461279464, "grad_norm": 5.5625, "learning_rate": 1.8837223805060323e-05, "loss": 0.38521972, "memory(GiB)": 15.04, "step": 2930, "train_speed(iter/s)": 0.339195 }, { "acc": 0.85078106, "epoch": 0.3952861952861953, "grad_norm": 12.6875, "learning_rate": 1.8832007079896685e-05, "loss": 0.56066775, "memory(GiB)": 15.04, "step": 2935, "train_speed(iter/s)": 0.339241 }, { "acc": 0.86893778, "epoch": 0.39595959595959596, "grad_norm": 7.71875, "learning_rate": 1.8826779404338055e-05, "loss": 0.58338552, "memory(GiB)": 15.04, "step": 2940, "train_speed(iter/s)": 0.33937 }, { "acc": 0.87823391, "epoch": 0.3966329966329966, "grad_norm": 13.9375, "learning_rate": 1.8821540784865983e-05, "loss": 0.36951785, "memory(GiB)": 15.04, "step": 2945, "train_speed(iter/s)": 0.339399 }, { "acc": 0.81051741, "epoch": 0.39730639730639733, "grad_norm": 26.5, "learning_rate": 1.8816291227975587e-05, "loss": 0.58486147, "memory(GiB)": 15.04, "step": 2950, "train_speed(iter/s)": 0.339504 }, { "acc": 0.88986492, "epoch": 0.397979797979798, "grad_norm": 14.625, "learning_rate": 1.881103074017555e-05, "loss": 0.32249541, "memory(GiB)": 15.04, "step": 2955, "train_speed(iter/s)": 0.339629 }, { "acc": 0.77094898, "epoch": 0.39865319865319865, "grad_norm": 7.28125, "learning_rate": 1.8805759327988108e-05, "loss": 0.47732401, "memory(GiB)": 15.04, "step": 2960, "train_speed(iter/s)": 0.3397 }, { "acc": 0.87760706, "epoch": 0.3993265993265993, "grad_norm": 6.625, "learning_rate": 1.8800476997949033e-05, "loss": 0.39705057, "memory(GiB)": 15.04, "step": 2965, "train_speed(iter/s)": 0.339765 }, { "acc": 0.88057041, "epoch": 0.4, "grad_norm": 11.875, "learning_rate": 1.879518375660765e-05, "loss": 0.40552897, "memory(GiB)": 15.04, "step": 2970, "train_speed(iter/s)": 0.339757 }, { "acc": 0.90826578, "epoch": 0.4006734006734007, "grad_norm": 7.6875, "learning_rate": 1.87898796105268e-05, "loss": 0.29796383, "memory(GiB)": 15.04, "step": 2975, "train_speed(iter/s)": 0.33984 }, { "acc": 0.88463964, "epoch": 0.40134680134680134, "grad_norm": 12.5, "learning_rate": 1.8784564566282845e-05, "loss": 0.41117978, "memory(GiB)": 15.04, "step": 2980, "train_speed(iter/s)": 0.33991 }, { "acc": 0.87788677, "epoch": 0.402020202020202, "grad_norm": 6.78125, "learning_rate": 1.877923863046566e-05, "loss": 0.63874054, "memory(GiB)": 15.04, "step": 2985, "train_speed(iter/s)": 0.339976 }, { "acc": 0.90607147, "epoch": 0.4026936026936027, "grad_norm": 6.8125, "learning_rate": 1.877390180967863e-05, "loss": 0.33274183, "memory(GiB)": 15.04, "step": 2990, "train_speed(iter/s)": 0.340038 }, { "acc": 0.88634825, "epoch": 0.4033670033670034, "grad_norm": 7.03125, "learning_rate": 1.8768554110538626e-05, "loss": 0.34868686, "memory(GiB)": 15.04, "step": 2995, "train_speed(iter/s)": 0.340082 }, { "acc": 0.83088036, "epoch": 0.40404040404040403, "grad_norm": 5.78125, "learning_rate": 1.8763195539676017e-05, "loss": 0.54208879, "memory(GiB)": 15.04, "step": 3000, "train_speed(iter/s)": 0.340137 }, { "epoch": 0.40404040404040403, "eval_acc": 0.8710384184245389, "eval_loss": 0.5025013089179993, "eval_runtime": 110.2728, "eval_samples_per_second": 1.36, "eval_steps_per_second": 1.36, "step": 3000 }, { "acc": 0.82111616, "epoch": 0.4047138047138047, "grad_norm": 6.46875, "learning_rate": 1.875782610373464e-05, "loss": 0.78256674, "memory(GiB)": 15.04, "step": 3005, "train_speed(iter/s)": 0.335798 }, { "acc": 0.84464054, "epoch": 0.4053872053872054, "grad_norm": 20.75, "learning_rate": 1.8752445809371813e-05, "loss": 0.53980637, "memory(GiB)": 15.04, "step": 3010, "train_speed(iter/s)": 0.335805 }, { "acc": 0.83855562, "epoch": 0.40606060606060607, "grad_norm": 8.75, "learning_rate": 1.874705466325831e-05, "loss": 0.60607486, "memory(GiB)": 15.04, "step": 3015, "train_speed(iter/s)": 0.335864 }, { "acc": 0.90274582, "epoch": 0.4067340067340067, "grad_norm": 14.3125, "learning_rate": 1.8741652672078366e-05, "loss": 0.41548071, "memory(GiB)": 15.04, "step": 3020, "train_speed(iter/s)": 0.335997 }, { "acc": 0.92024384, "epoch": 0.4074074074074074, "grad_norm": 7.21875, "learning_rate": 1.8736239842529658e-05, "loss": 0.25771976, "memory(GiB)": 15.04, "step": 3025, "train_speed(iter/s)": 0.336048 }, { "acc": 0.89563293, "epoch": 0.4080808080808081, "grad_norm": 5.09375, "learning_rate": 1.8730816181323297e-05, "loss": 0.40562844, "memory(GiB)": 15.04, "step": 3030, "train_speed(iter/s)": 0.336048 }, { "acc": 0.81184473, "epoch": 0.40875420875420876, "grad_norm": 7.25, "learning_rate": 1.8725381695183836e-05, "loss": 0.62321434, "memory(GiB)": 15.04, "step": 3035, "train_speed(iter/s)": 0.336055 }, { "acc": 0.89695549, "epoch": 0.4094276094276094, "grad_norm": 5.0625, "learning_rate": 1.8719936390849234e-05, "loss": 0.37249773, "memory(GiB)": 15.04, "step": 3040, "train_speed(iter/s)": 0.336156 }, { "acc": 0.89363928, "epoch": 0.4101010101010101, "grad_norm": 10.0625, "learning_rate": 1.8714480275070874e-05, "loss": 0.37879419, "memory(GiB)": 15.04, "step": 3045, "train_speed(iter/s)": 0.336237 }, { "acc": 0.84344769, "epoch": 0.4107744107744108, "grad_norm": 9.1875, "learning_rate": 1.8709013354613544e-05, "loss": 0.48255649, "memory(GiB)": 15.04, "step": 3050, "train_speed(iter/s)": 0.336308 }, { "acc": 0.83006134, "epoch": 0.41144781144781145, "grad_norm": 13.6875, "learning_rate": 1.8703535636255423e-05, "loss": 0.52381067, "memory(GiB)": 15.04, "step": 3055, "train_speed(iter/s)": 0.336377 }, { "acc": 0.82723989, "epoch": 0.4121212121212121, "grad_norm": 17.25, "learning_rate": 1.869804712678807e-05, "loss": 0.5395534, "memory(GiB)": 15.04, "step": 3060, "train_speed(iter/s)": 0.33647 }, { "acc": 0.888585, "epoch": 0.41279461279461277, "grad_norm": 7.125, "learning_rate": 1.8692547833016446e-05, "loss": 0.33099582, "memory(GiB)": 15.04, "step": 3065, "train_speed(iter/s)": 0.336561 }, { "acc": 0.87514992, "epoch": 0.4134680134680135, "grad_norm": 6.5, "learning_rate": 1.8687037761758864e-05, "loss": 0.51219778, "memory(GiB)": 15.04, "step": 3070, "train_speed(iter/s)": 0.336619 }, { "acc": 0.89671497, "epoch": 0.41414141414141414, "grad_norm": 4.96875, "learning_rate": 1.8681516919847004e-05, "loss": 0.33684578, "memory(GiB)": 15.04, "step": 3075, "train_speed(iter/s)": 0.336663 }, { "acc": 0.89340353, "epoch": 0.4148148148148148, "grad_norm": 12.5, "learning_rate": 1.8675985314125903e-05, "loss": 0.43263216, "memory(GiB)": 15.04, "step": 3080, "train_speed(iter/s)": 0.336757 }, { "acc": 0.9157217, "epoch": 0.41548821548821546, "grad_norm": 10.0, "learning_rate": 1.867044295145394e-05, "loss": 0.36201947, "memory(GiB)": 15.04, "step": 3085, "train_speed(iter/s)": 0.336882 }, { "acc": 0.8681529, "epoch": 0.4161616161616162, "grad_norm": 8.25, "learning_rate": 1.8664889838702837e-05, "loss": 0.51161928, "memory(GiB)": 15.04, "step": 3090, "train_speed(iter/s)": 0.33699 }, { "acc": 0.88494596, "epoch": 0.41683501683501684, "grad_norm": 5.65625, "learning_rate": 1.8659325982757632e-05, "loss": 0.27869239, "memory(GiB)": 15.04, "step": 3095, "train_speed(iter/s)": 0.337046 }, { "acc": 0.92889404, "epoch": 0.4175084175084175, "grad_norm": 12.3125, "learning_rate": 1.86537513905167e-05, "loss": 0.27401457, "memory(GiB)": 15.04, "step": 3100, "train_speed(iter/s)": 0.337156 }, { "acc": 0.85353842, "epoch": 0.41818181818181815, "grad_norm": 7.625, "learning_rate": 1.8648166068891716e-05, "loss": 0.45773873, "memory(GiB)": 15.04, "step": 3105, "train_speed(iter/s)": 0.33723 }, { "acc": 0.8940958, "epoch": 0.41885521885521887, "grad_norm": 6.90625, "learning_rate": 1.864257002480766e-05, "loss": 0.50468621, "memory(GiB)": 15.04, "step": 3110, "train_speed(iter/s)": 0.33724 }, { "acc": 0.85336952, "epoch": 0.41952861952861953, "grad_norm": 5.5625, "learning_rate": 1.8636963265202804e-05, "loss": 0.6110342, "memory(GiB)": 15.04, "step": 3115, "train_speed(iter/s)": 0.337191 }, { "acc": 0.8752511, "epoch": 0.4202020202020202, "grad_norm": 6.125, "learning_rate": 1.863134579702872e-05, "loss": 0.36342602, "memory(GiB)": 15.04, "step": 3120, "train_speed(iter/s)": 0.337222 }, { "acc": 0.80075426, "epoch": 0.4208754208754209, "grad_norm": 14.4375, "learning_rate": 1.8625717627250225e-05, "loss": 0.64978418, "memory(GiB)": 15.04, "step": 3125, "train_speed(iter/s)": 0.337323 }, { "acc": 0.89596958, "epoch": 0.42154882154882156, "grad_norm": 5.6875, "learning_rate": 1.8620078762845443e-05, "loss": 0.40613952, "memory(GiB)": 15.04, "step": 3130, "train_speed(iter/s)": 0.337369 }, { "acc": 0.87584219, "epoch": 0.4222222222222222, "grad_norm": 7.1875, "learning_rate": 1.8614429210805737e-05, "loss": 0.47951093, "memory(GiB)": 15.04, "step": 3135, "train_speed(iter/s)": 0.337441 }, { "acc": 0.84029493, "epoch": 0.4228956228956229, "grad_norm": 7.875, "learning_rate": 1.8608768978135717e-05, "loss": 0.49031301, "memory(GiB)": 15.04, "step": 3140, "train_speed(iter/s)": 0.337518 }, { "acc": 0.92243776, "epoch": 0.4235690235690236, "grad_norm": 8.1875, "learning_rate": 1.8603098071853252e-05, "loss": 0.30771151, "memory(GiB)": 15.04, "step": 3145, "train_speed(iter/s)": 0.337583 }, { "acc": 0.89681549, "epoch": 0.42424242424242425, "grad_norm": 12.875, "learning_rate": 1.8597416498989423e-05, "loss": 0.38468418, "memory(GiB)": 15.04, "step": 3150, "train_speed(iter/s)": 0.337667 }, { "acc": 0.86587696, "epoch": 0.4249158249158249, "grad_norm": 5.6875, "learning_rate": 1.859172426658856e-05, "loss": 0.42589006, "memory(GiB)": 15.04, "step": 3155, "train_speed(iter/s)": 0.337743 }, { "acc": 0.84840202, "epoch": 0.4255892255892256, "grad_norm": 8.125, "learning_rate": 1.8586021381708186e-05, "loss": 0.58262167, "memory(GiB)": 15.04, "step": 3160, "train_speed(iter/s)": 0.337864 }, { "acc": 0.89175978, "epoch": 0.4262626262626263, "grad_norm": 8.0, "learning_rate": 1.8580307851419055e-05, "loss": 0.41947713, "memory(GiB)": 15.04, "step": 3165, "train_speed(iter/s)": 0.337862 }, { "acc": 0.88823881, "epoch": 0.42693602693602695, "grad_norm": 6.28125, "learning_rate": 1.85745836828051e-05, "loss": 0.42632914, "memory(GiB)": 15.04, "step": 3170, "train_speed(iter/s)": 0.337978 }, { "acc": 0.80676479, "epoch": 0.4276094276094276, "grad_norm": 7.59375, "learning_rate": 1.856884888296345e-05, "loss": 0.53672976, "memory(GiB)": 15.04, "step": 3175, "train_speed(iter/s)": 0.338031 }, { "acc": 0.91209641, "epoch": 0.42828282828282827, "grad_norm": 5.21875, "learning_rate": 1.8563103459004423e-05, "loss": 0.37504165, "memory(GiB)": 15.04, "step": 3180, "train_speed(iter/s)": 0.337931 }, { "acc": 0.87650108, "epoch": 0.428956228956229, "grad_norm": 5.3125, "learning_rate": 1.85573474180515e-05, "loss": 0.45593104, "memory(GiB)": 15.04, "step": 3185, "train_speed(iter/s)": 0.338001 }, { "acc": 0.89606991, "epoch": 0.42962962962962964, "grad_norm": 8.5, "learning_rate": 1.8551580767241325e-05, "loss": 0.49555717, "memory(GiB)": 15.04, "step": 3190, "train_speed(iter/s)": 0.338088 }, { "acc": 0.92269087, "epoch": 0.4303030303030303, "grad_norm": 8.3125, "learning_rate": 1.8545803513723703e-05, "loss": 0.31717582, "memory(GiB)": 15.04, "step": 3195, "train_speed(iter/s)": 0.338184 }, { "acc": 0.90833187, "epoch": 0.43097643097643096, "grad_norm": 7.21875, "learning_rate": 1.8540015664661583e-05, "loss": 0.3351985, "memory(GiB)": 15.04, "step": 3200, "train_speed(iter/s)": 0.338254 }, { "acc": 0.90001535, "epoch": 0.4316498316498317, "grad_norm": 7.0625, "learning_rate": 1.853421722723105e-05, "loss": 0.36360321, "memory(GiB)": 15.04, "step": 3205, "train_speed(iter/s)": 0.338311 }, { "acc": 0.85820007, "epoch": 0.43232323232323233, "grad_norm": 10.0625, "learning_rate": 1.8528408208621324e-05, "loss": 0.50676789, "memory(GiB)": 15.04, "step": 3210, "train_speed(iter/s)": 0.338329 }, { "acc": 0.7928196, "epoch": 0.432996632996633, "grad_norm": 7.46875, "learning_rate": 1.852258861603472e-05, "loss": 0.57429237, "memory(GiB)": 15.04, "step": 3215, "train_speed(iter/s)": 0.338351 }, { "acc": 0.87294598, "epoch": 0.43367003367003365, "grad_norm": 9.3125, "learning_rate": 1.8516758456686694e-05, "loss": 0.41843138, "memory(GiB)": 15.04, "step": 3220, "train_speed(iter/s)": 0.338414 }, { "acc": 0.90121832, "epoch": 0.43434343434343436, "grad_norm": 7.875, "learning_rate": 1.8510917737805785e-05, "loss": 0.22768776, "memory(GiB)": 15.04, "step": 3225, "train_speed(iter/s)": 0.338506 }, { "acc": 0.88239965, "epoch": 0.435016835016835, "grad_norm": 9.9375, "learning_rate": 1.850506646663363e-05, "loss": 0.4163702, "memory(GiB)": 15.04, "step": 3230, "train_speed(iter/s)": 0.338511 }, { "acc": 0.89738407, "epoch": 0.4356902356902357, "grad_norm": 16.875, "learning_rate": 1.8499204650424947e-05, "loss": 0.44875789, "memory(GiB)": 15.04, "step": 3235, "train_speed(iter/s)": 0.338602 }, { "acc": 0.86667624, "epoch": 0.43636363636363634, "grad_norm": 11.5, "learning_rate": 1.849333229644753e-05, "loss": 0.46161666, "memory(GiB)": 15.04, "step": 3240, "train_speed(iter/s)": 0.33861 }, { "acc": 0.83652563, "epoch": 0.43703703703703706, "grad_norm": 8.25, "learning_rate": 1.848744941198224e-05, "loss": 0.3679383, "memory(GiB)": 15.04, "step": 3245, "train_speed(iter/s)": 0.338589 }, { "acc": 0.88514147, "epoch": 0.4377104377104377, "grad_norm": 5.625, "learning_rate": 1.8481556004322984e-05, "loss": 0.39434249, "memory(GiB)": 15.04, "step": 3250, "train_speed(iter/s)": 0.338664 }, { "acc": 0.74523182, "epoch": 0.4383838383838384, "grad_norm": 5.96875, "learning_rate": 1.8475652080776733e-05, "loss": 0.65559001, "memory(GiB)": 15.04, "step": 3255, "train_speed(iter/s)": 0.338744 }, { "acc": 0.8988143, "epoch": 0.43905723905723903, "grad_norm": 6.3125, "learning_rate": 1.8469737648663487e-05, "loss": 0.503934, "memory(GiB)": 15.04, "step": 3260, "train_speed(iter/s)": 0.338723 }, { "acc": 0.86166887, "epoch": 0.43973063973063975, "grad_norm": 6.9375, "learning_rate": 1.846381271531627e-05, "loss": 0.40841799, "memory(GiB)": 15.04, "step": 3265, "train_speed(iter/s)": 0.338795 }, { "acc": 0.90724373, "epoch": 0.4404040404040404, "grad_norm": 6.9375, "learning_rate": 1.8457877288081132e-05, "loss": 0.3223772, "memory(GiB)": 15.04, "step": 3270, "train_speed(iter/s)": 0.338811 }, { "acc": 0.89002905, "epoch": 0.44107744107744107, "grad_norm": 9.75, "learning_rate": 1.8451931374317138e-05, "loss": 0.38144407, "memory(GiB)": 15.04, "step": 3275, "train_speed(iter/s)": 0.338923 }, { "acc": 0.8057189, "epoch": 0.4417508417508417, "grad_norm": 7.375, "learning_rate": 1.8445974981396345e-05, "loss": 0.50009413, "memory(GiB)": 15.04, "step": 3280, "train_speed(iter/s)": 0.338962 }, { "acc": 0.83952885, "epoch": 0.44242424242424244, "grad_norm": 10.1875, "learning_rate": 1.844000811670381e-05, "loss": 0.64937525, "memory(GiB)": 15.04, "step": 3285, "train_speed(iter/s)": 0.339017 }, { "acc": 0.90349655, "epoch": 0.4430976430976431, "grad_norm": 19.25, "learning_rate": 1.8434030787637576e-05, "loss": 0.38233004, "memory(GiB)": 15.04, "step": 3290, "train_speed(iter/s)": 0.339117 }, { "acc": 0.92429094, "epoch": 0.44377104377104376, "grad_norm": 7.40625, "learning_rate": 1.8428043001608646e-05, "loss": 0.28647325, "memory(GiB)": 15.04, "step": 3295, "train_speed(iter/s)": 0.339195 }, { "acc": 0.83496065, "epoch": 0.4444444444444444, "grad_norm": 16.75, "learning_rate": 1.8422044766041007e-05, "loss": 0.3958189, "memory(GiB)": 15.04, "step": 3300, "train_speed(iter/s)": 0.339266 }, { "epoch": 0.4444444444444444, "eval_acc": 0.8735746939204456, "eval_loss": 0.4853774905204773, "eval_runtime": 109.648, "eval_samples_per_second": 1.368, "eval_steps_per_second": 1.368, "step": 3300 }, { "acc": 0.87697134, "epoch": 0.44511784511784513, "grad_norm": 23.375, "learning_rate": 1.8416036088371584e-05, "loss": 0.56972561, "memory(GiB)": 15.04, "step": 3305, "train_speed(iter/s)": 0.335535 }, { "acc": 0.85543346, "epoch": 0.4457912457912458, "grad_norm": 27.625, "learning_rate": 1.8410016976050257e-05, "loss": 0.55093555, "memory(GiB)": 15.04, "step": 3310, "train_speed(iter/s)": 0.33554 }, { "acc": 0.92177601, "epoch": 0.44646464646464645, "grad_norm": 14.25, "learning_rate": 1.8403987436539852e-05, "loss": 0.26810853, "memory(GiB)": 15.04, "step": 3315, "train_speed(iter/s)": 0.335662 }, { "acc": 0.84944754, "epoch": 0.4471380471380471, "grad_norm": 11.9375, "learning_rate": 1.839794747731611e-05, "loss": 0.59606938, "memory(GiB)": 15.04, "step": 3320, "train_speed(iter/s)": 0.335694 }, { "acc": 0.80258579, "epoch": 0.4478114478114478, "grad_norm": 15.5, "learning_rate": 1.8391897105867695e-05, "loss": 0.65352468, "memory(GiB)": 15.04, "step": 3325, "train_speed(iter/s)": 0.335752 }, { "acc": 0.84866686, "epoch": 0.4484848484848485, "grad_norm": 7.71875, "learning_rate": 1.838583632969618e-05, "loss": 0.53425221, "memory(GiB)": 15.04, "step": 3330, "train_speed(iter/s)": 0.33579 }, { "acc": 0.88721209, "epoch": 0.44915824915824915, "grad_norm": 8.1875, "learning_rate": 1.837976515631604e-05, "loss": 0.50930309, "memory(GiB)": 15.04, "step": 3335, "train_speed(iter/s)": 0.335762 }, { "acc": 0.82474318, "epoch": 0.4498316498316498, "grad_norm": 12.9375, "learning_rate": 1.8373683593254646e-05, "loss": 0.47711983, "memory(GiB)": 15.04, "step": 3340, "train_speed(iter/s)": 0.335867 }, { "acc": 0.90135469, "epoch": 0.4505050505050505, "grad_norm": 13.0625, "learning_rate": 1.8367591648052242e-05, "loss": 0.33111401, "memory(GiB)": 15.04, "step": 3345, "train_speed(iter/s)": 0.335958 }, { "acc": 0.88928232, "epoch": 0.4511784511784512, "grad_norm": 5.4375, "learning_rate": 1.8361489328261947e-05, "loss": 0.34135718, "memory(GiB)": 15.04, "step": 3350, "train_speed(iter/s)": 0.336026 }, { "acc": 0.83216152, "epoch": 0.45185185185185184, "grad_norm": 10.0625, "learning_rate": 1.835537664144974e-05, "loss": 0.46434398, "memory(GiB)": 15.04, "step": 3355, "train_speed(iter/s)": 0.336126 }, { "acc": 0.87009315, "epoch": 0.45252525252525255, "grad_norm": 16.5, "learning_rate": 1.8349253595194465e-05, "loss": 0.44999747, "memory(GiB)": 15.04, "step": 3360, "train_speed(iter/s)": 0.336234 }, { "acc": 0.77190375, "epoch": 0.4531986531986532, "grad_norm": 6.25, "learning_rate": 1.8343120197087798e-05, "loss": 0.6485055, "memory(GiB)": 15.04, "step": 3365, "train_speed(iter/s)": 0.33628 }, { "acc": 0.89529934, "epoch": 0.45387205387205387, "grad_norm": 9.5, "learning_rate": 1.8336976454734254e-05, "loss": 0.42717724, "memory(GiB)": 15.04, "step": 3370, "train_speed(iter/s)": 0.336309 }, { "acc": 0.8757637, "epoch": 0.45454545454545453, "grad_norm": 8.625, "learning_rate": 1.8330822375751172e-05, "loss": 0.390833, "memory(GiB)": 15.04, "step": 3375, "train_speed(iter/s)": 0.336342 }, { "acc": 0.86099453, "epoch": 0.45521885521885525, "grad_norm": 11.125, "learning_rate": 1.8324657967768712e-05, "loss": 0.55917716, "memory(GiB)": 15.04, "step": 3380, "train_speed(iter/s)": 0.336374 }, { "acc": 0.88323507, "epoch": 0.4558922558922559, "grad_norm": 13.9375, "learning_rate": 1.8318483238429835e-05, "loss": 0.32566054, "memory(GiB)": 15.04, "step": 3385, "train_speed(iter/s)": 0.336483 }, { "acc": 0.89567146, "epoch": 0.45656565656565656, "grad_norm": 7.8125, "learning_rate": 1.8312298195390303e-05, "loss": 0.35909457, "memory(GiB)": 15.04, "step": 3390, "train_speed(iter/s)": 0.33651 }, { "acc": 0.92252321, "epoch": 0.4572390572390572, "grad_norm": 5.875, "learning_rate": 1.8306102846318664e-05, "loss": 0.28103931, "memory(GiB)": 15.04, "step": 3395, "train_speed(iter/s)": 0.33656 }, { "acc": 0.88294439, "epoch": 0.45791245791245794, "grad_norm": 9.25, "learning_rate": 1.8299897198896234e-05, "loss": 0.58328576, "memory(GiB)": 15.04, "step": 3400, "train_speed(iter/s)": 0.336662 }, { "acc": 0.89842358, "epoch": 0.4585858585858586, "grad_norm": 9.6875, "learning_rate": 1.829368126081712e-05, "loss": 0.32496204, "memory(GiB)": 15.04, "step": 3405, "train_speed(iter/s)": 0.336753 }, { "acc": 0.88026924, "epoch": 0.45925925925925926, "grad_norm": 8.25, "learning_rate": 1.828745503978816e-05, "loss": 0.48563657, "memory(GiB)": 15.04, "step": 3410, "train_speed(iter/s)": 0.336855 }, { "acc": 0.8944335, "epoch": 0.4599326599326599, "grad_norm": 4.40625, "learning_rate": 1.8281218543528973e-05, "loss": 0.30072145, "memory(GiB)": 15.04, "step": 3415, "train_speed(iter/s)": 0.336902 }, { "acc": 0.84510555, "epoch": 0.46060606060606063, "grad_norm": 14.125, "learning_rate": 1.8274971779771888e-05, "loss": 0.46908517, "memory(GiB)": 15.04, "step": 3420, "train_speed(iter/s)": 0.336992 }, { "acc": 0.89817324, "epoch": 0.4612794612794613, "grad_norm": 10.6875, "learning_rate": 1.826871475626198e-05, "loss": 0.32642584, "memory(GiB)": 15.04, "step": 3425, "train_speed(iter/s)": 0.337076 }, { "acc": 0.88077135, "epoch": 0.46195286195286195, "grad_norm": 7.09375, "learning_rate": 1.8262447480757048e-05, "loss": 0.32293191, "memory(GiB)": 15.04, "step": 3430, "train_speed(iter/s)": 0.337165 }, { "acc": 0.89517422, "epoch": 0.4626262626262626, "grad_norm": 10.5625, "learning_rate": 1.8256169961027588e-05, "loss": 0.33811147, "memory(GiB)": 15.04, "step": 3435, "train_speed(iter/s)": 0.337221 }, { "acc": 0.75822539, "epoch": 0.4632996632996633, "grad_norm": 11.5, "learning_rate": 1.8249882204856802e-05, "loss": 0.66904917, "memory(GiB)": 15.04, "step": 3440, "train_speed(iter/s)": 0.337323 }, { "acc": 0.89435711, "epoch": 0.463973063973064, "grad_norm": 6.5, "learning_rate": 1.82435842200406e-05, "loss": 0.27627466, "memory(GiB)": 15.04, "step": 3445, "train_speed(iter/s)": 0.33741 }, { "acc": 0.9228344, "epoch": 0.46464646464646464, "grad_norm": 8.6875, "learning_rate": 1.823727601438755e-05, "loss": 0.33515551, "memory(GiB)": 15.04, "step": 3450, "train_speed(iter/s)": 0.337459 }, { "acc": 0.8139946, "epoch": 0.4653198653198653, "grad_norm": 7.59375, "learning_rate": 1.82309575957189e-05, "loss": 0.40257473, "memory(GiB)": 15.04, "step": 3455, "train_speed(iter/s)": 0.337525 }, { "acc": 0.90620852, "epoch": 0.465993265993266, "grad_norm": 14.5, "learning_rate": 1.8224628971868573e-05, "loss": 0.33643632, "memory(GiB)": 15.04, "step": 3460, "train_speed(iter/s)": 0.337598 }, { "acc": 0.87207336, "epoch": 0.4666666666666667, "grad_norm": 21.125, "learning_rate": 1.821829015068313e-05, "loss": 0.50049963, "memory(GiB)": 15.04, "step": 3465, "train_speed(iter/s)": 0.337654 }, { "acc": 0.91413269, "epoch": 0.46734006734006733, "grad_norm": 7.4375, "learning_rate": 1.821194114002178e-05, "loss": 0.29297369, "memory(GiB)": 15.04, "step": 3470, "train_speed(iter/s)": 0.337695 }, { "acc": 0.82051849, "epoch": 0.468013468013468, "grad_norm": 9.375, "learning_rate": 1.820558194775637e-05, "loss": 0.56539397, "memory(GiB)": 15.04, "step": 3475, "train_speed(iter/s)": 0.33781 }, { "acc": 0.89184208, "epoch": 0.4686868686868687, "grad_norm": 6.15625, "learning_rate": 1.8199212581771366e-05, "loss": 0.40346231, "memory(GiB)": 15.04, "step": 3480, "train_speed(iter/s)": 0.33786 }, { "acc": 0.88671398, "epoch": 0.46936026936026937, "grad_norm": 7.0, "learning_rate": 1.8192833049963848e-05, "loss": 0.49412446, "memory(GiB)": 15.04, "step": 3485, "train_speed(iter/s)": 0.337936 }, { "acc": 0.83467436, "epoch": 0.47003367003367, "grad_norm": 9.3125, "learning_rate": 1.8186443360243502e-05, "loss": 0.43304119, "memory(GiB)": 15.04, "step": 3490, "train_speed(iter/s)": 0.337992 }, { "acc": 0.78681512, "epoch": 0.4707070707070707, "grad_norm": 9.9375, "learning_rate": 1.81800435205326e-05, "loss": 1.1245203, "memory(GiB)": 15.04, "step": 3495, "train_speed(iter/s)": 0.338099 }, { "acc": 0.87781572, "epoch": 0.4713804713804714, "grad_norm": 9.25, "learning_rate": 1.8173633538766018e-05, "loss": 0.43102632, "memory(GiB)": 15.04, "step": 3500, "train_speed(iter/s)": 0.338126 }, { "acc": 0.85797892, "epoch": 0.47205387205387206, "grad_norm": 14.1875, "learning_rate": 1.8167213422891187e-05, "loss": 0.49299965, "memory(GiB)": 15.04, "step": 3505, "train_speed(iter/s)": 0.338127 }, { "acc": 0.79441729, "epoch": 0.4727272727272727, "grad_norm": 16.125, "learning_rate": 1.8160783180868108e-05, "loss": 0.60936694, "memory(GiB)": 15.04, "step": 3510, "train_speed(iter/s)": 0.338192 }, { "acc": 0.89987812, "epoch": 0.4734006734006734, "grad_norm": 10.6875, "learning_rate": 1.8154342820669346e-05, "loss": 0.31670492, "memory(GiB)": 15.04, "step": 3515, "train_speed(iter/s)": 0.338282 }, { "acc": 0.84160137, "epoch": 0.4740740740740741, "grad_norm": 11.375, "learning_rate": 1.8147892350279997e-05, "loss": 0.65960035, "memory(GiB)": 15.04, "step": 3520, "train_speed(iter/s)": 0.338413 }, { "acc": 0.86717596, "epoch": 0.47474747474747475, "grad_norm": 11.5, "learning_rate": 1.8141431777697707e-05, "loss": 0.46436634, "memory(GiB)": 15.04, "step": 3525, "train_speed(iter/s)": 0.338476 }, { "acc": 0.83819246, "epoch": 0.4754208754208754, "grad_norm": 22.375, "learning_rate": 1.8134961110932634e-05, "loss": 0.65677876, "memory(GiB)": 15.04, "step": 3530, "train_speed(iter/s)": 0.338566 }, { "acc": 0.91159954, "epoch": 0.47609427609427607, "grad_norm": 5.96875, "learning_rate": 1.812848035800746e-05, "loss": 0.29325655, "memory(GiB)": 15.04, "step": 3535, "train_speed(iter/s)": 0.338581 }, { "acc": 0.87480431, "epoch": 0.4767676767676768, "grad_norm": 13.0625, "learning_rate": 1.8121989526957364e-05, "loss": 0.50847125, "memory(GiB)": 15.04, "step": 3540, "train_speed(iter/s)": 0.338624 }, { "acc": 0.88437653, "epoch": 0.47744107744107744, "grad_norm": 7.40625, "learning_rate": 1.8115488625830032e-05, "loss": 0.41104035, "memory(GiB)": 15.04, "step": 3545, "train_speed(iter/s)": 0.338657 }, { "acc": 0.91306934, "epoch": 0.4781144781144781, "grad_norm": 8.9375, "learning_rate": 1.8108977662685628e-05, "loss": 0.32102196, "memory(GiB)": 15.04, "step": 3550, "train_speed(iter/s)": 0.338739 }, { "acc": 0.81103878, "epoch": 0.47878787878787876, "grad_norm": 20.125, "learning_rate": 1.8102456645596787e-05, "loss": 0.53531909, "memory(GiB)": 15.04, "step": 3555, "train_speed(iter/s)": 0.338796 }, { "acc": 0.88863211, "epoch": 0.4794612794612795, "grad_norm": 12.75, "learning_rate": 1.8095925582648624e-05, "loss": 0.33367224, "memory(GiB)": 15.04, "step": 3560, "train_speed(iter/s)": 0.338885 }, { "acc": 0.82931652, "epoch": 0.48013468013468014, "grad_norm": 10.25, "learning_rate": 1.8089384481938694e-05, "loss": 0.43920531, "memory(GiB)": 15.04, "step": 3565, "train_speed(iter/s)": 0.33892 }, { "acc": 0.90031691, "epoch": 0.4808080808080808, "grad_norm": 6.46875, "learning_rate": 1.8082833351577003e-05, "loss": 0.29520645, "memory(GiB)": 15.04, "step": 3570, "train_speed(iter/s)": 0.338981 }, { "acc": 0.86380301, "epoch": 0.48148148148148145, "grad_norm": 11.4375, "learning_rate": 1.8076272199685996e-05, "loss": 0.43042107, "memory(GiB)": 15.04, "step": 3575, "train_speed(iter/s)": 0.339054 }, { "acc": 0.89454012, "epoch": 0.48215488215488217, "grad_norm": 7.6875, "learning_rate": 1.806970103440054e-05, "loss": 0.21537628, "memory(GiB)": 15.04, "step": 3580, "train_speed(iter/s)": 0.339125 }, { "acc": 0.85797701, "epoch": 0.48282828282828283, "grad_norm": 7.28125, "learning_rate": 1.8063119863867915e-05, "loss": 0.60251961, "memory(GiB)": 15.04, "step": 3585, "train_speed(iter/s)": 0.339012 }, { "acc": 0.89630127, "epoch": 0.4835016835016835, "grad_norm": 6.625, "learning_rate": 1.805652869624781e-05, "loss": 0.27930336, "memory(GiB)": 15.04, "step": 3590, "train_speed(iter/s)": 0.339137 }, { "acc": 0.83644896, "epoch": 0.4841750841750842, "grad_norm": 15.875, "learning_rate": 1.804992753971231e-05, "loss": 0.58859396, "memory(GiB)": 15.04, "step": 3595, "train_speed(iter/s)": 0.339189 }, { "acc": 0.85748091, "epoch": 0.48484848484848486, "grad_norm": 6.625, "learning_rate": 1.8043316402445876e-05, "loss": 0.5261065, "memory(GiB)": 15.04, "step": 3600, "train_speed(iter/s)": 0.33925 }, { "epoch": 0.48484848484848486, "eval_acc": 0.876711972764365, "eval_loss": 0.4761183559894562, "eval_runtime": 109.805, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 3600 }, { "acc": 0.83587589, "epoch": 0.4855218855218855, "grad_norm": 9.8125, "learning_rate": 1.8036695292645356e-05, "loss": 0.38770075, "memory(GiB)": 15.04, "step": 3605, "train_speed(iter/s)": 0.33586 }, { "acc": 0.90010643, "epoch": 0.4861952861952862, "grad_norm": 9.6875, "learning_rate": 1.8030064218519952e-05, "loss": 0.43593073, "memory(GiB)": 15.04, "step": 3610, "train_speed(iter/s)": 0.335904 }, { "acc": 0.88875904, "epoch": 0.4868686868686869, "grad_norm": 11.875, "learning_rate": 1.8023423188291227e-05, "loss": 0.30031426, "memory(GiB)": 15.04, "step": 3615, "train_speed(iter/s)": 0.335967 }, { "acc": 0.84387503, "epoch": 0.48754208754208755, "grad_norm": 9.25, "learning_rate": 1.8016772210193086e-05, "loss": 0.80649157, "memory(GiB)": 15.04, "step": 3620, "train_speed(iter/s)": 0.336027 }, { "acc": 0.83444624, "epoch": 0.4882154882154882, "grad_norm": 6.90625, "learning_rate": 1.8010111292471765e-05, "loss": 0.44340019, "memory(GiB)": 15.04, "step": 3625, "train_speed(iter/s)": 0.336048 }, { "acc": 0.89580994, "epoch": 0.4888888888888889, "grad_norm": 12.0625, "learning_rate": 1.8003440443385827e-05, "loss": 0.41821451, "memory(GiB)": 15.04, "step": 3630, "train_speed(iter/s)": 0.336115 }, { "acc": 0.85760431, "epoch": 0.4895622895622896, "grad_norm": 17.5, "learning_rate": 1.7996759671206148e-05, "loss": 0.43475323, "memory(GiB)": 15.04, "step": 3635, "train_speed(iter/s)": 0.336135 }, { "acc": 0.88512459, "epoch": 0.49023569023569025, "grad_norm": 8.0625, "learning_rate": 1.7990068984215905e-05, "loss": 0.2988543, "memory(GiB)": 15.04, "step": 3640, "train_speed(iter/s)": 0.336223 }, { "acc": 0.77889462, "epoch": 0.4909090909090909, "grad_norm": 7.34375, "learning_rate": 1.7983368390710576e-05, "loss": 0.62918062, "memory(GiB)": 15.04, "step": 3645, "train_speed(iter/s)": 0.33624 }, { "acc": 0.90301514, "epoch": 0.49158249158249157, "grad_norm": 5.71875, "learning_rate": 1.797665789899791e-05, "loss": 0.32716761, "memory(GiB)": 15.04, "step": 3650, "train_speed(iter/s)": 0.33628 }, { "acc": 0.89472647, "epoch": 0.4922558922558923, "grad_norm": 12.125, "learning_rate": 1.796993751739793e-05, "loss": 0.46433377, "memory(GiB)": 15.04, "step": 3655, "train_speed(iter/s)": 0.336342 }, { "acc": 0.84266872, "epoch": 0.49292929292929294, "grad_norm": 8.375, "learning_rate": 1.7963207254242933e-05, "loss": 0.54928746, "memory(GiB)": 15.04, "step": 3660, "train_speed(iter/s)": 0.336405 }, { "acc": 0.89203091, "epoch": 0.4936026936026936, "grad_norm": 4.21875, "learning_rate": 1.795646711787746e-05, "loss": 0.41538334, "memory(GiB)": 15.04, "step": 3665, "train_speed(iter/s)": 0.336362 }, { "acc": 0.89859972, "epoch": 0.49427609427609426, "grad_norm": 10.8125, "learning_rate": 1.7949717116658282e-05, "loss": 0.31999536, "memory(GiB)": 15.04, "step": 3670, "train_speed(iter/s)": 0.336473 }, { "acc": 0.91256189, "epoch": 0.494949494949495, "grad_norm": 5.90625, "learning_rate": 1.7942957258954425e-05, "loss": 0.34010236, "memory(GiB)": 15.04, "step": 3675, "train_speed(iter/s)": 0.33655 }, { "acc": 0.88669271, "epoch": 0.49562289562289563, "grad_norm": 17.625, "learning_rate": 1.7936187553147108e-05, "loss": 0.49786587, "memory(GiB)": 15.04, "step": 3680, "train_speed(iter/s)": 0.33657 }, { "acc": 0.88359947, "epoch": 0.4962962962962963, "grad_norm": 9.1875, "learning_rate": 1.7929408007629788e-05, "loss": 0.48834329, "memory(GiB)": 15.04, "step": 3685, "train_speed(iter/s)": 0.336623 }, { "acc": 0.9091341, "epoch": 0.49696969696969695, "grad_norm": 9.5625, "learning_rate": 1.79226186308081e-05, "loss": 0.29978616, "memory(GiB)": 15.04, "step": 3690, "train_speed(iter/s)": 0.336687 }, { "acc": 0.87099781, "epoch": 0.49764309764309766, "grad_norm": 11.0625, "learning_rate": 1.7915819431099882e-05, "loss": 0.41647654, "memory(GiB)": 15.04, "step": 3695, "train_speed(iter/s)": 0.336752 }, { "acc": 0.88838787, "epoch": 0.4983164983164983, "grad_norm": 7.875, "learning_rate": 1.790901041693514e-05, "loss": 0.45208883, "memory(GiB)": 15.04, "step": 3700, "train_speed(iter/s)": 0.336698 }, { "acc": 0.88066473, "epoch": 0.498989898989899, "grad_norm": 7.0625, "learning_rate": 1.7902191596756058e-05, "loss": 0.35533628, "memory(GiB)": 15.04, "step": 3705, "train_speed(iter/s)": 0.336749 }, { "acc": 0.85620632, "epoch": 0.49966329966329964, "grad_norm": 7.59375, "learning_rate": 1.7895362979016975e-05, "loss": 0.51151724, "memory(GiB)": 15.04, "step": 3710, "train_speed(iter/s)": 0.336765 }, { "acc": 0.89235287, "epoch": 0.5003367003367003, "grad_norm": 7.90625, "learning_rate": 1.7888524572184375e-05, "loss": 0.42579594, "memory(GiB)": 15.04, "step": 3715, "train_speed(iter/s)": 0.336862 }, { "acc": 0.85100842, "epoch": 0.501010101010101, "grad_norm": 13.25, "learning_rate": 1.7881676384736876e-05, "loss": 0.5550694, "memory(GiB)": 15.04, "step": 3720, "train_speed(iter/s)": 0.336955 }, { "acc": 0.84736996, "epoch": 0.5016835016835017, "grad_norm": 7.34375, "learning_rate": 1.7874818425165233e-05, "loss": 0.57301402, "memory(GiB)": 15.04, "step": 3725, "train_speed(iter/s)": 0.337066 }, { "acc": 0.90914621, "epoch": 0.5023569023569023, "grad_norm": 7.90625, "learning_rate": 1.7867950701972313e-05, "loss": 0.30549459, "memory(GiB)": 15.04, "step": 3730, "train_speed(iter/s)": 0.337125 }, { "acc": 0.91866875, "epoch": 0.503030303030303, "grad_norm": 8.1875, "learning_rate": 1.7861073223673084e-05, "loss": 0.28113177, "memory(GiB)": 15.04, "step": 3735, "train_speed(iter/s)": 0.3372 }, { "acc": 0.90182066, "epoch": 0.5037037037037037, "grad_norm": 16.5, "learning_rate": 1.785418599879461e-05, "loss": 0.36212847, "memory(GiB)": 15.04, "step": 3740, "train_speed(iter/s)": 0.337301 }, { "acc": 0.79256129, "epoch": 0.5043771043771044, "grad_norm": 24.25, "learning_rate": 1.7847289035876044e-05, "loss": 0.93178387, "memory(GiB)": 15.04, "step": 3745, "train_speed(iter/s)": 0.337414 }, { "acc": 0.83150692, "epoch": 0.5050505050505051, "grad_norm": 14.25, "learning_rate": 1.7840382343468604e-05, "loss": 0.67089224, "memory(GiB)": 15.04, "step": 3750, "train_speed(iter/s)": 0.337449 }, { "acc": 0.89720764, "epoch": 0.5057239057239057, "grad_norm": 6.84375, "learning_rate": 1.7833465930135586e-05, "loss": 0.37244151, "memory(GiB)": 15.04, "step": 3755, "train_speed(iter/s)": 0.337492 }, { "acc": 0.86719933, "epoch": 0.5063973063973064, "grad_norm": 9.5, "learning_rate": 1.782653980445232e-05, "loss": 0.59741712, "memory(GiB)": 15.04, "step": 3760, "train_speed(iter/s)": 0.33755 }, { "acc": 0.8698164, "epoch": 0.5070707070707071, "grad_norm": 10.4375, "learning_rate": 1.7819603975006195e-05, "loss": 0.38258035, "memory(GiB)": 15.04, "step": 3765, "train_speed(iter/s)": 0.337623 }, { "acc": 0.90045404, "epoch": 0.5077441077441077, "grad_norm": 5.84375, "learning_rate": 1.781265845039662e-05, "loss": 0.42641959, "memory(GiB)": 15.04, "step": 3770, "train_speed(iter/s)": 0.337552 }, { "acc": 0.86632509, "epoch": 0.5084175084175084, "grad_norm": 7.6875, "learning_rate": 1.7805703239235023e-05, "loss": 0.51779404, "memory(GiB)": 15.04, "step": 3775, "train_speed(iter/s)": 0.337629 }, { "acc": 0.82737207, "epoch": 0.509090909090909, "grad_norm": 11.4375, "learning_rate": 1.7798738350144854e-05, "loss": 0.61180573, "memory(GiB)": 15.04, "step": 3780, "train_speed(iter/s)": 0.337738 }, { "acc": 0.91728687, "epoch": 0.5097643097643098, "grad_norm": 6.0625, "learning_rate": 1.7791763791761557e-05, "loss": 0.25166643, "memory(GiB)": 15.04, "step": 3785, "train_speed(iter/s)": 0.337811 }, { "acc": 0.88268843, "epoch": 0.5104377104377105, "grad_norm": 4.59375, "learning_rate": 1.7784779572732558e-05, "loss": 0.43862414, "memory(GiB)": 15.04, "step": 3790, "train_speed(iter/s)": 0.337802 }, { "acc": 0.86389875, "epoch": 0.5111111111111111, "grad_norm": 10.4375, "learning_rate": 1.7777785701717266e-05, "loss": 0.41195416, "memory(GiB)": 15.04, "step": 3795, "train_speed(iter/s)": 0.337886 }, { "acc": 0.8191576, "epoch": 0.5117845117845118, "grad_norm": 7.40625, "learning_rate": 1.7770782187387056e-05, "loss": 0.74752345, "memory(GiB)": 15.04, "step": 3800, "train_speed(iter/s)": 0.337868 }, { "acc": 0.86887932, "epoch": 0.5124579124579125, "grad_norm": 7.03125, "learning_rate": 1.776376903842526e-05, "loss": 0.45682001, "memory(GiB)": 15.04, "step": 3805, "train_speed(iter/s)": 0.337834 }, { "acc": 0.92683773, "epoch": 0.5131313131313131, "grad_norm": 12.4375, "learning_rate": 1.7756746263527157e-05, "loss": 0.31054833, "memory(GiB)": 15.04, "step": 3810, "train_speed(iter/s)": 0.337921 }, { "acc": 0.89547215, "epoch": 0.5138047138047138, "grad_norm": 5.875, "learning_rate": 1.774971387139996e-05, "loss": 0.45491533, "memory(GiB)": 15.04, "step": 3815, "train_speed(iter/s)": 0.337943 }, { "acc": 0.88838444, "epoch": 0.5144781144781144, "grad_norm": 12.3125, "learning_rate": 1.7742671870762805e-05, "loss": 0.34393697, "memory(GiB)": 15.04, "step": 3820, "train_speed(iter/s)": 0.337996 }, { "acc": 0.85375404, "epoch": 0.5151515151515151, "grad_norm": 9.3125, "learning_rate": 1.7735620270346735e-05, "loss": 0.64072185, "memory(GiB)": 15.04, "step": 3825, "train_speed(iter/s)": 0.338096 }, { "acc": 0.89439125, "epoch": 0.5158249158249159, "grad_norm": 7.84375, "learning_rate": 1.7728559078894708e-05, "loss": 0.34980464, "memory(GiB)": 15.04, "step": 3830, "train_speed(iter/s)": 0.33818 }, { "acc": 0.89482927, "epoch": 0.5164983164983165, "grad_norm": 15.375, "learning_rate": 1.7721488305161566e-05, "loss": 0.42905107, "memory(GiB)": 15.04, "step": 3835, "train_speed(iter/s)": 0.33823 }, { "acc": 0.78987341, "epoch": 0.5171717171717172, "grad_norm": 19.125, "learning_rate": 1.7714407957914033e-05, "loss": 0.84220772, "memory(GiB)": 15.04, "step": 3840, "train_speed(iter/s)": 0.338284 }, { "acc": 0.8020134, "epoch": 0.5178451178451179, "grad_norm": 18.0, "learning_rate": 1.77073180459307e-05, "loss": 0.80393257, "memory(GiB)": 15.04, "step": 3845, "train_speed(iter/s)": 0.338378 }, { "acc": 0.85574732, "epoch": 0.5185185185185185, "grad_norm": 13.0625, "learning_rate": 1.7700218578002018e-05, "loss": 0.60742431, "memory(GiB)": 15.04, "step": 3850, "train_speed(iter/s)": 0.338399 }, { "acc": 0.90230808, "epoch": 0.5191919191919192, "grad_norm": 8.625, "learning_rate": 1.7693109562930294e-05, "loss": 0.48803911, "memory(GiB)": 15.04, "step": 3855, "train_speed(iter/s)": 0.338495 }, { "acc": 0.9076643, "epoch": 0.5198653198653199, "grad_norm": 11.25, "learning_rate": 1.7685991009529658e-05, "loss": 0.30903282, "memory(GiB)": 15.04, "step": 3860, "train_speed(iter/s)": 0.338563 }, { "acc": 0.71694846, "epoch": 0.5205387205387205, "grad_norm": 10.5, "learning_rate": 1.7678862926626076e-05, "loss": 0.96086855, "memory(GiB)": 15.04, "step": 3865, "train_speed(iter/s)": 0.338651 }, { "acc": 0.91138239, "epoch": 0.5212121212121212, "grad_norm": 5.46875, "learning_rate": 1.767172532305733e-05, "loss": 0.37817876, "memory(GiB)": 15.04, "step": 3870, "train_speed(iter/s)": 0.338704 }, { "acc": 0.91213226, "epoch": 0.5218855218855218, "grad_norm": 7.5625, "learning_rate": 1.7664578207672997e-05, "loss": 0.33159304, "memory(GiB)": 15.04, "step": 3875, "train_speed(iter/s)": 0.338766 }, { "acc": 0.94331083, "epoch": 0.5225589225589226, "grad_norm": 6.875, "learning_rate": 1.765742158933446e-05, "loss": 0.23964734, "memory(GiB)": 15.04, "step": 3880, "train_speed(iter/s)": 0.338776 }, { "acc": 0.88471804, "epoch": 0.5232323232323233, "grad_norm": 7.25, "learning_rate": 1.765025547691487e-05, "loss": 0.45875759, "memory(GiB)": 15.04, "step": 3885, "train_speed(iter/s)": 0.338707 }, { "acc": 0.89637489, "epoch": 0.5239057239057239, "grad_norm": 11.0, "learning_rate": 1.7643079879299163e-05, "loss": 0.44263835, "memory(GiB)": 15.04, "step": 3890, "train_speed(iter/s)": 0.338725 }, { "acc": 0.86330814, "epoch": 0.5245791245791246, "grad_norm": 8.875, "learning_rate": 1.7635894805384024e-05, "loss": 0.54586911, "memory(GiB)": 15.04, "step": 3895, "train_speed(iter/s)": 0.338773 }, { "acc": 0.82347183, "epoch": 0.5252525252525253, "grad_norm": 10.5625, "learning_rate": 1.7628700264077893e-05, "loss": 0.66672564, "memory(GiB)": 15.04, "step": 3900, "train_speed(iter/s)": 0.338762 }, { "epoch": 0.5252525252525253, "eval_acc": 0.8781347962382445, "eval_loss": 0.4679408669471741, "eval_runtime": 109.8842, "eval_samples_per_second": 1.365, "eval_steps_per_second": 1.365, "step": 3900 }, { "acc": 0.77695065, "epoch": 0.5259259259259259, "grad_norm": 18.625, "learning_rate": 1.7621496264300954e-05, "loss": 0.54761453, "memory(GiB)": 15.04, "step": 3905, "train_speed(iter/s)": 0.335549 }, { "acc": 0.86843939, "epoch": 0.5265993265993266, "grad_norm": 8.5625, "learning_rate": 1.76142828149851e-05, "loss": 0.45215411, "memory(GiB)": 15.04, "step": 3910, "train_speed(iter/s)": 0.335647 }, { "acc": 0.86568508, "epoch": 0.5272727272727272, "grad_norm": 7.59375, "learning_rate": 1.760705992507396e-05, "loss": 0.52686167, "memory(GiB)": 15.04, "step": 3915, "train_speed(iter/s)": 0.335723 }, { "acc": 0.84859962, "epoch": 0.5279461279461279, "grad_norm": 11.875, "learning_rate": 1.7599827603522858e-05, "loss": 0.56251554, "memory(GiB)": 15.04, "step": 3920, "train_speed(iter/s)": 0.335692 }, { "acc": 0.84980879, "epoch": 0.5286195286195287, "grad_norm": 8.3125, "learning_rate": 1.7592585859298808e-05, "loss": 0.41706219, "memory(GiB)": 15.04, "step": 3925, "train_speed(iter/s)": 0.335597 }, { "acc": 0.82573633, "epoch": 0.5292929292929293, "grad_norm": 5.4375, "learning_rate": 1.7585334701380518e-05, "loss": 0.57327018, "memory(GiB)": 15.04, "step": 3930, "train_speed(iter/s)": 0.335688 }, { "acc": 0.80837812, "epoch": 0.52996632996633, "grad_norm": 9.0, "learning_rate": 1.757807413875836e-05, "loss": 0.77750101, "memory(GiB)": 15.04, "step": 3935, "train_speed(iter/s)": 0.335716 }, { "acc": 0.92627468, "epoch": 0.5306397306397307, "grad_norm": 5.53125, "learning_rate": 1.7570804180434368e-05, "loss": 0.32449107, "memory(GiB)": 15.04, "step": 3940, "train_speed(iter/s)": 0.335731 }, { "acc": 0.9044364, "epoch": 0.5313131313131313, "grad_norm": 7.15625, "learning_rate": 1.7563524835422224e-05, "loss": 0.4426527, "memory(GiB)": 15.04, "step": 3945, "train_speed(iter/s)": 0.335705 }, { "acc": 0.87809496, "epoch": 0.531986531986532, "grad_norm": 10.0, "learning_rate": 1.7556236112747253e-05, "loss": 0.47075071, "memory(GiB)": 15.04, "step": 3950, "train_speed(iter/s)": 0.335751 }, { "acc": 0.92612886, "epoch": 0.5326599326599326, "grad_norm": 11.5625, "learning_rate": 1.7548938021446398e-05, "loss": 0.27105238, "memory(GiB)": 15.04, "step": 3955, "train_speed(iter/s)": 0.335827 }, { "acc": 0.91240883, "epoch": 0.5333333333333333, "grad_norm": 6.09375, "learning_rate": 1.7541630570568227e-05, "loss": 0.27689917, "memory(GiB)": 15.04, "step": 3960, "train_speed(iter/s)": 0.335874 }, { "acc": 0.87582722, "epoch": 0.534006734006734, "grad_norm": 9.3125, "learning_rate": 1.7534313769172908e-05, "loss": 0.48547306, "memory(GiB)": 15.04, "step": 3965, "train_speed(iter/s)": 0.335879 }, { "acc": 0.88297787, "epoch": 0.5346801346801346, "grad_norm": 14.125, "learning_rate": 1.7526987626332202e-05, "loss": 0.45881667, "memory(GiB)": 15.04, "step": 3970, "train_speed(iter/s)": 0.335924 }, { "acc": 0.81192646, "epoch": 0.5353535353535354, "grad_norm": 11.0, "learning_rate": 1.7519652151129458e-05, "loss": 0.8739994, "memory(GiB)": 15.04, "step": 3975, "train_speed(iter/s)": 0.335987 }, { "acc": 0.88407602, "epoch": 0.5360269360269361, "grad_norm": 10.5, "learning_rate": 1.7512307352659583e-05, "loss": 0.27493479, "memory(GiB)": 15.04, "step": 3980, "train_speed(iter/s)": 0.336073 }, { "acc": 0.90319691, "epoch": 0.5367003367003367, "grad_norm": 7.25, "learning_rate": 1.7504953240029053e-05, "loss": 0.3337369, "memory(GiB)": 15.04, "step": 3985, "train_speed(iter/s)": 0.336143 }, { "acc": 0.914468, "epoch": 0.5373737373737374, "grad_norm": 7.15625, "learning_rate": 1.7497589822355892e-05, "loss": 0.30370355, "memory(GiB)": 15.04, "step": 3990, "train_speed(iter/s)": 0.336166 }, { "acc": 0.8577981, "epoch": 0.538047138047138, "grad_norm": 8.875, "learning_rate": 1.7490217108769663e-05, "loss": 0.5094347, "memory(GiB)": 15.04, "step": 3995, "train_speed(iter/s)": 0.33625 }, { "acc": 0.90495567, "epoch": 0.5387205387205387, "grad_norm": 7.59375, "learning_rate": 1.7482835108411442e-05, "loss": 0.36858811, "memory(GiB)": 15.04, "step": 4000, "train_speed(iter/s)": 0.336304 }, { "acc": 0.89041739, "epoch": 0.5393939393939394, "grad_norm": 4.84375, "learning_rate": 1.7475443830433835e-05, "loss": 0.32242258, "memory(GiB)": 15.04, "step": 4005, "train_speed(iter/s)": 0.336248 }, { "acc": 0.89687119, "epoch": 0.54006734006734, "grad_norm": 6.5625, "learning_rate": 1.7468043284000945e-05, "loss": 0.29208932, "memory(GiB)": 15.04, "step": 4010, "train_speed(iter/s)": 0.336335 }, { "acc": 0.82246227, "epoch": 0.5407407407407407, "grad_norm": 10.8125, "learning_rate": 1.746063347828836e-05, "loss": 0.56478887, "memory(GiB)": 15.04, "step": 4015, "train_speed(iter/s)": 0.336412 }, { "acc": 0.87356806, "epoch": 0.5414141414141415, "grad_norm": 7.90625, "learning_rate": 1.7453214422483154e-05, "loss": 0.48184762, "memory(GiB)": 15.04, "step": 4020, "train_speed(iter/s)": 0.336486 }, { "acc": 0.84914217, "epoch": 0.5420875420875421, "grad_norm": 11.625, "learning_rate": 1.744578612578387e-05, "loss": 0.42026148, "memory(GiB)": 15.04, "step": 4025, "train_speed(iter/s)": 0.336521 }, { "acc": 0.8880044, "epoch": 0.5427609427609428, "grad_norm": 8.5625, "learning_rate": 1.7438348597400513e-05, "loss": 0.34287093, "memory(GiB)": 15.04, "step": 4030, "train_speed(iter/s)": 0.336597 }, { "acc": 0.89508591, "epoch": 0.5434343434343434, "grad_norm": 11.375, "learning_rate": 1.7430901846554525e-05, "loss": 0.33280551, "memory(GiB)": 15.04, "step": 4035, "train_speed(iter/s)": 0.336685 }, { "acc": 0.88267012, "epoch": 0.5441077441077441, "grad_norm": 7.46875, "learning_rate": 1.7423445882478785e-05, "loss": 0.36910105, "memory(GiB)": 15.04, "step": 4040, "train_speed(iter/s)": 0.336762 }, { "acc": 0.88442879, "epoch": 0.5447811447811448, "grad_norm": 7.53125, "learning_rate": 1.74159807144176e-05, "loss": 0.42848067, "memory(GiB)": 15.04, "step": 4045, "train_speed(iter/s)": 0.336796 }, { "acc": 0.90293417, "epoch": 0.5454545454545454, "grad_norm": 10.8125, "learning_rate": 1.7408506351626677e-05, "loss": 0.37314644, "memory(GiB)": 15.04, "step": 4050, "train_speed(iter/s)": 0.336881 }, { "acc": 0.94608707, "epoch": 0.5461279461279461, "grad_norm": 10.75, "learning_rate": 1.740102280337314e-05, "loss": 0.19708145, "memory(GiB)": 15.04, "step": 4055, "train_speed(iter/s)": 0.336948 }, { "acc": 0.86210232, "epoch": 0.5468013468013468, "grad_norm": 14.625, "learning_rate": 1.7393530078935486e-05, "loss": 0.54546967, "memory(GiB)": 15.04, "step": 4060, "train_speed(iter/s)": 0.337023 }, { "acc": 0.8828249, "epoch": 0.5474747474747474, "grad_norm": 5.84375, "learning_rate": 1.73860281876036e-05, "loss": 0.53822279, "memory(GiB)": 15.04, "step": 4065, "train_speed(iter/s)": 0.33703 }, { "acc": 0.91557751, "epoch": 0.5481481481481482, "grad_norm": 11.8125, "learning_rate": 1.7378517138678727e-05, "loss": 0.40204964, "memory(GiB)": 15.04, "step": 4070, "train_speed(iter/s)": 0.337059 }, { "acc": 0.88881645, "epoch": 0.5488215488215489, "grad_norm": 9.625, "learning_rate": 1.7370996941473464e-05, "loss": 0.50532298, "memory(GiB)": 15.04, "step": 4075, "train_speed(iter/s)": 0.337095 }, { "acc": 0.8979125, "epoch": 0.5494949494949495, "grad_norm": 18.0, "learning_rate": 1.736346760531176e-05, "loss": 0.31720853, "memory(GiB)": 15.04, "step": 4080, "train_speed(iter/s)": 0.337157 }, { "acc": 0.84728556, "epoch": 0.5501683501683502, "grad_norm": 8.3125, "learning_rate": 1.7355929139528888e-05, "loss": 0.45086961, "memory(GiB)": 15.04, "step": 4085, "train_speed(iter/s)": 0.33714 }, { "acc": 0.92506981, "epoch": 0.5508417508417508, "grad_norm": 4.0625, "learning_rate": 1.7348381553471436e-05, "loss": 0.2887996, "memory(GiB)": 15.04, "step": 4090, "train_speed(iter/s)": 0.337177 }, { "acc": 0.91318417, "epoch": 0.5515151515151515, "grad_norm": 9.5625, "learning_rate": 1.734082485649731e-05, "loss": 0.40977402, "memory(GiB)": 15.04, "step": 4095, "train_speed(iter/s)": 0.337245 }, { "acc": 0.89119549, "epoch": 0.5521885521885522, "grad_norm": 8.0, "learning_rate": 1.7333259057975705e-05, "loss": 0.31618371, "memory(GiB)": 15.04, "step": 4100, "train_speed(iter/s)": 0.337301 }, { "acc": 0.91062832, "epoch": 0.5528619528619528, "grad_norm": 7.15625, "learning_rate": 1.7325684167287105e-05, "loss": 0.28954463, "memory(GiB)": 15.04, "step": 4105, "train_speed(iter/s)": 0.337332 }, { "acc": 0.92623892, "epoch": 0.5535353535353535, "grad_norm": 5.75, "learning_rate": 1.731810019382326e-05, "loss": 0.25716453, "memory(GiB)": 15.04, "step": 4110, "train_speed(iter/s)": 0.337401 }, { "acc": 0.92022734, "epoch": 0.5542087542087543, "grad_norm": 10.0, "learning_rate": 1.731050714698719e-05, "loss": 0.25363297, "memory(GiB)": 15.04, "step": 4115, "train_speed(iter/s)": 0.337441 }, { "acc": 0.8096302, "epoch": 0.5548821548821549, "grad_norm": 4.0625, "learning_rate": 1.730290503619316e-05, "loss": 0.65786781, "memory(GiB)": 15.04, "step": 4120, "train_speed(iter/s)": 0.337443 }, { "acc": 0.84931622, "epoch": 0.5555555555555556, "grad_norm": 12.0625, "learning_rate": 1.7295293870866677e-05, "loss": 0.58855181, "memory(GiB)": 15.04, "step": 4125, "train_speed(iter/s)": 0.337539 }, { "acc": 0.9130374, "epoch": 0.5562289562289562, "grad_norm": 14.4375, "learning_rate": 1.7287673660444464e-05, "loss": 0.28229191, "memory(GiB)": 15.04, "step": 4130, "train_speed(iter/s)": 0.337626 }, { "acc": 0.88506327, "epoch": 0.5569023569023569, "grad_norm": 11.1875, "learning_rate": 1.728004441437447e-05, "loss": 0.47019825, "memory(GiB)": 15.04, "step": 4135, "train_speed(iter/s)": 0.337605 }, { "acc": 0.84497614, "epoch": 0.5575757575757576, "grad_norm": 5.78125, "learning_rate": 1.7272406142115846e-05, "loss": 0.4880383, "memory(GiB)": 15.04, "step": 4140, "train_speed(iter/s)": 0.337599 }, { "acc": 0.89238911, "epoch": 0.5582491582491582, "grad_norm": 7.03125, "learning_rate": 1.7264758853138923e-05, "loss": 0.37703359, "memory(GiB)": 15.04, "step": 4145, "train_speed(iter/s)": 0.337609 }, { "acc": 0.89045725, "epoch": 0.5589225589225589, "grad_norm": 6.59375, "learning_rate": 1.7257102556925227e-05, "loss": 0.4140079, "memory(GiB)": 15.04, "step": 4150, "train_speed(iter/s)": 0.337668 }, { "acc": 0.91018286, "epoch": 0.5595959595959596, "grad_norm": 21.375, "learning_rate": 1.7249437262967436e-05, "loss": 0.40824814, "memory(GiB)": 15.04, "step": 4155, "train_speed(iter/s)": 0.337687 }, { "acc": 0.870082, "epoch": 0.5602693602693603, "grad_norm": 6.21875, "learning_rate": 1.7241762980769398e-05, "loss": 0.37111406, "memory(GiB)": 15.04, "step": 4160, "train_speed(iter/s)": 0.337636 }, { "acc": 0.86445503, "epoch": 0.560942760942761, "grad_norm": 16.625, "learning_rate": 1.7234079719846092e-05, "loss": 0.63412347, "memory(GiB)": 15.04, "step": 4165, "train_speed(iter/s)": 0.337721 }, { "acc": 0.91648846, "epoch": 0.5616161616161616, "grad_norm": 8.8125, "learning_rate": 1.722638748972364e-05, "loss": 0.32388, "memory(GiB)": 15.04, "step": 4170, "train_speed(iter/s)": 0.3378 }, { "acc": 0.89812613, "epoch": 0.5622895622895623, "grad_norm": 13.0625, "learning_rate": 1.7218686299939286e-05, "loss": 0.31773405, "memory(GiB)": 15.04, "step": 4175, "train_speed(iter/s)": 0.337759 }, { "acc": 0.90312166, "epoch": 0.562962962962963, "grad_norm": 6.125, "learning_rate": 1.721097616004137e-05, "loss": 0.44310713, "memory(GiB)": 15.04, "step": 4180, "train_speed(iter/s)": 0.337812 }, { "acc": 0.90160627, "epoch": 0.5636363636363636, "grad_norm": 4.4375, "learning_rate": 1.7203257079589334e-05, "loss": 0.44974608, "memory(GiB)": 15.04, "step": 4185, "train_speed(iter/s)": 0.337836 }, { "acc": 0.86169586, "epoch": 0.5643097643097643, "grad_norm": 10.125, "learning_rate": 1.7195529068153715e-05, "loss": 0.37513566, "memory(GiB)": 15.04, "step": 4190, "train_speed(iter/s)": 0.337902 }, { "acc": 0.88859873, "epoch": 0.564983164983165, "grad_norm": 11.75, "learning_rate": 1.718779213531611e-05, "loss": 0.4264184, "memory(GiB)": 15.04, "step": 4195, "train_speed(iter/s)": 0.337944 }, { "acc": 0.88831396, "epoch": 0.5656565656565656, "grad_norm": 8.25, "learning_rate": 1.7180046290669182e-05, "loss": 0.51441598, "memory(GiB)": 15.04, "step": 4200, "train_speed(iter/s)": 0.338041 }, { "epoch": 0.5656565656565656, "eval_acc": 0.8814506728833316, "eval_loss": 0.4589705467224121, "eval_runtime": 109.7591, "eval_samples_per_second": 1.367, "eval_steps_per_second": 1.367, "step": 4200 }, { "acc": 0.86627464, "epoch": 0.5663299663299664, "grad_norm": 8.9375, "learning_rate": 1.7172291543816647e-05, "loss": 0.43737912, "memory(GiB)": 15.04, "step": 4205, "train_speed(iter/s)": 0.335068 }, { "acc": 0.84894361, "epoch": 0.567003367003367, "grad_norm": 13.75, "learning_rate": 1.716452790437325e-05, "loss": 0.45851641, "memory(GiB)": 15.04, "step": 4210, "train_speed(iter/s)": 0.335161 }, { "acc": 0.92653151, "epoch": 0.5676767676767677, "grad_norm": 9.9375, "learning_rate": 1.7156755381964773e-05, "loss": 0.23524013, "memory(GiB)": 15.04, "step": 4215, "train_speed(iter/s)": 0.335236 }, { "acc": 0.81595592, "epoch": 0.5683501683501684, "grad_norm": 26.125, "learning_rate": 1.7148973986228e-05, "loss": 0.46322184, "memory(GiB)": 15.04, "step": 4220, "train_speed(iter/s)": 0.335287 }, { "acc": 0.90123253, "epoch": 0.569023569023569, "grad_norm": 8.1875, "learning_rate": 1.7141183726810727e-05, "loss": 0.358708, "memory(GiB)": 15.04, "step": 4225, "train_speed(iter/s)": 0.335338 }, { "acc": 0.92097464, "epoch": 0.5696969696969697, "grad_norm": 8.1875, "learning_rate": 1.713338461337173e-05, "loss": 0.29511561, "memory(GiB)": 15.04, "step": 4230, "train_speed(iter/s)": 0.335371 }, { "acc": 0.87373762, "epoch": 0.5703703703703704, "grad_norm": 8.9375, "learning_rate": 1.7125576655580765e-05, "loss": 0.23357124, "memory(GiB)": 15.04, "step": 4235, "train_speed(iter/s)": 0.33544 }, { "acc": 0.84742498, "epoch": 0.571043771043771, "grad_norm": 5.65625, "learning_rate": 1.7117759863118562e-05, "loss": 0.40855536, "memory(GiB)": 15.04, "step": 4240, "train_speed(iter/s)": 0.335503 }, { "acc": 0.89106922, "epoch": 0.5717171717171717, "grad_norm": 9.1875, "learning_rate": 1.7109934245676797e-05, "loss": 0.44124908, "memory(GiB)": 15.04, "step": 4245, "train_speed(iter/s)": 0.335585 }, { "acc": 0.85798492, "epoch": 0.5723905723905723, "grad_norm": 14.8125, "learning_rate": 1.7102099812958086e-05, "loss": 0.56557817, "memory(GiB)": 15.04, "step": 4250, "train_speed(iter/s)": 0.335641 }, { "acc": 0.85664806, "epoch": 0.573063973063973, "grad_norm": 12.5, "learning_rate": 1.7094256574675984e-05, "loss": 0.44262295, "memory(GiB)": 15.04, "step": 4255, "train_speed(iter/s)": 0.335669 }, { "acc": 0.91525526, "epoch": 0.5737373737373738, "grad_norm": 6.71875, "learning_rate": 1.7086404540554947e-05, "loss": 0.27817523, "memory(GiB)": 15.04, "step": 4260, "train_speed(iter/s)": 0.3357 }, { "acc": 0.85219793, "epoch": 0.5744107744107744, "grad_norm": 6.78125, "learning_rate": 1.7078543720330357e-05, "loss": 0.6271781, "memory(GiB)": 15.04, "step": 4265, "train_speed(iter/s)": 0.335788 }, { "acc": 0.91735287, "epoch": 0.5750841750841751, "grad_norm": 10.875, "learning_rate": 1.707067412374848e-05, "loss": 0.35268691, "memory(GiB)": 15.04, "step": 4270, "train_speed(iter/s)": 0.335868 }, { "acc": 0.90828514, "epoch": 0.5757575757575758, "grad_norm": 6.96875, "learning_rate": 1.7062795760566453e-05, "loss": 0.30698793, "memory(GiB)": 15.04, "step": 4275, "train_speed(iter/s)": 0.335937 }, { "acc": 0.87601233, "epoch": 0.5764309764309764, "grad_norm": 6.125, "learning_rate": 1.7054908640552302e-05, "loss": 0.37530935, "memory(GiB)": 15.04, "step": 4280, "train_speed(iter/s)": 0.336009 }, { "acc": 0.87714987, "epoch": 0.5771043771043771, "grad_norm": 8.75, "learning_rate": 1.7047012773484898e-05, "loss": 0.33812811, "memory(GiB)": 15.04, "step": 4285, "train_speed(iter/s)": 0.33605 }, { "acc": 0.88150272, "epoch": 0.5777777777777777, "grad_norm": 8.375, "learning_rate": 1.703910816915396e-05, "loss": 0.36193902, "memory(GiB)": 15.04, "step": 4290, "train_speed(iter/s)": 0.336074 }, { "acc": 0.89709082, "epoch": 0.5784511784511784, "grad_norm": 7.46875, "learning_rate": 1.7031194837360035e-05, "loss": 0.27638795, "memory(GiB)": 15.04, "step": 4295, "train_speed(iter/s)": 0.336157 }, { "acc": 0.90212574, "epoch": 0.5791245791245792, "grad_norm": 11.5, "learning_rate": 1.7023272787914496e-05, "loss": 0.41099176, "memory(GiB)": 15.04, "step": 4300, "train_speed(iter/s)": 0.336195 }, { "acc": 0.91579752, "epoch": 0.5797979797979798, "grad_norm": 9.25, "learning_rate": 1.701534203063953e-05, "loss": 0.30125055, "memory(GiB)": 15.04, "step": 4305, "train_speed(iter/s)": 0.336291 }, { "acc": 0.92083387, "epoch": 0.5804713804713805, "grad_norm": 6.75, "learning_rate": 1.7007402575368107e-05, "loss": 0.3330127, "memory(GiB)": 15.04, "step": 4310, "train_speed(iter/s)": 0.336333 }, { "acc": 0.91086092, "epoch": 0.5811447811447812, "grad_norm": 5.4375, "learning_rate": 1.6999454431943997e-05, "loss": 0.31445036, "memory(GiB)": 15.04, "step": 4315, "train_speed(iter/s)": 0.336361 }, { "acc": 0.8398551, "epoch": 0.5818181818181818, "grad_norm": 6.40625, "learning_rate": 1.6991497610221722e-05, "loss": 0.51611238, "memory(GiB)": 15.04, "step": 4320, "train_speed(iter/s)": 0.336398 }, { "acc": 0.85949497, "epoch": 0.5824915824915825, "grad_norm": 7.28125, "learning_rate": 1.6983532120066583e-05, "loss": 0.61256056, "memory(GiB)": 15.04, "step": 4325, "train_speed(iter/s)": 0.336384 }, { "acc": 0.80934429, "epoch": 0.5831649831649832, "grad_norm": 6.875, "learning_rate": 1.6975557971354622e-05, "loss": 0.26548476, "memory(GiB)": 15.04, "step": 4330, "train_speed(iter/s)": 0.336428 }, { "acc": 0.90867815, "epoch": 0.5838383838383838, "grad_norm": 7.8125, "learning_rate": 1.6967575173972614e-05, "loss": 0.38837395, "memory(GiB)": 15.04, "step": 4335, "train_speed(iter/s)": 0.336432 }, { "acc": 0.88366899, "epoch": 0.5845117845117845, "grad_norm": 9.25, "learning_rate": 1.6959583737818053e-05, "loss": 0.42913079, "memory(GiB)": 15.04, "step": 4340, "train_speed(iter/s)": 0.336457 }, { "acc": 0.84328384, "epoch": 0.5851851851851851, "grad_norm": 15.4375, "learning_rate": 1.6951583672799153e-05, "loss": 0.54969959, "memory(GiB)": 15.04, "step": 4345, "train_speed(iter/s)": 0.336537 }, { "acc": 0.92732067, "epoch": 0.5858585858585859, "grad_norm": 10.875, "learning_rate": 1.6943574988834828e-05, "loss": 0.25192871, "memory(GiB)": 15.04, "step": 4350, "train_speed(iter/s)": 0.336583 }, { "acc": 0.8462472, "epoch": 0.5865319865319866, "grad_norm": 5.75, "learning_rate": 1.6935557695854666e-05, "loss": 0.50387387, "memory(GiB)": 15.04, "step": 4355, "train_speed(iter/s)": 0.336632 }, { "acc": 0.8854928, "epoch": 0.5872053872053872, "grad_norm": 11.1875, "learning_rate": 1.6927531803798937e-05, "loss": 0.41608453, "memory(GiB)": 15.04, "step": 4360, "train_speed(iter/s)": 0.336677 }, { "acc": 0.90479069, "epoch": 0.5878787878787879, "grad_norm": 7.34375, "learning_rate": 1.691949732261857e-05, "loss": 0.25519066, "memory(GiB)": 15.04, "step": 4365, "train_speed(iter/s)": 0.336709 }, { "acc": 0.88697033, "epoch": 0.5885521885521886, "grad_norm": 9.1875, "learning_rate": 1.6911454262275153e-05, "loss": 0.33217278, "memory(GiB)": 15.04, "step": 4370, "train_speed(iter/s)": 0.336792 }, { "acc": 0.83444138, "epoch": 0.5892255892255892, "grad_norm": 7.9375, "learning_rate": 1.6903402632740893e-05, "loss": 0.48537288, "memory(GiB)": 15.04, "step": 4375, "train_speed(iter/s)": 0.336835 }, { "acc": 0.88983879, "epoch": 0.5898989898989899, "grad_norm": 8.3125, "learning_rate": 1.6895342443998637e-05, "loss": 0.37899985, "memory(GiB)": 15.04, "step": 4380, "train_speed(iter/s)": 0.336899 }, { "acc": 0.91641159, "epoch": 0.5905723905723905, "grad_norm": 6.84375, "learning_rate": 1.6887273706041833e-05, "loss": 0.28896179, "memory(GiB)": 15.04, "step": 4385, "train_speed(iter/s)": 0.33691 }, { "acc": 0.87947559, "epoch": 0.5912457912457912, "grad_norm": 7.03125, "learning_rate": 1.687919642887454e-05, "loss": 0.53417592, "memory(GiB)": 15.04, "step": 4390, "train_speed(iter/s)": 0.336984 }, { "acc": 0.89173651, "epoch": 0.591919191919192, "grad_norm": 12.1875, "learning_rate": 1.6871110622511394e-05, "loss": 0.40582504, "memory(GiB)": 15.04, "step": 4395, "train_speed(iter/s)": 0.337027 }, { "acc": 0.88862801, "epoch": 0.5925925925925926, "grad_norm": 16.5, "learning_rate": 1.6863016296977613e-05, "loss": 0.40161347, "memory(GiB)": 15.04, "step": 4400, "train_speed(iter/s)": 0.33707 }, { "acc": 0.89179201, "epoch": 0.5932659932659933, "grad_norm": 8.0, "learning_rate": 1.6854913462308972e-05, "loss": 0.47911062, "memory(GiB)": 15.04, "step": 4405, "train_speed(iter/s)": 0.337085 }, { "acc": 0.88523207, "epoch": 0.593939393939394, "grad_norm": 10.75, "learning_rate": 1.6846802128551803e-05, "loss": 0.39476483, "memory(GiB)": 15.04, "step": 4410, "train_speed(iter/s)": 0.337137 }, { "acc": 0.89964447, "epoch": 0.5946127946127946, "grad_norm": 5.4375, "learning_rate": 1.6838682305762972e-05, "loss": 0.35869577, "memory(GiB)": 15.04, "step": 4415, "train_speed(iter/s)": 0.337133 }, { "acc": 0.91826773, "epoch": 0.5952861952861953, "grad_norm": 12.0625, "learning_rate": 1.6830554004009863e-05, "loss": 0.34219422, "memory(GiB)": 15.04, "step": 4420, "train_speed(iter/s)": 0.337174 }, { "acc": 0.89292841, "epoch": 0.5959595959595959, "grad_norm": 5.5625, "learning_rate": 1.6822417233370387e-05, "loss": 0.36768606, "memory(GiB)": 15.04, "step": 4425, "train_speed(iter/s)": 0.33716 }, { "acc": 0.87695522, "epoch": 0.5966329966329966, "grad_norm": 7.375, "learning_rate": 1.6814272003932943e-05, "loss": 0.36812396, "memory(GiB)": 15.04, "step": 4430, "train_speed(iter/s)": 0.337195 }, { "acc": 0.89027824, "epoch": 0.5973063973063973, "grad_norm": 9.4375, "learning_rate": 1.6806118325796425e-05, "loss": 0.34605751, "memory(GiB)": 15.04, "step": 4435, "train_speed(iter/s)": 0.337258 }, { "acc": 0.92887344, "epoch": 0.597979797979798, "grad_norm": 12.0, "learning_rate": 1.67979562090702e-05, "loss": 0.27661517, "memory(GiB)": 15.04, "step": 4440, "train_speed(iter/s)": 0.337316 }, { "acc": 0.88897724, "epoch": 0.5986531986531987, "grad_norm": 15.375, "learning_rate": 1.6789785663874096e-05, "loss": 0.42862206, "memory(GiB)": 15.04, "step": 4445, "train_speed(iter/s)": 0.337418 }, { "acc": 0.91717243, "epoch": 0.5993265993265994, "grad_norm": 15.6875, "learning_rate": 1.6781606700338386e-05, "loss": 0.31442499, "memory(GiB)": 15.04, "step": 4450, "train_speed(iter/s)": 0.33748 }, { "acc": 0.87508106, "epoch": 0.6, "grad_norm": 17.25, "learning_rate": 1.6773419328603796e-05, "loss": 0.5540029, "memory(GiB)": 15.04, "step": 4455, "train_speed(iter/s)": 0.337426 }, { "acc": 0.89905529, "epoch": 0.6006734006734007, "grad_norm": 6.21875, "learning_rate": 1.6765223558821465e-05, "loss": 0.30281894, "memory(GiB)": 15.04, "step": 4460, "train_speed(iter/s)": 0.337459 }, { "acc": 0.86200438, "epoch": 0.6013468013468013, "grad_norm": 13.0, "learning_rate": 1.675701940115294e-05, "loss": 0.52247534, "memory(GiB)": 15.04, "step": 4465, "train_speed(iter/s)": 0.337513 }, { "acc": 0.89549341, "epoch": 0.602020202020202, "grad_norm": 10.125, "learning_rate": 1.6748806865770188e-05, "loss": 0.39380765, "memory(GiB)": 15.04, "step": 4470, "train_speed(iter/s)": 0.337485 }, { "acc": 0.85047359, "epoch": 0.6026936026936027, "grad_norm": 8.3125, "learning_rate": 1.674058596285554e-05, "loss": 0.58578529, "memory(GiB)": 15.04, "step": 4475, "train_speed(iter/s)": 0.337566 }, { "acc": 0.88651133, "epoch": 0.6033670033670033, "grad_norm": 5.5625, "learning_rate": 1.6732356702601716e-05, "loss": 0.48212433, "memory(GiB)": 15.04, "step": 4480, "train_speed(iter/s)": 0.337543 }, { "acc": 0.84304943, "epoch": 0.604040404040404, "grad_norm": 7.15625, "learning_rate": 1.672411909521179e-05, "loss": 0.57582579, "memory(GiB)": 15.04, "step": 4485, "train_speed(iter/s)": 0.337575 }, { "acc": 0.94032316, "epoch": 0.6047138047138048, "grad_norm": 6.90625, "learning_rate": 1.6715873150899184e-05, "loss": 0.24680698, "memory(GiB)": 15.04, "step": 4490, "train_speed(iter/s)": 0.337627 }, { "acc": 0.92611914, "epoch": 0.6053872053872054, "grad_norm": 7.21875, "learning_rate": 1.6707618879887673e-05, "loss": 0.26302812, "memory(GiB)": 15.04, "step": 4495, "train_speed(iter/s)": 0.337637 }, { "acc": 0.88908119, "epoch": 0.6060606060606061, "grad_norm": 8.625, "learning_rate": 1.6699356292411336e-05, "loss": 0.33910556, "memory(GiB)": 15.04, "step": 4500, "train_speed(iter/s)": 0.337705 }, { "epoch": 0.6060606060606061, "eval_acc": 0.8837320067739204, "eval_loss": 0.44876089692115784, "eval_runtime": 109.7203, "eval_samples_per_second": 1.367, "eval_steps_per_second": 1.367, "step": 4500 }, { "acc": 0.90021839, "epoch": 0.6067340067340067, "grad_norm": 6.375, "learning_rate": 1.669108539871457e-05, "loss": 0.28921244, "memory(GiB)": 15.04, "step": 4505, "train_speed(iter/s)": 0.334934 }, { "acc": 0.86491404, "epoch": 0.6074074074074074, "grad_norm": 7.375, "learning_rate": 1.6682806209052077e-05, "loss": 0.32596858, "memory(GiB)": 15.04, "step": 4510, "train_speed(iter/s)": 0.334937 }, { "acc": 0.90164213, "epoch": 0.6080808080808081, "grad_norm": 4.9375, "learning_rate": 1.6674518733688833e-05, "loss": 0.4585031, "memory(GiB)": 15.04, "step": 4515, "train_speed(iter/s)": 0.334899 }, { "acc": 0.91891603, "epoch": 0.6087542087542087, "grad_norm": 8.375, "learning_rate": 1.6666222982900098e-05, "loss": 0.27369349, "memory(GiB)": 15.04, "step": 4520, "train_speed(iter/s)": 0.334966 }, { "acc": 0.83691006, "epoch": 0.6094276094276094, "grad_norm": 6.09375, "learning_rate": 1.665791896697139e-05, "loss": 0.54756508, "memory(GiB)": 15.04, "step": 4525, "train_speed(iter/s)": 0.334992 }, { "acc": 0.88414259, "epoch": 0.6101010101010101, "grad_norm": 5.625, "learning_rate": 1.6649606696198467e-05, "loss": 0.54539962, "memory(GiB)": 15.04, "step": 4530, "train_speed(iter/s)": 0.335015 }, { "acc": 0.89783468, "epoch": 0.6107744107744107, "grad_norm": 17.5, "learning_rate": 1.664128618088733e-05, "loss": 0.43428464, "memory(GiB)": 15.04, "step": 4535, "train_speed(iter/s)": 0.335089 }, { "acc": 0.88710184, "epoch": 0.6114478114478115, "grad_norm": 19.375, "learning_rate": 1.6632957431354192e-05, "loss": 0.4390841, "memory(GiB)": 15.04, "step": 4540, "train_speed(iter/s)": 0.335119 }, { "acc": 0.92598915, "epoch": 0.6121212121212121, "grad_norm": 8.875, "learning_rate": 1.6624620457925494e-05, "loss": 0.25133462, "memory(GiB)": 15.04, "step": 4545, "train_speed(iter/s)": 0.33516 }, { "acc": 0.85548468, "epoch": 0.6127946127946128, "grad_norm": 6.03125, "learning_rate": 1.6616275270937858e-05, "loss": 0.60639415, "memory(GiB)": 15.04, "step": 4550, "train_speed(iter/s)": 0.335206 }, { "acc": 0.89241152, "epoch": 0.6134680134680135, "grad_norm": 5.8125, "learning_rate": 1.660792188073809e-05, "loss": 0.39169483, "memory(GiB)": 15.04, "step": 4555, "train_speed(iter/s)": 0.33528 }, { "acc": 0.91160355, "epoch": 0.6141414141414141, "grad_norm": 6.40625, "learning_rate": 1.659956029768317e-05, "loss": 0.3247937, "memory(GiB)": 15.04, "step": 4560, "train_speed(iter/s)": 0.335304 }, { "acc": 0.91380854, "epoch": 0.6148148148148148, "grad_norm": 9.75, "learning_rate": 1.659119053214024e-05, "loss": 0.32185378, "memory(GiB)": 15.04, "step": 4565, "train_speed(iter/s)": 0.33535 }, { "acc": 0.89268837, "epoch": 0.6154882154882155, "grad_norm": 8.4375, "learning_rate": 1.658281259448658e-05, "loss": 0.3313453, "memory(GiB)": 15.04, "step": 4570, "train_speed(iter/s)": 0.33537 }, { "acc": 0.84350071, "epoch": 0.6161616161616161, "grad_norm": 13.875, "learning_rate": 1.657442649510961e-05, "loss": 0.41612873, "memory(GiB)": 15.04, "step": 4575, "train_speed(iter/s)": 0.335462 }, { "acc": 0.90122662, "epoch": 0.6168350168350168, "grad_norm": 15.9375, "learning_rate": 1.656603224440686e-05, "loss": 0.30698125, "memory(GiB)": 15.04, "step": 4580, "train_speed(iter/s)": 0.335542 }, { "acc": 0.89745512, "epoch": 0.6175084175084176, "grad_norm": 18.625, "learning_rate": 1.655762985278597e-05, "loss": 0.37742386, "memory(GiB)": 15.04, "step": 4585, "train_speed(iter/s)": 0.335545 }, { "acc": 0.8741374, "epoch": 0.6181818181818182, "grad_norm": 8.0, "learning_rate": 1.6549219330664677e-05, "loss": 0.55189834, "memory(GiB)": 15.04, "step": 4590, "train_speed(iter/s)": 0.335564 }, { "acc": 0.86944952, "epoch": 0.6188552188552189, "grad_norm": 5.0, "learning_rate": 1.6540800688470798e-05, "loss": 0.54992094, "memory(GiB)": 15.04, "step": 4595, "train_speed(iter/s)": 0.33558 }, { "acc": 0.85860176, "epoch": 0.6195286195286195, "grad_norm": 10.6875, "learning_rate": 1.6532373936642217e-05, "loss": 0.42584996, "memory(GiB)": 15.04, "step": 4600, "train_speed(iter/s)": 0.335598 }, { "acc": 0.85607958, "epoch": 0.6202020202020202, "grad_norm": 9.5, "learning_rate": 1.652393908562687e-05, "loss": 0.66726885, "memory(GiB)": 15.04, "step": 4605, "train_speed(iter/s)": 0.335608 }, { "acc": 0.83417387, "epoch": 0.6208754208754209, "grad_norm": 12.0625, "learning_rate": 1.6515496145882733e-05, "loss": 0.44307995, "memory(GiB)": 15.04, "step": 4610, "train_speed(iter/s)": 0.335684 }, { "acc": 0.91452293, "epoch": 0.6215488215488215, "grad_norm": 10.125, "learning_rate": 1.6507045127877817e-05, "loss": 0.35714803, "memory(GiB)": 15.04, "step": 4615, "train_speed(iter/s)": 0.335741 }, { "acc": 0.91882572, "epoch": 0.6222222222222222, "grad_norm": 10.0625, "learning_rate": 1.649858604209015e-05, "loss": 0.3434411, "memory(GiB)": 15.04, "step": 4620, "train_speed(iter/s)": 0.335791 }, { "acc": 0.92277594, "epoch": 0.622895622895623, "grad_norm": 6.875, "learning_rate": 1.6490118899007755e-05, "loss": 0.23106558, "memory(GiB)": 15.04, "step": 4625, "train_speed(iter/s)": 0.335855 }, { "acc": 0.84693232, "epoch": 0.6235690235690236, "grad_norm": 11.25, "learning_rate": 1.6481643709128654e-05, "loss": 0.65914745, "memory(GiB)": 15.04, "step": 4630, "train_speed(iter/s)": 0.33585 }, { "acc": 0.89393358, "epoch": 0.6242424242424243, "grad_norm": 20.0, "learning_rate": 1.6473160482960837e-05, "loss": 0.43507428, "memory(GiB)": 15.04, "step": 4635, "train_speed(iter/s)": 0.335843 }, { "acc": 0.90842285, "epoch": 0.6249158249158249, "grad_norm": 5.90625, "learning_rate": 1.6464669231022257e-05, "loss": 0.38126285, "memory(GiB)": 15.04, "step": 4640, "train_speed(iter/s)": 0.335899 }, { "acc": 0.85284615, "epoch": 0.6255892255892256, "grad_norm": 6.625, "learning_rate": 1.6456169963840832e-05, "loss": 0.50575981, "memory(GiB)": 15.04, "step": 4645, "train_speed(iter/s)": 0.335962 }, { "acc": 0.91851702, "epoch": 0.6262626262626263, "grad_norm": 5.53125, "learning_rate": 1.6447662691954402e-05, "loss": 0.27984991, "memory(GiB)": 15.04, "step": 4650, "train_speed(iter/s)": 0.335947 }, { "acc": 0.89429655, "epoch": 0.6269360269360269, "grad_norm": 9.9375, "learning_rate": 1.6439147425910743e-05, "loss": 0.29749134, "memory(GiB)": 15.04, "step": 4655, "train_speed(iter/s)": 0.336041 }, { "acc": 0.83321552, "epoch": 0.6276094276094276, "grad_norm": 7.0625, "learning_rate": 1.643062417626753e-05, "loss": 0.48077607, "memory(GiB)": 15.04, "step": 4660, "train_speed(iter/s)": 0.336087 }, { "acc": 0.88771658, "epoch": 0.6282828282828283, "grad_norm": 5.875, "learning_rate": 1.6422092953592353e-05, "loss": 0.43149128, "memory(GiB)": 15.04, "step": 4665, "train_speed(iter/s)": 0.336139 }, { "acc": 0.89234009, "epoch": 0.6289562289562289, "grad_norm": 6.625, "learning_rate": 1.6413553768462672e-05, "loss": 0.33808124, "memory(GiB)": 15.04, "step": 4670, "train_speed(iter/s)": 0.336178 }, { "acc": 0.86847658, "epoch": 0.6296296296296297, "grad_norm": 7.8125, "learning_rate": 1.6405006631465826e-05, "loss": 0.50474515, "memory(GiB)": 15.04, "step": 4675, "train_speed(iter/s)": 0.336129 }, { "acc": 0.88128033, "epoch": 0.6303030303030303, "grad_norm": 7.0625, "learning_rate": 1.6396451553199014e-05, "loss": 0.29793146, "memory(GiB)": 15.04, "step": 4680, "train_speed(iter/s)": 0.336178 }, { "acc": 0.83775768, "epoch": 0.630976430976431, "grad_norm": 12.125, "learning_rate": 1.638788854426928e-05, "loss": 0.65453782, "memory(GiB)": 15.04, "step": 4685, "train_speed(iter/s)": 0.33623 }, { "acc": 0.88658571, "epoch": 0.6316498316498317, "grad_norm": 6.53125, "learning_rate": 1.6379317615293505e-05, "loss": 0.46322603, "memory(GiB)": 15.04, "step": 4690, "train_speed(iter/s)": 0.336229 }, { "acc": 0.88868914, "epoch": 0.6323232323232323, "grad_norm": 8.25, "learning_rate": 1.6370738776898378e-05, "loss": 0.36851487, "memory(GiB)": 15.04, "step": 4695, "train_speed(iter/s)": 0.336241 }, { "acc": 0.93904266, "epoch": 0.632996632996633, "grad_norm": 5.96875, "learning_rate": 1.6362152039720407e-05, "loss": 0.26145096, "memory(GiB)": 15.04, "step": 4700, "train_speed(iter/s)": 0.336295 }, { "acc": 0.88688774, "epoch": 0.6336700336700337, "grad_norm": 11.25, "learning_rate": 1.6353557414405883e-05, "loss": 0.32266824, "memory(GiB)": 15.04, "step": 4705, "train_speed(iter/s)": 0.336379 }, { "acc": 0.79992952, "epoch": 0.6343434343434343, "grad_norm": 21.25, "learning_rate": 1.634495491161089e-05, "loss": 0.50860252, "memory(GiB)": 15.04, "step": 4710, "train_speed(iter/s)": 0.336427 }, { "acc": 0.76752977, "epoch": 0.635016835016835, "grad_norm": 18.125, "learning_rate": 1.6336344542001264e-05, "loss": 0.54262133, "memory(GiB)": 15.04, "step": 4715, "train_speed(iter/s)": 0.336448 }, { "acc": 0.85343208, "epoch": 0.6356902356902356, "grad_norm": 6.3125, "learning_rate": 1.632772631625261e-05, "loss": 0.57416587, "memory(GiB)": 15.04, "step": 4720, "train_speed(iter/s)": 0.336476 }, { "acc": 0.92890425, "epoch": 0.6363636363636364, "grad_norm": 9.0625, "learning_rate": 1.631910024505025e-05, "loss": 0.29194543, "memory(GiB)": 15.04, "step": 4725, "train_speed(iter/s)": 0.336545 }, { "acc": 0.94128084, "epoch": 0.6370370370370371, "grad_norm": 14.4375, "learning_rate": 1.631046633908927e-05, "loss": 0.23905036, "memory(GiB)": 15.04, "step": 4730, "train_speed(iter/s)": 0.336607 }, { "acc": 0.87926884, "epoch": 0.6377104377104377, "grad_norm": 8.0625, "learning_rate": 1.6301824609074432e-05, "loss": 0.348334, "memory(GiB)": 15.04, "step": 4735, "train_speed(iter/s)": 0.336634 }, { "acc": 0.9110774, "epoch": 0.6383838383838384, "grad_norm": 8.3125, "learning_rate": 1.6293175065720223e-05, "loss": 0.22420795, "memory(GiB)": 15.04, "step": 4740, "train_speed(iter/s)": 0.33669 }, { "acc": 0.86740751, "epoch": 0.6390572390572391, "grad_norm": 14.25, "learning_rate": 1.628451771975081e-05, "loss": 0.43412094, "memory(GiB)": 15.04, "step": 4745, "train_speed(iter/s)": 0.336757 }, { "acc": 0.88887815, "epoch": 0.6397306397306397, "grad_norm": 6.65625, "learning_rate": 1.627585258190003e-05, "loss": 0.48215976, "memory(GiB)": 15.04, "step": 4750, "train_speed(iter/s)": 0.336804 }, { "acc": 0.85770273, "epoch": 0.6404040404040404, "grad_norm": 8.8125, "learning_rate": 1.6267179662911385e-05, "loss": 0.42769055, "memory(GiB)": 15.04, "step": 4755, "train_speed(iter/s)": 0.336766 }, { "acc": 0.90283947, "epoch": 0.641077441077441, "grad_norm": 7.625, "learning_rate": 1.6258498973538028e-05, "loss": 0.24866667, "memory(GiB)": 15.04, "step": 4760, "train_speed(iter/s)": 0.336822 }, { "acc": 0.89991875, "epoch": 0.6417508417508417, "grad_norm": 12.625, "learning_rate": 1.6249810524542736e-05, "loss": 0.35716062, "memory(GiB)": 15.04, "step": 4765, "train_speed(iter/s)": 0.33688 }, { "acc": 0.89296923, "epoch": 0.6424242424242425, "grad_norm": 10.25, "learning_rate": 1.624111432669792e-05, "loss": 0.35033152, "memory(GiB)": 15.04, "step": 4770, "train_speed(iter/s)": 0.336927 }, { "acc": 0.82698908, "epoch": 0.6430976430976431, "grad_norm": 6.0, "learning_rate": 1.6232410390785584e-05, "loss": 0.43431349, "memory(GiB)": 15.04, "step": 4775, "train_speed(iter/s)": 0.336905 }, { "acc": 0.89962473, "epoch": 0.6437710437710438, "grad_norm": 9.1875, "learning_rate": 1.6223698727597337e-05, "loss": 0.26351688, "memory(GiB)": 15.04, "step": 4780, "train_speed(iter/s)": 0.33694 }, { "acc": 0.94725761, "epoch": 0.6444444444444445, "grad_norm": 6.21875, "learning_rate": 1.621497934793437e-05, "loss": 0.21798928, "memory(GiB)": 15.04, "step": 4785, "train_speed(iter/s)": 0.336996 }, { "acc": 0.88771954, "epoch": 0.6451178451178451, "grad_norm": 5.96875, "learning_rate": 1.620625226260743e-05, "loss": 0.62674317, "memory(GiB)": 15.04, "step": 4790, "train_speed(iter/s)": 0.337041 }, { "acc": 0.86005669, "epoch": 0.6457912457912458, "grad_norm": 6.9375, "learning_rate": 1.619751748243683e-05, "loss": 0.55922494, "memory(GiB)": 15.04, "step": 4795, "train_speed(iter/s)": 0.337096 }, { "acc": 0.8449975, "epoch": 0.6464646464646465, "grad_norm": 9.4375, "learning_rate": 1.618877501825241e-05, "loss": 0.45132298, "memory(GiB)": 15.04, "step": 4800, "train_speed(iter/s)": 0.337157 }, { "epoch": 0.6464646464646465, "eval_acc": 0.8843698897504879, "eval_loss": 0.44451630115509033, "eval_runtime": 109.8236, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 4800 }, { "acc": 0.89971037, "epoch": 0.6471380471380471, "grad_norm": 6.96875, "learning_rate": 1.618002488089355e-05, "loss": 0.37263277, "memory(GiB)": 15.04, "step": 4805, "train_speed(iter/s)": 0.33463 }, { "acc": 0.89048262, "epoch": 0.6478114478114478, "grad_norm": 6.3125, "learning_rate": 1.617126708120914e-05, "loss": 0.43812232, "memory(GiB)": 15.04, "step": 4810, "train_speed(iter/s)": 0.334681 }, { "acc": 0.90058737, "epoch": 0.6484848484848484, "grad_norm": 8.3125, "learning_rate": 1.6162501630057566e-05, "loss": 0.31114213, "memory(GiB)": 15.04, "step": 4815, "train_speed(iter/s)": 0.334756 }, { "acc": 0.90134754, "epoch": 0.6491582491582492, "grad_norm": 14.3125, "learning_rate": 1.6153728538306705e-05, "loss": 0.25759342, "memory(GiB)": 15.04, "step": 4820, "train_speed(iter/s)": 0.33483 }, { "acc": 0.87186127, "epoch": 0.6498316498316499, "grad_norm": 10.0, "learning_rate": 1.6144947816833902e-05, "loss": 0.37279108, "memory(GiB)": 15.04, "step": 4825, "train_speed(iter/s)": 0.334884 }, { "acc": 0.84917154, "epoch": 0.6505050505050505, "grad_norm": 10.8125, "learning_rate": 1.6136159476525968e-05, "loss": 0.52433534, "memory(GiB)": 15.04, "step": 4830, "train_speed(iter/s)": 0.334875 }, { "acc": 0.89658546, "epoch": 0.6511784511784512, "grad_norm": 8.3125, "learning_rate": 1.6127363528279158e-05, "loss": 0.349422, "memory(GiB)": 15.04, "step": 4835, "train_speed(iter/s)": 0.334896 }, { "acc": 0.87986097, "epoch": 0.6518518518518519, "grad_norm": 20.125, "learning_rate": 1.611855998299916e-05, "loss": 0.42859497, "memory(GiB)": 15.04, "step": 4840, "train_speed(iter/s)": 0.334958 }, { "acc": 0.83318729, "epoch": 0.6525252525252525, "grad_norm": 11.125, "learning_rate": 1.6109748851601078e-05, "loss": 0.37896371, "memory(GiB)": 15.04, "step": 4845, "train_speed(iter/s)": 0.335057 }, { "acc": 0.87710838, "epoch": 0.6531986531986532, "grad_norm": 7.21875, "learning_rate": 1.6100930145009427e-05, "loss": 0.44344339, "memory(GiB)": 15.04, "step": 4850, "train_speed(iter/s)": 0.335064 }, { "acc": 0.84607038, "epoch": 0.6538720538720538, "grad_norm": 9.25, "learning_rate": 1.6092103874158113e-05, "loss": 0.72218566, "memory(GiB)": 15.04, "step": 4855, "train_speed(iter/s)": 0.335096 }, { "acc": 0.75290537, "epoch": 0.6545454545454545, "grad_norm": 12.4375, "learning_rate": 1.608327004999041e-05, "loss": 1.13738747, "memory(GiB)": 15.04, "step": 4860, "train_speed(iter/s)": 0.335145 }, { "acc": 0.82909489, "epoch": 0.6552188552188553, "grad_norm": 9.25, "learning_rate": 1.6074428683458972e-05, "loss": 0.49320173, "memory(GiB)": 15.04, "step": 4865, "train_speed(iter/s)": 0.335121 }, { "acc": 0.86532259, "epoch": 0.6558922558922559, "grad_norm": 6.25, "learning_rate": 1.60655797855258e-05, "loss": 0.34194508, "memory(GiB)": 15.04, "step": 4870, "train_speed(iter/s)": 0.335165 }, { "acc": 0.83790216, "epoch": 0.6565656565656566, "grad_norm": 14.1875, "learning_rate": 1.605672336716223e-05, "loss": 0.71522827, "memory(GiB)": 15.04, "step": 4875, "train_speed(iter/s)": 0.335208 }, { "acc": 0.91066542, "epoch": 0.6572390572390573, "grad_norm": 6.96875, "learning_rate": 1.6047859439348923e-05, "loss": 0.28191049, "memory(GiB)": 15.04, "step": 4880, "train_speed(iter/s)": 0.335235 }, { "acc": 0.90884056, "epoch": 0.6579124579124579, "grad_norm": 13.5, "learning_rate": 1.6038988013075848e-05, "loss": 0.29040275, "memory(GiB)": 15.04, "step": 4885, "train_speed(iter/s)": 0.335306 }, { "acc": 0.83176765, "epoch": 0.6585858585858586, "grad_norm": 5.3125, "learning_rate": 1.603010909934228e-05, "loss": 0.79500623, "memory(GiB)": 15.04, "step": 4890, "train_speed(iter/s)": 0.335385 }, { "acc": 0.86799116, "epoch": 0.6592592592592592, "grad_norm": 5.78125, "learning_rate": 1.6021222709156768e-05, "loss": 0.43854384, "memory(GiB)": 15.04, "step": 4895, "train_speed(iter/s)": 0.335437 }, { "acc": 0.82422266, "epoch": 0.6599326599326599, "grad_norm": 10.5, "learning_rate": 1.6012328853537133e-05, "loss": 0.73316464, "memory(GiB)": 15.04, "step": 4900, "train_speed(iter/s)": 0.335516 }, { "acc": 0.81474895, "epoch": 0.6606060606060606, "grad_norm": 12.25, "learning_rate": 1.600342754351045e-05, "loss": 0.39320011, "memory(GiB)": 15.04, "step": 4905, "train_speed(iter/s)": 0.335504 }, { "acc": 0.89751692, "epoch": 0.6612794612794612, "grad_norm": 8.0625, "learning_rate": 1.5994518790113048e-05, "loss": 0.37473979, "memory(GiB)": 15.04, "step": 4910, "train_speed(iter/s)": 0.335571 }, { "acc": 0.85417538, "epoch": 0.661952861952862, "grad_norm": 5.53125, "learning_rate": 1.5985602604390473e-05, "loss": 0.32387593, "memory(GiB)": 15.04, "step": 4915, "train_speed(iter/s)": 0.335632 }, { "acc": 0.83993015, "epoch": 0.6626262626262627, "grad_norm": 12.0625, "learning_rate": 1.597667899739749e-05, "loss": 0.68531394, "memory(GiB)": 15.04, "step": 4920, "train_speed(iter/s)": 0.335674 }, { "acc": 0.87181358, "epoch": 0.6632996632996633, "grad_norm": 11.0, "learning_rate": 1.5967747980198058e-05, "loss": 0.43111091, "memory(GiB)": 15.04, "step": 4925, "train_speed(iter/s)": 0.33574 }, { "acc": 0.90364523, "epoch": 0.663973063973064, "grad_norm": 7.0625, "learning_rate": 1.595880956386534e-05, "loss": 0.39029117, "memory(GiB)": 15.04, "step": 4930, "train_speed(iter/s)": 0.33578 }, { "acc": 0.93159332, "epoch": 0.6646464646464646, "grad_norm": 6.6875, "learning_rate": 1.5949863759481653e-05, "loss": 0.28740838, "memory(GiB)": 15.04, "step": 4935, "train_speed(iter/s)": 0.335823 }, { "acc": 0.81072521, "epoch": 0.6653198653198653, "grad_norm": 5.90625, "learning_rate": 1.594091057813849e-05, "loss": 0.46846576, "memory(GiB)": 15.04, "step": 4940, "train_speed(iter/s)": 0.335769 }, { "acc": 0.90675364, "epoch": 0.665993265993266, "grad_norm": 10.875, "learning_rate": 1.593195003093648e-05, "loss": 0.31139865, "memory(GiB)": 15.04, "step": 4945, "train_speed(iter/s)": 0.335806 }, { "acc": 0.92784157, "epoch": 0.6666666666666666, "grad_norm": 13.75, "learning_rate": 1.59229821289854e-05, "loss": 0.27862949, "memory(GiB)": 15.04, "step": 4950, "train_speed(iter/s)": 0.335856 }, { "acc": 0.91999216, "epoch": 0.6673400673400673, "grad_norm": 5.96875, "learning_rate": 1.5914006883404115e-05, "loss": 0.28840609, "memory(GiB)": 15.04, "step": 4955, "train_speed(iter/s)": 0.335821 }, { "acc": 0.89679012, "epoch": 0.6680134680134681, "grad_norm": 6.71875, "learning_rate": 1.5905024305320632e-05, "loss": 0.47731857, "memory(GiB)": 15.04, "step": 4960, "train_speed(iter/s)": 0.335851 }, { "acc": 0.85383654, "epoch": 0.6686868686868687, "grad_norm": 5.375, "learning_rate": 1.589603440587203e-05, "loss": 0.33904052, "memory(GiB)": 15.04, "step": 4965, "train_speed(iter/s)": 0.335858 }, { "acc": 0.9142395, "epoch": 0.6693602693602694, "grad_norm": 12.25, "learning_rate": 1.588703719620446e-05, "loss": 0.30967436, "memory(GiB)": 15.04, "step": 4970, "train_speed(iter/s)": 0.335931 }, { "acc": 0.85769234, "epoch": 0.67003367003367, "grad_norm": 11.4375, "learning_rate": 1.5878032687473147e-05, "loss": 0.4702589, "memory(GiB)": 15.04, "step": 4975, "train_speed(iter/s)": 0.336 }, { "acc": 0.84484854, "epoch": 0.6707070707070707, "grad_norm": 11.25, "learning_rate": 1.5869020890842367e-05, "loss": 0.26913323, "memory(GiB)": 15.04, "step": 4980, "train_speed(iter/s)": 0.336065 }, { "acc": 0.8702199, "epoch": 0.6713804713804714, "grad_norm": 12.375, "learning_rate": 1.586000181748542e-05, "loss": 0.45575948, "memory(GiB)": 15.04, "step": 4985, "train_speed(iter/s)": 0.336122 }, { "acc": 0.85966482, "epoch": 0.672053872053872, "grad_norm": 7.21875, "learning_rate": 1.5850975478584643e-05, "loss": 0.57842598, "memory(GiB)": 15.04, "step": 4990, "train_speed(iter/s)": 0.336161 }, { "acc": 0.87984657, "epoch": 0.6727272727272727, "grad_norm": 8.0625, "learning_rate": 1.584194188533137e-05, "loss": 0.45647306, "memory(GiB)": 15.04, "step": 4995, "train_speed(iter/s)": 0.336208 }, { "acc": 0.92159386, "epoch": 0.6734006734006734, "grad_norm": 8.4375, "learning_rate": 1.5832901048925932e-05, "loss": 0.25176334, "memory(GiB)": 15.04, "step": 5000, "train_speed(iter/s)": 0.336284 }, { "acc": 0.82076206, "epoch": 0.674074074074074, "grad_norm": 14.125, "learning_rate": 1.5823852980577647e-05, "loss": 0.49896278, "memory(GiB)": 15.04, "step": 5005, "train_speed(iter/s)": 0.336272 }, { "acc": 0.84987173, "epoch": 0.6747474747474748, "grad_norm": 23.625, "learning_rate": 1.5814797691504788e-05, "loss": 0.530128, "memory(GiB)": 15.04, "step": 5010, "train_speed(iter/s)": 0.336299 }, { "acc": 0.91046906, "epoch": 0.6754208754208754, "grad_norm": 8.0, "learning_rate": 1.5805735192934596e-05, "loss": 0.34117546, "memory(GiB)": 15.04, "step": 5015, "train_speed(iter/s)": 0.336355 }, { "acc": 0.84760551, "epoch": 0.6760942760942761, "grad_norm": 13.5625, "learning_rate": 1.579666549610323e-05, "loss": 0.57944155, "memory(GiB)": 15.04, "step": 5020, "train_speed(iter/s)": 0.336431 }, { "acc": 0.91725817, "epoch": 0.6767676767676768, "grad_norm": 9.3125, "learning_rate": 1.5787588612255796e-05, "loss": 0.35195091, "memory(GiB)": 15.04, "step": 5025, "train_speed(iter/s)": 0.336493 }, { "acc": 0.93613453, "epoch": 0.6774410774410774, "grad_norm": 9.4375, "learning_rate": 1.5778504552646293e-05, "loss": 0.26168051, "memory(GiB)": 15.04, "step": 5030, "train_speed(iter/s)": 0.336542 }, { "acc": 0.88962126, "epoch": 0.6781144781144781, "grad_norm": 9.0, "learning_rate": 1.5769413328537626e-05, "loss": 0.22385941, "memory(GiB)": 15.04, "step": 5035, "train_speed(iter/s)": 0.336629 }, { "acc": 0.83950338, "epoch": 0.6787878787878788, "grad_norm": 10.5625, "learning_rate": 1.5760314951201585e-05, "loss": 0.63088756, "memory(GiB)": 15.04, "step": 5040, "train_speed(iter/s)": 0.336704 }, { "acc": 0.88710566, "epoch": 0.6794612794612794, "grad_norm": 10.75, "learning_rate": 1.575120943191882e-05, "loss": 0.35499609, "memory(GiB)": 15.04, "step": 5045, "train_speed(iter/s)": 0.33669 }, { "acc": 0.94044094, "epoch": 0.6801346801346801, "grad_norm": 10.8125, "learning_rate": 1.5742096781978847e-05, "loss": 0.27930114, "memory(GiB)": 15.04, "step": 5050, "train_speed(iter/s)": 0.336745 }, { "acc": 0.82651567, "epoch": 0.6808080808080809, "grad_norm": 9.375, "learning_rate": 1.573297701268001e-05, "loss": 0.37496407, "memory(GiB)": 15.04, "step": 5055, "train_speed(iter/s)": 0.336772 }, { "acc": 0.84456654, "epoch": 0.6814814814814815, "grad_norm": 3.984375, "learning_rate": 1.572385013532949e-05, "loss": 0.4290081, "memory(GiB)": 15.04, "step": 5060, "train_speed(iter/s)": 0.336806 }, { "acc": 0.83932276, "epoch": 0.6821548821548822, "grad_norm": 21.625, "learning_rate": 1.571471616124328e-05, "loss": 0.5563108, "memory(GiB)": 15.04, "step": 5065, "train_speed(iter/s)": 0.336824 }, { "acc": 0.76784258, "epoch": 0.6828282828282828, "grad_norm": 13.625, "learning_rate": 1.5705575101746166e-05, "loss": 0.78404527, "memory(GiB)": 15.04, "step": 5070, "train_speed(iter/s)": 0.336833 }, { "acc": 0.8337821, "epoch": 0.6835016835016835, "grad_norm": 9.375, "learning_rate": 1.569642696817173e-05, "loss": 0.59160681, "memory(GiB)": 15.04, "step": 5075, "train_speed(iter/s)": 0.336854 }, { "acc": 0.85700722, "epoch": 0.6841750841750842, "grad_norm": 8.0, "learning_rate": 1.5687271771862302e-05, "loss": 0.50799479, "memory(GiB)": 15.04, "step": 5080, "train_speed(iter/s)": 0.336908 }, { "acc": 0.89061489, "epoch": 0.6848484848484848, "grad_norm": 9.25, "learning_rate": 1.5678109524169002e-05, "loss": 0.36893265, "memory(GiB)": 15.04, "step": 5085, "train_speed(iter/s)": 0.336942 }, { "acc": 0.84866123, "epoch": 0.6855218855218855, "grad_norm": 9.5, "learning_rate": 1.5668940236451667e-05, "loss": 0.78797088, "memory(GiB)": 15.04, "step": 5090, "train_speed(iter/s)": 0.337012 }, { "acc": 0.90500269, "epoch": 0.6861952861952862, "grad_norm": 26.125, "learning_rate": 1.565976392007887e-05, "loss": 0.33935552, "memory(GiB)": 15.04, "step": 5095, "train_speed(iter/s)": 0.33708 }, { "acc": 0.88241081, "epoch": 0.6868686868686869, "grad_norm": 5.6875, "learning_rate": 1.5650580586427903e-05, "loss": 0.51824064, "memory(GiB)": 15.04, "step": 5100, "train_speed(iter/s)": 0.337077 }, { "epoch": 0.6868686868686869, "eval_acc": 0.8866254713028907, "eval_loss": 0.4350052773952484, "eval_runtime": 109.8937, "eval_samples_per_second": 1.365, "eval_steps_per_second": 1.365, "step": 5100 }, { "acc": 0.84023781, "epoch": 0.6875420875420876, "grad_norm": 10.6875, "learning_rate": 1.564139024688475e-05, "loss": 0.63163309, "memory(GiB)": 15.04, "step": 5105, "train_speed(iter/s)": 0.334689 }, { "acc": 0.83299837, "epoch": 0.6882154882154882, "grad_norm": 13.0625, "learning_rate": 1.5632192912844084e-05, "loss": 0.861168, "memory(GiB)": 15.04, "step": 5110, "train_speed(iter/s)": 0.334736 }, { "acc": 0.93508368, "epoch": 0.6888888888888889, "grad_norm": 4.59375, "learning_rate": 1.562298859570926e-05, "loss": 0.26280169, "memory(GiB)": 15.04, "step": 5115, "train_speed(iter/s)": 0.334729 }, { "acc": 0.91097832, "epoch": 0.6895622895622896, "grad_norm": 6.25, "learning_rate": 1.5613777306892278e-05, "loss": 0.32177091, "memory(GiB)": 15.04, "step": 5120, "train_speed(iter/s)": 0.334798 }, { "acc": 0.83395815, "epoch": 0.6902356902356902, "grad_norm": 7.1875, "learning_rate": 1.560455905781378e-05, "loss": 0.52251482, "memory(GiB)": 15.04, "step": 5125, "train_speed(iter/s)": 0.334854 }, { "acc": 0.87596321, "epoch": 0.6909090909090909, "grad_norm": 7.4375, "learning_rate": 1.559533385990306e-05, "loss": 0.53407445, "memory(GiB)": 15.04, "step": 5130, "train_speed(iter/s)": 0.334883 }, { "acc": 0.86092014, "epoch": 0.6915824915824916, "grad_norm": 7.3125, "learning_rate": 1.5586101724598003e-05, "loss": 0.63142657, "memory(GiB)": 15.04, "step": 5135, "train_speed(iter/s)": 0.334941 }, { "acc": 0.90926075, "epoch": 0.6922558922558922, "grad_norm": 9.0625, "learning_rate": 1.5576862663345104e-05, "loss": 0.33125496, "memory(GiB)": 15.04, "step": 5140, "train_speed(iter/s)": 0.335006 }, { "acc": 0.93036556, "epoch": 0.692929292929293, "grad_norm": 6.78125, "learning_rate": 1.5567616687599446e-05, "loss": 0.24624259, "memory(GiB)": 15.04, "step": 5145, "train_speed(iter/s)": 0.335039 }, { "acc": 0.87925148, "epoch": 0.6936026936026936, "grad_norm": 5.25, "learning_rate": 1.5558363808824682e-05, "loss": 0.39352739, "memory(GiB)": 15.04, "step": 5150, "train_speed(iter/s)": 0.335049 }, { "acc": 0.81072083, "epoch": 0.6942760942760943, "grad_norm": 9.125, "learning_rate": 1.5549104038493034e-05, "loss": 0.67198267, "memory(GiB)": 15.04, "step": 5155, "train_speed(iter/s)": 0.335072 }, { "acc": 0.82437325, "epoch": 0.694949494949495, "grad_norm": 5.78125, "learning_rate": 1.5539837388085253e-05, "loss": 0.66641488, "memory(GiB)": 15.04, "step": 5160, "train_speed(iter/s)": 0.335105 }, { "acc": 0.80968819, "epoch": 0.6956228956228956, "grad_norm": 25.5, "learning_rate": 1.5530563869090633e-05, "loss": 0.79818001, "memory(GiB)": 15.04, "step": 5165, "train_speed(iter/s)": 0.335141 }, { "acc": 0.90361757, "epoch": 0.6962962962962963, "grad_norm": 10.5625, "learning_rate": 1.5521283493006975e-05, "loss": 0.38003852, "memory(GiB)": 15.04, "step": 5170, "train_speed(iter/s)": 0.335183 }, { "acc": 0.93689108, "epoch": 0.696969696969697, "grad_norm": 7.875, "learning_rate": 1.551199627134059e-05, "loss": 0.22883389, "memory(GiB)": 15.04, "step": 5175, "train_speed(iter/s)": 0.335212 }, { "acc": 0.8980669, "epoch": 0.6976430976430976, "grad_norm": 5.09375, "learning_rate": 1.5502702215606272e-05, "loss": 0.39488235, "memory(GiB)": 15.04, "step": 5180, "train_speed(iter/s)": 0.335239 }, { "acc": 0.86305151, "epoch": 0.6983164983164983, "grad_norm": 11.125, "learning_rate": 1.5493401337327282e-05, "loss": 0.30057523, "memory(GiB)": 15.04, "step": 5185, "train_speed(iter/s)": 0.335304 }, { "acc": 0.77735128, "epoch": 0.6989898989898989, "grad_norm": 5.40625, "learning_rate": 1.5484093648035357e-05, "loss": 1.3391305, "memory(GiB)": 15.04, "step": 5190, "train_speed(iter/s)": 0.335318 }, { "acc": 0.90310974, "epoch": 0.6996632996632997, "grad_norm": 7.1875, "learning_rate": 1.547477915927066e-05, "loss": 0.32603617, "memory(GiB)": 15.04, "step": 5195, "train_speed(iter/s)": 0.335306 }, { "acc": 0.88939428, "epoch": 0.7003367003367004, "grad_norm": 5.03125, "learning_rate": 1.5465457882581797e-05, "loss": 0.41364112, "memory(GiB)": 15.04, "step": 5200, "train_speed(iter/s)": 0.335289 }, { "acc": 0.8884407, "epoch": 0.701010101010101, "grad_norm": 9.5625, "learning_rate": 1.5456129829525784e-05, "loss": 0.26326492, "memory(GiB)": 15.04, "step": 5205, "train_speed(iter/s)": 0.335371 }, { "acc": 0.91765385, "epoch": 0.7016835016835017, "grad_norm": 10.5625, "learning_rate": 1.544679501166804e-05, "loss": 0.31347032, "memory(GiB)": 15.04, "step": 5210, "train_speed(iter/s)": 0.33542 }, { "acc": 0.91512489, "epoch": 0.7023569023569024, "grad_norm": 7.0, "learning_rate": 1.5437453440582372e-05, "loss": 0.29753859, "memory(GiB)": 15.04, "step": 5215, "train_speed(iter/s)": 0.335418 }, { "acc": 0.89700146, "epoch": 0.703030303030303, "grad_norm": 6.46875, "learning_rate": 1.542810512785096e-05, "loss": 0.33366153, "memory(GiB)": 15.04, "step": 5220, "train_speed(iter/s)": 0.33542 }, { "acc": 0.92242832, "epoch": 0.7037037037037037, "grad_norm": 7.15625, "learning_rate": 1.5418750085064343e-05, "loss": 0.2627866, "memory(GiB)": 15.04, "step": 5225, "train_speed(iter/s)": 0.335449 }, { "acc": 0.88747625, "epoch": 0.7043771043771043, "grad_norm": 23.375, "learning_rate": 1.5409388323821403e-05, "loss": 0.4003212, "memory(GiB)": 15.04, "step": 5230, "train_speed(iter/s)": 0.335531 }, { "acc": 0.93946686, "epoch": 0.705050505050505, "grad_norm": 7.78125, "learning_rate": 1.5400019855729353e-05, "loss": 0.20857615, "memory(GiB)": 15.04, "step": 5235, "train_speed(iter/s)": 0.335616 }, { "acc": 0.93392143, "epoch": 0.7057239057239058, "grad_norm": 5.375, "learning_rate": 1.539064469240372e-05, "loss": 0.23336976, "memory(GiB)": 15.04, "step": 5240, "train_speed(iter/s)": 0.335687 }, { "acc": 0.90790997, "epoch": 0.7063973063973064, "grad_norm": 10.0625, "learning_rate": 1.5381262845468336e-05, "loss": 0.28862476, "memory(GiB)": 15.04, "step": 5245, "train_speed(iter/s)": 0.335738 }, { "acc": 0.88231773, "epoch": 0.7070707070707071, "grad_norm": 8.0625, "learning_rate": 1.537187432655531e-05, "loss": 0.54753809, "memory(GiB)": 15.04, "step": 5250, "train_speed(iter/s)": 0.335788 }, { "acc": 0.90225554, "epoch": 0.7077441077441078, "grad_norm": 8.9375, "learning_rate": 1.536247914730504e-05, "loss": 0.38348703, "memory(GiB)": 15.04, "step": 5255, "train_speed(iter/s)": 0.335785 }, { "acc": 0.89147949, "epoch": 0.7084175084175084, "grad_norm": 6.90625, "learning_rate": 1.535307731936616e-05, "loss": 0.40294757, "memory(GiB)": 15.04, "step": 5260, "train_speed(iter/s)": 0.335827 }, { "acc": 0.85820866, "epoch": 0.7090909090909091, "grad_norm": 8.125, "learning_rate": 1.5343668854395574e-05, "loss": 0.85416241, "memory(GiB)": 15.04, "step": 5265, "train_speed(iter/s)": 0.335769 }, { "acc": 0.89848557, "epoch": 0.7097643097643098, "grad_norm": 9.9375, "learning_rate": 1.5334253764058387e-05, "loss": 0.36731422, "memory(GiB)": 15.04, "step": 5270, "train_speed(iter/s)": 0.335817 }, { "acc": 0.80676889, "epoch": 0.7104377104377104, "grad_norm": 10.5, "learning_rate": 1.5324832060027938e-05, "loss": 0.36697528, "memory(GiB)": 15.04, "step": 5275, "train_speed(iter/s)": 0.335857 }, { "acc": 0.92737293, "epoch": 0.7111111111111111, "grad_norm": 6.25, "learning_rate": 1.531540375398576e-05, "loss": 0.35305939, "memory(GiB)": 15.04, "step": 5280, "train_speed(iter/s)": 0.335899 }, { "acc": 0.90856133, "epoch": 0.7117845117845117, "grad_norm": 9.8125, "learning_rate": 1.5305968857621572e-05, "loss": 0.2692791, "memory(GiB)": 15.04, "step": 5285, "train_speed(iter/s)": 0.335917 }, { "acc": 0.88260841, "epoch": 0.7124579124579125, "grad_norm": 5.75, "learning_rate": 1.5296527382633262e-05, "loss": 0.32560964, "memory(GiB)": 15.04, "step": 5290, "train_speed(iter/s)": 0.335979 }, { "acc": 0.8704318, "epoch": 0.7131313131313132, "grad_norm": 11.375, "learning_rate": 1.5287079340726874e-05, "loss": 0.48602247, "memory(GiB)": 15.04, "step": 5295, "train_speed(iter/s)": 0.335989 }, { "acc": 0.88656054, "epoch": 0.7138047138047138, "grad_norm": 14.25, "learning_rate": 1.5277624743616597e-05, "loss": 0.41485868, "memory(GiB)": 15.04, "step": 5300, "train_speed(iter/s)": 0.336018 }, { "acc": 0.89056721, "epoch": 0.7144781144781145, "grad_norm": 13.3125, "learning_rate": 1.526816360302475e-05, "loss": 0.34193959, "memory(GiB)": 15.04, "step": 5305, "train_speed(iter/s)": 0.33607 }, { "acc": 0.93542423, "epoch": 0.7151515151515152, "grad_norm": 12.0, "learning_rate": 1.5258695930681757e-05, "loss": 0.2437969, "memory(GiB)": 15.04, "step": 5310, "train_speed(iter/s)": 0.336119 }, { "acc": 0.86053991, "epoch": 0.7158249158249158, "grad_norm": 8.4375, "learning_rate": 1.5249221738326147e-05, "loss": 0.72318025, "memory(GiB)": 15.04, "step": 5315, "train_speed(iter/s)": 0.336183 }, { "acc": 0.77432585, "epoch": 0.7164983164983165, "grad_norm": 4.84375, "learning_rate": 1.5239741037704531e-05, "loss": 0.46462812, "memory(GiB)": 15.04, "step": 5320, "train_speed(iter/s)": 0.336247 }, { "acc": 0.88148537, "epoch": 0.7171717171717171, "grad_norm": 8.0625, "learning_rate": 1.5230253840571585e-05, "loss": 0.48239102, "memory(GiB)": 15.04, "step": 5325, "train_speed(iter/s)": 0.336308 }, { "acc": 0.85648298, "epoch": 0.7178451178451178, "grad_norm": 5.46875, "learning_rate": 1.522076015869005e-05, "loss": 0.56495943, "memory(GiB)": 15.04, "step": 5330, "train_speed(iter/s)": 0.336338 }, { "acc": 0.8542367, "epoch": 0.7185185185185186, "grad_norm": 7.65625, "learning_rate": 1.5211260003830695e-05, "loss": 0.53311505, "memory(GiB)": 15.04, "step": 5335, "train_speed(iter/s)": 0.336398 }, { "acc": 0.91923809, "epoch": 0.7191919191919192, "grad_norm": 5.9375, "learning_rate": 1.5201753387772327e-05, "loss": 0.30996644, "memory(GiB)": 15.04, "step": 5340, "train_speed(iter/s)": 0.336457 }, { "acc": 0.90340796, "epoch": 0.7198653198653199, "grad_norm": 18.125, "learning_rate": 1.519224032230175e-05, "loss": 0.45624275, "memory(GiB)": 15.04, "step": 5345, "train_speed(iter/s)": 0.336531 }, { "acc": 0.92789698, "epoch": 0.7205387205387206, "grad_norm": 3.25, "learning_rate": 1.5182720819213772e-05, "loss": 0.27522459, "memory(GiB)": 15.04, "step": 5350, "train_speed(iter/s)": 0.33658 }, { "acc": 0.89438334, "epoch": 0.7212121212121212, "grad_norm": 7.9375, "learning_rate": 1.5173194890311189e-05, "loss": 0.39296758, "memory(GiB)": 15.04, "step": 5355, "train_speed(iter/s)": 0.336614 }, { "acc": 0.91784678, "epoch": 0.7218855218855219, "grad_norm": 7.03125, "learning_rate": 1.5163662547404752e-05, "loss": 0.30004089, "memory(GiB)": 15.04, "step": 5360, "train_speed(iter/s)": 0.336586 }, { "acc": 0.91970224, "epoch": 0.7225589225589225, "grad_norm": 15.5625, "learning_rate": 1.5154123802313173e-05, "loss": 0.29415617, "memory(GiB)": 15.04, "step": 5365, "train_speed(iter/s)": 0.336656 }, { "acc": 0.91070747, "epoch": 0.7232323232323232, "grad_norm": 5.9375, "learning_rate": 1.5144578666863095e-05, "loss": 0.3511692, "memory(GiB)": 15.04, "step": 5370, "train_speed(iter/s)": 0.33668 }, { "acc": 0.88041229, "epoch": 0.7239057239057239, "grad_norm": 22.75, "learning_rate": 1.513502715288909e-05, "loss": 0.62181106, "memory(GiB)": 15.04, "step": 5375, "train_speed(iter/s)": 0.336748 }, { "acc": 0.88765354, "epoch": 0.7245791245791245, "grad_norm": 6.75, "learning_rate": 1.512546927223364e-05, "loss": 0.32409241, "memory(GiB)": 15.04, "step": 5380, "train_speed(iter/s)": 0.336784 }, { "acc": 0.92649918, "epoch": 0.7252525252525253, "grad_norm": 8.8125, "learning_rate": 1.5115905036747109e-05, "loss": 0.33871119, "memory(GiB)": 15.04, "step": 5385, "train_speed(iter/s)": 0.336837 }, { "acc": 0.7644515, "epoch": 0.725925925925926, "grad_norm": 7.6875, "learning_rate": 1.5106334458287753e-05, "loss": 0.89487886, "memory(GiB)": 15.04, "step": 5390, "train_speed(iter/s)": 0.336855 }, { "acc": 0.87101507, "epoch": 0.7265993265993266, "grad_norm": 7.5, "learning_rate": 1.5096757548721685e-05, "loss": 0.38332465, "memory(GiB)": 15.04, "step": 5395, "train_speed(iter/s)": 0.336881 }, { "acc": 0.89714222, "epoch": 0.7272727272727273, "grad_norm": 12.625, "learning_rate": 1.5087174319922873e-05, "loss": 0.39275725, "memory(GiB)": 15.04, "step": 5400, "train_speed(iter/s)": 0.33689 }, { "epoch": 0.7272727272727273, "eval_acc": 0.8893768729978299, "eval_loss": 0.42885732650756836, "eval_runtime": 110.102, "eval_samples_per_second": 1.362, "eval_steps_per_second": 1.362, "step": 5400 }, { "acc": 0.91678247, "epoch": 0.7279461279461279, "grad_norm": 12.0, "learning_rate": 1.5077584783773112e-05, "loss": 0.31800861, "memory(GiB)": 15.04, "step": 5405, "train_speed(iter/s)": 0.33467 }, { "acc": 0.92087908, "epoch": 0.7286195286195286, "grad_norm": 6.5625, "learning_rate": 1.5067988952162026e-05, "loss": 0.22192085, "memory(GiB)": 15.04, "step": 5410, "train_speed(iter/s)": 0.334707 }, { "acc": 0.91095114, "epoch": 0.7292929292929293, "grad_norm": 7.75, "learning_rate": 1.505838683698704e-05, "loss": 0.34651372, "memory(GiB)": 15.04, "step": 5415, "train_speed(iter/s)": 0.334748 }, { "acc": 0.91784496, "epoch": 0.7299663299663299, "grad_norm": 5.59375, "learning_rate": 1.504877845015337e-05, "loss": 0.31098709, "memory(GiB)": 15.04, "step": 5420, "train_speed(iter/s)": 0.334746 }, { "acc": 0.89352703, "epoch": 0.7306397306397306, "grad_norm": 12.0, "learning_rate": 1.5039163803574006e-05, "loss": 0.33507795, "memory(GiB)": 15.04, "step": 5425, "train_speed(iter/s)": 0.334791 }, { "acc": 0.85959349, "epoch": 0.7313131313131314, "grad_norm": 8.125, "learning_rate": 1.5029542909169706e-05, "loss": 0.33322883, "memory(GiB)": 15.04, "step": 5430, "train_speed(iter/s)": 0.334819 }, { "acc": 0.83815746, "epoch": 0.731986531986532, "grad_norm": 7.5625, "learning_rate": 1.5019915778868965e-05, "loss": 0.58955793, "memory(GiB)": 15.04, "step": 5435, "train_speed(iter/s)": 0.334867 }, { "acc": 0.84119377, "epoch": 0.7326599326599327, "grad_norm": 9.9375, "learning_rate": 1.5010282424608016e-05, "loss": 0.5419806, "memory(GiB)": 15.04, "step": 5440, "train_speed(iter/s)": 0.334891 }, { "acc": 0.91294098, "epoch": 0.7333333333333333, "grad_norm": 13.125, "learning_rate": 1.5000642858330805e-05, "loss": 0.2769104, "memory(GiB)": 15.04, "step": 5445, "train_speed(iter/s)": 0.334947 }, { "acc": 0.8449502, "epoch": 0.734006734006734, "grad_norm": 7.96875, "learning_rate": 1.4990997091988989e-05, "loss": 0.38201289, "memory(GiB)": 15.04, "step": 5450, "train_speed(iter/s)": 0.334942 }, { "acc": 0.93443613, "epoch": 0.7346801346801347, "grad_norm": 4.78125, "learning_rate": 1.4981345137541898e-05, "loss": 0.24895678, "memory(GiB)": 15.04, "step": 5455, "train_speed(iter/s)": 0.334957 }, { "acc": 0.91406021, "epoch": 0.7353535353535353, "grad_norm": 8.1875, "learning_rate": 1.4971687006956545e-05, "loss": 0.34090152, "memory(GiB)": 15.04, "step": 5460, "train_speed(iter/s)": 0.334957 }, { "acc": 0.88916616, "epoch": 0.736026936026936, "grad_norm": 9.1875, "learning_rate": 1.4962022712207598e-05, "loss": 0.38841276, "memory(GiB)": 15.04, "step": 5465, "train_speed(iter/s)": 0.334992 }, { "acc": 0.87221527, "epoch": 0.7367003367003367, "grad_norm": 8.5, "learning_rate": 1.4952352265277363e-05, "loss": 0.47804179, "memory(GiB)": 15.04, "step": 5470, "train_speed(iter/s)": 0.335001 }, { "acc": 0.91606455, "epoch": 0.7373737373737373, "grad_norm": 6.375, "learning_rate": 1.494267567815578e-05, "loss": 0.31870043, "memory(GiB)": 15.04, "step": 5475, "train_speed(iter/s)": 0.335005 }, { "acc": 0.92161064, "epoch": 0.7380471380471381, "grad_norm": 7.3125, "learning_rate": 1.49329929628404e-05, "loss": 0.26377363, "memory(GiB)": 15.04, "step": 5480, "train_speed(iter/s)": 0.335015 }, { "acc": 0.91865702, "epoch": 0.7387205387205387, "grad_norm": 6.46875, "learning_rate": 1.4923304131336371e-05, "loss": 0.19029766, "memory(GiB)": 15.04, "step": 5485, "train_speed(iter/s)": 0.33505 }, { "acc": 0.86236744, "epoch": 0.7393939393939394, "grad_norm": 12.5625, "learning_rate": 1.4913609195656427e-05, "loss": 0.57456055, "memory(GiB)": 15.04, "step": 5490, "train_speed(iter/s)": 0.335079 }, { "acc": 0.85322609, "epoch": 0.7400673400673401, "grad_norm": 6.125, "learning_rate": 1.4903908167820862e-05, "loss": 0.47080436, "memory(GiB)": 15.04, "step": 5495, "train_speed(iter/s)": 0.335091 }, { "acc": 0.76626668, "epoch": 0.7407407407407407, "grad_norm": 9.3125, "learning_rate": 1.4894201059857536e-05, "loss": 0.58807654, "memory(GiB)": 15.04, "step": 5500, "train_speed(iter/s)": 0.335139 }, { "acc": 0.85379887, "epoch": 0.7414141414141414, "grad_norm": 17.125, "learning_rate": 1.4884487883801837e-05, "loss": 0.47466197, "memory(GiB)": 15.04, "step": 5505, "train_speed(iter/s)": 0.335088 }, { "acc": 0.92498798, "epoch": 0.7420875420875421, "grad_norm": 6.625, "learning_rate": 1.487476865169668e-05, "loss": 0.25954266, "memory(GiB)": 15.04, "step": 5510, "train_speed(iter/s)": 0.335153 }, { "acc": 0.86269741, "epoch": 0.7427609427609427, "grad_norm": 19.0, "learning_rate": 1.4865043375592493e-05, "loss": 0.65633082, "memory(GiB)": 15.04, "step": 5515, "train_speed(iter/s)": 0.335191 }, { "acc": 0.92104807, "epoch": 0.7434343434343434, "grad_norm": 5.0625, "learning_rate": 1.4855312067547187e-05, "loss": 0.33057408, "memory(GiB)": 15.04, "step": 5520, "train_speed(iter/s)": 0.335231 }, { "acc": 0.89448395, "epoch": 0.7441077441077442, "grad_norm": 10.5, "learning_rate": 1.4845574739626167e-05, "loss": 0.34970629, "memory(GiB)": 15.04, "step": 5525, "train_speed(iter/s)": 0.335235 }, { "acc": 0.86514912, "epoch": 0.7447811447811448, "grad_norm": 7.4375, "learning_rate": 1.4835831403902288e-05, "loss": 0.40157342, "memory(GiB)": 15.04, "step": 5530, "train_speed(iter/s)": 0.335189 }, { "acc": 0.8821455, "epoch": 0.7454545454545455, "grad_norm": 5.65625, "learning_rate": 1.482608207245586e-05, "loss": 0.44763246, "memory(GiB)": 15.04, "step": 5535, "train_speed(iter/s)": 0.335147 }, { "acc": 0.90621281, "epoch": 0.7461279461279461, "grad_norm": 6.59375, "learning_rate": 1.4816326757374627e-05, "loss": 0.27925975, "memory(GiB)": 15.04, "step": 5540, "train_speed(iter/s)": 0.335197 }, { "acc": 0.83214693, "epoch": 0.7468013468013468, "grad_norm": 12.5625, "learning_rate": 1.4806565470753747e-05, "loss": 0.59721742, "memory(GiB)": 15.04, "step": 5545, "train_speed(iter/s)": 0.335195 }, { "acc": 0.88033695, "epoch": 0.7474747474747475, "grad_norm": 14.8125, "learning_rate": 1.4796798224695787e-05, "loss": 0.37324944, "memory(GiB)": 15.04, "step": 5550, "train_speed(iter/s)": 0.335178 }, { "acc": 0.89488153, "epoch": 0.7481481481481481, "grad_norm": 9.8125, "learning_rate": 1.4787025031310706e-05, "loss": 0.58029065, "memory(GiB)": 15.04, "step": 5555, "train_speed(iter/s)": 0.335221 }, { "acc": 0.85531311, "epoch": 0.7488215488215488, "grad_norm": 19.875, "learning_rate": 1.4777245902715827e-05, "loss": 0.50645776, "memory(GiB)": 15.04, "step": 5560, "train_speed(iter/s)": 0.335265 }, { "acc": 0.9132328, "epoch": 0.7494949494949495, "grad_norm": 7.875, "learning_rate": 1.4767460851035838e-05, "loss": 0.27047372, "memory(GiB)": 15.04, "step": 5565, "train_speed(iter/s)": 0.335324 }, { "acc": 0.89535303, "epoch": 0.7501683501683502, "grad_norm": 7.21875, "learning_rate": 1.475766988840277e-05, "loss": 0.42600985, "memory(GiB)": 15.04, "step": 5570, "train_speed(iter/s)": 0.335372 }, { "acc": 0.8320179, "epoch": 0.7508417508417509, "grad_norm": 6.6875, "learning_rate": 1.4747873026955986e-05, "loss": 0.37951889, "memory(GiB)": 15.04, "step": 5575, "train_speed(iter/s)": 0.335425 }, { "acc": 0.86511326, "epoch": 0.7515151515151515, "grad_norm": 14.1875, "learning_rate": 1.4738070278842152e-05, "loss": 0.5175148, "memory(GiB)": 15.04, "step": 5580, "train_speed(iter/s)": 0.335493 }, { "acc": 0.80341415, "epoch": 0.7521885521885522, "grad_norm": 11.25, "learning_rate": 1.4728261656215243e-05, "loss": 0.8064867, "memory(GiB)": 15.04, "step": 5585, "train_speed(iter/s)": 0.335538 }, { "acc": 0.85371695, "epoch": 0.7528619528619529, "grad_norm": 17.625, "learning_rate": 1.4718447171236514e-05, "loss": 0.46963062, "memory(GiB)": 15.04, "step": 5590, "train_speed(iter/s)": 0.335607 }, { "acc": 0.84597397, "epoch": 0.7535353535353535, "grad_norm": 7.21875, "learning_rate": 1.4708626836074489e-05, "loss": 0.335937, "memory(GiB)": 15.04, "step": 5595, "train_speed(iter/s)": 0.33565 }, { "acc": 0.90138521, "epoch": 0.7542087542087542, "grad_norm": 9.1875, "learning_rate": 1.4698800662904948e-05, "loss": 0.32173617, "memory(GiB)": 15.04, "step": 5600, "train_speed(iter/s)": 0.335678 }, { "acc": 0.89959269, "epoch": 0.7548821548821549, "grad_norm": 5.03125, "learning_rate": 1.46889686639109e-05, "loss": 0.27304873, "memory(GiB)": 15.04, "step": 5605, "train_speed(iter/s)": 0.335685 }, { "acc": 0.87979374, "epoch": 0.7555555555555555, "grad_norm": 10.375, "learning_rate": 1.467913085128259e-05, "loss": 0.36787922, "memory(GiB)": 15.04, "step": 5610, "train_speed(iter/s)": 0.335717 }, { "acc": 0.9064332, "epoch": 0.7562289562289563, "grad_norm": 12.0625, "learning_rate": 1.4669287237217458e-05, "loss": 0.33975303, "memory(GiB)": 15.04, "step": 5615, "train_speed(iter/s)": 0.335765 }, { "acc": 0.89176922, "epoch": 0.7569023569023569, "grad_norm": 8.625, "learning_rate": 1.4659437833920149e-05, "loss": 0.29958005, "memory(GiB)": 15.04, "step": 5620, "train_speed(iter/s)": 0.335801 }, { "acc": 0.89875574, "epoch": 0.7575757575757576, "grad_norm": 9.375, "learning_rate": 1.464958265360248e-05, "loss": 0.32082832, "memory(GiB)": 15.04, "step": 5625, "train_speed(iter/s)": 0.33582 }, { "acc": 0.93867149, "epoch": 0.7582491582491583, "grad_norm": 5.6875, "learning_rate": 1.4639721708483428e-05, "loss": 0.1997571, "memory(GiB)": 15.04, "step": 5630, "train_speed(iter/s)": 0.335842 }, { "acc": 0.89692726, "epoch": 0.7589225589225589, "grad_norm": 7.46875, "learning_rate": 1.462985501078912e-05, "loss": 0.38000989, "memory(GiB)": 15.04, "step": 5635, "train_speed(iter/s)": 0.335852 }, { "acc": 0.90923653, "epoch": 0.7595959595959596, "grad_norm": 11.25, "learning_rate": 1.4619982572752816e-05, "loss": 0.28477805, "memory(GiB)": 15.04, "step": 5640, "train_speed(iter/s)": 0.335893 }, { "acc": 0.80711994, "epoch": 0.7602693602693603, "grad_norm": 13.8125, "learning_rate": 1.4610104406614897e-05, "loss": 0.64299107, "memory(GiB)": 15.04, "step": 5645, "train_speed(iter/s)": 0.335968 }, { "acc": 0.84009895, "epoch": 0.7609427609427609, "grad_norm": 42.25, "learning_rate": 1.4600220524622838e-05, "loss": 0.65882978, "memory(GiB)": 15.04, "step": 5650, "train_speed(iter/s)": 0.335993 }, { "acc": 0.84721203, "epoch": 0.7616161616161616, "grad_norm": 18.75, "learning_rate": 1.459033093903121e-05, "loss": 0.42750006, "memory(GiB)": 15.04, "step": 5655, "train_speed(iter/s)": 0.336039 }, { "acc": 0.93362293, "epoch": 0.7622895622895622, "grad_norm": 5.0625, "learning_rate": 1.4580435662101642e-05, "loss": 0.20740576, "memory(GiB)": 15.04, "step": 5660, "train_speed(iter/s)": 0.336086 }, { "acc": 0.86373415, "epoch": 0.762962962962963, "grad_norm": 6.0, "learning_rate": 1.4570534706102835e-05, "loss": 0.41281152, "memory(GiB)": 15.04, "step": 5665, "train_speed(iter/s)": 0.33612 }, { "acc": 0.89508467, "epoch": 0.7636363636363637, "grad_norm": 4.84375, "learning_rate": 1.4560628083310523e-05, "loss": 0.39009066, "memory(GiB)": 15.04, "step": 5670, "train_speed(iter/s)": 0.336198 }, { "acc": 0.87871475, "epoch": 0.7643097643097643, "grad_norm": 10.0625, "learning_rate": 1.4550715806007461e-05, "loss": 0.54682922, "memory(GiB)": 15.04, "step": 5675, "train_speed(iter/s)": 0.33618 }, { "acc": 0.85852232, "epoch": 0.764983164983165, "grad_norm": 10.4375, "learning_rate": 1.4540797886483429e-05, "loss": 0.70370178, "memory(GiB)": 15.04, "step": 5680, "train_speed(iter/s)": 0.336219 }, { "acc": 0.92808132, "epoch": 0.7656565656565657, "grad_norm": 9.5, "learning_rate": 1.4530874337035188e-05, "loss": 0.27153614, "memory(GiB)": 15.04, "step": 5685, "train_speed(iter/s)": 0.336278 }, { "acc": 0.91171694, "epoch": 0.7663299663299663, "grad_norm": 12.5, "learning_rate": 1.4520945169966487e-05, "loss": 0.32486353, "memory(GiB)": 15.04, "step": 5690, "train_speed(iter/s)": 0.336306 }, { "acc": 0.86730728, "epoch": 0.767003367003367, "grad_norm": 10.4375, "learning_rate": 1.4511010397588044e-05, "loss": 0.69123378, "memory(GiB)": 15.04, "step": 5695, "train_speed(iter/s)": 0.336353 }, { "acc": 0.931775, "epoch": 0.7676767676767676, "grad_norm": 7.46875, "learning_rate": 1.4501070032217515e-05, "loss": 0.28064499, "memory(GiB)": 15.04, "step": 5700, "train_speed(iter/s)": 0.336418 }, { "epoch": 0.7676767676767676, "eval_acc": 0.8887719748492237, "eval_loss": 0.4258766770362854, "eval_runtime": 110.5315, "eval_samples_per_second": 1.357, "eval_steps_per_second": 1.357, "step": 5700 }, { "acc": 0.86454439, "epoch": 0.7683501683501683, "grad_norm": 15.875, "learning_rate": 1.44911240861795e-05, "loss": 0.37299004, "memory(GiB)": 15.04, "step": 5705, "train_speed(iter/s)": 0.334292 }, { "acc": 0.8620121, "epoch": 0.769023569023569, "grad_norm": 6.03125, "learning_rate": 1.4481172571805515e-05, "loss": 0.39818571, "memory(GiB)": 15.04, "step": 5710, "train_speed(iter/s)": 0.334331 }, { "acc": 0.90279598, "epoch": 0.7696969696969697, "grad_norm": 9.25, "learning_rate": 1.4471215501433978e-05, "loss": 0.32868023, "memory(GiB)": 15.04, "step": 5715, "train_speed(iter/s)": 0.334383 }, { "acc": 0.90749531, "epoch": 0.7703703703703704, "grad_norm": 11.875, "learning_rate": 1.44612528874102e-05, "loss": 0.24195716, "memory(GiB)": 15.04, "step": 5720, "train_speed(iter/s)": 0.334467 }, { "acc": 0.92181034, "epoch": 0.7710437710437711, "grad_norm": 11.125, "learning_rate": 1.4451284742086363e-05, "loss": 0.30442991, "memory(GiB)": 15.04, "step": 5725, "train_speed(iter/s)": 0.334473 }, { "acc": 0.87175598, "epoch": 0.7717171717171717, "grad_norm": 6.59375, "learning_rate": 1.4441311077821505e-05, "loss": 0.61085787, "memory(GiB)": 15.04, "step": 5730, "train_speed(iter/s)": 0.334528 }, { "acc": 0.91578388, "epoch": 0.7723905723905724, "grad_norm": 12.625, "learning_rate": 1.443133190698151e-05, "loss": 0.2823009, "memory(GiB)": 15.04, "step": 5735, "train_speed(iter/s)": 0.334578 }, { "acc": 0.89059925, "epoch": 0.773063973063973, "grad_norm": 12.125, "learning_rate": 1.4421347241939085e-05, "loss": 0.4863658, "memory(GiB)": 15.04, "step": 5740, "train_speed(iter/s)": 0.334593 }, { "acc": 0.9332159, "epoch": 0.7737373737373737, "grad_norm": 9.4375, "learning_rate": 1.4411357095073761e-05, "loss": 0.35421152, "memory(GiB)": 15.04, "step": 5745, "train_speed(iter/s)": 0.334644 }, { "acc": 0.92666492, "epoch": 0.7744107744107744, "grad_norm": 6.03125, "learning_rate": 1.4401361478771847e-05, "loss": 0.25091832, "memory(GiB)": 15.04, "step": 5750, "train_speed(iter/s)": 0.334685 }, { "acc": 0.91748695, "epoch": 0.775084175084175, "grad_norm": 9.0, "learning_rate": 1.4391360405426447e-05, "loss": 0.3200659, "memory(GiB)": 15.04, "step": 5755, "train_speed(iter/s)": 0.334717 }, { "acc": 0.89587498, "epoch": 0.7757575757575758, "grad_norm": 10.875, "learning_rate": 1.4381353887437426e-05, "loss": 0.47424874, "memory(GiB)": 15.04, "step": 5760, "train_speed(iter/s)": 0.33474 }, { "acc": 0.91452494, "epoch": 0.7764309764309765, "grad_norm": 13.6875, "learning_rate": 1.43713419372114e-05, "loss": 0.33517017, "memory(GiB)": 15.04, "step": 5765, "train_speed(iter/s)": 0.334776 }, { "acc": 0.87971125, "epoch": 0.7771043771043771, "grad_norm": 7.53125, "learning_rate": 1.4361324567161723e-05, "loss": 0.31353204, "memory(GiB)": 15.04, "step": 5770, "train_speed(iter/s)": 0.334803 }, { "acc": 0.75419145, "epoch": 0.7777777777777778, "grad_norm": 14.6875, "learning_rate": 1.4351301789708465e-05, "loss": 0.72678986, "memory(GiB)": 15.04, "step": 5775, "train_speed(iter/s)": 0.334848 }, { "acc": 0.8997736, "epoch": 0.7784511784511785, "grad_norm": 10.25, "learning_rate": 1.43412736172784e-05, "loss": 0.36349277, "memory(GiB)": 15.04, "step": 5780, "train_speed(iter/s)": 0.334887 }, { "acc": 0.87870064, "epoch": 0.7791245791245791, "grad_norm": 12.125, "learning_rate": 1.4331240062304996e-05, "loss": 0.39855471, "memory(GiB)": 15.04, "step": 5785, "train_speed(iter/s)": 0.334888 }, { "acc": 0.89457045, "epoch": 0.7797979797979798, "grad_norm": 5.78125, "learning_rate": 1.432120113722839e-05, "loss": 0.3033524, "memory(GiB)": 15.04, "step": 5790, "train_speed(iter/s)": 0.33492 }, { "acc": 0.9024951, "epoch": 0.7804713804713804, "grad_norm": 10.0625, "learning_rate": 1.4311156854495378e-05, "loss": 0.41811175, "memory(GiB)": 15.04, "step": 5795, "train_speed(iter/s)": 0.334956 }, { "acc": 0.86573696, "epoch": 0.7811447811447811, "grad_norm": 18.125, "learning_rate": 1.4301107226559399e-05, "loss": 0.31564784, "memory(GiB)": 15.04, "step": 5800, "train_speed(iter/s)": 0.335035 }, { "acc": 0.90242767, "epoch": 0.7818181818181819, "grad_norm": 8.1875, "learning_rate": 1.4291052265880521e-05, "loss": 0.35957475, "memory(GiB)": 15.04, "step": 5805, "train_speed(iter/s)": 0.335085 }, { "acc": 0.88742027, "epoch": 0.7824915824915825, "grad_norm": 8.625, "learning_rate": 1.4280991984925421e-05, "loss": 0.3438597, "memory(GiB)": 15.04, "step": 5810, "train_speed(iter/s)": 0.335131 }, { "acc": 0.88498249, "epoch": 0.7831649831649832, "grad_norm": 10.4375, "learning_rate": 1.4270926396167374e-05, "loss": 0.60539603, "memory(GiB)": 15.04, "step": 5815, "train_speed(iter/s)": 0.335169 }, { "acc": 0.90841513, "epoch": 0.7838383838383839, "grad_norm": 10.0, "learning_rate": 1.4260855512086236e-05, "loss": 0.3165911, "memory(GiB)": 15.04, "step": 5820, "train_speed(iter/s)": 0.335184 }, { "acc": 0.82462864, "epoch": 0.7845117845117845, "grad_norm": 7.6875, "learning_rate": 1.4250779345168428e-05, "loss": 0.47752819, "memory(GiB)": 15.04, "step": 5825, "train_speed(iter/s)": 0.335219 }, { "acc": 0.93924837, "epoch": 0.7851851851851852, "grad_norm": 6.34375, "learning_rate": 1.4240697907906922e-05, "loss": 0.19553621, "memory(GiB)": 15.04, "step": 5830, "train_speed(iter/s)": 0.335274 }, { "acc": 0.83240747, "epoch": 0.7858585858585858, "grad_norm": 13.0625, "learning_rate": 1.423061121280122e-05, "loss": 0.84983807, "memory(GiB)": 15.04, "step": 5835, "train_speed(iter/s)": 0.335349 }, { "acc": 0.92390738, "epoch": 0.7865319865319865, "grad_norm": 12.0625, "learning_rate": 1.422051927235735e-05, "loss": 0.23929789, "memory(GiB)": 15.04, "step": 5840, "train_speed(iter/s)": 0.335399 }, { "acc": 0.93130398, "epoch": 0.7872053872053872, "grad_norm": 12.3125, "learning_rate": 1.4210422099087837e-05, "loss": 0.22142253, "memory(GiB)": 15.04, "step": 5845, "train_speed(iter/s)": 0.335458 }, { "acc": 0.9127141, "epoch": 0.7878787878787878, "grad_norm": 12.625, "learning_rate": 1.4200319705511698e-05, "loss": 0.37274418, "memory(GiB)": 15.04, "step": 5850, "train_speed(iter/s)": 0.335495 }, { "acc": 0.84959421, "epoch": 0.7885521885521886, "grad_norm": 10.8125, "learning_rate": 1.4190212104154422e-05, "loss": 0.63109818, "memory(GiB)": 15.04, "step": 5855, "train_speed(iter/s)": 0.335559 }, { "acc": 0.81121473, "epoch": 0.7892255892255893, "grad_norm": 7.46875, "learning_rate": 1.4180099307547952e-05, "loss": 0.52778053, "memory(GiB)": 15.04, "step": 5860, "train_speed(iter/s)": 0.335577 }, { "acc": 0.8930335, "epoch": 0.7898989898989899, "grad_norm": 9.9375, "learning_rate": 1.4169981328230676e-05, "loss": 0.36625161, "memory(GiB)": 15.04, "step": 5865, "train_speed(iter/s)": 0.335604 }, { "acc": 0.86561136, "epoch": 0.7905723905723906, "grad_norm": 10.5, "learning_rate": 1.4159858178747406e-05, "loss": 0.60013657, "memory(GiB)": 15.04, "step": 5870, "train_speed(iter/s)": 0.335668 }, { "acc": 0.92761574, "epoch": 0.7912457912457912, "grad_norm": 6.90625, "learning_rate": 1.4149729871649363e-05, "loss": 0.27232482, "memory(GiB)": 15.04, "step": 5875, "train_speed(iter/s)": 0.335704 }, { "acc": 0.90905743, "epoch": 0.7919191919191919, "grad_norm": 14.1875, "learning_rate": 1.4139596419494167e-05, "loss": 0.38415422, "memory(GiB)": 15.04, "step": 5880, "train_speed(iter/s)": 0.335758 }, { "acc": 0.88108702, "epoch": 0.7925925925925926, "grad_norm": 11.25, "learning_rate": 1.412945783484581e-05, "loss": 0.38889017, "memory(GiB)": 15.04, "step": 5885, "train_speed(iter/s)": 0.335827 }, { "acc": 0.90847778, "epoch": 0.7932659932659932, "grad_norm": 8.4375, "learning_rate": 1.4119314130274655e-05, "loss": 0.39114232, "memory(GiB)": 15.04, "step": 5890, "train_speed(iter/s)": 0.335872 }, { "acc": 0.93595724, "epoch": 0.793939393939394, "grad_norm": 15.625, "learning_rate": 1.4109165318357409e-05, "loss": 0.30136671, "memory(GiB)": 15.04, "step": 5895, "train_speed(iter/s)": 0.33592 }, { "acc": 0.94308987, "epoch": 0.7946127946127947, "grad_norm": 5.40625, "learning_rate": 1.4099011411677115e-05, "loss": 0.29036045, "memory(GiB)": 15.04, "step": 5900, "train_speed(iter/s)": 0.335952 }, { "acc": 0.80220032, "epoch": 0.7952861952861953, "grad_norm": 5.03125, "learning_rate": 1.4088852422823125e-05, "loss": 0.53842239, "memory(GiB)": 15.04, "step": 5905, "train_speed(iter/s)": 0.335929 }, { "acc": 0.90533876, "epoch": 0.795959595959596, "grad_norm": 7.9375, "learning_rate": 1.4078688364391097e-05, "loss": 0.31355054, "memory(GiB)": 15.04, "step": 5910, "train_speed(iter/s)": 0.335926 }, { "acc": 0.83117151, "epoch": 0.7966329966329966, "grad_norm": 6.34375, "learning_rate": 1.4068519248982976e-05, "loss": 0.54436059, "memory(GiB)": 15.04, "step": 5915, "train_speed(iter/s)": 0.335966 }, { "acc": 0.81759272, "epoch": 0.7973063973063973, "grad_norm": 16.625, "learning_rate": 1.4058345089206981e-05, "loss": 0.66891346, "memory(GiB)": 15.04, "step": 5920, "train_speed(iter/s)": 0.335993 }, { "acc": 0.90339985, "epoch": 0.797979797979798, "grad_norm": 6.15625, "learning_rate": 1.4048165897677572e-05, "loss": 0.34642906, "memory(GiB)": 15.04, "step": 5925, "train_speed(iter/s)": 0.33602 }, { "acc": 0.8771596, "epoch": 0.7986531986531986, "grad_norm": 5.4375, "learning_rate": 1.4037981687015459e-05, "loss": 0.53214455, "memory(GiB)": 15.04, "step": 5930, "train_speed(iter/s)": 0.336046 }, { "acc": 0.90402775, "epoch": 0.7993265993265993, "grad_norm": 6.6875, "learning_rate": 1.402779246984757e-05, "loss": 0.33912227, "memory(GiB)": 15.04, "step": 5935, "train_speed(iter/s)": 0.336051 }, { "acc": 0.9046689, "epoch": 0.8, "grad_norm": 13.5, "learning_rate": 1.4017598258807042e-05, "loss": 0.35386438, "memory(GiB)": 15.04, "step": 5940, "train_speed(iter/s)": 0.336088 }, { "acc": 0.93352156, "epoch": 0.8006734006734006, "grad_norm": 6.5, "learning_rate": 1.4007399066533203e-05, "loss": 0.20259643, "memory(GiB)": 15.04, "step": 5945, "train_speed(iter/s)": 0.336132 }, { "acc": 0.85727062, "epoch": 0.8013468013468014, "grad_norm": 7.125, "learning_rate": 1.3997194905671558e-05, "loss": 0.54228849, "memory(GiB)": 15.04, "step": 5950, "train_speed(iter/s)": 0.336163 }, { "acc": 0.89675951, "epoch": 0.802020202020202, "grad_norm": 11.5, "learning_rate": 1.3986985788873772e-05, "loss": 0.39463236, "memory(GiB)": 15.04, "step": 5955, "train_speed(iter/s)": 0.336178 }, { "acc": 0.9124465, "epoch": 0.8026936026936027, "grad_norm": 8.1875, "learning_rate": 1.3976771728797651e-05, "loss": 0.31261494, "memory(GiB)": 15.04, "step": 5960, "train_speed(iter/s)": 0.336201 }, { "acc": 0.89148684, "epoch": 0.8033670033670034, "grad_norm": 6.125, "learning_rate": 1.396655273810714e-05, "loss": 0.34783628, "memory(GiB)": 15.04, "step": 5965, "train_speed(iter/s)": 0.336242 }, { "acc": 0.87334671, "epoch": 0.804040404040404, "grad_norm": 5.5625, "learning_rate": 1.3956328829472286e-05, "loss": 0.457757, "memory(GiB)": 15.04, "step": 5970, "train_speed(iter/s)": 0.336217 }, { "acc": 0.88221893, "epoch": 0.8047138047138047, "grad_norm": 21.25, "learning_rate": 1.3946100015569237e-05, "loss": 0.35697987, "memory(GiB)": 15.04, "step": 5975, "train_speed(iter/s)": 0.336257 }, { "acc": 0.90383415, "epoch": 0.8053872053872054, "grad_norm": 6.375, "learning_rate": 1.3935866309080225e-05, "loss": 0.20525901, "memory(GiB)": 15.04, "step": 5980, "train_speed(iter/s)": 0.336311 }, { "acc": 0.85681868, "epoch": 0.806060606060606, "grad_norm": 6.3125, "learning_rate": 1.3925627722693549e-05, "loss": 0.49668331, "memory(GiB)": 15.04, "step": 5985, "train_speed(iter/s)": 0.336315 }, { "acc": 0.87076035, "epoch": 0.8067340067340067, "grad_norm": 7.5, "learning_rate": 1.3915384269103553e-05, "loss": 0.32496142, "memory(GiB)": 15.04, "step": 5990, "train_speed(iter/s)": 0.336339 }, { "acc": 0.94674578, "epoch": 0.8074074074074075, "grad_norm": 4.5625, "learning_rate": 1.3905135961010623e-05, "loss": 0.20143983, "memory(GiB)": 15.04, "step": 5995, "train_speed(iter/s)": 0.336373 }, { "acc": 0.91290941, "epoch": 0.8080808080808081, "grad_norm": 22.125, "learning_rate": 1.3894882811121155e-05, "loss": 0.38342104, "memory(GiB)": 15.04, "step": 6000, "train_speed(iter/s)": 0.336415 }, { "epoch": 0.8080808080808081, "eval_acc": 0.8914740951777989, "eval_loss": 0.42146316170692444, "eval_runtime": 109.8221, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 6000 }, { "acc": 0.9116683, "epoch": 0.8087542087542088, "grad_norm": 7.03125, "learning_rate": 1.3884624832147558e-05, "loss": 0.25487845, "memory(GiB)": 15.04, "step": 6005, "train_speed(iter/s)": 0.334333 }, { "acc": 0.89876242, "epoch": 0.8094276094276094, "grad_norm": 6.59375, "learning_rate": 1.387436203680822e-05, "loss": 0.36956875, "memory(GiB)": 15.04, "step": 6010, "train_speed(iter/s)": 0.3343 }, { "acc": 0.87323742, "epoch": 0.8101010101010101, "grad_norm": 7.0, "learning_rate": 1.3864094437827502e-05, "loss": 0.40369444, "memory(GiB)": 15.04, "step": 6015, "train_speed(iter/s)": 0.334279 }, { "acc": 0.90273552, "epoch": 0.8107744107744108, "grad_norm": 14.6875, "learning_rate": 1.3853822047935727e-05, "loss": 0.30668547, "memory(GiB)": 15.04, "step": 6020, "train_speed(iter/s)": 0.334331 }, { "acc": 0.88416748, "epoch": 0.8114478114478114, "grad_norm": 12.625, "learning_rate": 1.3843544879869151e-05, "loss": 0.44226246, "memory(GiB)": 15.04, "step": 6025, "train_speed(iter/s)": 0.33436 }, { "acc": 0.92972078, "epoch": 0.8121212121212121, "grad_norm": 6.4375, "learning_rate": 1.3833262946369959e-05, "loss": 0.25983531, "memory(GiB)": 15.04, "step": 6030, "train_speed(iter/s)": 0.334384 }, { "acc": 0.83782463, "epoch": 0.8127946127946128, "grad_norm": 5.40625, "learning_rate": 1.3822976260186237e-05, "loss": 0.37785616, "memory(GiB)": 15.04, "step": 6035, "train_speed(iter/s)": 0.334426 }, { "acc": 0.88986654, "epoch": 0.8134680134680135, "grad_norm": 13.625, "learning_rate": 1.3812684834071976e-05, "loss": 0.43530312, "memory(GiB)": 15.04, "step": 6040, "train_speed(iter/s)": 0.334442 }, { "acc": 0.89353771, "epoch": 0.8141414141414142, "grad_norm": 5.96875, "learning_rate": 1.3802388680787033e-05, "loss": 0.37451477, "memory(GiB)": 15.04, "step": 6045, "train_speed(iter/s)": 0.334502 }, { "acc": 0.88501167, "epoch": 0.8148148148148148, "grad_norm": 8.125, "learning_rate": 1.3792087813097132e-05, "loss": 0.42273898, "memory(GiB)": 15.04, "step": 6050, "train_speed(iter/s)": 0.334552 }, { "acc": 0.9307291, "epoch": 0.8154882154882155, "grad_norm": 8.5625, "learning_rate": 1.3781782243773836e-05, "loss": 0.2445611, "memory(GiB)": 15.04, "step": 6055, "train_speed(iter/s)": 0.334605 }, { "acc": 0.88975878, "epoch": 0.8161616161616162, "grad_norm": 8.25, "learning_rate": 1.3771471985594545e-05, "loss": 0.28871515, "memory(GiB)": 15.04, "step": 6060, "train_speed(iter/s)": 0.334643 }, { "acc": 0.79767194, "epoch": 0.8168350168350168, "grad_norm": 62.0, "learning_rate": 1.3761157051342469e-05, "loss": 0.74234476, "memory(GiB)": 15.04, "step": 6065, "train_speed(iter/s)": 0.334657 }, { "acc": 0.8930418, "epoch": 0.8175084175084175, "grad_norm": 5.34375, "learning_rate": 1.375083745380661e-05, "loss": 0.48860698, "memory(GiB)": 15.04, "step": 6070, "train_speed(iter/s)": 0.334652 }, { "acc": 0.92133274, "epoch": 0.8181818181818182, "grad_norm": 6.90625, "learning_rate": 1.3740513205781768e-05, "loss": 0.25783746, "memory(GiB)": 15.04, "step": 6075, "train_speed(iter/s)": 0.334671 }, { "acc": 0.88961411, "epoch": 0.8188552188552188, "grad_norm": 14.8125, "learning_rate": 1.3730184320068484e-05, "loss": 0.41849647, "memory(GiB)": 15.04, "step": 6080, "train_speed(iter/s)": 0.334647 }, { "acc": 0.91754131, "epoch": 0.8195286195286196, "grad_norm": 7.53125, "learning_rate": 1.3719850809473076e-05, "loss": 0.32514465, "memory(GiB)": 15.04, "step": 6085, "train_speed(iter/s)": 0.334632 }, { "acc": 0.89978971, "epoch": 0.8202020202020202, "grad_norm": 10.3125, "learning_rate": 1.3709512686807578e-05, "loss": 0.47395735, "memory(GiB)": 15.04, "step": 6090, "train_speed(iter/s)": 0.334675 }, { "acc": 0.92563581, "epoch": 0.8208754208754209, "grad_norm": 12.1875, "learning_rate": 1.3699169964889746e-05, "loss": 0.29105644, "memory(GiB)": 15.04, "step": 6095, "train_speed(iter/s)": 0.334726 }, { "acc": 0.9316227, "epoch": 0.8215488215488216, "grad_norm": 7.71875, "learning_rate": 1.3688822656543044e-05, "loss": 0.23778965, "memory(GiB)": 15.04, "step": 6100, "train_speed(iter/s)": 0.334742 }, { "acc": 0.88348303, "epoch": 0.8222222222222222, "grad_norm": 8.125, "learning_rate": 1.3678470774596615e-05, "loss": 0.2383842, "memory(GiB)": 15.04, "step": 6105, "train_speed(iter/s)": 0.334782 }, { "acc": 0.92106791, "epoch": 0.8228956228956229, "grad_norm": 12.6875, "learning_rate": 1.366811433188528e-05, "loss": 0.25929103, "memory(GiB)": 15.04, "step": 6110, "train_speed(iter/s)": 0.334839 }, { "acc": 0.9059947, "epoch": 0.8235690235690236, "grad_norm": 7.0, "learning_rate": 1.3657753341249506e-05, "loss": 0.29663203, "memory(GiB)": 15.04, "step": 6115, "train_speed(iter/s)": 0.334848 }, { "acc": 0.906668, "epoch": 0.8242424242424242, "grad_norm": 5.40625, "learning_rate": 1.3647387815535407e-05, "loss": 0.27174668, "memory(GiB)": 15.04, "step": 6120, "train_speed(iter/s)": 0.334847 }, { "acc": 0.89451036, "epoch": 0.8249158249158249, "grad_norm": 20.0, "learning_rate": 1.3637017767594718e-05, "loss": 0.33886986, "memory(GiB)": 15.04, "step": 6125, "train_speed(iter/s)": 0.334894 }, { "acc": 0.84128714, "epoch": 0.8255892255892255, "grad_norm": 15.0, "learning_rate": 1.362664321028477e-05, "loss": 0.66911936, "memory(GiB)": 15.04, "step": 6130, "train_speed(iter/s)": 0.334914 }, { "acc": 0.90483494, "epoch": 0.8262626262626263, "grad_norm": 7.125, "learning_rate": 1.3616264156468509e-05, "loss": 0.43964629, "memory(GiB)": 15.04, "step": 6135, "train_speed(iter/s)": 0.334958 }, { "acc": 0.88671465, "epoch": 0.826936026936027, "grad_norm": 15.25, "learning_rate": 1.360588061901443e-05, "loss": 0.45865536, "memory(GiB)": 15.04, "step": 6140, "train_speed(iter/s)": 0.334997 }, { "acc": 0.8760848, "epoch": 0.8276094276094276, "grad_norm": 7.65625, "learning_rate": 1.3595492610796604e-05, "loss": 0.53596601, "memory(GiB)": 15.04, "step": 6145, "train_speed(iter/s)": 0.335031 }, { "acc": 0.9154603, "epoch": 0.8282828282828283, "grad_norm": 12.0, "learning_rate": 1.3585100144694637e-05, "loss": 0.38792026, "memory(GiB)": 15.04, "step": 6150, "train_speed(iter/s)": 0.335013 }, { "acc": 0.92935333, "epoch": 0.828956228956229, "grad_norm": 25.125, "learning_rate": 1.3574703233593663e-05, "loss": 0.26510258, "memory(GiB)": 15.04, "step": 6155, "train_speed(iter/s)": 0.33504 }, { "acc": 0.90549126, "epoch": 0.8296296296296296, "grad_norm": 6.0, "learning_rate": 1.3564301890384333e-05, "loss": 0.25224361, "memory(GiB)": 15.04, "step": 6160, "train_speed(iter/s)": 0.335069 }, { "acc": 0.77151456, "epoch": 0.8303030303030303, "grad_norm": 11.875, "learning_rate": 1.3553896127962785e-05, "loss": 0.59610553, "memory(GiB)": 15.04, "step": 6165, "train_speed(iter/s)": 0.335109 }, { "acc": 0.89386921, "epoch": 0.8309764309764309, "grad_norm": 9.625, "learning_rate": 1.3543485959230644e-05, "loss": 0.36909132, "memory(GiB)": 15.04, "step": 6170, "train_speed(iter/s)": 0.335162 }, { "acc": 0.89750032, "epoch": 0.8316498316498316, "grad_norm": 15.4375, "learning_rate": 1.3533071397094992e-05, "loss": 0.32972035, "memory(GiB)": 15.04, "step": 6175, "train_speed(iter/s)": 0.335223 }, { "acc": 0.88388786, "epoch": 0.8323232323232324, "grad_norm": 6.78125, "learning_rate": 1.3522652454468359e-05, "loss": 0.52773705, "memory(GiB)": 15.04, "step": 6180, "train_speed(iter/s)": 0.335262 }, { "acc": 0.91888924, "epoch": 0.832996632996633, "grad_norm": 4.90625, "learning_rate": 1.3512229144268712e-05, "loss": 0.24450941, "memory(GiB)": 15.04, "step": 6185, "train_speed(iter/s)": 0.33531 }, { "acc": 0.94059229, "epoch": 0.8336700336700337, "grad_norm": 4.53125, "learning_rate": 1.3501801479419423e-05, "loss": 0.20464752, "memory(GiB)": 15.04, "step": 6190, "train_speed(iter/s)": 0.335347 }, { "acc": 0.84352541, "epoch": 0.8343434343434344, "grad_norm": 6.875, "learning_rate": 1.3491369472849275e-05, "loss": 0.36902907, "memory(GiB)": 15.04, "step": 6195, "train_speed(iter/s)": 0.33539 }, { "acc": 0.90285397, "epoch": 0.835016835016835, "grad_norm": 7.28125, "learning_rate": 1.3480933137492423e-05, "loss": 0.50938802, "memory(GiB)": 15.04, "step": 6200, "train_speed(iter/s)": 0.335342 }, { "acc": 0.86406784, "epoch": 0.8356902356902357, "grad_norm": 16.75, "learning_rate": 1.3470492486288394e-05, "loss": 0.57964182, "memory(GiB)": 15.04, "step": 6205, "train_speed(iter/s)": 0.335369 }, { "acc": 0.85142422, "epoch": 0.8363636363636363, "grad_norm": 4.125, "learning_rate": 1.3460047532182068e-05, "loss": 0.52853956, "memory(GiB)": 15.04, "step": 6210, "train_speed(iter/s)": 0.335396 }, { "acc": 0.90336504, "epoch": 0.837037037037037, "grad_norm": 8.375, "learning_rate": 1.344959828812366e-05, "loss": 0.39982934, "memory(GiB)": 15.04, "step": 6215, "train_speed(iter/s)": 0.335434 }, { "acc": 0.92998228, "epoch": 0.8377104377104377, "grad_norm": 6.375, "learning_rate": 1.3439144767068699e-05, "loss": 0.25489178, "memory(GiB)": 15.04, "step": 6220, "train_speed(iter/s)": 0.335396 }, { "acc": 0.9138279, "epoch": 0.8383838383838383, "grad_norm": 10.9375, "learning_rate": 1.342868698197802e-05, "loss": 0.23432858, "memory(GiB)": 15.04, "step": 6225, "train_speed(iter/s)": 0.335433 }, { "acc": 0.93223724, "epoch": 0.8390572390572391, "grad_norm": 9.0, "learning_rate": 1.3418224945817747e-05, "loss": 0.22406673, "memory(GiB)": 15.04, "step": 6230, "train_speed(iter/s)": 0.335507 }, { "acc": 0.91276436, "epoch": 0.8397306397306398, "grad_norm": 11.6875, "learning_rate": 1.340775867155927e-05, "loss": 0.28521492, "memory(GiB)": 15.04, "step": 6235, "train_speed(iter/s)": 0.335553 }, { "acc": 0.86696482, "epoch": 0.8404040404040404, "grad_norm": 5.59375, "learning_rate": 1.3397288172179237e-05, "loss": 0.60414639, "memory(GiB)": 15.04, "step": 6240, "train_speed(iter/s)": 0.335602 }, { "acc": 0.89616995, "epoch": 0.8410774410774411, "grad_norm": 7.78125, "learning_rate": 1.3386813460659532e-05, "loss": 0.26624556, "memory(GiB)": 15.04, "step": 6245, "train_speed(iter/s)": 0.335636 }, { "acc": 0.909869, "epoch": 0.8417508417508418, "grad_norm": 9.75, "learning_rate": 1.3376334549987262e-05, "loss": 0.36458156, "memory(GiB)": 15.04, "step": 6250, "train_speed(iter/s)": 0.335691 }, { "acc": 0.89609671, "epoch": 0.8424242424242424, "grad_norm": 21.0, "learning_rate": 1.3365851453154744e-05, "loss": 0.40891066, "memory(GiB)": 15.04, "step": 6255, "train_speed(iter/s)": 0.335739 }, { "acc": 0.80581694, "epoch": 0.8430976430976431, "grad_norm": 6.28125, "learning_rate": 1.335536418315948e-05, "loss": 0.46938071, "memory(GiB)": 15.04, "step": 6260, "train_speed(iter/s)": 0.335723 }, { "acc": 0.84534855, "epoch": 0.8437710437710437, "grad_norm": 16.5, "learning_rate": 1.3344872753004155e-05, "loss": 0.42049317, "memory(GiB)": 15.04, "step": 6265, "train_speed(iter/s)": 0.335734 }, { "acc": 0.75441942, "epoch": 0.8444444444444444, "grad_norm": 20.875, "learning_rate": 1.3334377175696596e-05, "loss": 1.06415768, "memory(GiB)": 15.04, "step": 6270, "train_speed(iter/s)": 0.335805 }, { "acc": 0.894419, "epoch": 0.8451178451178452, "grad_norm": 7.21875, "learning_rate": 1.3323877464249787e-05, "loss": 0.40499711, "memory(GiB)": 15.04, "step": 6275, "train_speed(iter/s)": 0.335828 }, { "acc": 0.89972925, "epoch": 0.8457912457912458, "grad_norm": 5.3125, "learning_rate": 1.3313373631681832e-05, "loss": 0.28698552, "memory(GiB)": 15.04, "step": 6280, "train_speed(iter/s)": 0.335847 }, { "acc": 0.92284803, "epoch": 0.8464646464646465, "grad_norm": 4.75, "learning_rate": 1.3302865691015943e-05, "loss": 0.24218967, "memory(GiB)": 15.04, "step": 6285, "train_speed(iter/s)": 0.335839 }, { "acc": 0.87736807, "epoch": 0.8471380471380472, "grad_norm": 5.84375, "learning_rate": 1.3292353655280426e-05, "loss": 0.41942592, "memory(GiB)": 15.04, "step": 6290, "train_speed(iter/s)": 0.335863 }, { "acc": 0.9020483, "epoch": 0.8478114478114478, "grad_norm": 12.5, "learning_rate": 1.3281837537508668e-05, "loss": 0.38974915, "memory(GiB)": 15.04, "step": 6295, "train_speed(iter/s)": 0.335896 }, { "acc": 0.84610481, "epoch": 0.8484848484848485, "grad_norm": 7.96875, "learning_rate": 1.3271317350739112e-05, "loss": 0.42944093, "memory(GiB)": 15.04, "step": 6300, "train_speed(iter/s)": 0.335927 }, { "epoch": 0.8484848484848485, "eval_acc": 0.891844344081504, "eval_loss": 0.41997501254081726, "eval_runtime": 109.7774, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 6300 }, { "acc": 0.86580458, "epoch": 0.8491582491582491, "grad_norm": 14.0, "learning_rate": 1.3260793108015254e-05, "loss": 0.59847932, "memory(GiB)": 15.04, "step": 6305, "train_speed(iter/s)": 0.333975 }, { "acc": 0.85472517, "epoch": 0.8498316498316498, "grad_norm": 5.59375, "learning_rate": 1.3250264822385605e-05, "loss": 0.32474654, "memory(GiB)": 15.04, "step": 6310, "train_speed(iter/s)": 0.334025 }, { "acc": 0.87372379, "epoch": 0.8505050505050505, "grad_norm": 9.375, "learning_rate": 1.3239732506903707e-05, "loss": 0.53135829, "memory(GiB)": 15.04, "step": 6315, "train_speed(iter/s)": 0.334025 }, { "acc": 0.83493977, "epoch": 0.8511784511784511, "grad_norm": 15.8125, "learning_rate": 1.3229196174628078e-05, "loss": 0.62159672, "memory(GiB)": 15.04, "step": 6320, "train_speed(iter/s)": 0.33409 }, { "acc": 0.87602005, "epoch": 0.8518518518518519, "grad_norm": 15.6875, "learning_rate": 1.3218655838622232e-05, "loss": 0.50940185, "memory(GiB)": 15.04, "step": 6325, "train_speed(iter/s)": 0.334123 }, { "acc": 0.93578596, "epoch": 0.8525252525252526, "grad_norm": 9.0625, "learning_rate": 1.3208111511954641e-05, "loss": 0.29616165, "memory(GiB)": 15.04, "step": 6330, "train_speed(iter/s)": 0.33415 }, { "acc": 0.91249781, "epoch": 0.8531986531986532, "grad_norm": 7.125, "learning_rate": 1.3197563207698729e-05, "loss": 0.27347684, "memory(GiB)": 15.04, "step": 6335, "train_speed(iter/s)": 0.334187 }, { "acc": 0.92190247, "epoch": 0.8538720538720539, "grad_norm": 6.8125, "learning_rate": 1.3187010938932842e-05, "loss": 0.24746497, "memory(GiB)": 15.04, "step": 6340, "train_speed(iter/s)": 0.334208 }, { "acc": 0.93268328, "epoch": 0.8545454545454545, "grad_norm": 6.5625, "learning_rate": 1.317645471874025e-05, "loss": 0.37441986, "memory(GiB)": 15.04, "step": 6345, "train_speed(iter/s)": 0.334261 }, { "acc": 0.92450418, "epoch": 0.8552188552188552, "grad_norm": 13.5, "learning_rate": 1.3165894560209118e-05, "loss": 0.30913606, "memory(GiB)": 15.04, "step": 6350, "train_speed(iter/s)": 0.334303 }, { "acc": 0.90470371, "epoch": 0.8558922558922559, "grad_norm": 15.6875, "learning_rate": 1.3155330476432497e-05, "loss": 0.36733928, "memory(GiB)": 15.04, "step": 6355, "train_speed(iter/s)": 0.334335 }, { "acc": 0.92587414, "epoch": 0.8565656565656565, "grad_norm": 9.6875, "learning_rate": 1.3144762480508306e-05, "loss": 0.33982816, "memory(GiB)": 15.04, "step": 6360, "train_speed(iter/s)": 0.334366 }, { "acc": 0.92668858, "epoch": 0.8572390572390572, "grad_norm": 3.953125, "learning_rate": 1.313419058553931e-05, "loss": 0.27973926, "memory(GiB)": 15.04, "step": 6365, "train_speed(iter/s)": 0.334311 }, { "acc": 0.87961559, "epoch": 0.857912457912458, "grad_norm": 7.34375, "learning_rate": 1.312361480463311e-05, "loss": 0.32757173, "memory(GiB)": 15.04, "step": 6370, "train_speed(iter/s)": 0.334382 }, { "acc": 0.88145189, "epoch": 0.8585858585858586, "grad_norm": 7.9375, "learning_rate": 1.3113035150902122e-05, "loss": 0.24424481, "memory(GiB)": 15.04, "step": 6375, "train_speed(iter/s)": 0.334421 }, { "acc": 0.92479296, "epoch": 0.8592592592592593, "grad_norm": 9.5625, "learning_rate": 1.3102451637463572e-05, "loss": 0.28027315, "memory(GiB)": 15.04, "step": 6380, "train_speed(iter/s)": 0.334473 }, { "acc": 0.94058914, "epoch": 0.8599326599326599, "grad_norm": 4.65625, "learning_rate": 1.3091864277439461e-05, "loss": 0.22502601, "memory(GiB)": 15.04, "step": 6385, "train_speed(iter/s)": 0.334502 }, { "acc": 0.89868279, "epoch": 0.8606060606060606, "grad_norm": 8.375, "learning_rate": 1.308127308395657e-05, "loss": 0.36915667, "memory(GiB)": 15.04, "step": 6390, "train_speed(iter/s)": 0.334537 }, { "acc": 0.89056492, "epoch": 0.8612794612794613, "grad_norm": 4.65625, "learning_rate": 1.3070678070146424e-05, "loss": 0.50626311, "memory(GiB)": 15.04, "step": 6395, "train_speed(iter/s)": 0.334567 }, { "acc": 0.91969786, "epoch": 0.8619528619528619, "grad_norm": 7.78125, "learning_rate": 1.3060079249145288e-05, "loss": 0.27441006, "memory(GiB)": 15.04, "step": 6400, "train_speed(iter/s)": 0.334602 }, { "acc": 0.88433943, "epoch": 0.8626262626262626, "grad_norm": 13.875, "learning_rate": 1.3049476634094147e-05, "loss": 0.47878642, "memory(GiB)": 15.04, "step": 6405, "train_speed(iter/s)": 0.334659 }, { "acc": 0.8990777, "epoch": 0.8632996632996633, "grad_norm": 7.03125, "learning_rate": 1.3038870238138694e-05, "loss": 0.45147247, "memory(GiB)": 15.04, "step": 6410, "train_speed(iter/s)": 0.334685 }, { "acc": 0.88826809, "epoch": 0.863973063973064, "grad_norm": 4.4375, "learning_rate": 1.3028260074429304e-05, "loss": 0.38848712, "memory(GiB)": 15.04, "step": 6415, "train_speed(iter/s)": 0.334679 }, { "acc": 0.90999537, "epoch": 0.8646464646464647, "grad_norm": 10.375, "learning_rate": 1.3017646156121026e-05, "loss": 0.29897921, "memory(GiB)": 15.04, "step": 6420, "train_speed(iter/s)": 0.334736 }, { "acc": 0.91853495, "epoch": 0.8653198653198653, "grad_norm": 12.0625, "learning_rate": 1.3007028496373561e-05, "loss": 0.28005025, "memory(GiB)": 15.04, "step": 6425, "train_speed(iter/s)": 0.334787 }, { "acc": 0.8939868, "epoch": 0.865993265993266, "grad_norm": 6.1875, "learning_rate": 1.2996407108351256e-05, "loss": 0.43116326, "memory(GiB)": 15.04, "step": 6430, "train_speed(iter/s)": 0.334741 }, { "acc": 0.89180508, "epoch": 0.8666666666666667, "grad_norm": 7.46875, "learning_rate": 1.2985782005223077e-05, "loss": 0.29863009, "memory(GiB)": 15.04, "step": 6435, "train_speed(iter/s)": 0.334797 }, { "acc": 0.93351583, "epoch": 0.8673400673400673, "grad_norm": 8.875, "learning_rate": 1.2975153200162592e-05, "loss": 0.21974232, "memory(GiB)": 15.04, "step": 6440, "train_speed(iter/s)": 0.334834 }, { "acc": 0.89463205, "epoch": 0.868013468013468, "grad_norm": 7.09375, "learning_rate": 1.2964520706347963e-05, "loss": 0.40170579, "memory(GiB)": 15.04, "step": 6445, "train_speed(iter/s)": 0.334853 }, { "acc": 0.91431379, "epoch": 0.8686868686868687, "grad_norm": 10.5625, "learning_rate": 1.2953884536961925e-05, "loss": 0.29531341, "memory(GiB)": 15.04, "step": 6450, "train_speed(iter/s)": 0.334872 }, { "acc": 0.93744087, "epoch": 0.8693602693602693, "grad_norm": 8.625, "learning_rate": 1.2943244705191772e-05, "loss": 0.24498906, "memory(GiB)": 15.04, "step": 6455, "train_speed(iter/s)": 0.334875 }, { "acc": 0.81136913, "epoch": 0.87003367003367, "grad_norm": 6.9375, "learning_rate": 1.2932601224229333e-05, "loss": 0.56593361, "memory(GiB)": 15.04, "step": 6460, "train_speed(iter/s)": 0.334891 }, { "acc": 0.9021183, "epoch": 0.8707070707070707, "grad_norm": 8.5625, "learning_rate": 1.2921954107270966e-05, "loss": 0.48122501, "memory(GiB)": 15.04, "step": 6465, "train_speed(iter/s)": 0.334934 }, { "acc": 0.91989908, "epoch": 0.8713804713804714, "grad_norm": 6.46875, "learning_rate": 1.2911303367517541e-05, "loss": 0.27749155, "memory(GiB)": 15.04, "step": 6470, "train_speed(iter/s)": 0.33499 }, { "acc": 0.93246155, "epoch": 0.8720538720538721, "grad_norm": 5.34375, "learning_rate": 1.2900649018174407e-05, "loss": 0.27817974, "memory(GiB)": 15.04, "step": 6475, "train_speed(iter/s)": 0.335037 }, { "acc": 0.92235565, "epoch": 0.8727272727272727, "grad_norm": 5.6875, "learning_rate": 1.2889991072451404e-05, "loss": 0.28813252, "memory(GiB)": 15.04, "step": 6480, "train_speed(iter/s)": 0.335094 }, { "acc": 0.78675847, "epoch": 0.8734006734006734, "grad_norm": 38.25, "learning_rate": 1.287932954356282e-05, "loss": 0.74885602, "memory(GiB)": 15.04, "step": 6485, "train_speed(iter/s)": 0.335152 }, { "acc": 0.90806704, "epoch": 0.8740740740740741, "grad_norm": 7.46875, "learning_rate": 1.286866444472739e-05, "loss": 0.48535357, "memory(GiB)": 15.04, "step": 6490, "train_speed(iter/s)": 0.33519 }, { "acc": 0.86187124, "epoch": 0.8747474747474747, "grad_norm": 10.375, "learning_rate": 1.2857995789168272e-05, "loss": 0.41511011, "memory(GiB)": 15.04, "step": 6495, "train_speed(iter/s)": 0.335235 }, { "acc": 0.87467813, "epoch": 0.8754208754208754, "grad_norm": 7.5625, "learning_rate": 1.2847323590113039e-05, "loss": 0.41906748, "memory(GiB)": 15.04, "step": 6500, "train_speed(iter/s)": 0.335259 }, { "acc": 0.88989744, "epoch": 0.8760942760942761, "grad_norm": 18.0, "learning_rate": 1.2836647860793653e-05, "loss": 0.3864934, "memory(GiB)": 15.04, "step": 6505, "train_speed(iter/s)": 0.335239 }, { "acc": 0.86787643, "epoch": 0.8767676767676768, "grad_norm": 11.3125, "learning_rate": 1.2825968614446456e-05, "loss": 0.44939446, "memory(GiB)": 15.04, "step": 6510, "train_speed(iter/s)": 0.335285 }, { "acc": 0.80952492, "epoch": 0.8774410774410775, "grad_norm": 7.375, "learning_rate": 1.2815285864312148e-05, "loss": 0.69332781, "memory(GiB)": 15.04, "step": 6515, "train_speed(iter/s)": 0.335328 }, { "acc": 0.87130203, "epoch": 0.8781144781144781, "grad_norm": 13.625, "learning_rate": 1.2804599623635771e-05, "loss": 0.45426693, "memory(GiB)": 15.04, "step": 6520, "train_speed(iter/s)": 0.335367 }, { "acc": 0.92601309, "epoch": 0.8787878787878788, "grad_norm": 6.34375, "learning_rate": 1.2793909905666703e-05, "loss": 0.24447515, "memory(GiB)": 15.04, "step": 6525, "train_speed(iter/s)": 0.335405 }, { "acc": 0.9039216, "epoch": 0.8794612794612795, "grad_norm": 9.9375, "learning_rate": 1.278321672365863e-05, "loss": 0.37623487, "memory(GiB)": 15.04, "step": 6530, "train_speed(iter/s)": 0.335451 }, { "acc": 0.88665333, "epoch": 0.8801346801346801, "grad_norm": 7.65625, "learning_rate": 1.2772520090869525e-05, "loss": 0.3670491, "memory(GiB)": 15.04, "step": 6535, "train_speed(iter/s)": 0.335478 }, { "acc": 0.84381504, "epoch": 0.8808080808080808, "grad_norm": 7.71875, "learning_rate": 1.2761820020561649e-05, "loss": 0.5703146, "memory(GiB)": 15.04, "step": 6540, "train_speed(iter/s)": 0.335515 }, { "acc": 0.88605452, "epoch": 0.8814814814814815, "grad_norm": 7.84375, "learning_rate": 1.2751116526001519e-05, "loss": 0.67496691, "memory(GiB)": 15.04, "step": 6545, "train_speed(iter/s)": 0.335553 }, { "acc": 0.8932662, "epoch": 0.8821548821548821, "grad_norm": 10.625, "learning_rate": 1.2740409620459906e-05, "loss": 0.37912779, "memory(GiB)": 15.04, "step": 6550, "train_speed(iter/s)": 0.335575 }, { "acc": 0.93029604, "epoch": 0.8828282828282829, "grad_norm": 9.75, "learning_rate": 1.2729699317211799e-05, "loss": 0.28949127, "memory(GiB)": 15.04, "step": 6555, "train_speed(iter/s)": 0.33562 }, { "acc": 0.89841614, "epoch": 0.8835016835016835, "grad_norm": 4.78125, "learning_rate": 1.2718985629536408e-05, "loss": 0.32285652, "memory(GiB)": 15.04, "step": 6560, "train_speed(iter/s)": 0.335603 }, { "acc": 0.82716017, "epoch": 0.8841750841750842, "grad_norm": 6.3125, "learning_rate": 1.2708268570717138e-05, "loss": 0.85316229, "memory(GiB)": 15.04, "step": 6565, "train_speed(iter/s)": 0.335654 }, { "acc": 0.903016, "epoch": 0.8848484848484849, "grad_norm": 4.90625, "learning_rate": 1.2697548154041564e-05, "loss": 0.45106721, "memory(GiB)": 15.04, "step": 6570, "train_speed(iter/s)": 0.335682 }, { "acc": 0.94780951, "epoch": 0.8855218855218855, "grad_norm": 8.875, "learning_rate": 1.268682439280144e-05, "loss": 0.17702886, "memory(GiB)": 15.04, "step": 6575, "train_speed(iter/s)": 0.335723 }, { "acc": 0.91942225, "epoch": 0.8861952861952862, "grad_norm": 11.9375, "learning_rate": 1.2676097300292659e-05, "loss": 0.32949536, "memory(GiB)": 15.04, "step": 6580, "train_speed(iter/s)": 0.335714 }, { "acc": 0.900741, "epoch": 0.8868686868686869, "grad_norm": 8.0, "learning_rate": 1.2665366889815237e-05, "loss": 0.39808414, "memory(GiB)": 15.04, "step": 6585, "train_speed(iter/s)": 0.335729 }, { "acc": 0.90363607, "epoch": 0.8875420875420875, "grad_norm": 9.375, "learning_rate": 1.2654633174673321e-05, "loss": 0.35671287, "memory(GiB)": 15.04, "step": 6590, "train_speed(iter/s)": 0.33575 }, { "acc": 0.85732718, "epoch": 0.8882154882154882, "grad_norm": 4.90625, "learning_rate": 1.2643896168175137e-05, "loss": 0.635991, "memory(GiB)": 15.04, "step": 6595, "train_speed(iter/s)": 0.335704 }, { "acc": 0.85181875, "epoch": 0.8888888888888888, "grad_norm": 8.25, "learning_rate": 1.2633155883633009e-05, "loss": 0.42253537, "memory(GiB)": 15.04, "step": 6600, "train_speed(iter/s)": 0.335715 }, { "epoch": 0.8888888888888888, "eval_acc": 0.8936819510176637, "eval_loss": 0.4132220149040222, "eval_runtime": 110.1754, "eval_samples_per_second": 1.361, "eval_steps_per_second": 1.361, "step": 6600 }, { "acc": 0.89893084, "epoch": 0.8895622895622896, "grad_norm": 10.0, "learning_rate": 1.2622412334363307e-05, "loss": 0.313694, "memory(GiB)": 15.04, "step": 6605, "train_speed(iter/s)": 0.333881 }, { "acc": 0.92133274, "epoch": 0.8902356902356903, "grad_norm": 7.90625, "learning_rate": 1.2611665533686464e-05, "loss": 0.29927416, "memory(GiB)": 15.04, "step": 6610, "train_speed(iter/s)": 0.33393 }, { "acc": 0.77386642, "epoch": 0.8909090909090909, "grad_norm": 9.4375, "learning_rate": 1.2600915494926937e-05, "loss": 0.86817532, "memory(GiB)": 15.04, "step": 6615, "train_speed(iter/s)": 0.333975 }, { "acc": 0.90358448, "epoch": 0.8915824915824916, "grad_norm": 9.75, "learning_rate": 1.25901622314132e-05, "loss": 0.36347694, "memory(GiB)": 15.04, "step": 6620, "train_speed(iter/s)": 0.334032 }, { "acc": 0.86857395, "epoch": 0.8922558922558923, "grad_norm": 10.0, "learning_rate": 1.2579405756477723e-05, "loss": 0.51028337, "memory(GiB)": 15.04, "step": 6625, "train_speed(iter/s)": 0.334083 }, { "acc": 0.9134038, "epoch": 0.8929292929292929, "grad_norm": 11.0625, "learning_rate": 1.2568646083456963e-05, "loss": 0.33207953, "memory(GiB)": 15.04, "step": 6630, "train_speed(iter/s)": 0.334128 }, { "acc": 0.90302343, "epoch": 0.8936026936026936, "grad_norm": 8.0625, "learning_rate": 1.2557883225691331e-05, "loss": 0.36005437, "memory(GiB)": 15.04, "step": 6635, "train_speed(iter/s)": 0.334167 }, { "acc": 0.90513153, "epoch": 0.8942760942760942, "grad_norm": 7.0, "learning_rate": 1.2547117196525202e-05, "loss": 0.46652622, "memory(GiB)": 15.04, "step": 6640, "train_speed(iter/s)": 0.334143 }, { "acc": 0.88937864, "epoch": 0.8949494949494949, "grad_norm": 18.5, "learning_rate": 1.2536348009306871e-05, "loss": 0.42260156, "memory(GiB)": 15.04, "step": 6645, "train_speed(iter/s)": 0.334196 }, { "acc": 0.84189262, "epoch": 0.8956228956228957, "grad_norm": 15.0, "learning_rate": 1.2525575677388552e-05, "loss": 0.52167392, "memory(GiB)": 15.04, "step": 6650, "train_speed(iter/s)": 0.334213 }, { "acc": 0.8754921, "epoch": 0.8962962962962963, "grad_norm": 17.375, "learning_rate": 1.251480021412636e-05, "loss": 0.32569854, "memory(GiB)": 15.04, "step": 6655, "train_speed(iter/s)": 0.334265 }, { "acc": 0.92789869, "epoch": 0.896969696969697, "grad_norm": 15.625, "learning_rate": 1.2504021632880294e-05, "loss": 0.30449035, "memory(GiB)": 15.04, "step": 6660, "train_speed(iter/s)": 0.334247 }, { "acc": 0.88237553, "epoch": 0.8976430976430977, "grad_norm": 14.0625, "learning_rate": 1.249323994701421e-05, "loss": 0.36932364, "memory(GiB)": 15.04, "step": 6665, "train_speed(iter/s)": 0.334317 }, { "acc": 0.92880669, "epoch": 0.8983164983164983, "grad_norm": 4.78125, "learning_rate": 1.2482455169895822e-05, "loss": 0.28703864, "memory(GiB)": 15.04, "step": 6670, "train_speed(iter/s)": 0.334359 }, { "acc": 0.89129295, "epoch": 0.898989898989899, "grad_norm": 10.75, "learning_rate": 1.2471667314896674e-05, "loss": 0.37683971, "memory(GiB)": 15.04, "step": 6675, "train_speed(iter/s)": 0.334417 }, { "acc": 0.91925354, "epoch": 0.8996632996632996, "grad_norm": 11.0, "learning_rate": 1.2460876395392126e-05, "loss": 0.25543945, "memory(GiB)": 15.04, "step": 6680, "train_speed(iter/s)": 0.33446 }, { "acc": 0.82515974, "epoch": 0.9003367003367003, "grad_norm": 6.1875, "learning_rate": 1.2450082424761336e-05, "loss": 0.51135798, "memory(GiB)": 15.04, "step": 6685, "train_speed(iter/s)": 0.3345 }, { "acc": 0.94111023, "epoch": 0.901010101010101, "grad_norm": 8.3125, "learning_rate": 1.2439285416387248e-05, "loss": 0.23880255, "memory(GiB)": 15.04, "step": 6690, "train_speed(iter/s)": 0.334507 }, { "acc": 0.89929008, "epoch": 0.9016835016835016, "grad_norm": 6.09375, "learning_rate": 1.2428485383656565e-05, "loss": 0.3595645, "memory(GiB)": 15.04, "step": 6695, "train_speed(iter/s)": 0.334512 }, { "acc": 0.92091246, "epoch": 0.9023569023569024, "grad_norm": 13.6875, "learning_rate": 1.2417682339959755e-05, "loss": 0.28632832, "memory(GiB)": 15.04, "step": 6700, "train_speed(iter/s)": 0.334487 }, { "acc": 0.90725622, "epoch": 0.9030303030303031, "grad_norm": 6.15625, "learning_rate": 1.2406876298691006e-05, "loss": 0.32967763, "memory(GiB)": 15.04, "step": 6705, "train_speed(iter/s)": 0.334526 }, { "acc": 0.83416014, "epoch": 0.9037037037037037, "grad_norm": 8.25, "learning_rate": 1.2396067273248224e-05, "loss": 0.64012122, "memory(GiB)": 15.04, "step": 6710, "train_speed(iter/s)": 0.334581 }, { "acc": 0.9107769, "epoch": 0.9043771043771044, "grad_norm": 9.75, "learning_rate": 1.2385255277033022e-05, "loss": 0.39353929, "memory(GiB)": 15.04, "step": 6715, "train_speed(iter/s)": 0.334603 }, { "acc": 0.88990326, "epoch": 0.9050505050505051, "grad_norm": 11.9375, "learning_rate": 1.2374440323450685e-05, "loss": 0.35531456, "memory(GiB)": 15.04, "step": 6720, "train_speed(iter/s)": 0.334648 }, { "acc": 0.88896084, "epoch": 0.9057239057239057, "grad_norm": 8.375, "learning_rate": 1.2363622425910173e-05, "loss": 0.2434422, "memory(GiB)": 15.04, "step": 6725, "train_speed(iter/s)": 0.334704 }, { "acc": 0.94749165, "epoch": 0.9063973063973064, "grad_norm": 6.3125, "learning_rate": 1.2352801597824098e-05, "loss": 0.20872343, "memory(GiB)": 15.04, "step": 6730, "train_speed(iter/s)": 0.334752 }, { "acc": 0.84323606, "epoch": 0.907070707070707, "grad_norm": 10.9375, "learning_rate": 1.2341977852608698e-05, "loss": 0.67311592, "memory(GiB)": 15.04, "step": 6735, "train_speed(iter/s)": 0.334801 }, { "acc": 0.9264308, "epoch": 0.9077441077441077, "grad_norm": 8.75, "learning_rate": 1.2331151203683832e-05, "loss": 0.28203523, "memory(GiB)": 15.04, "step": 6740, "train_speed(iter/s)": 0.334853 }, { "acc": 0.87206812, "epoch": 0.9084175084175085, "grad_norm": 12.0625, "learning_rate": 1.2320321664472958e-05, "loss": 0.39646292, "memory(GiB)": 15.04, "step": 6745, "train_speed(iter/s)": 0.334901 }, { "acc": 0.92845144, "epoch": 0.9090909090909091, "grad_norm": 4.34375, "learning_rate": 1.2309489248403121e-05, "loss": 0.31394038, "memory(GiB)": 15.04, "step": 6750, "train_speed(iter/s)": 0.334909 }, { "acc": 0.85453014, "epoch": 0.9097643097643098, "grad_norm": 14.6875, "learning_rate": 1.229865396890493e-05, "loss": 0.53902259, "memory(GiB)": 15.04, "step": 6755, "train_speed(iter/s)": 0.334952 }, { "acc": 0.92648659, "epoch": 0.9104377104377105, "grad_norm": 9.0625, "learning_rate": 1.2287815839412543e-05, "loss": 0.2476675, "memory(GiB)": 15.04, "step": 6760, "train_speed(iter/s)": 0.334979 }, { "acc": 0.94983616, "epoch": 0.9111111111111111, "grad_norm": 7.1875, "learning_rate": 1.227697487336365e-05, "loss": 0.21728773, "memory(GiB)": 15.04, "step": 6765, "train_speed(iter/s)": 0.335031 }, { "acc": 0.8695796, "epoch": 0.9117845117845118, "grad_norm": 13.4375, "learning_rate": 1.2266131084199467e-05, "loss": 0.48047376, "memory(GiB)": 15.04, "step": 6770, "train_speed(iter/s)": 0.33503 }, { "acc": 0.90174141, "epoch": 0.9124579124579124, "grad_norm": 7.09375, "learning_rate": 1.22552844853647e-05, "loss": 0.35227265, "memory(GiB)": 15.04, "step": 6775, "train_speed(iter/s)": 0.335068 }, { "acc": 0.92144661, "epoch": 0.9131313131313131, "grad_norm": 11.3125, "learning_rate": 1.2244435090307542e-05, "loss": 0.35098794, "memory(GiB)": 15.04, "step": 6780, "train_speed(iter/s)": 0.335109 }, { "acc": 0.91706858, "epoch": 0.9138047138047138, "grad_norm": 6.03125, "learning_rate": 1.2233582912479658e-05, "loss": 0.28804533, "memory(GiB)": 15.04, "step": 6785, "train_speed(iter/s)": 0.335141 }, { "acc": 0.89750051, "epoch": 0.9144781144781144, "grad_norm": 9.25, "learning_rate": 1.2222727965336151e-05, "loss": 0.38108807, "memory(GiB)": 15.04, "step": 6790, "train_speed(iter/s)": 0.33518 }, { "acc": 0.91727934, "epoch": 0.9151515151515152, "grad_norm": 7.84375, "learning_rate": 1.2211870262335574e-05, "loss": 0.40908642, "memory(GiB)": 15.04, "step": 6795, "train_speed(iter/s)": 0.335223 }, { "acc": 0.92927933, "epoch": 0.9158249158249159, "grad_norm": 6.5, "learning_rate": 1.2201009816939886e-05, "loss": 0.23740695, "memory(GiB)": 15.04, "step": 6800, "train_speed(iter/s)": 0.335266 }, { "acc": 0.88172426, "epoch": 0.9164983164983165, "grad_norm": 5.71875, "learning_rate": 1.2190146642614444e-05, "loss": 0.4228384, "memory(GiB)": 15.04, "step": 6805, "train_speed(iter/s)": 0.33528 }, { "acc": 0.94038963, "epoch": 0.9171717171717172, "grad_norm": 7.0625, "learning_rate": 1.2179280752828e-05, "loss": 0.22256649, "memory(GiB)": 15.04, "step": 6810, "train_speed(iter/s)": 0.335311 }, { "acc": 0.87440615, "epoch": 0.9178451178451178, "grad_norm": 4.875, "learning_rate": 1.2168412161052654e-05, "loss": 0.6039371, "memory(GiB)": 15.04, "step": 6815, "train_speed(iter/s)": 0.335342 }, { "acc": 0.93131466, "epoch": 0.9185185185185185, "grad_norm": 14.125, "learning_rate": 1.215754088076388e-05, "loss": 0.24507437, "memory(GiB)": 15.04, "step": 6820, "train_speed(iter/s)": 0.335399 }, { "acc": 0.8701457, "epoch": 0.9191919191919192, "grad_norm": 8.1875, "learning_rate": 1.2146666925440467e-05, "loss": 0.42550588, "memory(GiB)": 15.04, "step": 6825, "train_speed(iter/s)": 0.335434 }, { "acc": 0.91583204, "epoch": 0.9198653198653198, "grad_norm": 7.4375, "learning_rate": 1.2135790308564527e-05, "loss": 0.41826954, "memory(GiB)": 15.04, "step": 6830, "train_speed(iter/s)": 0.335431 }, { "acc": 0.92180929, "epoch": 0.9205387205387205, "grad_norm": 6.90625, "learning_rate": 1.2124911043621472e-05, "loss": 0.20690181, "memory(GiB)": 15.04, "step": 6835, "train_speed(iter/s)": 0.335485 }, { "acc": 0.88640709, "epoch": 0.9212121212121213, "grad_norm": 6.84375, "learning_rate": 1.2114029144099997e-05, "loss": 0.39851635, "memory(GiB)": 15.04, "step": 6840, "train_speed(iter/s)": 0.335497 }, { "acc": 0.91542492, "epoch": 0.9218855218855219, "grad_norm": 17.5, "learning_rate": 1.2103144623492065e-05, "loss": 0.25600076, "memory(GiB)": 15.04, "step": 6845, "train_speed(iter/s)": 0.335553 }, { "acc": 0.90082397, "epoch": 0.9225589225589226, "grad_norm": 5.4375, "learning_rate": 1.2092257495292884e-05, "loss": 0.38325059, "memory(GiB)": 15.04, "step": 6850, "train_speed(iter/s)": 0.335574 }, { "acc": 0.85676031, "epoch": 0.9232323232323232, "grad_norm": 13.5, "learning_rate": 1.2081367773000901e-05, "loss": 0.4454113, "memory(GiB)": 15.04, "step": 6855, "train_speed(iter/s)": 0.33563 }, { "acc": 0.88738985, "epoch": 0.9239057239057239, "grad_norm": 7.0, "learning_rate": 1.2070475470117772e-05, "loss": 0.57540016, "memory(GiB)": 15.04, "step": 6860, "train_speed(iter/s)": 0.33567 }, { "acc": 0.94030704, "epoch": 0.9245791245791246, "grad_norm": 6.125, "learning_rate": 1.2059580600148362e-05, "loss": 0.21582599, "memory(GiB)": 15.04, "step": 6865, "train_speed(iter/s)": 0.335706 }, { "acc": 0.84658775, "epoch": 0.9252525252525252, "grad_norm": 13.625, "learning_rate": 1.2048683176600714e-05, "loss": 0.41957035, "memory(GiB)": 15.04, "step": 6870, "train_speed(iter/s)": 0.335708 }, { "acc": 0.85046186, "epoch": 0.9259259259259259, "grad_norm": 13.9375, "learning_rate": 1.2037783212986032e-05, "loss": 0.53905044, "memory(GiB)": 15.04, "step": 6875, "train_speed(iter/s)": 0.335689 }, { "acc": 0.88738012, "epoch": 0.9265993265993266, "grad_norm": 17.125, "learning_rate": 1.202688072281868e-05, "loss": 0.4662982, "memory(GiB)": 15.04, "step": 6880, "train_speed(iter/s)": 0.335725 }, { "acc": 0.88524971, "epoch": 0.9272727272727272, "grad_norm": 7.875, "learning_rate": 1.2015975719616142e-05, "loss": 0.34116085, "memory(GiB)": 15.04, "step": 6885, "train_speed(iter/s)": 0.335773 }, { "acc": 0.92624969, "epoch": 0.927946127946128, "grad_norm": 8.25, "learning_rate": 1.200506821689903e-05, "loss": 0.23012111, "memory(GiB)": 15.04, "step": 6890, "train_speed(iter/s)": 0.335804 }, { "acc": 0.90716, "epoch": 0.9286195286195286, "grad_norm": 8.9375, "learning_rate": 1.1994158228191048e-05, "loss": 0.32048681, "memory(GiB)": 15.04, "step": 6895, "train_speed(iter/s)": 0.335827 }, { "acc": 0.91858282, "epoch": 0.9292929292929293, "grad_norm": 8.25, "learning_rate": 1.1983245767018983e-05, "loss": 0.30142379, "memory(GiB)": 15.04, "step": 6900, "train_speed(iter/s)": 0.335826 }, { "epoch": 0.9292929292929293, "eval_acc": 0.8917195774165503, "eval_loss": 0.42348816990852356, "eval_runtime": 109.8053, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 6900 }, { "acc": 0.90472231, "epoch": 0.92996632996633, "grad_norm": 5.59375, "learning_rate": 1.197233084691269e-05, "loss": 0.41530919, "memory(GiB)": 15.04, "step": 6905, "train_speed(iter/s)": 0.334045 }, { "acc": 0.92123251, "epoch": 0.9306397306397306, "grad_norm": 14.625, "learning_rate": 1.196141348140507e-05, "loss": 0.34660089, "memory(GiB)": 15.04, "step": 6910, "train_speed(iter/s)": 0.334068 }, { "acc": 0.84240036, "epoch": 0.9313131313131313, "grad_norm": 33.0, "learning_rate": 1.1950493684032052e-05, "loss": 0.61305223, "memory(GiB)": 15.04, "step": 6915, "train_speed(iter/s)": 0.334123 }, { "acc": 0.85562725, "epoch": 0.931986531986532, "grad_norm": 7.4375, "learning_rate": 1.1939571468332593e-05, "loss": 0.56112657, "memory(GiB)": 15.04, "step": 6920, "train_speed(iter/s)": 0.334131 }, { "acc": 0.89052143, "epoch": 0.9326599326599326, "grad_norm": 10.0, "learning_rate": 1.1928646847848639e-05, "loss": 0.38444917, "memory(GiB)": 15.04, "step": 6925, "train_speed(iter/s)": 0.334166 }, { "acc": 0.88111267, "epoch": 0.9333333333333333, "grad_norm": 9.5625, "learning_rate": 1.1917719836125118e-05, "loss": 0.52355328, "memory(GiB)": 15.04, "step": 6930, "train_speed(iter/s)": 0.334224 }, { "acc": 0.93286581, "epoch": 0.934006734006734, "grad_norm": 8.6875, "learning_rate": 1.1906790446709922e-05, "loss": 0.26666851, "memory(GiB)": 15.04, "step": 6935, "train_speed(iter/s)": 0.334265 }, { "acc": 0.84633722, "epoch": 0.9346801346801347, "grad_norm": 21.75, "learning_rate": 1.1895858693153892e-05, "loss": 0.66107178, "memory(GiB)": 15.04, "step": 6940, "train_speed(iter/s)": 0.33427 }, { "acc": 0.86352634, "epoch": 0.9353535353535354, "grad_norm": 10.25, "learning_rate": 1.1884924589010805e-05, "loss": 0.36745579, "memory(GiB)": 15.04, "step": 6945, "train_speed(iter/s)": 0.334324 }, { "acc": 0.84268894, "epoch": 0.936026936026936, "grad_norm": 8.875, "learning_rate": 1.1873988147837347e-05, "loss": 0.5519784, "memory(GiB)": 15.04, "step": 6950, "train_speed(iter/s)": 0.334349 }, { "acc": 0.91525087, "epoch": 0.9367003367003367, "grad_norm": 6.90625, "learning_rate": 1.1863049383193103e-05, "loss": 0.28298798, "memory(GiB)": 15.04, "step": 6955, "train_speed(iter/s)": 0.33438 }, { "acc": 0.89190407, "epoch": 0.9373737373737374, "grad_norm": 8.75, "learning_rate": 1.1852108308640535e-05, "loss": 0.22739718, "memory(GiB)": 15.04, "step": 6960, "train_speed(iter/s)": 0.334425 }, { "acc": 0.9698102, "epoch": 0.938047138047138, "grad_norm": 11.875, "learning_rate": 1.184116493774498e-05, "loss": 0.12442456, "memory(GiB)": 15.04, "step": 6965, "train_speed(iter/s)": 0.334486 }, { "acc": 0.84846802, "epoch": 0.9387205387205387, "grad_norm": 7.71875, "learning_rate": 1.183021928407461e-05, "loss": 0.65881367, "memory(GiB)": 15.04, "step": 6970, "train_speed(iter/s)": 0.334462 }, { "acc": 0.91213017, "epoch": 0.9393939393939394, "grad_norm": 7.03125, "learning_rate": 1.1819271361200435e-05, "loss": 0.32764463, "memory(GiB)": 15.04, "step": 6975, "train_speed(iter/s)": 0.334474 }, { "acc": 0.9128233, "epoch": 0.94006734006734, "grad_norm": 8.5625, "learning_rate": 1.1808321182696271e-05, "loss": 0.32920997, "memory(GiB)": 15.04, "step": 6980, "train_speed(iter/s)": 0.334502 }, { "acc": 0.9396574, "epoch": 0.9407407407407408, "grad_norm": 9.25, "learning_rate": 1.179736876213874e-05, "loss": 0.20859499, "memory(GiB)": 15.04, "step": 6985, "train_speed(iter/s)": 0.334565 }, { "acc": 0.86029367, "epoch": 0.9414141414141414, "grad_norm": 13.5, "learning_rate": 1.1786414113107236e-05, "loss": 0.5239604, "memory(GiB)": 15.04, "step": 6990, "train_speed(iter/s)": 0.334586 }, { "acc": 0.90799704, "epoch": 0.9420875420875421, "grad_norm": 7.625, "learning_rate": 1.1775457249183922e-05, "loss": 0.44904799, "memory(GiB)": 15.04, "step": 6995, "train_speed(iter/s)": 0.334574 }, { "acc": 0.91598625, "epoch": 0.9427609427609428, "grad_norm": 8.75, "learning_rate": 1.1764498183953701e-05, "loss": 0.31580322, "memory(GiB)": 15.04, "step": 7000, "train_speed(iter/s)": 0.334615 }, { "acc": 0.85193005, "epoch": 0.9434343434343434, "grad_norm": 14.75, "learning_rate": 1.1753536931004211e-05, "loss": 0.65886221, "memory(GiB)": 15.04, "step": 7005, "train_speed(iter/s)": 0.334488 }, { "acc": 0.89839611, "epoch": 0.9441077441077441, "grad_norm": 3.765625, "learning_rate": 1.1742573503925794e-05, "loss": 0.46067529, "memory(GiB)": 15.04, "step": 7010, "train_speed(iter/s)": 0.334519 }, { "acc": 0.90813351, "epoch": 0.9447811447811448, "grad_norm": 7.125, "learning_rate": 1.1731607916311503e-05, "loss": 0.3260469, "memory(GiB)": 15.04, "step": 7015, "train_speed(iter/s)": 0.334521 }, { "acc": 0.91946726, "epoch": 0.9454545454545454, "grad_norm": 6.375, "learning_rate": 1.1720640181757055e-05, "loss": 0.26359484, "memory(GiB)": 15.04, "step": 7020, "train_speed(iter/s)": 0.334562 }, { "acc": 0.8410737, "epoch": 0.9461279461279462, "grad_norm": 6.96875, "learning_rate": 1.1709670313860835e-05, "loss": 0.40910587, "memory(GiB)": 15.04, "step": 7025, "train_speed(iter/s)": 0.334598 }, { "acc": 0.94576159, "epoch": 0.9468013468013468, "grad_norm": 10.25, "learning_rate": 1.1698698326223872e-05, "loss": 0.23115597, "memory(GiB)": 15.04, "step": 7030, "train_speed(iter/s)": 0.33464 }, { "acc": 0.91468258, "epoch": 0.9474747474747475, "grad_norm": 4.5, "learning_rate": 1.1687724232449823e-05, "loss": 0.30396674, "memory(GiB)": 15.04, "step": 7035, "train_speed(iter/s)": 0.334677 }, { "acc": 0.91524286, "epoch": 0.9481481481481482, "grad_norm": 7.75, "learning_rate": 1.1676748046144957e-05, "loss": 0.35731499, "memory(GiB)": 15.04, "step": 7040, "train_speed(iter/s)": 0.3347 }, { "acc": 0.82774048, "epoch": 0.9488215488215488, "grad_norm": 9.0, "learning_rate": 1.1665769780918139e-05, "loss": 0.62082992, "memory(GiB)": 15.04, "step": 7045, "train_speed(iter/s)": 0.334749 }, { "acc": 0.90356588, "epoch": 0.9494949494949495, "grad_norm": 7.25, "learning_rate": 1.1654789450380805e-05, "loss": 0.42908516, "memory(GiB)": 15.04, "step": 7050, "train_speed(iter/s)": 0.334731 }, { "acc": 0.92638025, "epoch": 0.9501683501683502, "grad_norm": 7.0625, "learning_rate": 1.1643807068146964e-05, "loss": 0.25998495, "memory(GiB)": 15.04, "step": 7055, "train_speed(iter/s)": 0.334743 }, { "acc": 0.94073849, "epoch": 0.9508417508417508, "grad_norm": 7.71875, "learning_rate": 1.1632822647833155e-05, "loss": 0.25604682, "memory(GiB)": 15.04, "step": 7060, "train_speed(iter/s)": 0.334807 }, { "acc": 0.95495081, "epoch": 0.9515151515151515, "grad_norm": 5.21875, "learning_rate": 1.1621836203058452e-05, "loss": 0.25055363, "memory(GiB)": 15.04, "step": 7065, "train_speed(iter/s)": 0.334855 }, { "acc": 0.89045639, "epoch": 0.9521885521885521, "grad_norm": 7.34375, "learning_rate": 1.1610847747444435e-05, "loss": 0.44178624, "memory(GiB)": 15.04, "step": 7070, "train_speed(iter/s)": 0.334892 }, { "acc": 0.89794683, "epoch": 0.9528619528619529, "grad_norm": 24.75, "learning_rate": 1.1599857294615184e-05, "loss": 0.53782544, "memory(GiB)": 15.04, "step": 7075, "train_speed(iter/s)": 0.334906 }, { "acc": 0.88415861, "epoch": 0.9535353535353536, "grad_norm": 11.3125, "learning_rate": 1.1588864858197246e-05, "loss": 0.42191024, "memory(GiB)": 15.04, "step": 7080, "train_speed(iter/s)": 0.334954 }, { "acc": 0.87026701, "epoch": 0.9542087542087542, "grad_norm": 7.65625, "learning_rate": 1.1577870451819633e-05, "loss": 0.62059903, "memory(GiB)": 15.04, "step": 7085, "train_speed(iter/s)": 0.334844 }, { "acc": 0.87935076, "epoch": 0.9548821548821549, "grad_norm": 26.75, "learning_rate": 1.15668740891138e-05, "loss": 0.42825451, "memory(GiB)": 15.04, "step": 7090, "train_speed(iter/s)": 0.334906 }, { "acc": 0.88299179, "epoch": 0.9555555555555556, "grad_norm": 5.21875, "learning_rate": 1.1555875783713627e-05, "loss": 0.41017833, "memory(GiB)": 15.04, "step": 7095, "train_speed(iter/s)": 0.334944 }, { "acc": 0.88696423, "epoch": 0.9562289562289562, "grad_norm": 13.8125, "learning_rate": 1.1544875549255396e-05, "loss": 0.3364373, "memory(GiB)": 15.04, "step": 7100, "train_speed(iter/s)": 0.334975 }, { "acc": 0.93549919, "epoch": 0.9569023569023569, "grad_norm": 6.34375, "learning_rate": 1.1533873399377792e-05, "loss": 0.25334518, "memory(GiB)": 15.04, "step": 7105, "train_speed(iter/s)": 0.335026 }, { "acc": 0.92688274, "epoch": 0.9575757575757575, "grad_norm": 8.3125, "learning_rate": 1.1522869347721863e-05, "loss": 0.30110722, "memory(GiB)": 15.04, "step": 7110, "train_speed(iter/s)": 0.335076 }, { "acc": 0.91134062, "epoch": 0.9582491582491582, "grad_norm": 5.875, "learning_rate": 1.151186340793103e-05, "loss": 0.29459972, "memory(GiB)": 15.04, "step": 7115, "train_speed(iter/s)": 0.335103 }, { "acc": 0.89624596, "epoch": 0.958922558922559, "grad_norm": 10.3125, "learning_rate": 1.150085559365104e-05, "loss": 0.35935357, "memory(GiB)": 15.04, "step": 7120, "train_speed(iter/s)": 0.335115 }, { "acc": 0.88149929, "epoch": 0.9595959595959596, "grad_norm": 8.6875, "learning_rate": 1.1489845918529971e-05, "loss": 0.32850969, "memory(GiB)": 15.04, "step": 7125, "train_speed(iter/s)": 0.335121 }, { "acc": 0.88787622, "epoch": 0.9602693602693603, "grad_norm": 7.90625, "learning_rate": 1.1478834396218208e-05, "loss": 0.3603116, "memory(GiB)": 15.04, "step": 7130, "train_speed(iter/s)": 0.335139 }, { "acc": 0.90232334, "epoch": 0.960942760942761, "grad_norm": 14.375, "learning_rate": 1.1467821040368423e-05, "loss": 0.46831064, "memory(GiB)": 15.04, "step": 7135, "train_speed(iter/s)": 0.335192 }, { "acc": 0.90447445, "epoch": 0.9616161616161616, "grad_norm": 4.90625, "learning_rate": 1.145680586463557e-05, "loss": 0.2544513, "memory(GiB)": 15.04, "step": 7140, "train_speed(iter/s)": 0.335236 }, { "acc": 0.90101051, "epoch": 0.9622895622895623, "grad_norm": 9.875, "learning_rate": 1.1445788882676848e-05, "loss": 0.33424101, "memory(GiB)": 15.04, "step": 7145, "train_speed(iter/s)": 0.335282 }, { "acc": 0.93389111, "epoch": 0.9629629629629629, "grad_norm": 7.84375, "learning_rate": 1.14347701081517e-05, "loss": 0.19712846, "memory(GiB)": 15.04, "step": 7150, "train_speed(iter/s)": 0.335336 }, { "acc": 0.89447041, "epoch": 0.9636363636363636, "grad_norm": 10.0625, "learning_rate": 1.1423749554721799e-05, "loss": 0.25301185, "memory(GiB)": 15.04, "step": 7155, "train_speed(iter/s)": 0.335379 }, { "acc": 0.84386959, "epoch": 0.9643097643097643, "grad_norm": 6.125, "learning_rate": 1.1412727236051012e-05, "loss": 0.4289156, "memory(GiB)": 15.04, "step": 7160, "train_speed(iter/s)": 0.335409 }, { "acc": 0.83983965, "epoch": 0.9649831649831649, "grad_norm": 9.375, "learning_rate": 1.1401703165805398e-05, "loss": 0.37435858, "memory(GiB)": 15.04, "step": 7165, "train_speed(iter/s)": 0.335473 }, { "acc": 0.9178792, "epoch": 0.9656565656565657, "grad_norm": 16.0, "learning_rate": 1.139067735765319e-05, "loss": 0.32167628, "memory(GiB)": 15.04, "step": 7170, "train_speed(iter/s)": 0.335511 }, { "acc": 0.89984436, "epoch": 0.9663299663299664, "grad_norm": 8.625, "learning_rate": 1.1379649825264781e-05, "loss": 0.22635565, "memory(GiB)": 15.04, "step": 7175, "train_speed(iter/s)": 0.33555 }, { "acc": 0.8739892, "epoch": 0.967003367003367, "grad_norm": 8.1875, "learning_rate": 1.1368620582312684e-05, "loss": 0.39380751, "memory(GiB)": 15.04, "step": 7180, "train_speed(iter/s)": 0.335599 }, { "acc": 0.92747126, "epoch": 0.9676767676767677, "grad_norm": 9.875, "learning_rate": 1.1357589642471556e-05, "loss": 0.19782375, "memory(GiB)": 15.04, "step": 7185, "train_speed(iter/s)": 0.33564 }, { "acc": 0.91667948, "epoch": 0.9683501683501684, "grad_norm": 7.53125, "learning_rate": 1.1346557019418144e-05, "loss": 0.38190329, "memory(GiB)": 15.04, "step": 7190, "train_speed(iter/s)": 0.33565 }, { "acc": 0.93237314, "epoch": 0.969023569023569, "grad_norm": 7.625, "learning_rate": 1.1335522726831278e-05, "loss": 0.27520278, "memory(GiB)": 15.04, "step": 7195, "train_speed(iter/s)": 0.335657 }, { "acc": 0.83783598, "epoch": 0.9696969696969697, "grad_norm": 13.5625, "learning_rate": 1.1324486778391872e-05, "loss": 0.40921426, "memory(GiB)": 15.04, "step": 7200, "train_speed(iter/s)": 0.335689 }, { "epoch": 0.9696969696969697, "eval_acc": 0.8939006043970545, "eval_loss": 0.4129711389541626, "eval_runtime": 109.8029, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 7200 }, { "acc": 0.9090373, "epoch": 0.9703703703703703, "grad_norm": 4.5, "learning_rate": 1.131344918778288e-05, "loss": 0.45668912, "memory(GiB)": 15.04, "step": 7205, "train_speed(iter/s)": 0.334004 }, { "acc": 0.84333715, "epoch": 0.971043771043771, "grad_norm": 17.875, "learning_rate": 1.1302409968689301e-05, "loss": 0.59896164, "memory(GiB)": 15.04, "step": 7210, "train_speed(iter/s)": 0.334016 }, { "acc": 0.918678, "epoch": 0.9717171717171718, "grad_norm": 13.9375, "learning_rate": 1.129136913479815e-05, "loss": 0.33298759, "memory(GiB)": 15.04, "step": 7215, "train_speed(iter/s)": 0.334048 }, { "acc": 0.8880929, "epoch": 0.9723905723905724, "grad_norm": 12.8125, "learning_rate": 1.128032669979844e-05, "loss": 0.38684759, "memory(GiB)": 15.04, "step": 7220, "train_speed(iter/s)": 0.334108 }, { "acc": 0.90387564, "epoch": 0.9730639730639731, "grad_norm": 5.65625, "learning_rate": 1.1269282677381177e-05, "loss": 0.30169425, "memory(GiB)": 15.04, "step": 7225, "train_speed(iter/s)": 0.334122 }, { "acc": 0.90293312, "epoch": 0.9737373737373738, "grad_norm": 9.9375, "learning_rate": 1.1258237081239324e-05, "loss": 0.31314328, "memory(GiB)": 15.04, "step": 7230, "train_speed(iter/s)": 0.334171 }, { "acc": 0.85950842, "epoch": 0.9744107744107744, "grad_norm": 21.5, "learning_rate": 1.1247189925067812e-05, "loss": 0.51640587, "memory(GiB)": 15.04, "step": 7235, "train_speed(iter/s)": 0.334217 }, { "acc": 0.90892792, "epoch": 0.9750841750841751, "grad_norm": 4.8125, "learning_rate": 1.123614122256349e-05, "loss": 0.31192939, "memory(GiB)": 15.04, "step": 7240, "train_speed(iter/s)": 0.334196 }, { "acc": 0.94763403, "epoch": 0.9757575757575757, "grad_norm": 5.0, "learning_rate": 1.1225090987425134e-05, "loss": 0.25039723, "memory(GiB)": 15.04, "step": 7245, "train_speed(iter/s)": 0.334213 }, { "acc": 0.87983856, "epoch": 0.9764309764309764, "grad_norm": 11.8125, "learning_rate": 1.1214039233353413e-05, "loss": 0.40090127, "memory(GiB)": 15.04, "step": 7250, "train_speed(iter/s)": 0.334251 }, { "acc": 0.90854416, "epoch": 0.9771043771043771, "grad_norm": 4.375, "learning_rate": 1.1202985974050884e-05, "loss": 0.39664159, "memory(GiB)": 15.04, "step": 7255, "train_speed(iter/s)": 0.33428 }, { "acc": 0.88465996, "epoch": 0.9777777777777777, "grad_norm": 7.75, "learning_rate": 1.119193122322197e-05, "loss": 0.43538961, "memory(GiB)": 15.04, "step": 7260, "train_speed(iter/s)": 0.334268 }, { "acc": 0.89416714, "epoch": 0.9784511784511785, "grad_norm": 7.9375, "learning_rate": 1.1180874994572946e-05, "loss": 0.30163293, "memory(GiB)": 15.04, "step": 7265, "train_speed(iter/s)": 0.334288 }, { "acc": 0.89555483, "epoch": 0.9791245791245792, "grad_norm": 5.25, "learning_rate": 1.1169817301811911e-05, "loss": 0.30586894, "memory(GiB)": 15.04, "step": 7270, "train_speed(iter/s)": 0.334282 }, { "acc": 0.89075212, "epoch": 0.9797979797979798, "grad_norm": 7.46875, "learning_rate": 1.1158758158648786e-05, "loss": 0.4049346, "memory(GiB)": 15.04, "step": 7275, "train_speed(iter/s)": 0.334323 }, { "acc": 0.88342791, "epoch": 0.9804713804713805, "grad_norm": 6.90625, "learning_rate": 1.1147697578795287e-05, "loss": 0.37540126, "memory(GiB)": 15.04, "step": 7280, "train_speed(iter/s)": 0.334364 }, { "acc": 0.8891819, "epoch": 0.9811447811447811, "grad_norm": 7.71875, "learning_rate": 1.1136635575964916e-05, "loss": 0.33928139, "memory(GiB)": 15.04, "step": 7285, "train_speed(iter/s)": 0.334394 }, { "acc": 0.80094166, "epoch": 0.9818181818181818, "grad_norm": 16.625, "learning_rate": 1.1125572163872936e-05, "loss": 0.76204133, "memory(GiB)": 15.04, "step": 7290, "train_speed(iter/s)": 0.334436 }, { "acc": 0.92983532, "epoch": 0.9824915824915825, "grad_norm": 12.0625, "learning_rate": 1.1114507356236354e-05, "loss": 0.25833547, "memory(GiB)": 15.04, "step": 7295, "train_speed(iter/s)": 0.334443 }, { "acc": 0.87583199, "epoch": 0.9831649831649831, "grad_norm": 21.375, "learning_rate": 1.1103441166773911e-05, "loss": 0.66637115, "memory(GiB)": 15.04, "step": 7300, "train_speed(iter/s)": 0.334493 }, { "acc": 0.94558496, "epoch": 0.9838383838383838, "grad_norm": 8.8125, "learning_rate": 1.1092373609206064e-05, "loss": 0.20286543, "memory(GiB)": 15.04, "step": 7305, "train_speed(iter/s)": 0.334534 }, { "acc": 0.90613937, "epoch": 0.9845117845117846, "grad_norm": 9.625, "learning_rate": 1.108130469725496e-05, "loss": 0.38738461, "memory(GiB)": 15.04, "step": 7310, "train_speed(iter/s)": 0.33459 }, { "acc": 0.90719738, "epoch": 0.9851851851851852, "grad_norm": 6.96875, "learning_rate": 1.1070234444644432e-05, "loss": 0.40077448, "memory(GiB)": 15.04, "step": 7315, "train_speed(iter/s)": 0.334619 }, { "acc": 0.91612635, "epoch": 0.9858585858585859, "grad_norm": 8.4375, "learning_rate": 1.1059162865099969e-05, "loss": 0.33301401, "memory(GiB)": 15.04, "step": 7320, "train_speed(iter/s)": 0.334645 }, { "acc": 0.94373732, "epoch": 0.9865319865319865, "grad_norm": 5.375, "learning_rate": 1.1048089972348705e-05, "loss": 0.20315571, "memory(GiB)": 15.04, "step": 7325, "train_speed(iter/s)": 0.334692 }, { "acc": 0.93327198, "epoch": 0.9872053872053872, "grad_norm": 6.125, "learning_rate": 1.1037015780119412e-05, "loss": 0.23666122, "memory(GiB)": 15.04, "step": 7330, "train_speed(iter/s)": 0.334721 }, { "acc": 0.83606768, "epoch": 0.9878787878787879, "grad_norm": 11.1875, "learning_rate": 1.1025940302142461e-05, "loss": 0.47933102, "memory(GiB)": 15.04, "step": 7335, "train_speed(iter/s)": 0.33475 }, { "acc": 0.89996281, "epoch": 0.9885521885521885, "grad_norm": 9.3125, "learning_rate": 1.1014863552149823e-05, "loss": 0.38059247, "memory(GiB)": 15.04, "step": 7340, "train_speed(iter/s)": 0.334786 }, { "acc": 0.86802759, "epoch": 0.9892255892255892, "grad_norm": 11.9375, "learning_rate": 1.1003785543875045e-05, "loss": 0.47677841, "memory(GiB)": 15.04, "step": 7345, "train_speed(iter/s)": 0.334832 }, { "acc": 0.88216543, "epoch": 0.98989898989899, "grad_norm": 10.75, "learning_rate": 1.0992706291053237e-05, "loss": 0.26247544, "memory(GiB)": 15.04, "step": 7350, "train_speed(iter/s)": 0.33482 }, { "acc": 0.94089928, "epoch": 0.9905723905723905, "grad_norm": 6.46875, "learning_rate": 1.0981625807421043e-05, "loss": 0.20868149, "memory(GiB)": 15.04, "step": 7355, "train_speed(iter/s)": 0.334842 }, { "acc": 0.92048578, "epoch": 0.9912457912457913, "grad_norm": 10.5, "learning_rate": 1.0970544106716649e-05, "loss": 0.29919848, "memory(GiB)": 15.04, "step": 7360, "train_speed(iter/s)": 0.334893 }, { "acc": 0.89477882, "epoch": 0.9919191919191919, "grad_norm": 6.28125, "learning_rate": 1.0959461202679735e-05, "loss": 0.45753975, "memory(GiB)": 15.04, "step": 7365, "train_speed(iter/s)": 0.334937 }, { "acc": 0.90612965, "epoch": 0.9925925925925926, "grad_norm": 30.25, "learning_rate": 1.0948377109051481e-05, "loss": 0.39858489, "memory(GiB)": 15.04, "step": 7370, "train_speed(iter/s)": 0.334978 }, { "acc": 0.90982237, "epoch": 0.9932659932659933, "grad_norm": 5.09375, "learning_rate": 1.0937291839574532e-05, "loss": 0.31556346, "memory(GiB)": 15.04, "step": 7375, "train_speed(iter/s)": 0.334975 }, { "acc": 0.94280615, "epoch": 0.9939393939393939, "grad_norm": 8.4375, "learning_rate": 1.0926205407993007e-05, "loss": 0.25707188, "memory(GiB)": 15.04, "step": 7380, "train_speed(iter/s)": 0.334987 }, { "acc": 0.90444498, "epoch": 0.9946127946127946, "grad_norm": 9.4375, "learning_rate": 1.0915117828052457e-05, "loss": 0.35674734, "memory(GiB)": 15.04, "step": 7385, "train_speed(iter/s)": 0.335024 }, { "acc": 0.90108099, "epoch": 0.9952861952861953, "grad_norm": 9.875, "learning_rate": 1.0904029113499852e-05, "loss": 0.32676821, "memory(GiB)": 15.04, "step": 7390, "train_speed(iter/s)": 0.335062 }, { "acc": 0.93637123, "epoch": 0.9959595959595959, "grad_norm": 7.59375, "learning_rate": 1.0892939278083577e-05, "loss": 0.24299397, "memory(GiB)": 15.04, "step": 7395, "train_speed(iter/s)": 0.335104 }, { "acc": 0.91700153, "epoch": 0.9966329966329966, "grad_norm": 5.84375, "learning_rate": 1.08818483355534e-05, "loss": 0.29521022, "memory(GiB)": 15.04, "step": 7400, "train_speed(iter/s)": 0.335135 }, { "acc": 0.85686979, "epoch": 0.9973063973063973, "grad_norm": 13.8125, "learning_rate": 1.0870756299660466e-05, "loss": 0.4336318, "memory(GiB)": 15.04, "step": 7405, "train_speed(iter/s)": 0.335186 }, { "acc": 0.87050056, "epoch": 0.997979797979798, "grad_norm": 11.8125, "learning_rate": 1.085966318415728e-05, "loss": 0.57098708, "memory(GiB)": 15.04, "step": 7410, "train_speed(iter/s)": 0.335217 }, { "acc": 0.91617241, "epoch": 0.9986531986531987, "grad_norm": 4.6875, "learning_rate": 1.0848569002797674e-05, "loss": 0.33973608, "memory(GiB)": 15.04, "step": 7415, "train_speed(iter/s)": 0.335261 }, { "acc": 0.84956837, "epoch": 0.9993265993265993, "grad_norm": 8.3125, "learning_rate": 1.083747376933681e-05, "loss": 0.4439743, "memory(GiB)": 15.04, "step": 7420, "train_speed(iter/s)": 0.335282 }, { "acc": 0.91084309, "epoch": 1.0, "grad_norm": 3.515625, "learning_rate": 1.082637749753115e-05, "loss": 0.33124869, "memory(GiB)": 15.04, "step": 7425, "train_speed(iter/s)": 0.335267 }, { "acc": 0.91813211, "epoch": 1.0006734006734006, "grad_norm": 6.03125, "learning_rate": 1.0815280201138451e-05, "loss": 0.3201786, "memory(GiB)": 15.04, "step": 7430, "train_speed(iter/s)": 0.335285 }, { "acc": 0.87773285, "epoch": 1.0013468013468014, "grad_norm": 7.5625, "learning_rate": 1.080418189391773e-05, "loss": 0.43178754, "memory(GiB)": 15.04, "step": 7435, "train_speed(iter/s)": 0.335303 }, { "acc": 0.90512619, "epoch": 1.002020202020202, "grad_norm": 5.75, "learning_rate": 1.0793082589629264e-05, "loss": 0.46502805, "memory(GiB)": 15.04, "step": 7440, "train_speed(iter/s)": 0.335306 }, { "acc": 0.83395872, "epoch": 1.0026936026936026, "grad_norm": 9.75, "learning_rate": 1.0781982302034563e-05, "loss": 0.53723869, "memory(GiB)": 15.04, "step": 7445, "train_speed(iter/s)": 0.335343 }, { "acc": 0.81441259, "epoch": 1.0033670033670035, "grad_norm": 12.0625, "learning_rate": 1.077088104489636e-05, "loss": 0.49833088, "memory(GiB)": 15.04, "step": 7450, "train_speed(iter/s)": 0.335361 }, { "acc": 0.90716105, "epoch": 1.004040404040404, "grad_norm": 16.125, "learning_rate": 1.0759778831978585e-05, "loss": 0.46950393, "memory(GiB)": 15.04, "step": 7455, "train_speed(iter/s)": 0.335399 }, { "acc": 0.92520227, "epoch": 1.0047138047138047, "grad_norm": 5.71875, "learning_rate": 1.0748675677046356e-05, "loss": 0.27084908, "memory(GiB)": 15.04, "step": 7460, "train_speed(iter/s)": 0.335432 }, { "acc": 0.8178484, "epoch": 1.0053872053872055, "grad_norm": 12.0, "learning_rate": 1.0737571593865963e-05, "loss": 0.31178315, "memory(GiB)": 15.04, "step": 7465, "train_speed(iter/s)": 0.335465 }, { "acc": 0.91998367, "epoch": 1.006060606060606, "grad_norm": 5.375, "learning_rate": 1.0726466596204836e-05, "loss": 0.28148334, "memory(GiB)": 15.04, "step": 7470, "train_speed(iter/s)": 0.335501 }, { "acc": 0.85789928, "epoch": 1.0067340067340067, "grad_norm": 4.21875, "learning_rate": 1.0715360697831547e-05, "loss": 0.26114526, "memory(GiB)": 15.04, "step": 7475, "train_speed(iter/s)": 0.335504 }, { "acc": 0.930474, "epoch": 1.0074074074074073, "grad_norm": 5.625, "learning_rate": 1.0704253912515787e-05, "loss": 0.28305035, "memory(GiB)": 15.04, "step": 7480, "train_speed(iter/s)": 0.335516 }, { "acc": 0.88864784, "epoch": 1.0080808080808081, "grad_norm": 7.09375, "learning_rate": 1.0693146254028342e-05, "loss": 0.39999893, "memory(GiB)": 15.04, "step": 7485, "train_speed(iter/s)": 0.335522 }, { "acc": 0.92239819, "epoch": 1.0087542087542087, "grad_norm": 17.125, "learning_rate": 1.0682037736141078e-05, "loss": 0.24749136, "memory(GiB)": 15.04, "step": 7490, "train_speed(iter/s)": 0.335561 }, { "acc": 0.9252677, "epoch": 1.0094276094276093, "grad_norm": 10.1875, "learning_rate": 1.0670928372626932e-05, "loss": 0.26599128, "memory(GiB)": 15.04, "step": 7495, "train_speed(iter/s)": 0.335574 }, { "acc": 0.90563536, "epoch": 1.0101010101010102, "grad_norm": 14.9375, "learning_rate": 1.0659818177259886e-05, "loss": 0.46962347, "memory(GiB)": 15.04, "step": 7500, "train_speed(iter/s)": 0.335579 }, { "epoch": 1.0101010101010102, "eval_acc": 0.8932916324312863, "eval_loss": 0.41181936860084534, "eval_runtime": 109.715, "eval_samples_per_second": 1.367, "eval_steps_per_second": 1.367, "step": 7500 }, { "acc": 0.92174358, "epoch": 1.0107744107744108, "grad_norm": 6.875, "learning_rate": 1.0648707163814957e-05, "loss": 0.29238164, "memory(GiB)": 15.04, "step": 7505, "train_speed(iter/s)": 0.333896 }, { "acc": 0.84976702, "epoch": 1.0114478114478114, "grad_norm": 10.5, "learning_rate": 1.0637595346068173e-05, "loss": 0.43344164, "memory(GiB)": 15.04, "step": 7510, "train_speed(iter/s)": 0.333939 }, { "acc": 0.89667292, "epoch": 1.0121212121212122, "grad_norm": 22.125, "learning_rate": 1.062648273779656e-05, "loss": 0.46906109, "memory(GiB)": 15.04, "step": 7515, "train_speed(iter/s)": 0.333972 }, { "acc": 0.87726889, "epoch": 1.0127946127946128, "grad_norm": 15.5, "learning_rate": 1.0615369352778122e-05, "loss": 0.32460937, "memory(GiB)": 15.04, "step": 7520, "train_speed(iter/s)": 0.333997 }, { "acc": 0.89613562, "epoch": 1.0134680134680134, "grad_norm": 12.9375, "learning_rate": 1.0604255204791831e-05, "loss": 0.48519855, "memory(GiB)": 15.04, "step": 7525, "train_speed(iter/s)": 0.334044 }, { "acc": 0.89590931, "epoch": 1.0141414141414142, "grad_norm": 7.34375, "learning_rate": 1.0593140307617604e-05, "loss": 0.43767009, "memory(GiB)": 15.04, "step": 7530, "train_speed(iter/s)": 0.333996 }, { "acc": 0.78652587, "epoch": 1.0148148148148148, "grad_norm": 14.5, "learning_rate": 1.0582024675036282e-05, "loss": 1.00399866, "memory(GiB)": 15.04, "step": 7535, "train_speed(iter/s)": 0.333975 }, { "acc": 0.90799513, "epoch": 1.0154882154882154, "grad_norm": 5.9375, "learning_rate": 1.0570908320829625e-05, "loss": 0.31158729, "memory(GiB)": 15.04, "step": 7540, "train_speed(iter/s)": 0.333995 }, { "acc": 0.90999622, "epoch": 1.0161616161616163, "grad_norm": 6.0625, "learning_rate": 1.055979125878028e-05, "loss": 0.31318541, "memory(GiB)": 15.04, "step": 7545, "train_speed(iter/s)": 0.334028 }, { "acc": 0.92610779, "epoch": 1.0168350168350169, "grad_norm": 7.8125, "learning_rate": 1.0548673502671776e-05, "loss": 0.26086628, "memory(GiB)": 15.04, "step": 7550, "train_speed(iter/s)": 0.334072 }, { "acc": 0.82934303, "epoch": 1.0175084175084175, "grad_norm": 8.0, "learning_rate": 1.0537555066288503e-05, "loss": 0.51228423, "memory(GiB)": 15.04, "step": 7555, "train_speed(iter/s)": 0.334116 }, { "acc": 0.87914476, "epoch": 1.018181818181818, "grad_norm": 6.5, "learning_rate": 1.0526435963415695e-05, "loss": 0.61368141, "memory(GiB)": 15.04, "step": 7560, "train_speed(iter/s)": 0.33416 }, { "acc": 0.92587614, "epoch": 1.018855218855219, "grad_norm": 11.0625, "learning_rate": 1.051531620783941e-05, "loss": 0.2348141, "memory(GiB)": 15.04, "step": 7565, "train_speed(iter/s)": 0.334197 }, { "acc": 0.90355387, "epoch": 1.0195286195286195, "grad_norm": 7.46875, "learning_rate": 1.0504195813346511e-05, "loss": 0.37921202, "memory(GiB)": 15.04, "step": 7570, "train_speed(iter/s)": 0.334233 }, { "acc": 0.93372288, "epoch": 1.02020202020202, "grad_norm": 8.125, "learning_rate": 1.0493074793724665e-05, "loss": 0.21990647, "memory(GiB)": 15.04, "step": 7575, "train_speed(iter/s)": 0.33427 }, { "acc": 0.91960363, "epoch": 1.020875420875421, "grad_norm": 13.9375, "learning_rate": 1.0481953162762302e-05, "loss": 0.28417826, "memory(GiB)": 15.04, "step": 7580, "train_speed(iter/s)": 0.33429 }, { "acc": 0.91729288, "epoch": 1.0215488215488215, "grad_norm": 5.65625, "learning_rate": 1.047083093424862e-05, "loss": 0.24105039, "memory(GiB)": 15.04, "step": 7585, "train_speed(iter/s)": 0.334287 }, { "acc": 0.85426855, "epoch": 1.0222222222222221, "grad_norm": 15.8125, "learning_rate": 1.045970812197355e-05, "loss": 0.43264589, "memory(GiB)": 15.04, "step": 7590, "train_speed(iter/s)": 0.334299 }, { "acc": 0.90263243, "epoch": 1.022895622895623, "grad_norm": 12.4375, "learning_rate": 1.0448584739727752e-05, "loss": 0.39218068, "memory(GiB)": 15.04, "step": 7595, "train_speed(iter/s)": 0.334333 }, { "acc": 0.88058176, "epoch": 1.0235690235690236, "grad_norm": 6.78125, "learning_rate": 1.0437460801302586e-05, "loss": 0.6238133, "memory(GiB)": 15.04, "step": 7600, "train_speed(iter/s)": 0.334378 }, { "acc": 0.89416704, "epoch": 1.0242424242424242, "grad_norm": 6.53125, "learning_rate": 1.0426336320490112e-05, "loss": 0.40273023, "memory(GiB)": 15.04, "step": 7605, "train_speed(iter/s)": 0.334388 }, { "acc": 0.86264629, "epoch": 1.024915824915825, "grad_norm": 4.9375, "learning_rate": 1.0415211311083053e-05, "loss": 0.52320361, "memory(GiB)": 15.04, "step": 7610, "train_speed(iter/s)": 0.334381 }, { "acc": 0.92643089, "epoch": 1.0255892255892256, "grad_norm": 6.1875, "learning_rate": 1.0404085786874792e-05, "loss": 0.20462449, "memory(GiB)": 15.04, "step": 7615, "train_speed(iter/s)": 0.334432 }, { "acc": 0.93033438, "epoch": 1.0262626262626262, "grad_norm": 6.1875, "learning_rate": 1.0392959761659348e-05, "loss": 0.24749184, "memory(GiB)": 15.04, "step": 7620, "train_speed(iter/s)": 0.334479 }, { "acc": 0.92216396, "epoch": 1.026936026936027, "grad_norm": 9.8125, "learning_rate": 1.038183324923136e-05, "loss": 0.39832199, "memory(GiB)": 15.04, "step": 7625, "train_speed(iter/s)": 0.334519 }, { "acc": 0.9161252, "epoch": 1.0276094276094276, "grad_norm": 9.5625, "learning_rate": 1.0370706263386083e-05, "loss": 0.25948415, "memory(GiB)": 15.04, "step": 7630, "train_speed(iter/s)": 0.334551 }, { "acc": 0.89475193, "epoch": 1.0282828282828282, "grad_norm": 12.1875, "learning_rate": 1.035957881791934e-05, "loss": 0.48776073, "memory(GiB)": 15.04, "step": 7635, "train_speed(iter/s)": 0.334579 }, { "acc": 0.87355528, "epoch": 1.028956228956229, "grad_norm": 4.46875, "learning_rate": 1.034845092662754e-05, "loss": 0.31079061, "memory(GiB)": 15.04, "step": 7640, "train_speed(iter/s)": 0.334613 }, { "acc": 0.91904821, "epoch": 1.0296296296296297, "grad_norm": 5.59375, "learning_rate": 1.0337322603307631e-05, "loss": 0.26461391, "memory(GiB)": 15.04, "step": 7645, "train_speed(iter/s)": 0.334624 }, { "acc": 0.89145298, "epoch": 1.0303030303030303, "grad_norm": 8.625, "learning_rate": 1.032619386175711e-05, "loss": 0.49507813, "memory(GiB)": 15.04, "step": 7650, "train_speed(iter/s)": 0.334664 }, { "acc": 0.89694061, "epoch": 1.0309764309764309, "grad_norm": 6.09375, "learning_rate": 1.0315064715773983e-05, "loss": 0.38549335, "memory(GiB)": 15.04, "step": 7655, "train_speed(iter/s)": 0.33469 }, { "acc": 0.93146105, "epoch": 1.0316498316498317, "grad_norm": 5.15625, "learning_rate": 1.0303935179156762e-05, "loss": 0.22203453, "memory(GiB)": 15.04, "step": 7660, "train_speed(iter/s)": 0.334726 }, { "acc": 0.9121562, "epoch": 1.0323232323232323, "grad_norm": 6.375, "learning_rate": 1.0292805265704442e-05, "loss": 0.3183358, "memory(GiB)": 15.04, "step": 7665, "train_speed(iter/s)": 0.33477 }, { "acc": 0.89644375, "epoch": 1.032996632996633, "grad_norm": 11.25, "learning_rate": 1.0281674989216483e-05, "loss": 0.39723806, "memory(GiB)": 15.04, "step": 7670, "train_speed(iter/s)": 0.334797 }, { "acc": 0.90520725, "epoch": 1.0336700336700337, "grad_norm": 6.09375, "learning_rate": 1.0270544363492803e-05, "loss": 0.41350541, "memory(GiB)": 15.04, "step": 7675, "train_speed(iter/s)": 0.334764 }, { "acc": 0.91033335, "epoch": 1.0343434343434343, "grad_norm": 7.84375, "learning_rate": 1.0259413402333743e-05, "loss": 0.38219395, "memory(GiB)": 15.04, "step": 7680, "train_speed(iter/s)": 0.334789 }, { "acc": 0.89836197, "epoch": 1.035016835016835, "grad_norm": 11.5, "learning_rate": 1.0248282119540065e-05, "loss": 0.29855702, "memory(GiB)": 15.04, "step": 7685, "train_speed(iter/s)": 0.334833 }, { "acc": 0.88722687, "epoch": 1.0356902356902358, "grad_norm": 7.4375, "learning_rate": 1.023715052891293e-05, "loss": 0.58175788, "memory(GiB)": 15.04, "step": 7690, "train_speed(iter/s)": 0.334852 }, { "acc": 0.89256382, "epoch": 1.0363636363636364, "grad_norm": 6.875, "learning_rate": 1.0226018644253874e-05, "loss": 0.3838747, "memory(GiB)": 15.04, "step": 7695, "train_speed(iter/s)": 0.334865 }, { "acc": 0.87644978, "epoch": 1.037037037037037, "grad_norm": 5.15625, "learning_rate": 1.0214886479364811e-05, "loss": 0.54048905, "memory(GiB)": 15.04, "step": 7700, "train_speed(iter/s)": 0.334906 }, { "acc": 0.91026125, "epoch": 1.0377104377104378, "grad_norm": 11.375, "learning_rate": 1.0203754048047994e-05, "loss": 0.38877017, "memory(GiB)": 15.04, "step": 7705, "train_speed(iter/s)": 0.334939 }, { "acc": 0.9281456, "epoch": 1.0383838383838384, "grad_norm": 5.5625, "learning_rate": 1.0192621364106003e-05, "loss": 0.30660315, "memory(GiB)": 15.04, "step": 7710, "train_speed(iter/s)": 0.334969 }, { "acc": 0.90934849, "epoch": 1.039057239057239, "grad_norm": 10.5, "learning_rate": 1.0181488441341738e-05, "loss": 0.34135413, "memory(GiB)": 15.04, "step": 7715, "train_speed(iter/s)": 0.335007 }, { "acc": 0.89176168, "epoch": 1.0397306397306396, "grad_norm": 13.375, "learning_rate": 1.0170355293558389e-05, "loss": 0.37481618, "memory(GiB)": 15.04, "step": 7720, "train_speed(iter/s)": 0.335022 }, { "acc": 0.90818529, "epoch": 1.0404040404040404, "grad_norm": 6.96875, "learning_rate": 1.0159221934559435e-05, "loss": 0.28701985, "memory(GiB)": 15.04, "step": 7725, "train_speed(iter/s)": 0.335053 }, { "acc": 0.92640896, "epoch": 1.041077441077441, "grad_norm": 6.96875, "learning_rate": 1.0148088378148604e-05, "loss": 0.3465523, "memory(GiB)": 15.04, "step": 7730, "train_speed(iter/s)": 0.33508 }, { "acc": 0.92203531, "epoch": 1.0417508417508416, "grad_norm": 8.375, "learning_rate": 1.013695463812988e-05, "loss": 0.24410901, "memory(GiB)": 15.04, "step": 7735, "train_speed(iter/s)": 0.335127 }, { "acc": 0.89423656, "epoch": 1.0424242424242425, "grad_norm": 8.3125, "learning_rate": 1.0125820728307463e-05, "loss": 0.56631393, "memory(GiB)": 15.04, "step": 7740, "train_speed(iter/s)": 0.335166 }, { "acc": 0.93116884, "epoch": 1.043097643097643, "grad_norm": 9.625, "learning_rate": 1.0114686662485776e-05, "loss": 0.26160462, "memory(GiB)": 15.04, "step": 7745, "train_speed(iter/s)": 0.335201 }, { "acc": 0.84082174, "epoch": 1.0437710437710437, "grad_norm": 7.5, "learning_rate": 1.0103552454469427e-05, "loss": 0.66812701, "memory(GiB)": 15.04, "step": 7750, "train_speed(iter/s)": 0.335207 }, { "acc": 0.83968897, "epoch": 1.0444444444444445, "grad_norm": 7.46875, "learning_rate": 1.0092418118063202e-05, "loss": 0.53751125, "memory(GiB)": 15.04, "step": 7755, "train_speed(iter/s)": 0.335239 }, { "acc": 0.88964825, "epoch": 1.0451178451178451, "grad_norm": 15.125, "learning_rate": 1.0081283667072053e-05, "loss": 0.48736072, "memory(GiB)": 15.04, "step": 7760, "train_speed(iter/s)": 0.335221 }, { "acc": 0.88361464, "epoch": 1.0457912457912457, "grad_norm": 7.71875, "learning_rate": 1.0070149115301062e-05, "loss": 0.33016255, "memory(GiB)": 15.04, "step": 7765, "train_speed(iter/s)": 0.335268 }, { "acc": 0.90930042, "epoch": 1.0464646464646465, "grad_norm": 6.96875, "learning_rate": 1.0059014476555444e-05, "loss": 0.29139402, "memory(GiB)": 15.04, "step": 7770, "train_speed(iter/s)": 0.335293 }, { "acc": 0.86369629, "epoch": 1.0471380471380471, "grad_norm": 11.75, "learning_rate": 1.004787976464052e-05, "loss": 0.42862406, "memory(GiB)": 15.04, "step": 7775, "train_speed(iter/s)": 0.335312 }, { "acc": 0.8694375, "epoch": 1.0478114478114477, "grad_norm": 8.25, "learning_rate": 1.0036744993361703e-05, "loss": 0.47458825, "memory(GiB)": 15.04, "step": 7780, "train_speed(iter/s)": 0.335347 }, { "acc": 0.9060977, "epoch": 1.0484848484848486, "grad_norm": 9.3125, "learning_rate": 1.0025610176524477e-05, "loss": 0.31830359, "memory(GiB)": 15.04, "step": 7785, "train_speed(iter/s)": 0.335326 }, { "acc": 0.92732201, "epoch": 1.0491582491582492, "grad_norm": 12.875, "learning_rate": 1.0014475327934381e-05, "loss": 0.25894213, "memory(GiB)": 15.04, "step": 7790, "train_speed(iter/s)": 0.335372 }, { "acc": 0.92505512, "epoch": 1.0498316498316498, "grad_norm": 9.3125, "learning_rate": 1.0003340461396999e-05, "loss": 0.28632023, "memory(GiB)": 15.04, "step": 7795, "train_speed(iter/s)": 0.335398 }, { "acc": 0.93215771, "epoch": 1.0505050505050506, "grad_norm": 8.375, "learning_rate": 9.992205590717936e-06, "loss": 0.23813188, "memory(GiB)": 15.04, "step": 7800, "train_speed(iter/s)": 0.335438 }, { "epoch": 1.0505050505050506, "eval_acc": 0.8949522150060721, "eval_loss": 0.4078538119792938, "eval_runtime": 109.7415, "eval_samples_per_second": 1.367, "eval_steps_per_second": 1.367, "step": 7800 }, { "acc": 0.92810698, "epoch": 1.0511784511784512, "grad_norm": 9.875, "learning_rate": 9.981070729702795e-06, "loss": 0.2444669, "memory(GiB)": 15.04, "step": 7805, "train_speed(iter/s)": 0.333911 }, { "acc": 0.85111637, "epoch": 1.0518518518518518, "grad_norm": 6.3125, "learning_rate": 9.969935892157182e-06, "loss": 0.68557639, "memory(GiB)": 15.04, "step": 7810, "train_speed(iter/s)": 0.333935 }, { "acc": 0.85432844, "epoch": 1.0525252525252524, "grad_norm": 7.34375, "learning_rate": 9.958801091886654e-06, "loss": 0.37782602, "memory(GiB)": 15.04, "step": 7815, "train_speed(iter/s)": 0.333973 }, { "acc": 0.93435078, "epoch": 1.0531986531986532, "grad_norm": 8.1875, "learning_rate": 9.947666342696742e-06, "loss": 0.21125178, "memory(GiB)": 15.04, "step": 7820, "train_speed(iter/s)": 0.333977 }, { "acc": 0.92402878, "epoch": 1.0538720538720538, "grad_norm": 10.875, "learning_rate": 9.936531658392894e-06, "loss": 0.26423035, "memory(GiB)": 15.04, "step": 7825, "train_speed(iter/s)": 0.334024 }, { "acc": 0.92936287, "epoch": 1.0545454545454545, "grad_norm": 8.1875, "learning_rate": 9.925397052780491e-06, "loss": 0.23406742, "memory(GiB)": 15.04, "step": 7830, "train_speed(iter/s)": 0.334044 }, { "acc": 0.8435791, "epoch": 1.0552188552188553, "grad_norm": 8.1875, "learning_rate": 9.91426253966482e-06, "loss": 0.48717136, "memory(GiB)": 15.04, "step": 7835, "train_speed(iter/s)": 0.334048 }, { "acc": 0.85716352, "epoch": 1.0558922558922559, "grad_norm": 6.03125, "learning_rate": 9.903128132851036e-06, "loss": 0.71113405, "memory(GiB)": 15.04, "step": 7840, "train_speed(iter/s)": 0.334045 }, { "acc": 0.89160318, "epoch": 1.0565656565656565, "grad_norm": 12.125, "learning_rate": 9.89199384614418e-06, "loss": 0.48323383, "memory(GiB)": 15.04, "step": 7845, "train_speed(iter/s)": 0.3341 }, { "acc": 0.93457861, "epoch": 1.0572390572390573, "grad_norm": 7.0625, "learning_rate": 9.880859693349129e-06, "loss": 0.23514717, "memory(GiB)": 15.04, "step": 7850, "train_speed(iter/s)": 0.334146 }, { "acc": 0.90697279, "epoch": 1.057912457912458, "grad_norm": 6.53125, "learning_rate": 9.869725688270609e-06, "loss": 0.29730036, "memory(GiB)": 15.04, "step": 7855, "train_speed(iter/s)": 0.33419 }, { "acc": 0.92781496, "epoch": 1.0585858585858585, "grad_norm": 8.1875, "learning_rate": 9.85859184471315e-06, "loss": 0.28114614, "memory(GiB)": 15.04, "step": 7860, "train_speed(iter/s)": 0.334225 }, { "acc": 0.8981905, "epoch": 1.0592592592592593, "grad_norm": 4.46875, "learning_rate": 9.84745817648109e-06, "loss": 0.26651657, "memory(GiB)": 15.04, "step": 7865, "train_speed(iter/s)": 0.334207 }, { "acc": 0.92359161, "epoch": 1.05993265993266, "grad_norm": 6.0, "learning_rate": 9.836324697378546e-06, "loss": 0.21482296, "memory(GiB)": 15.04, "step": 7870, "train_speed(iter/s)": 0.334248 }, { "acc": 0.86257191, "epoch": 1.0606060606060606, "grad_norm": 10.125, "learning_rate": 9.8251914212094e-06, "loss": 0.37756774, "memory(GiB)": 15.04, "step": 7875, "train_speed(iter/s)": 0.334297 }, { "acc": 0.86224775, "epoch": 1.0612794612794614, "grad_norm": 12.0, "learning_rate": 9.814058361777282e-06, "loss": 0.56825967, "memory(GiB)": 15.04, "step": 7880, "train_speed(iter/s)": 0.334338 }, { "acc": 0.76086092, "epoch": 1.061952861952862, "grad_norm": 4.9375, "learning_rate": 9.802925532885562e-06, "loss": 0.5710423, "memory(GiB)": 15.04, "step": 7885, "train_speed(iter/s)": 0.334364 }, { "acc": 0.8451333, "epoch": 1.0626262626262626, "grad_norm": 7.75, "learning_rate": 9.791792948337308e-06, "loss": 0.49727669, "memory(GiB)": 15.04, "step": 7890, "train_speed(iter/s)": 0.334383 }, { "acc": 0.92011862, "epoch": 1.0632996632996634, "grad_norm": 7.625, "learning_rate": 9.780660621935304e-06, "loss": 0.33122869, "memory(GiB)": 15.04, "step": 7895, "train_speed(iter/s)": 0.334383 }, { "acc": 0.84289303, "epoch": 1.063973063973064, "grad_norm": 11.5, "learning_rate": 9.76952856748199e-06, "loss": 0.53285923, "memory(GiB)": 15.04, "step": 7900, "train_speed(iter/s)": 0.334374 }, { "acc": 0.89361753, "epoch": 1.0646464646464646, "grad_norm": 10.5, "learning_rate": 9.758396798779493e-06, "loss": 0.47205482, "memory(GiB)": 15.04, "step": 7905, "train_speed(iter/s)": 0.334424 }, { "acc": 0.87449894, "epoch": 1.0653198653198652, "grad_norm": 9.25, "learning_rate": 9.747265329629578e-06, "loss": 0.35123575, "memory(GiB)": 15.04, "step": 7910, "train_speed(iter/s)": 0.334435 }, { "acc": 0.92202873, "epoch": 1.065993265993266, "grad_norm": 7.28125, "learning_rate": 9.736134173833629e-06, "loss": 0.21412907, "memory(GiB)": 15.04, "step": 7915, "train_speed(iter/s)": 0.334464 }, { "acc": 0.89996395, "epoch": 1.0666666666666667, "grad_norm": 10.6875, "learning_rate": 9.725003345192652e-06, "loss": 0.59659462, "memory(GiB)": 15.04, "step": 7920, "train_speed(iter/s)": 0.334492 }, { "acc": 0.89880562, "epoch": 1.0673400673400673, "grad_norm": 6.71875, "learning_rate": 9.713872857507242e-06, "loss": 0.49608169, "memory(GiB)": 15.04, "step": 7925, "train_speed(iter/s)": 0.334479 }, { "acc": 0.91478395, "epoch": 1.068013468013468, "grad_norm": 6.46875, "learning_rate": 9.702742724577573e-06, "loss": 0.31762419, "memory(GiB)": 15.04, "step": 7930, "train_speed(iter/s)": 0.334505 }, { "acc": 0.92808399, "epoch": 1.0686868686868687, "grad_norm": 6.53125, "learning_rate": 9.691612960203385e-06, "loss": 0.27756267, "memory(GiB)": 15.04, "step": 7935, "train_speed(iter/s)": 0.334537 }, { "acc": 0.86438789, "epoch": 1.0693602693602693, "grad_norm": 5.28125, "learning_rate": 9.68048357818395e-06, "loss": 0.49684315, "memory(GiB)": 15.04, "step": 7940, "train_speed(iter/s)": 0.334567 }, { "acc": 0.94519253, "epoch": 1.0700336700336701, "grad_norm": 5.0, "learning_rate": 9.669354592318072e-06, "loss": 0.26527648, "memory(GiB)": 15.04, "step": 7945, "train_speed(iter/s)": 0.334609 }, { "acc": 0.90258961, "epoch": 1.0707070707070707, "grad_norm": 8.125, "learning_rate": 9.658226016404065e-06, "loss": 0.4713563, "memory(GiB)": 15.04, "step": 7950, "train_speed(iter/s)": 0.334619 }, { "acc": 0.9072814, "epoch": 1.0713804713804713, "grad_norm": 8.0625, "learning_rate": 9.647097864239728e-06, "loss": 0.30933635, "memory(GiB)": 15.04, "step": 7955, "train_speed(iter/s)": 0.334651 }, { "acc": 0.92713327, "epoch": 1.0720538720538721, "grad_norm": 2.390625, "learning_rate": 9.63597014962235e-06, "loss": 0.2353159, "memory(GiB)": 15.04, "step": 7960, "train_speed(iter/s)": 0.334681 }, { "acc": 0.83974371, "epoch": 1.0727272727272728, "grad_norm": 6.34375, "learning_rate": 9.624842886348654e-06, "loss": 0.42585888, "memory(GiB)": 15.04, "step": 7965, "train_speed(iter/s)": 0.334715 }, { "acc": 0.87823381, "epoch": 1.0734006734006734, "grad_norm": 6.9375, "learning_rate": 9.613716088214827e-06, "loss": 0.31924102, "memory(GiB)": 15.04, "step": 7970, "train_speed(iter/s)": 0.334722 }, { "acc": 0.90528374, "epoch": 1.074074074074074, "grad_norm": 7.0625, "learning_rate": 9.602589769016461e-06, "loss": 0.25827124, "memory(GiB)": 15.04, "step": 7975, "train_speed(iter/s)": 0.334775 }, { "acc": 0.93018322, "epoch": 1.0747474747474748, "grad_norm": 7.78125, "learning_rate": 9.591463942548565e-06, "loss": 0.2314424, "memory(GiB)": 15.04, "step": 7980, "train_speed(iter/s)": 0.334794 }, { "acc": 0.8915287, "epoch": 1.0754208754208754, "grad_norm": 15.375, "learning_rate": 9.580338622605541e-06, "loss": 0.24742427, "memory(GiB)": 15.04, "step": 7985, "train_speed(iter/s)": 0.334846 }, { "acc": 0.94772692, "epoch": 1.076094276094276, "grad_norm": 7.78125, "learning_rate": 9.569213822981142e-06, "loss": 0.24680769, "memory(GiB)": 15.04, "step": 7990, "train_speed(iter/s)": 0.334871 }, { "acc": 0.89446325, "epoch": 1.0767676767676768, "grad_norm": 4.9375, "learning_rate": 9.558089557468506e-06, "loss": 0.47502527, "memory(GiB)": 15.04, "step": 7995, "train_speed(iter/s)": 0.334899 }, { "acc": 0.84791069, "epoch": 1.0774410774410774, "grad_norm": 6.96875, "learning_rate": 9.546965839860077e-06, "loss": 0.33400559, "memory(GiB)": 15.04, "step": 8000, "train_speed(iter/s)": 0.334924 }, { "acc": 0.93035774, "epoch": 1.078114478114478, "grad_norm": 5.0625, "learning_rate": 9.535842683947642e-06, "loss": 0.30570438, "memory(GiB)": 15.04, "step": 8005, "train_speed(iter/s)": 0.334873 }, { "acc": 0.84862366, "epoch": 1.0787878787878789, "grad_norm": 11.6875, "learning_rate": 9.52472010352229e-06, "loss": 0.37180922, "memory(GiB)": 15.04, "step": 8010, "train_speed(iter/s)": 0.334905 }, { "acc": 0.8975028, "epoch": 1.0794612794612795, "grad_norm": 6.03125, "learning_rate": 9.513598112374383e-06, "loss": 0.34270852, "memory(GiB)": 15.04, "step": 8015, "train_speed(iter/s)": 0.334896 }, { "acc": 0.87457514, "epoch": 1.08013468013468, "grad_norm": 9.75, "learning_rate": 9.502476724293569e-06, "loss": 0.44135003, "memory(GiB)": 15.04, "step": 8020, "train_speed(iter/s)": 0.334948 }, { "acc": 0.92260237, "epoch": 1.0808080808080809, "grad_norm": 6.6875, "learning_rate": 9.49135595306873e-06, "loss": 0.26781821, "memory(GiB)": 15.04, "step": 8025, "train_speed(iter/s)": 0.334976 }, { "acc": 0.91439962, "epoch": 1.0814814814814815, "grad_norm": 7.0, "learning_rate": 9.480235812488003e-06, "loss": 0.30432069, "memory(GiB)": 15.04, "step": 8030, "train_speed(iter/s)": 0.335021 }, { "acc": 0.92259712, "epoch": 1.082154882154882, "grad_norm": 4.875, "learning_rate": 9.46911631633873e-06, "loss": 0.29038439, "memory(GiB)": 15.04, "step": 8035, "train_speed(iter/s)": 0.335033 }, { "acc": 0.92675104, "epoch": 1.082828282828283, "grad_norm": 12.8125, "learning_rate": 9.457997478407453e-06, "loss": 0.23149812, "memory(GiB)": 15.04, "step": 8040, "train_speed(iter/s)": 0.335066 }, { "acc": 0.89729252, "epoch": 1.0835016835016835, "grad_norm": 8.5, "learning_rate": 9.446879312479909e-06, "loss": 0.41467619, "memory(GiB)": 15.04, "step": 8045, "train_speed(iter/s)": 0.335056 }, { "acc": 0.83994951, "epoch": 1.0841750841750841, "grad_norm": 9.4375, "learning_rate": 9.43576183234099e-06, "loss": 0.4559495, "memory(GiB)": 15.04, "step": 8050, "train_speed(iter/s)": 0.335098 }, { "acc": 0.92020178, "epoch": 1.084848484848485, "grad_norm": 5.5625, "learning_rate": 9.424645051774744e-06, "loss": 0.31291816, "memory(GiB)": 15.04, "step": 8055, "train_speed(iter/s)": 0.335117 }, { "acc": 0.89911575, "epoch": 1.0855218855218856, "grad_norm": 9.9375, "learning_rate": 9.413528984564354e-06, "loss": 0.25901747, "memory(GiB)": 15.04, "step": 8060, "train_speed(iter/s)": 0.335135 }, { "acc": 0.91648579, "epoch": 1.0861952861952862, "grad_norm": 6.34375, "learning_rate": 9.402413644492108e-06, "loss": 0.31045237, "memory(GiB)": 15.04, "step": 8065, "train_speed(iter/s)": 0.335162 }, { "acc": 0.88398638, "epoch": 1.0868686868686868, "grad_norm": 11.3125, "learning_rate": 9.391299045339409e-06, "loss": 0.35305567, "memory(GiB)": 15.04, "step": 8070, "train_speed(iter/s)": 0.335161 }, { "acc": 0.85972319, "epoch": 1.0875420875420876, "grad_norm": 12.0, "learning_rate": 9.380185200886722e-06, "loss": 0.51338773, "memory(GiB)": 15.04, "step": 8075, "train_speed(iter/s)": 0.335131 }, { "acc": 0.91868744, "epoch": 1.0882154882154882, "grad_norm": 6.125, "learning_rate": 9.36907212491359e-06, "loss": 0.28593349, "memory(GiB)": 15.04, "step": 8080, "train_speed(iter/s)": 0.335153 }, { "acc": 0.86711378, "epoch": 1.0888888888888888, "grad_norm": 4.53125, "learning_rate": 9.357959831198603e-06, "loss": 0.5976912, "memory(GiB)": 15.04, "step": 8085, "train_speed(iter/s)": 0.335185 }, { "acc": 0.92944174, "epoch": 1.0895622895622896, "grad_norm": 12.125, "learning_rate": 9.34684833351937e-06, "loss": 0.32996786, "memory(GiB)": 15.04, "step": 8090, "train_speed(iter/s)": 0.335182 }, { "acc": 0.91718426, "epoch": 1.0902356902356902, "grad_norm": 7.0625, "learning_rate": 9.33573764565253e-06, "loss": 0.28149164, "memory(GiB)": 15.04, "step": 8095, "train_speed(iter/s)": 0.33522 }, { "acc": 0.91738691, "epoch": 1.0909090909090908, "grad_norm": 8.375, "learning_rate": 9.324627781373699e-06, "loss": 0.35977407, "memory(GiB)": 15.04, "step": 8100, "train_speed(iter/s)": 0.335243 }, { "epoch": 1.0909090909090908, "eval_acc": 0.8937275796581557, "eval_loss": 0.41258442401885986, "eval_runtime": 109.7511, "eval_samples_per_second": 1.367, "eval_steps_per_second": 1.367, "step": 8100 }, { "acc": 0.89032269, "epoch": 1.0915824915824917, "grad_norm": 8.1875, "learning_rate": 9.313518754457482e-06, "loss": 0.30160944, "memory(GiB)": 15.04, "step": 8105, "train_speed(iter/s)": 0.333743 }, { "acc": 0.87330008, "epoch": 1.0922558922558923, "grad_norm": 6.5625, "learning_rate": 9.302410578677456e-06, "loss": 0.35457988, "memory(GiB)": 15.04, "step": 8110, "train_speed(iter/s)": 0.333777 }, { "acc": 0.90822659, "epoch": 1.0929292929292929, "grad_norm": 8.0625, "learning_rate": 9.291303267806117e-06, "loss": 0.26888912, "memory(GiB)": 15.04, "step": 8115, "train_speed(iter/s)": 0.333815 }, { "acc": 0.91463413, "epoch": 1.0936026936026937, "grad_norm": 5.96875, "learning_rate": 9.280196835614916e-06, "loss": 0.30835445, "memory(GiB)": 15.04, "step": 8120, "train_speed(iter/s)": 0.333812 }, { "acc": 0.87086973, "epoch": 1.0942760942760943, "grad_norm": 14.0, "learning_rate": 9.269091295874193e-06, "loss": 0.4832408, "memory(GiB)": 15.04, "step": 8125, "train_speed(iter/s)": 0.33385 }, { "acc": 0.94408932, "epoch": 1.094949494949495, "grad_norm": 7.1875, "learning_rate": 9.257986662353192e-06, "loss": 0.26666822, "memory(GiB)": 15.04, "step": 8130, "train_speed(iter/s)": 0.333875 }, { "acc": 0.91334887, "epoch": 1.0956228956228957, "grad_norm": 5.21875, "learning_rate": 9.246882948820038e-06, "loss": 0.26005599, "memory(GiB)": 15.04, "step": 8135, "train_speed(iter/s)": 0.333909 }, { "acc": 0.91448956, "epoch": 1.0962962962962963, "grad_norm": 10.625, "learning_rate": 9.235780169041702e-06, "loss": 0.28071885, "memory(GiB)": 15.04, "step": 8140, "train_speed(iter/s)": 0.333926 }, { "acc": 0.92785759, "epoch": 1.096969696969697, "grad_norm": 15.4375, "learning_rate": 9.22467833678401e-06, "loss": 0.24446368, "memory(GiB)": 15.04, "step": 8145, "train_speed(iter/s)": 0.333966 }, { "acc": 0.93406315, "epoch": 1.0976430976430978, "grad_norm": 6.40625, "learning_rate": 9.2135774658116e-06, "loss": 0.23972368, "memory(GiB)": 15.04, "step": 8150, "train_speed(iter/s)": 0.333988 }, { "acc": 0.8919508, "epoch": 1.0983164983164984, "grad_norm": 7.34375, "learning_rate": 9.202477569887932e-06, "loss": 0.41952968, "memory(GiB)": 15.04, "step": 8155, "train_speed(iter/s)": 0.33394 }, { "acc": 0.90486822, "epoch": 1.098989898989899, "grad_norm": 7.65625, "learning_rate": 9.191378662775253e-06, "loss": 0.27907114, "memory(GiB)": 15.04, "step": 8160, "train_speed(iter/s)": 0.333987 }, { "acc": 0.94088945, "epoch": 1.0996632996632996, "grad_norm": 5.71875, "learning_rate": 9.180280758234575e-06, "loss": 0.19171976, "memory(GiB)": 15.04, "step": 8165, "train_speed(iter/s)": 0.333981 }, { "acc": 0.91611347, "epoch": 1.1003367003367004, "grad_norm": 6.03125, "learning_rate": 9.169183870025682e-06, "loss": 0.2421226, "memory(GiB)": 15.04, "step": 8170, "train_speed(iter/s)": 0.334011 }, { "acc": 0.91907644, "epoch": 1.101010101010101, "grad_norm": 8.1875, "learning_rate": 9.158088011907081e-06, "loss": 0.33632712, "memory(GiB)": 15.04, "step": 8175, "train_speed(iter/s)": 0.333984 }, { "acc": 0.8622076, "epoch": 1.1016835016835016, "grad_norm": 15.875, "learning_rate": 9.146993197636015e-06, "loss": 0.46791787, "memory(GiB)": 15.04, "step": 8180, "train_speed(iter/s)": 0.333996 }, { "acc": 0.88847284, "epoch": 1.1023569023569024, "grad_norm": 11.625, "learning_rate": 9.135899440968435e-06, "loss": 0.40275388, "memory(GiB)": 15.04, "step": 8185, "train_speed(iter/s)": 0.334046 }, { "acc": 0.92154865, "epoch": 1.103030303030303, "grad_norm": 6.65625, "learning_rate": 9.12480675565896e-06, "loss": 0.30134928, "memory(GiB)": 15.04, "step": 8190, "train_speed(iter/s)": 0.334075 }, { "acc": 0.91268005, "epoch": 1.1037037037037036, "grad_norm": 7.8125, "learning_rate": 9.11371515546091e-06, "loss": 0.21697178, "memory(GiB)": 15.04, "step": 8195, "train_speed(iter/s)": 0.334113 }, { "acc": 0.91839352, "epoch": 1.1043771043771045, "grad_norm": 7.875, "learning_rate": 9.10262465412623e-06, "loss": 0.23478787, "memory(GiB)": 15.04, "step": 8200, "train_speed(iter/s)": 0.334159 }, { "acc": 0.94088354, "epoch": 1.105050505050505, "grad_norm": 8.1875, "learning_rate": 9.091535265405528e-06, "loss": 0.22988636, "memory(GiB)": 15.04, "step": 8205, "train_speed(iter/s)": 0.334196 }, { "acc": 0.83029928, "epoch": 1.1057239057239057, "grad_norm": 7.125, "learning_rate": 9.080447003048016e-06, "loss": 0.7598547, "memory(GiB)": 15.04, "step": 8210, "train_speed(iter/s)": 0.334215 }, { "acc": 0.90943527, "epoch": 1.1063973063973065, "grad_norm": 9.0, "learning_rate": 9.069359880801518e-06, "loss": 0.28078151, "memory(GiB)": 15.04, "step": 8215, "train_speed(iter/s)": 0.334258 }, { "acc": 0.91230097, "epoch": 1.107070707070707, "grad_norm": 5.625, "learning_rate": 9.05827391241244e-06, "loss": 0.28510058, "memory(GiB)": 15.04, "step": 8220, "train_speed(iter/s)": 0.33424 }, { "acc": 0.92466831, "epoch": 1.1077441077441077, "grad_norm": 6.34375, "learning_rate": 9.04718911162576e-06, "loss": 0.24214656, "memory(GiB)": 15.04, "step": 8225, "train_speed(iter/s)": 0.334284 }, { "acc": 0.91876364, "epoch": 1.1084175084175083, "grad_norm": 4.4375, "learning_rate": 9.036105492185003e-06, "loss": 0.22657027, "memory(GiB)": 15.04, "step": 8230, "train_speed(iter/s)": 0.334324 }, { "acc": 0.86891813, "epoch": 1.1090909090909091, "grad_norm": 14.0625, "learning_rate": 9.025023067832239e-06, "loss": 0.31826494, "memory(GiB)": 15.04, "step": 8235, "train_speed(iter/s)": 0.334355 }, { "acc": 0.93562517, "epoch": 1.1097643097643097, "grad_norm": 5.3125, "learning_rate": 9.013941852308046e-06, "loss": 0.17317027, "memory(GiB)": 15.04, "step": 8240, "train_speed(iter/s)": 0.334385 }, { "acc": 0.88516407, "epoch": 1.1104377104377103, "grad_norm": 12.75, "learning_rate": 9.00286185935151e-06, "loss": 0.35928683, "memory(GiB)": 15.04, "step": 8245, "train_speed(iter/s)": 0.334401 }, { "acc": 0.88753223, "epoch": 1.1111111111111112, "grad_norm": 10.0, "learning_rate": 8.991783102700203e-06, "loss": 0.60402765, "memory(GiB)": 15.04, "step": 8250, "train_speed(iter/s)": 0.334426 }, { "acc": 0.94273224, "epoch": 1.1117845117845118, "grad_norm": 7.5, "learning_rate": 8.980705596090154e-06, "loss": 0.24888756, "memory(GiB)": 15.04, "step": 8255, "train_speed(iter/s)": 0.334436 }, { "acc": 0.93126106, "epoch": 1.1124579124579124, "grad_norm": 6.59375, "learning_rate": 8.969629353255855e-06, "loss": 0.45041056, "memory(GiB)": 15.04, "step": 8260, "train_speed(iter/s)": 0.334467 }, { "acc": 0.92297277, "epoch": 1.1131313131313132, "grad_norm": 14.4375, "learning_rate": 8.958554387930216e-06, "loss": 0.27016611, "memory(GiB)": 15.04, "step": 8265, "train_speed(iter/s)": 0.3345 }, { "acc": 0.90376577, "epoch": 1.1138047138047138, "grad_norm": 8.0625, "learning_rate": 8.947480713844578e-06, "loss": 0.31735523, "memory(GiB)": 15.04, "step": 8270, "train_speed(iter/s)": 0.334519 }, { "acc": 0.91011925, "epoch": 1.1144781144781144, "grad_norm": 7.15625, "learning_rate": 8.936408344728676e-06, "loss": 0.33862257, "memory(GiB)": 15.04, "step": 8275, "train_speed(iter/s)": 0.334533 }, { "acc": 0.87243767, "epoch": 1.1151515151515152, "grad_norm": 11.875, "learning_rate": 8.92533729431062e-06, "loss": 0.65410371, "memory(GiB)": 15.04, "step": 8280, "train_speed(iter/s)": 0.334538 }, { "acc": 0.91496019, "epoch": 1.1158249158249158, "grad_norm": 12.25, "learning_rate": 8.914267576316898e-06, "loss": 0.35794642, "memory(GiB)": 15.04, "step": 8285, "train_speed(iter/s)": 0.334567 }, { "acc": 0.82232246, "epoch": 1.1164983164983164, "grad_norm": 9.375, "learning_rate": 8.903199204472329e-06, "loss": 0.51466088, "memory(GiB)": 15.04, "step": 8290, "train_speed(iter/s)": 0.334615 }, { "acc": 0.9278451, "epoch": 1.1171717171717173, "grad_norm": 9.125, "learning_rate": 8.892132192500082e-06, "loss": 0.28508511, "memory(GiB)": 15.04, "step": 8295, "train_speed(iter/s)": 0.334637 }, { "acc": 0.84856863, "epoch": 1.1178451178451179, "grad_norm": 5.3125, "learning_rate": 8.881066554121625e-06, "loss": 0.67713985, "memory(GiB)": 15.04, "step": 8300, "train_speed(iter/s)": 0.334639 }, { "acc": 0.92465773, "epoch": 1.1185185185185185, "grad_norm": 12.4375, "learning_rate": 8.87000230305673e-06, "loss": 0.34076834, "memory(GiB)": 15.04, "step": 8305, "train_speed(iter/s)": 0.334669 }, { "acc": 0.8347146, "epoch": 1.1191919191919193, "grad_norm": 26.375, "learning_rate": 8.85893945302345e-06, "loss": 0.42291145, "memory(GiB)": 15.04, "step": 8310, "train_speed(iter/s)": 0.334698 }, { "acc": 0.90365276, "epoch": 1.11986531986532, "grad_norm": 6.90625, "learning_rate": 8.847878017738097e-06, "loss": 0.34632828, "memory(GiB)": 15.04, "step": 8315, "train_speed(iter/s)": 0.334721 }, { "acc": 0.91383142, "epoch": 1.1205387205387205, "grad_norm": 20.625, "learning_rate": 8.836818010915226e-06, "loss": 0.36084635, "memory(GiB)": 15.04, "step": 8320, "train_speed(iter/s)": 0.334764 }, { "acc": 0.92835274, "epoch": 1.121212121212121, "grad_norm": 9.4375, "learning_rate": 8.825759446267634e-06, "loss": 0.25349114, "memory(GiB)": 15.04, "step": 8325, "train_speed(iter/s)": 0.334799 }, { "acc": 0.89334583, "epoch": 1.121885521885522, "grad_norm": 5.5625, "learning_rate": 8.814702337506311e-06, "loss": 0.5174253, "memory(GiB)": 15.04, "step": 8330, "train_speed(iter/s)": 0.33481 }, { "acc": 0.88716631, "epoch": 1.1225589225589225, "grad_norm": 16.25, "learning_rate": 8.803646698340463e-06, "loss": 0.27215698, "memory(GiB)": 15.04, "step": 8335, "train_speed(iter/s)": 0.334855 }, { "acc": 0.8864501, "epoch": 1.1232323232323231, "grad_norm": 6.84375, "learning_rate": 8.792592542477451e-06, "loss": 0.34078074, "memory(GiB)": 15.04, "step": 8340, "train_speed(iter/s)": 0.334895 }, { "acc": 0.89195995, "epoch": 1.123905723905724, "grad_norm": 11.25, "learning_rate": 8.781539883622818e-06, "loss": 0.46099796, "memory(GiB)": 15.04, "step": 8345, "train_speed(iter/s)": 0.334932 }, { "acc": 0.84340038, "epoch": 1.1245791245791246, "grad_norm": 13.1875, "learning_rate": 8.770488735480244e-06, "loss": 0.60841222, "memory(GiB)": 15.04, "step": 8350, "train_speed(iter/s)": 0.334954 }, { "acc": 0.82589006, "epoch": 1.1252525252525252, "grad_norm": 9.6875, "learning_rate": 8.759439111751523e-06, "loss": 0.52863579, "memory(GiB)": 15.04, "step": 8355, "train_speed(iter/s)": 0.334984 }, { "acc": 0.89610262, "epoch": 1.125925925925926, "grad_norm": 10.0625, "learning_rate": 8.748391026136582e-06, "loss": 0.36142957, "memory(GiB)": 15.04, "step": 8360, "train_speed(iter/s)": 0.335005 }, { "acc": 0.90552168, "epoch": 1.1265993265993266, "grad_norm": 5.40625, "learning_rate": 8.737344492333417e-06, "loss": 0.34427059, "memory(GiB)": 15.04, "step": 8365, "train_speed(iter/s)": 0.334991 }, { "acc": 0.91735144, "epoch": 1.1272727272727272, "grad_norm": 8.0625, "learning_rate": 8.72629952403812e-06, "loss": 0.26472263, "memory(GiB)": 15.04, "step": 8370, "train_speed(iter/s)": 0.335022 }, { "acc": 0.84729338, "epoch": 1.127946127946128, "grad_norm": 20.125, "learning_rate": 8.715256134944831e-06, "loss": 0.56816874, "memory(GiB)": 15.04, "step": 8375, "train_speed(iter/s)": 0.335051 }, { "acc": 0.91038513, "epoch": 1.1286195286195286, "grad_norm": 14.0625, "learning_rate": 8.704214338745735e-06, "loss": 0.29438369, "memory(GiB)": 15.04, "step": 8380, "train_speed(iter/s)": 0.335078 }, { "acc": 0.82746258, "epoch": 1.1292929292929292, "grad_norm": 7.59375, "learning_rate": 8.693174149131042e-06, "loss": 0.83370857, "memory(GiB)": 15.04, "step": 8385, "train_speed(iter/s)": 0.335125 }, { "acc": 0.88657513, "epoch": 1.12996632996633, "grad_norm": 12.75, "learning_rate": 8.68213557978897e-06, "loss": 0.42116256, "memory(GiB)": 15.04, "step": 8390, "train_speed(iter/s)": 0.335136 }, { "acc": 0.91205168, "epoch": 1.1306397306397307, "grad_norm": 11.3125, "learning_rate": 8.671098644405726e-06, "loss": 0.43715248, "memory(GiB)": 15.04, "step": 8395, "train_speed(iter/s)": 0.335124 }, { "acc": 0.85746775, "epoch": 1.1313131313131313, "grad_norm": 8.625, "learning_rate": 8.660063356665498e-06, "loss": 0.60507174, "memory(GiB)": 15.04, "step": 8400, "train_speed(iter/s)": 0.335164 }, { "epoch": 1.1313131313131313, "eval_acc": 0.8942114714270625, "eval_loss": 0.4126920700073242, "eval_runtime": 109.7133, "eval_samples_per_second": 1.367, "eval_steps_per_second": 1.367, "step": 8400 }, { "acc": 0.89846535, "epoch": 1.131986531986532, "grad_norm": 10.4375, "learning_rate": 8.649029730250418e-06, "loss": 0.4828084, "memory(GiB)": 15.04, "step": 8405, "train_speed(iter/s)": 0.333698 }, { "acc": 0.84215689, "epoch": 1.1326599326599327, "grad_norm": 9.3125, "learning_rate": 8.637997778840577e-06, "loss": 0.51783433, "memory(GiB)": 15.04, "step": 8410, "train_speed(iter/s)": 0.333741 }, { "acc": 0.91730366, "epoch": 1.1333333333333333, "grad_norm": 14.0625, "learning_rate": 8.626967516113968e-06, "loss": 0.29312508, "memory(GiB)": 15.04, "step": 8415, "train_speed(iter/s)": 0.333762 }, { "acc": 0.92496166, "epoch": 1.134006734006734, "grad_norm": 5.5, "learning_rate": 8.615938955746508e-06, "loss": 0.37733843, "memory(GiB)": 15.04, "step": 8420, "train_speed(iter/s)": 0.333765 }, { "acc": 0.8840188, "epoch": 1.1346801346801347, "grad_norm": 7.53125, "learning_rate": 8.604912111411998e-06, "loss": 0.33286364, "memory(GiB)": 15.04, "step": 8425, "train_speed(iter/s)": 0.333805 }, { "acc": 0.92879047, "epoch": 1.1353535353535353, "grad_norm": 8.8125, "learning_rate": 8.5938869967821e-06, "loss": 0.26940417, "memory(GiB)": 15.04, "step": 8430, "train_speed(iter/s)": 0.333816 }, { "acc": 0.84250507, "epoch": 1.136026936026936, "grad_norm": 23.625, "learning_rate": 8.582863625526351e-06, "loss": 0.83595047, "memory(GiB)": 15.04, "step": 8435, "train_speed(iter/s)": 0.333845 }, { "acc": 0.90578241, "epoch": 1.1367003367003368, "grad_norm": 6.75, "learning_rate": 8.571842011312111e-06, "loss": 0.42703586, "memory(GiB)": 15.04, "step": 8440, "train_speed(iter/s)": 0.333846 }, { "acc": 0.92058296, "epoch": 1.1373737373737374, "grad_norm": 5.21875, "learning_rate": 8.560822167804567e-06, "loss": 0.27597072, "memory(GiB)": 15.04, "step": 8445, "train_speed(iter/s)": 0.333846 }, { "acc": 0.91123238, "epoch": 1.138047138047138, "grad_norm": 18.875, "learning_rate": 8.549804108666717e-06, "loss": 0.40116172, "memory(GiB)": 15.04, "step": 8450, "train_speed(iter/s)": 0.333879 }, { "acc": 0.92618265, "epoch": 1.1387205387205388, "grad_norm": 9.6875, "learning_rate": 8.538787847559332e-06, "loss": 0.25792506, "memory(GiB)": 15.04, "step": 8455, "train_speed(iter/s)": 0.333916 }, { "acc": 0.92828245, "epoch": 1.1393939393939394, "grad_norm": 6.59375, "learning_rate": 8.52777339814097e-06, "loss": 0.19393549, "memory(GiB)": 15.04, "step": 8460, "train_speed(iter/s)": 0.333945 }, { "acc": 0.92836123, "epoch": 1.14006734006734, "grad_norm": 6.4375, "learning_rate": 8.516760774067927e-06, "loss": 0.31603966, "memory(GiB)": 15.04, "step": 8465, "train_speed(iter/s)": 0.333983 }, { "acc": 0.94478445, "epoch": 1.1407407407407408, "grad_norm": 6.5, "learning_rate": 8.505749988994247e-06, "loss": 0.24762452, "memory(GiB)": 15.04, "step": 8470, "train_speed(iter/s)": 0.333978 }, { "acc": 0.83944941, "epoch": 1.1414141414141414, "grad_norm": 11.375, "learning_rate": 8.494741056571693e-06, "loss": 0.82006397, "memory(GiB)": 15.04, "step": 8475, "train_speed(iter/s)": 0.333993 }, { "acc": 0.89516182, "epoch": 1.142087542087542, "grad_norm": 6.75, "learning_rate": 8.483733990449725e-06, "loss": 0.38534896, "memory(GiB)": 15.04, "step": 8480, "train_speed(iter/s)": 0.334006 }, { "acc": 0.8848381, "epoch": 1.1427609427609426, "grad_norm": 9.0625, "learning_rate": 8.472728804275496e-06, "loss": 0.41047716, "memory(GiB)": 15.04, "step": 8485, "train_speed(iter/s)": 0.334045 }, { "acc": 0.7664793, "epoch": 1.1434343434343435, "grad_norm": 20.625, "learning_rate": 8.46172551169382e-06, "loss": 0.73766646, "memory(GiB)": 15.04, "step": 8490, "train_speed(iter/s)": 0.334091 }, { "acc": 0.82757874, "epoch": 1.144107744107744, "grad_norm": 11.0, "learning_rate": 8.450724126347169e-06, "loss": 0.59155359, "memory(GiB)": 15.04, "step": 8495, "train_speed(iter/s)": 0.334104 }, { "acc": 0.91042652, "epoch": 1.144781144781145, "grad_norm": 3.6875, "learning_rate": 8.439724661875657e-06, "loss": 0.31208146, "memory(GiB)": 15.04, "step": 8500, "train_speed(iter/s)": 0.334089 }, { "acc": 0.95034885, "epoch": 1.1454545454545455, "grad_norm": 5.09375, "learning_rate": 8.428727131916996e-06, "loss": 0.19542702, "memory(GiB)": 15.04, "step": 8505, "train_speed(iter/s)": 0.33408 }, { "acc": 0.90736942, "epoch": 1.146127946127946, "grad_norm": 8.8125, "learning_rate": 8.417731550106526e-06, "loss": 0.40063429, "memory(GiB)": 15.04, "step": 8510, "train_speed(iter/s)": 0.33411 }, { "acc": 0.89917545, "epoch": 1.1468013468013467, "grad_norm": 7.46875, "learning_rate": 8.406737930077143e-06, "loss": 0.36739912, "memory(GiB)": 15.04, "step": 8515, "train_speed(iter/s)": 0.334111 }, { "acc": 0.90421133, "epoch": 1.1474747474747475, "grad_norm": 11.6875, "learning_rate": 8.395746285459333e-06, "loss": 0.31956413, "memory(GiB)": 15.04, "step": 8520, "train_speed(iter/s)": 0.334126 }, { "acc": 0.88792953, "epoch": 1.1481481481481481, "grad_norm": 18.0, "learning_rate": 8.38475662988113e-06, "loss": 0.60335183, "memory(GiB)": 15.04, "step": 8525, "train_speed(iter/s)": 0.334166 }, { "acc": 0.87934303, "epoch": 1.1488215488215487, "grad_norm": 7.1875, "learning_rate": 8.373768976968088e-06, "loss": 0.53611398, "memory(GiB)": 15.04, "step": 8530, "train_speed(iter/s)": 0.334193 }, { "acc": 0.94821157, "epoch": 1.1494949494949496, "grad_norm": 6.40625, "learning_rate": 8.362783340343294e-06, "loss": 0.19109724, "memory(GiB)": 15.04, "step": 8535, "train_speed(iter/s)": 0.334229 }, { "acc": 0.85498476, "epoch": 1.1501683501683502, "grad_norm": 4.78125, "learning_rate": 8.351799733627322e-06, "loss": 0.26411538, "memory(GiB)": 15.04, "step": 8540, "train_speed(iter/s)": 0.33427 }, { "acc": 0.94778214, "epoch": 1.1508417508417508, "grad_norm": 7.34375, "learning_rate": 8.340818170438239e-06, "loss": 0.19438212, "memory(GiB)": 15.04, "step": 8545, "train_speed(iter/s)": 0.334308 }, { "acc": 0.93892117, "epoch": 1.1515151515151516, "grad_norm": 5.34375, "learning_rate": 8.329838664391578e-06, "loss": 0.26018372, "memory(GiB)": 15.04, "step": 8550, "train_speed(iter/s)": 0.33432 }, { "acc": 0.872229, "epoch": 1.1521885521885522, "grad_norm": 11.4375, "learning_rate": 8.318861229100309e-06, "loss": 0.30497761, "memory(GiB)": 15.04, "step": 8555, "train_speed(iter/s)": 0.334359 }, { "acc": 0.87170715, "epoch": 1.1528619528619528, "grad_norm": 7.875, "learning_rate": 8.307885878174853e-06, "loss": 0.37016969, "memory(GiB)": 15.04, "step": 8560, "train_speed(iter/s)": 0.334365 }, { "acc": 0.86693535, "epoch": 1.1535353535353536, "grad_norm": 7.21875, "learning_rate": 8.296912625223034e-06, "loss": 0.58177495, "memory(GiB)": 15.04, "step": 8565, "train_speed(iter/s)": 0.334398 }, { "acc": 0.85612116, "epoch": 1.1542087542087542, "grad_norm": 6.90625, "learning_rate": 8.285941483850073e-06, "loss": 0.5755075, "memory(GiB)": 15.04, "step": 8570, "train_speed(iter/s)": 0.334441 }, { "acc": 0.87171307, "epoch": 1.1548821548821548, "grad_norm": 6.15625, "learning_rate": 8.274972467658589e-06, "loss": 0.52622843, "memory(GiB)": 15.04, "step": 8575, "train_speed(iter/s)": 0.334449 }, { "acc": 0.93473549, "epoch": 1.1555555555555554, "grad_norm": 6.21875, "learning_rate": 8.264005590248544e-06, "loss": 0.20657101, "memory(GiB)": 15.04, "step": 8580, "train_speed(iter/s)": 0.334462 }, { "acc": 0.91747732, "epoch": 1.1562289562289563, "grad_norm": 6.90625, "learning_rate": 8.253040865217269e-06, "loss": 0.31283977, "memory(GiB)": 15.04, "step": 8585, "train_speed(iter/s)": 0.334494 }, { "acc": 0.90545425, "epoch": 1.1569023569023569, "grad_norm": 5.625, "learning_rate": 8.242078306159408e-06, "loss": 0.34917042, "memory(GiB)": 15.04, "step": 8590, "train_speed(iter/s)": 0.334515 }, { "acc": 0.85144119, "epoch": 1.1575757575757575, "grad_norm": 14.625, "learning_rate": 8.231117926666932e-06, "loss": 0.37468622, "memory(GiB)": 15.04, "step": 8595, "train_speed(iter/s)": 0.334544 }, { "acc": 0.83755226, "epoch": 1.1582491582491583, "grad_norm": 17.25, "learning_rate": 8.220159740329113e-06, "loss": 0.25008359, "memory(GiB)": 15.04, "step": 8600, "train_speed(iter/s)": 0.334571 }, { "acc": 0.84674406, "epoch": 1.158922558922559, "grad_norm": 8.375, "learning_rate": 8.209203760732483e-06, "loss": 0.34826465, "memory(GiB)": 15.04, "step": 8605, "train_speed(iter/s)": 0.334597 }, { "acc": 0.9512723, "epoch": 1.1595959595959595, "grad_norm": 11.9375, "learning_rate": 8.198250001460867e-06, "loss": 0.18209766, "memory(GiB)": 15.04, "step": 8610, "train_speed(iter/s)": 0.334635 }, { "acc": 0.91602087, "epoch": 1.1602693602693603, "grad_norm": 21.0, "learning_rate": 8.187298476095308e-06, "loss": 0.38796551, "memory(GiB)": 15.04, "step": 8615, "train_speed(iter/s)": 0.334674 }, { "acc": 0.92153492, "epoch": 1.160942760942761, "grad_norm": 5.25, "learning_rate": 8.1763491982141e-06, "loss": 0.25587325, "memory(GiB)": 15.04, "step": 8620, "train_speed(iter/s)": 0.334709 }, { "acc": 0.89310303, "epoch": 1.1616161616161615, "grad_norm": 6.34375, "learning_rate": 8.165402181392748e-06, "loss": 0.3570641, "memory(GiB)": 15.04, "step": 8625, "train_speed(iter/s)": 0.334702 }, { "acc": 0.91990938, "epoch": 1.1622895622895624, "grad_norm": 4.65625, "learning_rate": 8.154457439203937e-06, "loss": 0.33330026, "memory(GiB)": 15.04, "step": 8630, "train_speed(iter/s)": 0.334722 }, { "acc": 0.88193007, "epoch": 1.162962962962963, "grad_norm": 11.75, "learning_rate": 8.14351498521756e-06, "loss": 0.21499176, "memory(GiB)": 15.04, "step": 8635, "train_speed(iter/s)": 0.334763 }, { "acc": 0.94585333, "epoch": 1.1636363636363636, "grad_norm": 4.71875, "learning_rate": 8.132574833000642e-06, "loss": 0.16870201, "memory(GiB)": 15.04, "step": 8640, "train_speed(iter/s)": 0.334807 }, { "acc": 0.89892464, "epoch": 1.1643097643097644, "grad_norm": 8.9375, "learning_rate": 8.121636996117377e-06, "loss": 0.3604188, "memory(GiB)": 15.04, "step": 8645, "train_speed(iter/s)": 0.334838 }, { "acc": 0.85911007, "epoch": 1.164983164983165, "grad_norm": 6.25, "learning_rate": 8.11070148812908e-06, "loss": 0.51609712, "memory(GiB)": 15.04, "step": 8650, "train_speed(iter/s)": 0.334873 }, { "acc": 0.91215897, "epoch": 1.1656565656565656, "grad_norm": 9.5625, "learning_rate": 8.099768322594178e-06, "loss": 0.24298859, "memory(GiB)": 15.04, "step": 8655, "train_speed(iter/s)": 0.334911 }, { "acc": 0.86823225, "epoch": 1.1663299663299664, "grad_norm": 4.1875, "learning_rate": 8.088837513068192e-06, "loss": 0.30021667, "memory(GiB)": 15.04, "step": 8660, "train_speed(iter/s)": 0.334917 }, { "acc": 0.93826561, "epoch": 1.167003367003367, "grad_norm": 15.75, "learning_rate": 8.07790907310373e-06, "loss": 0.22655346, "memory(GiB)": 15.04, "step": 8665, "train_speed(iter/s)": 0.334947 }, { "acc": 0.9213007, "epoch": 1.1676767676767676, "grad_norm": 12.625, "learning_rate": 8.06698301625045e-06, "loss": 0.25660739, "memory(GiB)": 15.04, "step": 8670, "train_speed(iter/s)": 0.334975 }, { "acc": 0.90065212, "epoch": 1.1683501683501682, "grad_norm": 8.3125, "learning_rate": 8.056059356055072e-06, "loss": 0.36666379, "memory(GiB)": 15.04, "step": 8675, "train_speed(iter/s)": 0.335 }, { "acc": 0.90083942, "epoch": 1.169023569023569, "grad_norm": 9.6875, "learning_rate": 8.045138106061323e-06, "loss": 0.32792065, "memory(GiB)": 15.04, "step": 8680, "train_speed(iter/s)": 0.335016 }, { "acc": 0.89946604, "epoch": 1.1696969696969697, "grad_norm": 13.8125, "learning_rate": 8.034219279809959e-06, "loss": 0.51767335, "memory(GiB)": 15.04, "step": 8685, "train_speed(iter/s)": 0.334977 }, { "acc": 0.88581705, "epoch": 1.1703703703703703, "grad_norm": 13.4375, "learning_rate": 8.023302890838729e-06, "loss": 0.39377327, "memory(GiB)": 15.04, "step": 8690, "train_speed(iter/s)": 0.335001 }, { "acc": 0.89919348, "epoch": 1.171043771043771, "grad_norm": 10.25, "learning_rate": 8.012388952682345e-06, "loss": 0.40560446, "memory(GiB)": 15.04, "step": 8695, "train_speed(iter/s)": 0.335028 }, { "acc": 0.94321012, "epoch": 1.1717171717171717, "grad_norm": 5.84375, "learning_rate": 8.001477478872504e-06, "loss": 0.20610557, "memory(GiB)": 15.04, "step": 8700, "train_speed(iter/s)": 0.335056 }, { "epoch": 1.1717171717171717, "eval_acc": 0.8943338839508115, "eval_loss": 0.41293954849243164, "eval_runtime": 109.9265, "eval_samples_per_second": 1.365, "eval_steps_per_second": 1.365, "step": 8700 }, { "acc": 0.91216431, "epoch": 1.1723905723905723, "grad_norm": 7.59375, "learning_rate": 7.990568482937826e-06, "loss": 0.28307076, "memory(GiB)": 15.04, "step": 8705, "train_speed(iter/s)": 0.333645 }, { "acc": 0.84530878, "epoch": 1.1730639730639731, "grad_norm": 12.8125, "learning_rate": 7.97966197840387e-06, "loss": 0.58879704, "memory(GiB)": 15.04, "step": 8710, "train_speed(iter/s)": 0.333696 }, { "acc": 0.87714481, "epoch": 1.1737373737373737, "grad_norm": 13.5, "learning_rate": 7.968757978793111e-06, "loss": 0.53552775, "memory(GiB)": 15.04, "step": 8715, "train_speed(iter/s)": 0.333733 }, { "acc": 0.92908802, "epoch": 1.1744107744107743, "grad_norm": 8.9375, "learning_rate": 7.9578564976249e-06, "loss": 0.23767047, "memory(GiB)": 15.04, "step": 8720, "train_speed(iter/s)": 0.33376 }, { "acc": 0.88402081, "epoch": 1.1750841750841752, "grad_norm": 20.375, "learning_rate": 7.946957548415488e-06, "loss": 0.44677601, "memory(GiB)": 15.04, "step": 8725, "train_speed(iter/s)": 0.33379 }, { "acc": 0.91932278, "epoch": 1.1757575757575758, "grad_norm": 12.375, "learning_rate": 7.936061144677964e-06, "loss": 0.32225776, "memory(GiB)": 15.04, "step": 8730, "train_speed(iter/s)": 0.333818 }, { "acc": 0.91353588, "epoch": 1.1764309764309764, "grad_norm": 9.1875, "learning_rate": 7.92516729992228e-06, "loss": 0.29596422, "memory(GiB)": 15.04, "step": 8735, "train_speed(iter/s)": 0.33384 }, { "acc": 0.92825842, "epoch": 1.177104377104377, "grad_norm": 8.9375, "learning_rate": 7.914276027655208e-06, "loss": 0.27144384, "memory(GiB)": 15.04, "step": 8740, "train_speed(iter/s)": 0.333883 }, { "acc": 0.94485922, "epoch": 1.1777777777777778, "grad_norm": 6.03125, "learning_rate": 7.903387341380325e-06, "loss": 0.25880239, "memory(GiB)": 15.04, "step": 8745, "train_speed(iter/s)": 0.333909 }, { "acc": 0.86168137, "epoch": 1.1784511784511784, "grad_norm": 8.75, "learning_rate": 7.892501254598011e-06, "loss": 0.31010644, "memory(GiB)": 15.04, "step": 8750, "train_speed(iter/s)": 0.33393 }, { "acc": 0.85700951, "epoch": 1.1791245791245792, "grad_norm": 9.5625, "learning_rate": 7.881617780805419e-06, "loss": 0.78340473, "memory(GiB)": 15.04, "step": 8755, "train_speed(iter/s)": 0.333984 }, { "acc": 0.9334198, "epoch": 1.1797979797979798, "grad_norm": 11.875, "learning_rate": 7.870736933496457e-06, "loss": 0.29740202, "memory(GiB)": 15.04, "step": 8760, "train_speed(iter/s)": 0.33403 }, { "acc": 0.89856739, "epoch": 1.1804713804713804, "grad_norm": 14.625, "learning_rate": 7.85985872616179e-06, "loss": 0.28176606, "memory(GiB)": 15.04, "step": 8765, "train_speed(iter/s)": 0.334055 }, { "acc": 0.89273281, "epoch": 1.181144781144781, "grad_norm": 6.4375, "learning_rate": 7.848983172288796e-06, "loss": 0.36913018, "memory(GiB)": 15.04, "step": 8770, "train_speed(iter/s)": 0.334086 }, { "acc": 0.9282836, "epoch": 1.1818181818181819, "grad_norm": 8.0, "learning_rate": 7.83811028536157e-06, "loss": 0.26287441, "memory(GiB)": 15.04, "step": 8775, "train_speed(iter/s)": 0.334108 }, { "acc": 0.8915103, "epoch": 1.1824915824915825, "grad_norm": 4.6875, "learning_rate": 7.827240078860898e-06, "loss": 0.23198943, "memory(GiB)": 15.04, "step": 8780, "train_speed(iter/s)": 0.334145 }, { "acc": 0.90011988, "epoch": 1.183164983164983, "grad_norm": 12.125, "learning_rate": 7.816372566264243e-06, "loss": 0.33637316, "memory(GiB)": 15.04, "step": 8785, "train_speed(iter/s)": 0.334143 }, { "acc": 0.91603937, "epoch": 1.183838383838384, "grad_norm": 5.96875, "learning_rate": 7.805507761045734e-06, "loss": 0.26426532, "memory(GiB)": 15.04, "step": 8790, "train_speed(iter/s)": 0.334172 }, { "acc": 0.89845304, "epoch": 1.1845117845117845, "grad_norm": 13.25, "learning_rate": 7.794645676676132e-06, "loss": 0.5090816, "memory(GiB)": 15.04, "step": 8795, "train_speed(iter/s)": 0.334204 }, { "acc": 0.86508207, "epoch": 1.1851851851851851, "grad_norm": 6.625, "learning_rate": 7.783786326622837e-06, "loss": 0.34729381, "memory(GiB)": 15.04, "step": 8800, "train_speed(iter/s)": 0.334233 }, { "acc": 0.87080355, "epoch": 1.185858585858586, "grad_norm": 12.8125, "learning_rate": 7.772929724349843e-06, "loss": 0.55419102, "memory(GiB)": 15.04, "step": 8805, "train_speed(iter/s)": 0.334252 }, { "acc": 0.80040865, "epoch": 1.1865319865319865, "grad_norm": 12.4375, "learning_rate": 7.762075883317753e-06, "loss": 1.01047974, "memory(GiB)": 15.04, "step": 8810, "train_speed(iter/s)": 0.334259 }, { "acc": 0.92868814, "epoch": 1.1872053872053872, "grad_norm": 8.25, "learning_rate": 7.751224816983737e-06, "loss": 0.24025538, "memory(GiB)": 15.04, "step": 8815, "train_speed(iter/s)": 0.334292 }, { "acc": 0.92831297, "epoch": 1.187878787878788, "grad_norm": 7.40625, "learning_rate": 7.740376538801533e-06, "loss": 0.2216753, "memory(GiB)": 15.04, "step": 8820, "train_speed(iter/s)": 0.334326 }, { "acc": 0.93432398, "epoch": 1.1885521885521886, "grad_norm": 4.34375, "learning_rate": 7.72953106222141e-06, "loss": 0.25570393, "memory(GiB)": 15.04, "step": 8825, "train_speed(iter/s)": 0.334316 }, { "acc": 0.87599707, "epoch": 1.1892255892255892, "grad_norm": 8.875, "learning_rate": 7.718688400690174e-06, "loss": 0.35427902, "memory(GiB)": 15.04, "step": 8830, "train_speed(iter/s)": 0.334355 }, { "acc": 0.91540041, "epoch": 1.1898989898989898, "grad_norm": 5.40625, "learning_rate": 7.707848567651134e-06, "loss": 0.35271568, "memory(GiB)": 15.04, "step": 8835, "train_speed(iter/s)": 0.334376 }, { "acc": 0.86296587, "epoch": 1.1905723905723906, "grad_norm": 6.21875, "learning_rate": 7.697011576544102e-06, "loss": 0.34237313, "memory(GiB)": 15.04, "step": 8840, "train_speed(iter/s)": 0.334363 }, { "acc": 0.87622728, "epoch": 1.1912457912457912, "grad_norm": 6.09375, "learning_rate": 7.68617744080535e-06, "loss": 0.37369969, "memory(GiB)": 15.04, "step": 8845, "train_speed(iter/s)": 0.334401 }, { "acc": 0.92973967, "epoch": 1.1919191919191918, "grad_norm": 8.4375, "learning_rate": 7.675346173867627e-06, "loss": 0.31743107, "memory(GiB)": 15.04, "step": 8850, "train_speed(iter/s)": 0.334417 }, { "acc": 0.91796322, "epoch": 1.1925925925925926, "grad_norm": 4.25, "learning_rate": 7.664517789160111e-06, "loss": 0.30589209, "memory(GiB)": 15.04, "step": 8855, "train_speed(iter/s)": 0.334446 }, { "acc": 0.87161016, "epoch": 1.1932659932659933, "grad_norm": 5.03125, "learning_rate": 7.653692300108416e-06, "loss": 0.46378088, "memory(GiB)": 15.04, "step": 8860, "train_speed(iter/s)": 0.334489 }, { "acc": 0.93547935, "epoch": 1.1939393939393939, "grad_norm": 5.96875, "learning_rate": 7.642869720134567e-06, "loss": 0.26447537, "memory(GiB)": 15.04, "step": 8865, "train_speed(iter/s)": 0.33453 }, { "acc": 0.88174496, "epoch": 1.1946127946127947, "grad_norm": 8.75, "learning_rate": 7.63205006265697e-06, "loss": 0.54033194, "memory(GiB)": 15.04, "step": 8870, "train_speed(iter/s)": 0.334539 }, { "acc": 0.8227273, "epoch": 1.1952861952861953, "grad_norm": 8.8125, "learning_rate": 7.621233341090421e-06, "loss": 0.38368597, "memory(GiB)": 15.04, "step": 8875, "train_speed(iter/s)": 0.334566 }, { "acc": 0.91086254, "epoch": 1.195959595959596, "grad_norm": 7.1875, "learning_rate": 7.6104195688460655e-06, "loss": 0.27185309, "memory(GiB)": 15.04, "step": 8880, "train_speed(iter/s)": 0.334595 }, { "acc": 0.83424644, "epoch": 1.1966329966329967, "grad_norm": 9.75, "learning_rate": 7.599608759331398e-06, "loss": 0.70110936, "memory(GiB)": 15.04, "step": 8885, "train_speed(iter/s)": 0.334626 }, { "acc": 0.87613516, "epoch": 1.1973063973063973, "grad_norm": 10.75, "learning_rate": 7.588800925950246e-06, "loss": 0.35930502, "memory(GiB)": 15.04, "step": 8890, "train_speed(iter/s)": 0.334654 }, { "acc": 0.92512188, "epoch": 1.197979797979798, "grad_norm": 10.125, "learning_rate": 7.577996082102729e-06, "loss": 0.25872157, "memory(GiB)": 15.04, "step": 8895, "train_speed(iter/s)": 0.334691 }, { "acc": 0.92029266, "epoch": 1.1986531986531987, "grad_norm": 5.375, "learning_rate": 7.567194241185279e-06, "loss": 0.26382964, "memory(GiB)": 15.04, "step": 8900, "train_speed(iter/s)": 0.334705 }, { "acc": 0.88651953, "epoch": 1.1993265993265994, "grad_norm": 15.8125, "learning_rate": 7.556395416590589e-06, "loss": 0.59845929, "memory(GiB)": 15.04, "step": 8905, "train_speed(iter/s)": 0.334739 }, { "acc": 0.89042358, "epoch": 1.2, "grad_norm": 12.5625, "learning_rate": 7.545599621707625e-06, "loss": 0.36238532, "memory(GiB)": 15.04, "step": 8910, "train_speed(iter/s)": 0.334779 }, { "acc": 0.9247798, "epoch": 1.2006734006734008, "grad_norm": 11.1875, "learning_rate": 7.534806869921592e-06, "loss": 0.25445011, "memory(GiB)": 15.04, "step": 8915, "train_speed(iter/s)": 0.334808 }, { "acc": 0.9430151, "epoch": 1.2013468013468014, "grad_norm": 10.5625, "learning_rate": 7.524017174613916e-06, "loss": 0.32521975, "memory(GiB)": 15.04, "step": 8920, "train_speed(iter/s)": 0.334847 }, { "acc": 0.91420298, "epoch": 1.202020202020202, "grad_norm": 9.3125, "learning_rate": 7.5132305491622425e-06, "loss": 0.3544997, "memory(GiB)": 15.04, "step": 8925, "train_speed(iter/s)": 0.334837 }, { "acc": 0.78509068, "epoch": 1.2026936026936026, "grad_norm": 6.3125, "learning_rate": 7.502447006940406e-06, "loss": 1.27279234, "memory(GiB)": 15.04, "step": 8930, "train_speed(iter/s)": 0.334862 }, { "acc": 0.94311543, "epoch": 1.2033670033670034, "grad_norm": 9.125, "learning_rate": 7.491666561318416e-06, "loss": 0.19692328, "memory(GiB)": 15.04, "step": 8935, "train_speed(iter/s)": 0.334891 }, { "acc": 0.90329046, "epoch": 1.204040404040404, "grad_norm": 6.0625, "learning_rate": 7.480889225662454e-06, "loss": 0.26504488, "memory(GiB)": 15.04, "step": 8940, "train_speed(iter/s)": 0.334916 }, { "acc": 0.82259665, "epoch": 1.2047138047138046, "grad_norm": 9.4375, "learning_rate": 7.470115013334829e-06, "loss": 0.56307988, "memory(GiB)": 15.04, "step": 8945, "train_speed(iter/s)": 0.334962 }, { "acc": 0.90707951, "epoch": 1.2053872053872055, "grad_norm": 18.75, "learning_rate": 7.459343937693992e-06, "loss": 0.33230534, "memory(GiB)": 15.04, "step": 8950, "train_speed(iter/s)": 0.334993 }, { "acc": 0.8344964, "epoch": 1.206060606060606, "grad_norm": 9.0, "learning_rate": 7.448576012094492e-06, "loss": 0.43879514, "memory(GiB)": 15.04, "step": 8955, "train_speed(iter/s)": 0.335031 }, { "acc": 0.90590734, "epoch": 1.2067340067340067, "grad_norm": 5.875, "learning_rate": 7.437811249886985e-06, "loss": 0.35281451, "memory(GiB)": 15.04, "step": 8960, "train_speed(iter/s)": 0.335057 }, { "acc": 0.82382822, "epoch": 1.2074074074074075, "grad_norm": 12.4375, "learning_rate": 7.427049664418202e-06, "loss": 0.63607116, "memory(GiB)": 15.04, "step": 8965, "train_speed(iter/s)": 0.33509 }, { "acc": 0.87669516, "epoch": 1.208080808080808, "grad_norm": 8.5, "learning_rate": 7.416291269030923e-06, "loss": 0.59069467, "memory(GiB)": 15.04, "step": 8970, "train_speed(iter/s)": 0.335114 }, { "acc": 0.8820632, "epoch": 1.2087542087542087, "grad_norm": 7.84375, "learning_rate": 7.4055360770639925e-06, "loss": 0.71371379, "memory(GiB)": 15.04, "step": 8975, "train_speed(iter/s)": 0.335138 }, { "acc": 0.9173686, "epoch": 1.2094276094276095, "grad_norm": 4.1875, "learning_rate": 7.394784101852265e-06, "loss": 0.26022398, "memory(GiB)": 15.04, "step": 8980, "train_speed(iter/s)": 0.335154 }, { "acc": 0.8300209, "epoch": 1.2101010101010101, "grad_norm": 15.4375, "learning_rate": 7.384035356726618e-06, "loss": 0.51802835, "memory(GiB)": 15.04, "step": 8985, "train_speed(iter/s)": 0.335202 }, { "acc": 0.89961033, "epoch": 1.2107744107744107, "grad_norm": 7.15625, "learning_rate": 7.373289855013924e-06, "loss": 0.29598281, "memory(GiB)": 15.04, "step": 8990, "train_speed(iter/s)": 0.335226 }, { "acc": 0.84554043, "epoch": 1.2114478114478113, "grad_norm": 7.84375, "learning_rate": 7.3625476100370254e-06, "loss": 0.50715199, "memory(GiB)": 15.04, "step": 8995, "train_speed(iter/s)": 0.335254 }, { "acc": 0.87353363, "epoch": 1.2121212121212122, "grad_norm": 8.625, "learning_rate": 7.351808635114736e-06, "loss": 0.51893387, "memory(GiB)": 15.04, "step": 9000, "train_speed(iter/s)": 0.335262 }, { "epoch": 1.2121212121212122, "eval_acc": 0.894475415466723, "eval_loss": 0.41569873690605164, "eval_runtime": 109.7912, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 9000 }, { "acc": 0.8929966, "epoch": 1.2127946127946128, "grad_norm": 14.375, "learning_rate": 7.341072943561811e-06, "loss": 0.47138219, "memory(GiB)": 15.04, "step": 9005, "train_speed(iter/s)": 0.333891 }, { "acc": 0.87038364, "epoch": 1.2134680134680136, "grad_norm": 43.0, "learning_rate": 7.330340548688933e-06, "loss": 0.37270429, "memory(GiB)": 15.04, "step": 9010, "train_speed(iter/s)": 0.333881 }, { "acc": 0.8217186, "epoch": 1.2141414141414142, "grad_norm": 12.4375, "learning_rate": 7.319611463802705e-06, "loss": 0.64520493, "memory(GiB)": 15.04, "step": 9015, "train_speed(iter/s)": 0.333923 }, { "acc": 0.93706741, "epoch": 1.2148148148148148, "grad_norm": 4.53125, "learning_rate": 7.308885702205612e-06, "loss": 0.16838611, "memory(GiB)": 15.04, "step": 9020, "train_speed(iter/s)": 0.333962 }, { "acc": 0.92356606, "epoch": 1.2154882154882154, "grad_norm": 5.53125, "learning_rate": 7.298163277196035e-06, "loss": 0.29079893, "memory(GiB)": 15.04, "step": 9025, "train_speed(iter/s)": 0.333967 }, { "acc": 0.89644585, "epoch": 1.2161616161616162, "grad_norm": 8.375, "learning_rate": 7.2874442020682056e-06, "loss": 0.38219862, "memory(GiB)": 15.04, "step": 9030, "train_speed(iter/s)": 0.333971 }, { "acc": 0.93895531, "epoch": 1.2168350168350168, "grad_norm": 5.59375, "learning_rate": 7.276728490112208e-06, "loss": 0.22326462, "memory(GiB)": 15.04, "step": 9035, "train_speed(iter/s)": 0.33401 }, { "acc": 0.9340889, "epoch": 1.2175084175084174, "grad_norm": 12.0625, "learning_rate": 7.266016154613959e-06, "loss": 0.24871545, "memory(GiB)": 15.04, "step": 9040, "train_speed(iter/s)": 0.33405 }, { "acc": 0.90456057, "epoch": 1.2181818181818183, "grad_norm": 25.875, "learning_rate": 7.255307208855178e-06, "loss": 0.44414439, "memory(GiB)": 15.04, "step": 9045, "train_speed(iter/s)": 0.334094 }, { "acc": 0.90100832, "epoch": 1.2188552188552189, "grad_norm": 8.375, "learning_rate": 7.244601666113397e-06, "loss": 0.33543525, "memory(GiB)": 15.04, "step": 9050, "train_speed(iter/s)": 0.334091 }, { "acc": 0.86368971, "epoch": 1.2195286195286195, "grad_norm": 14.75, "learning_rate": 7.2338995396619135e-06, "loss": 0.34672532, "memory(GiB)": 15.04, "step": 9055, "train_speed(iter/s)": 0.33408 }, { "acc": 0.82249479, "epoch": 1.2202020202020203, "grad_norm": 5.53125, "learning_rate": 7.2232008427698e-06, "loss": 0.53450623, "memory(GiB)": 15.04, "step": 9060, "train_speed(iter/s)": 0.334108 }, { "acc": 0.91870232, "epoch": 1.220875420875421, "grad_norm": 11.4375, "learning_rate": 7.212505588701877e-06, "loss": 0.3474076, "memory(GiB)": 15.04, "step": 9065, "train_speed(iter/s)": 0.334139 }, { "acc": 0.90108252, "epoch": 1.2215488215488215, "grad_norm": 6.40625, "learning_rate": 7.201813790718686e-06, "loss": 0.35729029, "memory(GiB)": 15.04, "step": 9070, "train_speed(iter/s)": 0.334176 }, { "acc": 0.94556217, "epoch": 1.2222222222222223, "grad_norm": 4.875, "learning_rate": 7.191125462076497e-06, "loss": 0.21146717, "memory(GiB)": 15.04, "step": 9075, "train_speed(iter/s)": 0.334183 }, { "acc": 0.90873528, "epoch": 1.222895622895623, "grad_norm": 5.09375, "learning_rate": 7.180440616027264e-06, "loss": 0.34912663, "memory(GiB)": 15.04, "step": 9080, "train_speed(iter/s)": 0.334191 }, { "acc": 0.90023823, "epoch": 1.2235690235690235, "grad_norm": 9.375, "learning_rate": 7.169759265818637e-06, "loss": 0.34831755, "memory(GiB)": 15.04, "step": 9085, "train_speed(iter/s)": 0.334208 }, { "acc": 0.89683199, "epoch": 1.2242424242424241, "grad_norm": 15.75, "learning_rate": 7.159081424693925e-06, "loss": 0.35634911, "memory(GiB)": 15.04, "step": 9090, "train_speed(iter/s)": 0.334224 }, { "acc": 0.93440924, "epoch": 1.224915824915825, "grad_norm": 11.3125, "learning_rate": 7.148407105892085e-06, "loss": 0.20357089, "memory(GiB)": 15.04, "step": 9095, "train_speed(iter/s)": 0.334247 }, { "acc": 0.93140469, "epoch": 1.2255892255892256, "grad_norm": 9.5, "learning_rate": 7.137736322647708e-06, "loss": 0.2412384, "memory(GiB)": 15.04, "step": 9100, "train_speed(iter/s)": 0.334278 }, { "acc": 0.88848457, "epoch": 1.2262626262626264, "grad_norm": 6.09375, "learning_rate": 7.1270690881910055e-06, "loss": 0.47499456, "memory(GiB)": 15.04, "step": 9105, "train_speed(iter/s)": 0.334313 }, { "acc": 0.93150959, "epoch": 1.226936026936027, "grad_norm": 12.125, "learning_rate": 7.116405415747779e-06, "loss": 0.24199398, "memory(GiB)": 15.04, "step": 9110, "train_speed(iter/s)": 0.334351 }, { "acc": 0.92064953, "epoch": 1.2276094276094276, "grad_norm": 9.3125, "learning_rate": 7.10574531853943e-06, "loss": 0.39521904, "memory(GiB)": 15.04, "step": 9115, "train_speed(iter/s)": 0.334367 }, { "acc": 0.8341259, "epoch": 1.2282828282828282, "grad_norm": 22.5, "learning_rate": 7.095088809782909e-06, "loss": 0.61440983, "memory(GiB)": 15.04, "step": 9120, "train_speed(iter/s)": 0.334394 }, { "acc": 0.88455887, "epoch": 1.228956228956229, "grad_norm": 8.0, "learning_rate": 7.084435902690727e-06, "loss": 0.35118048, "memory(GiB)": 15.04, "step": 9125, "train_speed(iter/s)": 0.334398 }, { "acc": 0.91548948, "epoch": 1.2296296296296296, "grad_norm": 9.1875, "learning_rate": 7.073786610470935e-06, "loss": 0.32967341, "memory(GiB)": 15.04, "step": 9130, "train_speed(iter/s)": 0.334435 }, { "acc": 0.88200417, "epoch": 1.2303030303030302, "grad_norm": 5.96875, "learning_rate": 7.063140946327086e-06, "loss": 0.36016295, "memory(GiB)": 15.04, "step": 9135, "train_speed(iter/s)": 0.334469 }, { "acc": 0.90623474, "epoch": 1.230976430976431, "grad_norm": 11.25, "learning_rate": 7.052498923458253e-06, "loss": 0.31132498, "memory(GiB)": 15.04, "step": 9140, "train_speed(iter/s)": 0.334475 }, { "acc": 0.85107536, "epoch": 1.2316498316498317, "grad_norm": 10.6875, "learning_rate": 7.041860555058977e-06, "loss": 0.74530754, "memory(GiB)": 15.04, "step": 9145, "train_speed(iter/s)": 0.334513 }, { "acc": 0.90378151, "epoch": 1.2323232323232323, "grad_norm": 5.65625, "learning_rate": 7.031225854319281e-06, "loss": 0.35486798, "memory(GiB)": 15.04, "step": 9150, "train_speed(iter/s)": 0.334511 }, { "acc": 0.94854698, "epoch": 1.232996632996633, "grad_norm": 5.125, "learning_rate": 7.020594834424639e-06, "loss": 0.23054857, "memory(GiB)": 15.04, "step": 9155, "train_speed(iter/s)": 0.334522 }, { "acc": 0.88515387, "epoch": 1.2336700336700337, "grad_norm": 5.96875, "learning_rate": 7.009967508555952e-06, "loss": 0.41824684, "memory(GiB)": 15.04, "step": 9160, "train_speed(iter/s)": 0.334524 }, { "acc": 0.91940241, "epoch": 1.2343434343434343, "grad_norm": 8.3125, "learning_rate": 6.999343889889553e-06, "loss": 0.2349196, "memory(GiB)": 15.04, "step": 9165, "train_speed(iter/s)": 0.334557 }, { "acc": 0.85173903, "epoch": 1.2350168350168351, "grad_norm": 6.8125, "learning_rate": 6.988723991597166e-06, "loss": 0.71610355, "memory(GiB)": 15.04, "step": 9170, "train_speed(iter/s)": 0.334571 }, { "acc": 0.85665131, "epoch": 1.2356902356902357, "grad_norm": 10.8125, "learning_rate": 6.978107826845914e-06, "loss": 0.59850006, "memory(GiB)": 15.04, "step": 9175, "train_speed(iter/s)": 0.334547 }, { "acc": 0.91978998, "epoch": 1.2363636363636363, "grad_norm": 6.5, "learning_rate": 6.967495408798288e-06, "loss": 0.28697846, "memory(GiB)": 15.04, "step": 9180, "train_speed(iter/s)": 0.334573 }, { "acc": 0.88025265, "epoch": 1.237037037037037, "grad_norm": 7.65625, "learning_rate": 6.9568867506121285e-06, "loss": 0.73406439, "memory(GiB)": 15.04, "step": 9185, "train_speed(iter/s)": 0.334608 }, { "acc": 0.91176662, "epoch": 1.2377104377104378, "grad_norm": 6.65625, "learning_rate": 6.94628186544062e-06, "loss": 0.37211435, "memory(GiB)": 15.04, "step": 9190, "train_speed(iter/s)": 0.334617 }, { "acc": 0.90800314, "epoch": 1.2383838383838384, "grad_norm": 9.0625, "learning_rate": 6.93568076643226e-06, "loss": 0.28418851, "memory(GiB)": 15.04, "step": 9195, "train_speed(iter/s)": 0.334651 }, { "acc": 0.8702795, "epoch": 1.239057239057239, "grad_norm": 16.25, "learning_rate": 6.925083466730864e-06, "loss": 0.3472441, "memory(GiB)": 15.04, "step": 9200, "train_speed(iter/s)": 0.334689 }, { "acc": 0.90860109, "epoch": 1.2397306397306398, "grad_norm": 12.9375, "learning_rate": 6.914489979475536e-06, "loss": 0.32622931, "memory(GiB)": 15.04, "step": 9205, "train_speed(iter/s)": 0.334713 }, { "acc": 0.92050362, "epoch": 1.2404040404040404, "grad_norm": 12.5, "learning_rate": 6.903900317800637e-06, "loss": 0.30244985, "memory(GiB)": 15.04, "step": 9210, "train_speed(iter/s)": 0.334741 }, { "acc": 0.89314461, "epoch": 1.241077441077441, "grad_norm": 6.6875, "learning_rate": 6.893314494835806e-06, "loss": 0.4402411, "memory(GiB)": 15.04, "step": 9215, "train_speed(iter/s)": 0.334738 }, { "acc": 0.92904959, "epoch": 1.2417508417508418, "grad_norm": 12.1875, "learning_rate": 6.882732523705906e-06, "loss": 0.31815796, "memory(GiB)": 15.04, "step": 9220, "train_speed(iter/s)": 0.334752 }, { "acc": 0.91005831, "epoch": 1.2424242424242424, "grad_norm": 11.625, "learning_rate": 6.872154417531034e-06, "loss": 0.41316104, "memory(GiB)": 15.04, "step": 9225, "train_speed(iter/s)": 0.334771 }, { "acc": 0.92818918, "epoch": 1.243097643097643, "grad_norm": 10.25, "learning_rate": 6.861580189426495e-06, "loss": 0.35050559, "memory(GiB)": 15.04, "step": 9230, "train_speed(iter/s)": 0.334801 }, { "acc": 0.86370831, "epoch": 1.2437710437710439, "grad_norm": 15.8125, "learning_rate": 6.851009852502777e-06, "loss": 0.67675142, "memory(GiB)": 15.04, "step": 9235, "train_speed(iter/s)": 0.334819 }, { "acc": 0.85959864, "epoch": 1.2444444444444445, "grad_norm": 20.875, "learning_rate": 6.840443419865556e-06, "loss": 0.58502169, "memory(GiB)": 15.04, "step": 9240, "train_speed(iter/s)": 0.334826 }, { "acc": 0.8739397, "epoch": 1.245117845117845, "grad_norm": 8.0625, "learning_rate": 6.829880904615652e-06, "loss": 0.45994511, "memory(GiB)": 15.04, "step": 9245, "train_speed(iter/s)": 0.33481 }, { "acc": 0.93040657, "epoch": 1.2457912457912457, "grad_norm": 6.34375, "learning_rate": 6.819322319849044e-06, "loss": 0.24262767, "memory(GiB)": 15.04, "step": 9250, "train_speed(iter/s)": 0.334831 }, { "acc": 0.91367722, "epoch": 1.2464646464646465, "grad_norm": 10.3125, "learning_rate": 6.808767678656829e-06, "loss": 0.32125883, "memory(GiB)": 15.04, "step": 9255, "train_speed(iter/s)": 0.33482 }, { "acc": 0.92794933, "epoch": 1.247138047138047, "grad_norm": 16.5, "learning_rate": 6.798216994125213e-06, "loss": 0.23239794, "memory(GiB)": 15.04, "step": 9260, "train_speed(iter/s)": 0.33486 }, { "acc": 0.89250135, "epoch": 1.247811447811448, "grad_norm": 14.8125, "learning_rate": 6.7876702793355035e-06, "loss": 0.43735785, "memory(GiB)": 15.04, "step": 9265, "train_speed(iter/s)": 0.334857 }, { "acc": 0.86579876, "epoch": 1.2484848484848485, "grad_norm": 18.625, "learning_rate": 6.777127547364078e-06, "loss": 0.36714532, "memory(GiB)": 15.04, "step": 9270, "train_speed(iter/s)": 0.334887 }, { "acc": 0.91549988, "epoch": 1.2491582491582491, "grad_norm": 8.1875, "learning_rate": 6.766588811282379e-06, "loss": 0.27293491, "memory(GiB)": 15.04, "step": 9275, "train_speed(iter/s)": 0.334891 }, { "acc": 0.90694761, "epoch": 1.2498316498316497, "grad_norm": 7.375, "learning_rate": 6.756054084156902e-06, "loss": 0.31676784, "memory(GiB)": 15.04, "step": 9280, "train_speed(iter/s)": 0.33492 }, { "acc": 0.89656353, "epoch": 1.2505050505050506, "grad_norm": 8.875, "learning_rate": 6.745523379049157e-06, "loss": 0.33908391, "memory(GiB)": 15.04, "step": 9285, "train_speed(iter/s)": 0.334946 }, { "acc": 0.91946945, "epoch": 1.2511784511784512, "grad_norm": 6.4375, "learning_rate": 6.734996709015684e-06, "loss": 0.32658114, "memory(GiB)": 15.04, "step": 9290, "train_speed(iter/s)": 0.334981 }, { "acc": 0.81229038, "epoch": 1.2518518518518518, "grad_norm": 8.3125, "learning_rate": 6.724474087108004e-06, "loss": 0.68991122, "memory(GiB)": 15.04, "step": 9295, "train_speed(iter/s)": 0.334984 }, { "acc": 0.92849064, "epoch": 1.2525252525252526, "grad_norm": 8.5, "learning_rate": 6.713955526372629e-06, "loss": 0.24314618, "memory(GiB)": 15.04, "step": 9300, "train_speed(iter/s)": 0.335028 }, { "epoch": 1.2525252525252526, "eval_acc": 0.8943658280922432, "eval_loss": 0.4082954525947571, "eval_runtime": 109.9044, "eval_samples_per_second": 1.365, "eval_steps_per_second": 1.365, "step": 9300 }, { "acc": 0.8597661, "epoch": 1.2531986531986532, "grad_norm": 13.375, "learning_rate": 6.70344103985104e-06, "loss": 0.40775919, "memory(GiB)": 15.04, "step": 9305, "train_speed(iter/s)": 0.333702 }, { "acc": 0.86505661, "epoch": 1.2538720538720538, "grad_norm": 11.0, "learning_rate": 6.692930640579651e-06, "loss": 0.22979717, "memory(GiB)": 15.04, "step": 9310, "train_speed(iter/s)": 0.333749 }, { "acc": 0.92401609, "epoch": 1.2545454545454544, "grad_norm": 5.15625, "learning_rate": 6.682424341589824e-06, "loss": 0.32938907, "memory(GiB)": 15.04, "step": 9315, "train_speed(iter/s)": 0.33378 }, { "acc": 0.92805328, "epoch": 1.2552188552188552, "grad_norm": 8.8125, "learning_rate": 6.671922155907826e-06, "loss": 0.26388872, "memory(GiB)": 15.04, "step": 9320, "train_speed(iter/s)": 0.333811 }, { "acc": 0.91945896, "epoch": 1.2558922558922558, "grad_norm": 10.0625, "learning_rate": 6.661424096554829e-06, "loss": 0.20681672, "memory(GiB)": 15.04, "step": 9325, "train_speed(iter/s)": 0.33384 }, { "acc": 0.83211021, "epoch": 1.2565656565656567, "grad_norm": 8.9375, "learning_rate": 6.650930176546896e-06, "loss": 0.64398093, "memory(GiB)": 15.04, "step": 9330, "train_speed(iter/s)": 0.333839 }, { "acc": 0.88890476, "epoch": 1.2572390572390573, "grad_norm": 11.6875, "learning_rate": 6.64044040889494e-06, "loss": 0.34757118, "memory(GiB)": 15.04, "step": 9335, "train_speed(iter/s)": 0.333826 }, { "acc": 0.89659986, "epoch": 1.2579124579124579, "grad_norm": 31.25, "learning_rate": 6.629954806604746e-06, "loss": 0.6473115, "memory(GiB)": 15.04, "step": 9340, "train_speed(iter/s)": 0.333846 }, { "acc": 0.91689882, "epoch": 1.2585858585858585, "grad_norm": 25.625, "learning_rate": 6.619473382676917e-06, "loss": 0.26747901, "memory(GiB)": 15.04, "step": 9345, "train_speed(iter/s)": 0.333884 }, { "acc": 0.90223913, "epoch": 1.2592592592592593, "grad_norm": 6.5, "learning_rate": 6.6089961501068875e-06, "loss": 0.36214557, "memory(GiB)": 15.04, "step": 9350, "train_speed(iter/s)": 0.333903 }, { "acc": 0.81011677, "epoch": 1.25993265993266, "grad_norm": 18.5, "learning_rate": 6.59852312188489e-06, "loss": 0.58135796, "memory(GiB)": 15.04, "step": 9355, "train_speed(iter/s)": 0.333939 }, { "acc": 0.91339312, "epoch": 1.2606060606060607, "grad_norm": 6.21875, "learning_rate": 6.588054310995946e-06, "loss": 0.28903656, "memory(GiB)": 15.04, "step": 9360, "train_speed(iter/s)": 0.33397 }, { "acc": 0.93569059, "epoch": 1.2612794612794613, "grad_norm": 11.125, "learning_rate": 6.5775897304198464e-06, "loss": 0.24312882, "memory(GiB)": 15.04, "step": 9365, "train_speed(iter/s)": 0.333995 }, { "acc": 0.89483786, "epoch": 1.261952861952862, "grad_norm": 28.875, "learning_rate": 6.567129393131139e-06, "loss": 0.37268546, "memory(GiB)": 15.04, "step": 9370, "train_speed(iter/s)": 0.334042 }, { "acc": 0.90341072, "epoch": 1.2626262626262625, "grad_norm": 8.5625, "learning_rate": 6.556673312099106e-06, "loss": 0.32093859, "memory(GiB)": 15.04, "step": 9375, "train_speed(iter/s)": 0.334058 }, { "acc": 0.90460091, "epoch": 1.2632996632996634, "grad_norm": 8.0625, "learning_rate": 6.546221500287766e-06, "loss": 0.36141028, "memory(GiB)": 15.04, "step": 9380, "train_speed(iter/s)": 0.33409 }, { "acc": 0.89143772, "epoch": 1.263973063973064, "grad_norm": 5.28125, "learning_rate": 6.535773970655823e-06, "loss": 0.3237045, "memory(GiB)": 15.04, "step": 9385, "train_speed(iter/s)": 0.334088 }, { "acc": 0.91589622, "epoch": 1.2646464646464646, "grad_norm": 5.28125, "learning_rate": 6.525330736156692e-06, "loss": 0.2698863, "memory(GiB)": 15.04, "step": 9390, "train_speed(iter/s)": 0.334122 }, { "acc": 0.91980467, "epoch": 1.2653198653198654, "grad_norm": 5.53125, "learning_rate": 6.514891809738446e-06, "loss": 0.27117088, "memory(GiB)": 15.04, "step": 9395, "train_speed(iter/s)": 0.334162 }, { "acc": 0.9274991, "epoch": 1.265993265993266, "grad_norm": 19.375, "learning_rate": 6.5044572043438305e-06, "loss": 0.2792767, "memory(GiB)": 15.04, "step": 9400, "train_speed(iter/s)": 0.334172 }, { "acc": 0.89860916, "epoch": 1.2666666666666666, "grad_norm": 6.5, "learning_rate": 6.494026932910229e-06, "loss": 0.23947976, "memory(GiB)": 15.04, "step": 9405, "train_speed(iter/s)": 0.334204 }, { "acc": 0.89169331, "epoch": 1.2673400673400672, "grad_norm": 12.3125, "learning_rate": 6.483601008369645e-06, "loss": 0.32476013, "memory(GiB)": 15.04, "step": 9410, "train_speed(iter/s)": 0.334253 }, { "acc": 0.91733112, "epoch": 1.268013468013468, "grad_norm": 5.59375, "learning_rate": 6.473179443648703e-06, "loss": 0.29361489, "memory(GiB)": 15.04, "step": 9415, "train_speed(iter/s)": 0.334258 }, { "acc": 0.8154501, "epoch": 1.2686868686868686, "grad_norm": 7.0, "learning_rate": 6.462762251668609e-06, "loss": 0.85370541, "memory(GiB)": 15.04, "step": 9420, "train_speed(iter/s)": 0.334288 }, { "acc": 0.89090977, "epoch": 1.2693602693602695, "grad_norm": 12.5625, "learning_rate": 6.452349445345159e-06, "loss": 0.42459707, "memory(GiB)": 15.04, "step": 9425, "train_speed(iter/s)": 0.334292 }, { "acc": 0.87531862, "epoch": 1.27003367003367, "grad_norm": 4.25, "learning_rate": 6.441941037588712e-06, "loss": 0.36733196, "memory(GiB)": 15.04, "step": 9430, "train_speed(iter/s)": 0.334323 }, { "acc": 0.89692774, "epoch": 1.2707070707070707, "grad_norm": 8.4375, "learning_rate": 6.4315370413041655e-06, "loss": 0.43209648, "memory(GiB)": 15.04, "step": 9435, "train_speed(iter/s)": 0.334348 }, { "acc": 0.82759171, "epoch": 1.2713804713804713, "grad_norm": 17.75, "learning_rate": 6.421137469390949e-06, "loss": 0.4820406, "memory(GiB)": 15.04, "step": 9440, "train_speed(iter/s)": 0.334392 }, { "acc": 0.90950403, "epoch": 1.272053872053872, "grad_norm": 3.78125, "learning_rate": 6.41074233474301e-06, "loss": 0.2940321, "memory(GiB)": 15.04, "step": 9445, "train_speed(iter/s)": 0.334429 }, { "acc": 0.91077566, "epoch": 1.2727272727272727, "grad_norm": 6.90625, "learning_rate": 6.400351650248788e-06, "loss": 0.40922956, "memory(GiB)": 15.04, "step": 9450, "train_speed(iter/s)": 0.334387 }, { "acc": 0.89681053, "epoch": 1.2734006734006735, "grad_norm": 6.90625, "learning_rate": 6.3899654287912204e-06, "loss": 0.40920839, "memory(GiB)": 15.04, "step": 9455, "train_speed(iter/s)": 0.334419 }, { "acc": 0.91836929, "epoch": 1.2740740740740741, "grad_norm": 15.0, "learning_rate": 6.3795836832476895e-06, "loss": 0.33368034, "memory(GiB)": 15.04, "step": 9460, "train_speed(iter/s)": 0.334458 }, { "acc": 0.86629801, "epoch": 1.2747474747474747, "grad_norm": 11.8125, "learning_rate": 6.369206426490048e-06, "loss": 0.32990625, "memory(GiB)": 15.04, "step": 9465, "train_speed(iter/s)": 0.334479 }, { "acc": 0.89950666, "epoch": 1.2754208754208753, "grad_norm": 8.0625, "learning_rate": 6.358833671384565e-06, "loss": 0.39896271, "memory(GiB)": 15.04, "step": 9470, "train_speed(iter/s)": 0.334459 }, { "acc": 0.93626051, "epoch": 1.2760942760942762, "grad_norm": 5.28125, "learning_rate": 6.3484654307919415e-06, "loss": 0.246946, "memory(GiB)": 15.04, "step": 9475, "train_speed(iter/s)": 0.334492 }, { "acc": 0.91166868, "epoch": 1.2767676767676768, "grad_norm": 8.75, "learning_rate": 6.338101717567282e-06, "loss": 0.31343174, "memory(GiB)": 15.04, "step": 9480, "train_speed(iter/s)": 0.3345 }, { "acc": 0.88177662, "epoch": 1.2774410774410774, "grad_norm": 16.875, "learning_rate": 6.327742544560063e-06, "loss": 0.36417689, "memory(GiB)": 15.04, "step": 9485, "train_speed(iter/s)": 0.334495 }, { "acc": 0.93200769, "epoch": 1.2781144781144782, "grad_norm": 5.53125, "learning_rate": 6.317387924614151e-06, "loss": 0.20921798, "memory(GiB)": 15.04, "step": 9490, "train_speed(iter/s)": 0.334517 }, { "acc": 0.93198013, "epoch": 1.2787878787878788, "grad_norm": 7.34375, "learning_rate": 6.307037870567751e-06, "loss": 0.23144667, "memory(GiB)": 15.04, "step": 9495, "train_speed(iter/s)": 0.334545 }, { "acc": 0.92326565, "epoch": 1.2794612794612794, "grad_norm": 8.4375, "learning_rate": 6.296692395253415e-06, "loss": 0.29435198, "memory(GiB)": 15.04, "step": 9500, "train_speed(iter/s)": 0.334583 }, { "acc": 0.92107267, "epoch": 1.28013468013468, "grad_norm": 3.265625, "learning_rate": 6.286351511498024e-06, "loss": 0.26065884, "memory(GiB)": 15.04, "step": 9505, "train_speed(iter/s)": 0.334575 }, { "acc": 0.90917654, "epoch": 1.2808080808080808, "grad_norm": 7.5625, "learning_rate": 6.276015232122748e-06, "loss": 0.29355597, "memory(GiB)": 15.04, "step": 9510, "train_speed(iter/s)": 0.334582 }, { "acc": 0.89617958, "epoch": 1.2814814814814814, "grad_norm": 8.625, "learning_rate": 6.265683569943069e-06, "loss": 0.26834872, "memory(GiB)": 15.04, "step": 9515, "train_speed(iter/s)": 0.334588 }, { "acc": 0.8536458, "epoch": 1.2821548821548823, "grad_norm": 6.03125, "learning_rate": 6.255356537768725e-06, "loss": 0.42065563, "memory(GiB)": 15.04, "step": 9520, "train_speed(iter/s)": 0.334618 }, { "acc": 0.93801365, "epoch": 1.2828282828282829, "grad_norm": 6.1875, "learning_rate": 6.2450341484037325e-06, "loss": 0.23328383, "memory(GiB)": 15.04, "step": 9525, "train_speed(iter/s)": 0.334643 }, { "acc": 0.87468853, "epoch": 1.2835016835016835, "grad_norm": 8.4375, "learning_rate": 6.2347164146463355e-06, "loss": 0.35560603, "memory(GiB)": 15.04, "step": 9530, "train_speed(iter/s)": 0.334664 }, { "acc": 0.91180153, "epoch": 1.284175084175084, "grad_norm": 7.28125, "learning_rate": 6.224403349289018e-06, "loss": 0.27251148, "memory(GiB)": 15.04, "step": 9535, "train_speed(iter/s)": 0.334702 }, { "acc": 0.8807745, "epoch": 1.284848484848485, "grad_norm": 7.46875, "learning_rate": 6.214094965118466e-06, "loss": 0.33747103, "memory(GiB)": 15.04, "step": 9540, "train_speed(iter/s)": 0.334738 }, { "acc": 0.88320541, "epoch": 1.2855218855218855, "grad_norm": 19.75, "learning_rate": 6.203791274915567e-06, "loss": 0.54292784, "memory(GiB)": 15.04, "step": 9545, "train_speed(iter/s)": 0.334771 }, { "acc": 0.89294491, "epoch": 1.2861952861952861, "grad_norm": 7.8125, "learning_rate": 6.193492291455385e-06, "loss": 0.48767056, "memory(GiB)": 15.04, "step": 9550, "train_speed(iter/s)": 0.334779 }, { "acc": 0.92093458, "epoch": 1.286868686868687, "grad_norm": 5.90625, "learning_rate": 6.183198027507158e-06, "loss": 0.26070592, "memory(GiB)": 15.04, "step": 9555, "train_speed(iter/s)": 0.334805 }, { "acc": 0.88449621, "epoch": 1.2875420875420875, "grad_norm": 7.34375, "learning_rate": 6.172908495834258e-06, "loss": 0.49047241, "memory(GiB)": 15.04, "step": 9560, "train_speed(iter/s)": 0.334811 }, { "acc": 0.89118357, "epoch": 1.2882154882154881, "grad_norm": 6.71875, "learning_rate": 6.162623709194202e-06, "loss": 0.42326612, "memory(GiB)": 15.04, "step": 9565, "train_speed(iter/s)": 0.334809 }, { "acc": 0.92075138, "epoch": 1.2888888888888888, "grad_norm": 4.71875, "learning_rate": 6.152343680338614e-06, "loss": 0.29220011, "memory(GiB)": 15.04, "step": 9570, "train_speed(iter/s)": 0.334812 }, { "acc": 0.88515062, "epoch": 1.2895622895622896, "grad_norm": 22.875, "learning_rate": 6.142068422013226e-06, "loss": 0.42076855, "memory(GiB)": 15.04, "step": 9575, "train_speed(iter/s)": 0.334828 }, { "acc": 0.8957715, "epoch": 1.2902356902356902, "grad_norm": 8.375, "learning_rate": 6.131797946957857e-06, "loss": 0.28677437, "memory(GiB)": 15.04, "step": 9580, "train_speed(iter/s)": 0.334851 }, { "acc": 0.86107922, "epoch": 1.290909090909091, "grad_norm": 8.375, "learning_rate": 6.1215322679063846e-06, "loss": 0.2829767, "memory(GiB)": 15.04, "step": 9585, "train_speed(iter/s)": 0.334872 }, { "acc": 0.92166348, "epoch": 1.2915824915824916, "grad_norm": 6.40625, "learning_rate": 6.111271397586751e-06, "loss": 0.27973387, "memory(GiB)": 15.04, "step": 9590, "train_speed(iter/s)": 0.334893 }, { "acc": 0.85127792, "epoch": 1.2922558922558922, "grad_norm": 26.125, "learning_rate": 6.101015348720934e-06, "loss": 0.44502549, "memory(GiB)": 15.04, "step": 9595, "train_speed(iter/s)": 0.334932 }, { "acc": 0.90770893, "epoch": 1.2929292929292928, "grad_norm": 4.96875, "learning_rate": 6.090764134024927e-06, "loss": 0.42872896, "memory(GiB)": 15.04, "step": 9600, "train_speed(iter/s)": 0.334918 }, { "epoch": 1.2929292929292928, "eval_acc": 0.8960018608016954, "eval_loss": 0.4059368968009949, "eval_runtime": 110.1772, "eval_samples_per_second": 1.361, "eval_steps_per_second": 1.361, "step": 9600 }, { "acc": 0.8971241, "epoch": 1.2936026936026936, "grad_norm": 8.5, "learning_rate": 6.080517766208742e-06, "loss": 0.31131194, "memory(GiB)": 15.04, "step": 9605, "train_speed(iter/s)": 0.333671 }, { "acc": 0.86708736, "epoch": 1.2942760942760942, "grad_norm": 12.25, "learning_rate": 6.070276257976364e-06, "loss": 0.60133257, "memory(GiB)": 15.04, "step": 9610, "train_speed(iter/s)": 0.333707 }, { "acc": 0.89065332, "epoch": 1.294949494949495, "grad_norm": 9.125, "learning_rate": 6.0600396220257705e-06, "loss": 0.33532963, "memory(GiB)": 15.04, "step": 9615, "train_speed(iter/s)": 0.333742 }, { "acc": 0.90499363, "epoch": 1.2956228956228957, "grad_norm": 9.0, "learning_rate": 6.049807871048889e-06, "loss": 0.27627466, "memory(GiB)": 15.04, "step": 9620, "train_speed(iter/s)": 0.333763 }, { "acc": 0.88430939, "epoch": 1.2962962962962963, "grad_norm": 12.4375, "learning_rate": 6.039581017731591e-06, "loss": 0.63434253, "memory(GiB)": 15.04, "step": 9625, "train_speed(iter/s)": 0.333803 }, { "acc": 0.93028517, "epoch": 1.2969696969696969, "grad_norm": 4.875, "learning_rate": 6.029359074753679e-06, "loss": 0.24705272, "memory(GiB)": 15.04, "step": 9630, "train_speed(iter/s)": 0.333814 }, { "acc": 0.93611622, "epoch": 1.2976430976430977, "grad_norm": 5.40625, "learning_rate": 6.019142054788858e-06, "loss": 0.24927502, "memory(GiB)": 15.04, "step": 9635, "train_speed(iter/s)": 0.333856 }, { "acc": 0.92476377, "epoch": 1.2983164983164983, "grad_norm": 7.625, "learning_rate": 6.00892997050474e-06, "loss": 0.38260887, "memory(GiB)": 15.04, "step": 9640, "train_speed(iter/s)": 0.333882 }, { "acc": 0.93132019, "epoch": 1.298989898989899, "grad_norm": 20.25, "learning_rate": 5.99872283456282e-06, "loss": 0.26264632, "memory(GiB)": 15.04, "step": 9645, "train_speed(iter/s)": 0.333908 }, { "acc": 0.912959, "epoch": 1.2996632996632997, "grad_norm": 6.96875, "learning_rate": 5.98852065961844e-06, "loss": 0.28866189, "memory(GiB)": 15.04, "step": 9650, "train_speed(iter/s)": 0.333922 }, { "acc": 0.9070096, "epoch": 1.3003367003367003, "grad_norm": 8.75, "learning_rate": 5.978323458320814e-06, "loss": 0.46759634, "memory(GiB)": 15.04, "step": 9655, "train_speed(iter/s)": 0.333965 }, { "acc": 0.8527648, "epoch": 1.301010101010101, "grad_norm": 10.0625, "learning_rate": 5.9681312433129656e-06, "loss": 0.49304399, "memory(GiB)": 15.04, "step": 9660, "train_speed(iter/s)": 0.33399 }, { "acc": 0.94254284, "epoch": 1.3016835016835016, "grad_norm": 7.34375, "learning_rate": 5.957944027231756e-06, "loss": 0.26958907, "memory(GiB)": 15.04, "step": 9665, "train_speed(iter/s)": 0.333995 }, { "acc": 0.91601257, "epoch": 1.3023569023569024, "grad_norm": 9.1875, "learning_rate": 5.947761822707842e-06, "loss": 0.28514488, "memory(GiB)": 15.04, "step": 9670, "train_speed(iter/s)": 0.334035 }, { "acc": 0.83065281, "epoch": 1.303030303030303, "grad_norm": 5.53125, "learning_rate": 5.937584642365661e-06, "loss": 0.4698401, "memory(GiB)": 15.04, "step": 9675, "train_speed(iter/s)": 0.334057 }, { "acc": 0.88610077, "epoch": 1.3037037037037038, "grad_norm": 7.34375, "learning_rate": 5.927412498823431e-06, "loss": 0.39201999, "memory(GiB)": 15.04, "step": 9680, "train_speed(iter/s)": 0.334072 }, { "acc": 0.89949703, "epoch": 1.3043771043771044, "grad_norm": 14.4375, "learning_rate": 5.9172454046931125e-06, "loss": 0.34470611, "memory(GiB)": 15.04, "step": 9685, "train_speed(iter/s)": 0.334092 }, { "acc": 0.91302719, "epoch": 1.305050505050505, "grad_norm": 5.125, "learning_rate": 5.90708337258042e-06, "loss": 0.26164844, "memory(GiB)": 15.04, "step": 9690, "train_speed(iter/s)": 0.334101 }, { "acc": 0.93276243, "epoch": 1.3057239057239056, "grad_norm": 7.25, "learning_rate": 5.89692641508478e-06, "loss": 0.23191741, "memory(GiB)": 15.04, "step": 9695, "train_speed(iter/s)": 0.334123 }, { "acc": 0.92170534, "epoch": 1.3063973063973064, "grad_norm": 4.1875, "learning_rate": 5.886774544799337e-06, "loss": 0.20918903, "memory(GiB)": 15.04, "step": 9700, "train_speed(iter/s)": 0.334138 }, { "acc": 0.9261858, "epoch": 1.307070707070707, "grad_norm": 11.5, "learning_rate": 5.876627774310917e-06, "loss": 0.26228995, "memory(GiB)": 15.04, "step": 9705, "train_speed(iter/s)": 0.334169 }, { "acc": 0.92261524, "epoch": 1.3077441077441079, "grad_norm": 5.8125, "learning_rate": 5.866486116200033e-06, "loss": 0.22712448, "memory(GiB)": 15.04, "step": 9710, "train_speed(iter/s)": 0.334203 }, { "acc": 0.8621439, "epoch": 1.3084175084175085, "grad_norm": 8.8125, "learning_rate": 5.8563495830408525e-06, "loss": 0.42797208, "memory(GiB)": 15.04, "step": 9715, "train_speed(iter/s)": 0.334241 }, { "acc": 0.87616835, "epoch": 1.309090909090909, "grad_norm": 5.84375, "learning_rate": 5.8462181874011955e-06, "loss": 0.46978016, "memory(GiB)": 15.04, "step": 9720, "train_speed(iter/s)": 0.334275 }, { "acc": 0.87344656, "epoch": 1.3097643097643097, "grad_norm": 10.3125, "learning_rate": 5.836091941842506e-06, "loss": 0.44679465, "memory(GiB)": 15.04, "step": 9725, "train_speed(iter/s)": 0.334299 }, { "acc": 0.91853428, "epoch": 1.3104377104377105, "grad_norm": 6.34375, "learning_rate": 5.825970858919847e-06, "loss": 0.45350027, "memory(GiB)": 15.04, "step": 9730, "train_speed(iter/s)": 0.334331 }, { "acc": 0.82459059, "epoch": 1.3111111111111111, "grad_norm": 8.875, "learning_rate": 5.815854951181874e-06, "loss": 0.64473267, "memory(GiB)": 15.04, "step": 9735, "train_speed(iter/s)": 0.334364 }, { "acc": 0.85284615, "epoch": 1.3117845117845117, "grad_norm": 5.59375, "learning_rate": 5.805744231170833e-06, "loss": 0.53016238, "memory(GiB)": 15.04, "step": 9740, "train_speed(iter/s)": 0.334396 }, { "acc": 0.90661678, "epoch": 1.3124579124579125, "grad_norm": 12.0, "learning_rate": 5.795638711422542e-06, "loss": 0.32342672, "memory(GiB)": 15.04, "step": 9745, "train_speed(iter/s)": 0.334419 }, { "acc": 0.89141331, "epoch": 1.3131313131313131, "grad_norm": 6.1875, "learning_rate": 5.785538404466355e-06, "loss": 0.31523666, "memory(GiB)": 15.04, "step": 9750, "train_speed(iter/s)": 0.334442 }, { "acc": 0.88405666, "epoch": 1.3138047138047138, "grad_norm": 8.0, "learning_rate": 5.775443322825183e-06, "loss": 0.36176951, "memory(GiB)": 15.04, "step": 9755, "train_speed(iter/s)": 0.334482 }, { "acc": 0.91502209, "epoch": 1.3144781144781144, "grad_norm": 8.6875, "learning_rate": 5.765353479015438e-06, "loss": 0.29535065, "memory(GiB)": 15.04, "step": 9760, "train_speed(iter/s)": 0.334499 }, { "acc": 0.90932894, "epoch": 1.3151515151515152, "grad_norm": 4.15625, "learning_rate": 5.755268885547054e-06, "loss": 0.45657849, "memory(GiB)": 15.04, "step": 9765, "train_speed(iter/s)": 0.334504 }, { "acc": 0.92367249, "epoch": 1.3158249158249158, "grad_norm": 4.5, "learning_rate": 5.745189554923454e-06, "loss": 0.2430377, "memory(GiB)": 15.04, "step": 9770, "train_speed(iter/s)": 0.334532 }, { "acc": 0.90884066, "epoch": 1.3164983164983166, "grad_norm": 7.59375, "learning_rate": 5.7351154996415215e-06, "loss": 0.34123187, "memory(GiB)": 15.04, "step": 9775, "train_speed(iter/s)": 0.334537 }, { "acc": 0.80925026, "epoch": 1.3171717171717172, "grad_norm": 13.0625, "learning_rate": 5.725046732191619e-06, "loss": 0.4762979, "memory(GiB)": 15.04, "step": 9780, "train_speed(iter/s)": 0.334572 }, { "acc": 0.88695049, "epoch": 1.3178451178451178, "grad_norm": 7.78125, "learning_rate": 5.7149832650575365e-06, "loss": 0.51415014, "memory(GiB)": 15.04, "step": 9785, "train_speed(iter/s)": 0.334594 }, { "acc": 0.9239048, "epoch": 1.3185185185185184, "grad_norm": 8.5625, "learning_rate": 5.704925110716499e-06, "loss": 0.31108632, "memory(GiB)": 15.04, "step": 9790, "train_speed(iter/s)": 0.33462 }, { "acc": 0.90207672, "epoch": 1.3191919191919192, "grad_norm": 6.4375, "learning_rate": 5.6948722816391525e-06, "loss": 0.27707739, "memory(GiB)": 15.04, "step": 9795, "train_speed(iter/s)": 0.334658 }, { "acc": 0.903545, "epoch": 1.3198653198653199, "grad_norm": 11.625, "learning_rate": 5.6848247902895215e-06, "loss": 0.34115701, "memory(GiB)": 15.04, "step": 9800, "train_speed(iter/s)": 0.334671 }, { "acc": 0.86289349, "epoch": 1.3205387205387205, "grad_norm": 12.0, "learning_rate": 5.6747826491250326e-06, "loss": 0.51363149, "memory(GiB)": 15.04, "step": 9805, "train_speed(iter/s)": 0.334693 }, { "acc": 0.88835621, "epoch": 1.3212121212121213, "grad_norm": 6.4375, "learning_rate": 5.664745870596462e-06, "loss": 0.48059182, "memory(GiB)": 15.04, "step": 9810, "train_speed(iter/s)": 0.334691 }, { "acc": 0.82850885, "epoch": 1.3218855218855219, "grad_norm": 9.75, "learning_rate": 5.654714467147951e-06, "loss": 0.50740252, "memory(GiB)": 15.04, "step": 9815, "train_speed(iter/s)": 0.334718 }, { "acc": 0.84253283, "epoch": 1.3225589225589225, "grad_norm": 14.0, "learning_rate": 5.644688451216968e-06, "loss": 0.34741228, "memory(GiB)": 15.04, "step": 9820, "train_speed(iter/s)": 0.33476 }, { "acc": 0.88747406, "epoch": 1.3232323232323233, "grad_norm": 10.875, "learning_rate": 5.634667835234302e-06, "loss": 0.36861217, "memory(GiB)": 15.04, "step": 9825, "train_speed(iter/s)": 0.3348 }, { "acc": 0.947155, "epoch": 1.323905723905724, "grad_norm": 5.59375, "learning_rate": 5.624652631624056e-06, "loss": 0.20252576, "memory(GiB)": 15.04, "step": 9830, "train_speed(iter/s)": 0.334773 }, { "acc": 0.89678516, "epoch": 1.3245791245791245, "grad_norm": 10.375, "learning_rate": 5.614642852803604e-06, "loss": 0.38581629, "memory(GiB)": 15.04, "step": 9835, "train_speed(iter/s)": 0.334802 }, { "acc": 0.89338531, "epoch": 1.3252525252525253, "grad_norm": 8.25, "learning_rate": 5.604638511183619e-06, "loss": 0.32135746, "memory(GiB)": 15.04, "step": 9840, "train_speed(iter/s)": 0.334833 }, { "acc": 0.92641726, "epoch": 1.325925925925926, "grad_norm": 7.34375, "learning_rate": 5.594639619168005e-06, "loss": 0.25842469, "memory(GiB)": 15.04, "step": 9845, "train_speed(iter/s)": 0.334855 }, { "acc": 0.92931995, "epoch": 1.3265993265993266, "grad_norm": 5.0, "learning_rate": 5.584646189153937e-06, "loss": 0.21179109, "memory(GiB)": 15.04, "step": 9850, "train_speed(iter/s)": 0.334867 }, { "acc": 0.8109643, "epoch": 1.3272727272727272, "grad_norm": 6.5625, "learning_rate": 5.574658233531796e-06, "loss": 0.35859571, "memory(GiB)": 15.04, "step": 9855, "train_speed(iter/s)": 0.334897 }, { "acc": 0.85045424, "epoch": 1.327946127946128, "grad_norm": 8.0625, "learning_rate": 5.564675764685181e-06, "loss": 0.75530548, "memory(GiB)": 15.04, "step": 9860, "train_speed(iter/s)": 0.334895 }, { "acc": 0.93578234, "epoch": 1.3286195286195286, "grad_norm": 6.5625, "learning_rate": 5.554698794990896e-06, "loss": 0.21489677, "memory(GiB)": 15.04, "step": 9865, "train_speed(iter/s)": 0.334915 }, { "acc": 0.89079962, "epoch": 1.3292929292929294, "grad_norm": 5.53125, "learning_rate": 5.5447273368189255e-06, "loss": 0.39067354, "memory(GiB)": 15.04, "step": 9870, "train_speed(iter/s)": 0.334916 }, { "acc": 0.91762409, "epoch": 1.32996632996633, "grad_norm": 8.125, "learning_rate": 5.53476140253241e-06, "loss": 0.26952195, "memory(GiB)": 15.04, "step": 9875, "train_speed(iter/s)": 0.334939 }, { "acc": 0.90775633, "epoch": 1.3306397306397306, "grad_norm": 6.65625, "learning_rate": 5.524801004487652e-06, "loss": 0.34524853, "memory(GiB)": 15.04, "step": 9880, "train_speed(iter/s)": 0.334968 }, { "acc": 0.87291451, "epoch": 1.3313131313131312, "grad_norm": 11.125, "learning_rate": 5.514846155034084e-06, "loss": 0.40953698, "memory(GiB)": 15.04, "step": 9885, "train_speed(iter/s)": 0.334983 }, { "acc": 0.92717562, "epoch": 1.331986531986532, "grad_norm": 7.1875, "learning_rate": 5.50489686651426e-06, "loss": 0.31727536, "memory(GiB)": 15.04, "step": 9890, "train_speed(iter/s)": 0.334996 }, { "acc": 0.93518858, "epoch": 1.3326599326599327, "grad_norm": 7.03125, "learning_rate": 5.494953151263847e-06, "loss": 0.28765833, "memory(GiB)": 15.04, "step": 9895, "train_speed(iter/s)": 0.335006 }, { "acc": 0.87127934, "epoch": 1.3333333333333333, "grad_norm": 20.25, "learning_rate": 5.485015021611587e-06, "loss": 0.47570858, "memory(GiB)": 15.04, "step": 9900, "train_speed(iter/s)": 0.335046 }, { "epoch": 1.3333333333333333, "eval_acc": 0.8956226336808257, "eval_loss": 0.4088384509086609, "eval_runtime": 110.0565, "eval_samples_per_second": 1.363, "eval_steps_per_second": 1.363, "step": 9900 }, { "acc": 0.93745594, "epoch": 1.334006734006734, "grad_norm": 4.65625, "learning_rate": 5.475082489879313e-06, "loss": 0.38016248, "memory(GiB)": 15.04, "step": 9905, "train_speed(iter/s)": 0.333844 }, { "acc": 0.94206533, "epoch": 1.3346801346801347, "grad_norm": 8.6875, "learning_rate": 5.465155568381899e-06, "loss": 0.20986974, "memory(GiB)": 15.04, "step": 9910, "train_speed(iter/s)": 0.33388 }, { "acc": 0.90396404, "epoch": 1.3353535353535353, "grad_norm": 8.5625, "learning_rate": 5.455234269427281e-06, "loss": 0.36379542, "memory(GiB)": 15.04, "step": 9915, "train_speed(iter/s)": 0.333903 }, { "acc": 0.8936451, "epoch": 1.336026936026936, "grad_norm": 5.5625, "learning_rate": 5.445318605316418e-06, "loss": 0.28865125, "memory(GiB)": 15.04, "step": 9920, "train_speed(iter/s)": 0.33391 }, { "acc": 0.93893976, "epoch": 1.3367003367003367, "grad_norm": 3.90625, "learning_rate": 5.4354085883432736e-06, "loss": 0.24402006, "memory(GiB)": 15.04, "step": 9925, "train_speed(iter/s)": 0.333893 }, { "acc": 0.88604975, "epoch": 1.3373737373737373, "grad_norm": 9.8125, "learning_rate": 5.425504230794827e-06, "loss": 0.35277283, "memory(GiB)": 15.04, "step": 9930, "train_speed(iter/s)": 0.333924 }, { "acc": 0.85546255, "epoch": 1.3380471380471382, "grad_norm": 16.625, "learning_rate": 5.415605544951019e-06, "loss": 0.55175953, "memory(GiB)": 15.04, "step": 9935, "train_speed(iter/s)": 0.333952 }, { "acc": 0.91053429, "epoch": 1.3387205387205388, "grad_norm": 8.9375, "learning_rate": 5.405712543084777e-06, "loss": 0.35091743, "memory(GiB)": 15.04, "step": 9940, "train_speed(iter/s)": 0.333969 }, { "acc": 0.86309891, "epoch": 1.3393939393939394, "grad_norm": 5.90625, "learning_rate": 5.395825237461976e-06, "loss": 0.644346, "memory(GiB)": 15.04, "step": 9945, "train_speed(iter/s)": 0.333983 }, { "acc": 0.88862133, "epoch": 1.34006734006734, "grad_norm": 8.5625, "learning_rate": 5.38594364034142e-06, "loss": 0.30490239, "memory(GiB)": 15.04, "step": 9950, "train_speed(iter/s)": 0.334023 }, { "acc": 0.93491707, "epoch": 1.3407407407407408, "grad_norm": 6.59375, "learning_rate": 5.37606776397485e-06, "loss": 0.25334756, "memory(GiB)": 15.04, "step": 9955, "train_speed(iter/s)": 0.334038 }, { "acc": 0.91741714, "epoch": 1.3414141414141414, "grad_norm": 8.0, "learning_rate": 5.366197620606899e-06, "loss": 0.24480448, "memory(GiB)": 15.04, "step": 9960, "train_speed(iter/s)": 0.334041 }, { "acc": 0.81197901, "epoch": 1.3420875420875422, "grad_norm": 17.125, "learning_rate": 5.3563332224750985e-06, "loss": 0.49897423, "memory(GiB)": 15.04, "step": 9965, "train_speed(iter/s)": 0.33405 }, { "acc": 0.87177372, "epoch": 1.3427609427609428, "grad_norm": 9.3125, "learning_rate": 5.346474581809866e-06, "loss": 0.380778, "memory(GiB)": 15.04, "step": 9970, "train_speed(iter/s)": 0.334081 }, { "acc": 0.93990059, "epoch": 1.3434343434343434, "grad_norm": 6.78125, "learning_rate": 5.336621710834462e-06, "loss": 0.22352836, "memory(GiB)": 15.04, "step": 9975, "train_speed(iter/s)": 0.334107 }, { "acc": 0.84761782, "epoch": 1.344107744107744, "grad_norm": 14.6875, "learning_rate": 5.326774621765009e-06, "loss": 0.48472433, "memory(GiB)": 15.04, "step": 9980, "train_speed(iter/s)": 0.334112 }, { "acc": 0.92594032, "epoch": 1.3447811447811449, "grad_norm": 6.125, "learning_rate": 5.316933326810452e-06, "loss": 0.30096884, "memory(GiB)": 15.04, "step": 9985, "train_speed(iter/s)": 0.334142 }, { "acc": 0.897118, "epoch": 1.3454545454545455, "grad_norm": 5.8125, "learning_rate": 5.3070978381725546e-06, "loss": 0.64908366, "memory(GiB)": 15.04, "step": 9990, "train_speed(iter/s)": 0.334175 }, { "acc": 0.90277023, "epoch": 1.346127946127946, "grad_norm": 5.96875, "learning_rate": 5.297268168045887e-06, "loss": 0.35519619, "memory(GiB)": 15.04, "step": 9995, "train_speed(iter/s)": 0.334188 }, { "acc": 0.87940149, "epoch": 1.3468013468013469, "grad_norm": 10.625, "learning_rate": 5.287444328617793e-06, "loss": 0.47951393, "memory(GiB)": 15.04, "step": 10000, "train_speed(iter/s)": 0.334201 }, { "acc": 0.82602615, "epoch": 1.3474747474747475, "grad_norm": 17.875, "learning_rate": 5.277626332068402e-06, "loss": 0.52059345, "memory(GiB)": 15.04, "step": 10005, "train_speed(iter/s)": 0.334192 }, { "acc": 0.91368418, "epoch": 1.348148148148148, "grad_norm": 6.59375, "learning_rate": 5.267814190570584e-06, "loss": 0.21461604, "memory(GiB)": 15.04, "step": 10010, "train_speed(iter/s)": 0.334219 }, { "acc": 0.91373196, "epoch": 1.3488215488215487, "grad_norm": 7.5625, "learning_rate": 5.258007916289965e-06, "loss": 0.26179779, "memory(GiB)": 15.04, "step": 10015, "train_speed(iter/s)": 0.334237 }, { "acc": 0.91855373, "epoch": 1.3494949494949495, "grad_norm": 9.125, "learning_rate": 5.24820752138488e-06, "loss": 0.31510684, "memory(GiB)": 15.04, "step": 10020, "train_speed(iter/s)": 0.334273 }, { "acc": 0.91080656, "epoch": 1.3501683501683501, "grad_norm": 9.9375, "learning_rate": 5.238413018006392e-06, "loss": 0.2690109, "memory(GiB)": 15.04, "step": 10025, "train_speed(iter/s)": 0.334306 }, { "acc": 0.93179283, "epoch": 1.350841750841751, "grad_norm": 5.34375, "learning_rate": 5.228624418298241e-06, "loss": 0.2307549, "memory(GiB)": 15.04, "step": 10030, "train_speed(iter/s)": 0.334341 }, { "acc": 0.86719303, "epoch": 1.3515151515151516, "grad_norm": 9.375, "learning_rate": 5.2188417343968645e-06, "loss": 0.48214526, "memory(GiB)": 15.04, "step": 10035, "train_speed(iter/s)": 0.334341 }, { "acc": 0.8814992, "epoch": 1.3521885521885522, "grad_norm": 12.9375, "learning_rate": 5.209064978431353e-06, "loss": 0.29228759, "memory(GiB)": 15.04, "step": 10040, "train_speed(iter/s)": 0.334386 }, { "acc": 0.8772913, "epoch": 1.3528619528619528, "grad_norm": 8.6875, "learning_rate": 5.199294162523455e-06, "loss": 0.52333031, "memory(GiB)": 15.04, "step": 10045, "train_speed(iter/s)": 0.334412 }, { "acc": 0.83181, "epoch": 1.3535353535353536, "grad_norm": 11.0625, "learning_rate": 5.189529298787546e-06, "loss": 0.71318178, "memory(GiB)": 15.04, "step": 10050, "train_speed(iter/s)": 0.334427 }, { "acc": 0.92688646, "epoch": 1.3542087542087542, "grad_norm": 7.3125, "learning_rate": 5.179770399330629e-06, "loss": 0.30033717, "memory(GiB)": 15.04, "step": 10055, "train_speed(iter/s)": 0.334453 }, { "acc": 0.88847084, "epoch": 1.354882154882155, "grad_norm": 5.71875, "learning_rate": 5.170017476252316e-06, "loss": 0.57049932, "memory(GiB)": 15.04, "step": 10060, "train_speed(iter/s)": 0.334476 }, { "acc": 0.83316708, "epoch": 1.3555555555555556, "grad_norm": 12.0, "learning_rate": 5.160270541644792e-06, "loss": 0.25934565, "memory(GiB)": 15.04, "step": 10065, "train_speed(iter/s)": 0.334505 }, { "acc": 0.94990816, "epoch": 1.3562289562289562, "grad_norm": 10.3125, "learning_rate": 5.150529607592838e-06, "loss": 0.20809155, "memory(GiB)": 15.04, "step": 10070, "train_speed(iter/s)": 0.334522 }, { "acc": 0.89981232, "epoch": 1.3569023569023568, "grad_norm": 5.4375, "learning_rate": 5.140794686173777e-06, "loss": 0.40964913, "memory(GiB)": 15.04, "step": 10075, "train_speed(iter/s)": 0.334545 }, { "acc": 0.93371658, "epoch": 1.3575757575757577, "grad_norm": 11.0, "learning_rate": 5.131065789457489e-06, "loss": 0.24967754, "memory(GiB)": 15.04, "step": 10080, "train_speed(iter/s)": 0.334573 }, { "acc": 0.8034173, "epoch": 1.3582491582491583, "grad_norm": 17.875, "learning_rate": 5.121342929506386e-06, "loss": 0.67588377, "memory(GiB)": 15.04, "step": 10085, "train_speed(iter/s)": 0.334605 }, { "acc": 0.84909592, "epoch": 1.3589225589225589, "grad_norm": 11.375, "learning_rate": 5.111626118375379e-06, "loss": 0.5578403, "memory(GiB)": 15.04, "step": 10090, "train_speed(iter/s)": 0.334616 }, { "acc": 0.88609409, "epoch": 1.3595959595959597, "grad_norm": 7.84375, "learning_rate": 5.1019153681119024e-06, "loss": 0.41217747, "memory(GiB)": 15.04, "step": 10095, "train_speed(iter/s)": 0.334635 }, { "acc": 0.90607328, "epoch": 1.3602693602693603, "grad_norm": 7.90625, "learning_rate": 5.092210690755853e-06, "loss": 0.50391126, "memory(GiB)": 15.04, "step": 10100, "train_speed(iter/s)": 0.334646 }, { "acc": 0.89073992, "epoch": 1.360942760942761, "grad_norm": 8.3125, "learning_rate": 5.082512098339616e-06, "loss": 0.36505196, "memory(GiB)": 15.04, "step": 10105, "train_speed(iter/s)": 0.33467 }, { "acc": 0.89976683, "epoch": 1.3616161616161615, "grad_norm": 5.78125, "learning_rate": 5.0728196028880265e-06, "loss": 0.70888257, "memory(GiB)": 15.04, "step": 10110, "train_speed(iter/s)": 0.334702 }, { "acc": 0.92454653, "epoch": 1.3622895622895623, "grad_norm": 5.0, "learning_rate": 5.063133216418351e-06, "loss": 0.2980185, "memory(GiB)": 15.04, "step": 10115, "train_speed(iter/s)": 0.334702 }, { "acc": 0.89257593, "epoch": 1.362962962962963, "grad_norm": 6.53125, "learning_rate": 5.0534529509402995e-06, "loss": 0.31065969, "memory(GiB)": 15.04, "step": 10120, "train_speed(iter/s)": 0.334706 }, { "acc": 0.91431408, "epoch": 1.3636363636363638, "grad_norm": 6.53125, "learning_rate": 5.0437788184559755e-06, "loss": 0.39485724, "memory(GiB)": 15.04, "step": 10125, "train_speed(iter/s)": 0.334735 }, { "acc": 0.88906364, "epoch": 1.3643097643097644, "grad_norm": 7.71875, "learning_rate": 5.0341108309598886e-06, "loss": 0.26602957, "memory(GiB)": 15.04, "step": 10130, "train_speed(iter/s)": 0.334744 }, { "acc": 0.90695925, "epoch": 1.364983164983165, "grad_norm": 6.4375, "learning_rate": 5.024449000438931e-06, "loss": 0.36310508, "memory(GiB)": 15.04, "step": 10135, "train_speed(iter/s)": 0.334748 }, { "acc": 0.91908731, "epoch": 1.3656565656565656, "grad_norm": 8.9375, "learning_rate": 5.01479333887235e-06, "loss": 0.37564602, "memory(GiB)": 15.04, "step": 10140, "train_speed(iter/s)": 0.334768 }, { "acc": 0.9059762, "epoch": 1.3663299663299664, "grad_norm": 5.96875, "learning_rate": 5.0051438582317594e-06, "loss": 0.18879628, "memory(GiB)": 15.04, "step": 10145, "train_speed(iter/s)": 0.334798 }, { "acc": 0.92819109, "epoch": 1.367003367003367, "grad_norm": 5.96875, "learning_rate": 4.9955005704810936e-06, "loss": 0.30936062, "memory(GiB)": 15.04, "step": 10150, "train_speed(iter/s)": 0.334822 }, { "acc": 0.93115044, "epoch": 1.3676767676767676, "grad_norm": 6.71875, "learning_rate": 4.9858634875766196e-06, "loss": 0.32543521, "memory(GiB)": 15.04, "step": 10155, "train_speed(iter/s)": 0.334836 }, { "acc": 0.8822731, "epoch": 1.3683501683501684, "grad_norm": 5.0, "learning_rate": 4.9762326214669154e-06, "loss": 0.6356657, "memory(GiB)": 15.04, "step": 10160, "train_speed(iter/s)": 0.334851 }, { "acc": 0.86856585, "epoch": 1.369023569023569, "grad_norm": 11.375, "learning_rate": 4.966607984092834e-06, "loss": 0.33516977, "memory(GiB)": 15.04, "step": 10165, "train_speed(iter/s)": 0.334886 }, { "acc": 0.89640121, "epoch": 1.3696969696969696, "grad_norm": 8.9375, "learning_rate": 4.956989587387523e-06, "loss": 0.41605682, "memory(GiB)": 15.04, "step": 10170, "train_speed(iter/s)": 0.33489 }, { "acc": 0.93657713, "epoch": 1.3703703703703702, "grad_norm": 4.875, "learning_rate": 4.94737744327638e-06, "loss": 0.24267459, "memory(GiB)": 15.04, "step": 10175, "train_speed(iter/s)": 0.334899 }, { "acc": 0.88825378, "epoch": 1.371043771043771, "grad_norm": 6.5625, "learning_rate": 4.93777156367706e-06, "loss": 0.40265498, "memory(GiB)": 15.04, "step": 10180, "train_speed(iter/s)": 0.334921 }, { "acc": 0.91289072, "epoch": 1.3717171717171717, "grad_norm": 10.5, "learning_rate": 4.928171960499442e-06, "loss": 0.30160131, "memory(GiB)": 15.04, "step": 10185, "train_speed(iter/s)": 0.334931 }, { "acc": 0.92027521, "epoch": 1.3723905723905725, "grad_norm": 9.0625, "learning_rate": 4.918578645645635e-06, "loss": 0.40555534, "memory(GiB)": 15.04, "step": 10190, "train_speed(iter/s)": 0.334964 }, { "acc": 0.85583553, "epoch": 1.373063973063973, "grad_norm": 15.0625, "learning_rate": 4.908991631009936e-06, "loss": 0.65150752, "memory(GiB)": 15.04, "step": 10195, "train_speed(iter/s)": 0.334969 }, { "acc": 0.94926624, "epoch": 1.3737373737373737, "grad_norm": 5.0, "learning_rate": 4.8994109284788445e-06, "loss": 0.22038443, "memory(GiB)": 15.04, "step": 10200, "train_speed(iter/s)": 0.334989 }, { "epoch": 1.3737373737373737, "eval_acc": 0.8955845525240543, "eval_loss": 0.40956243872642517, "eval_runtime": 109.7705, "eval_samples_per_second": 1.366, "eval_steps_per_second": 1.366, "step": 10200 }, { "acc": 0.77951794, "epoch": 1.3744107744107743, "grad_norm": 15.0625, "learning_rate": 4.889836549931024e-06, "loss": 0.83544111, "memory(GiB)": 15.04, "step": 10205, "train_speed(iter/s)": 0.33383 }, { "acc": 0.88952808, "epoch": 1.3750841750841751, "grad_norm": 6.125, "learning_rate": 4.880268507237307e-06, "loss": 0.37691703, "memory(GiB)": 15.04, "step": 10210, "train_speed(iter/s)": 0.333843 }, { "acc": 0.89511099, "epoch": 1.3757575757575757, "grad_norm": 9.625, "learning_rate": 4.870706812260656e-06, "loss": 0.32926075, "memory(GiB)": 15.04, "step": 10215, "train_speed(iter/s)": 0.333872 }, { "acc": 0.90368662, "epoch": 1.3764309764309766, "grad_norm": 14.75, "learning_rate": 4.861151476856182e-06, "loss": 0.44513268, "memory(GiB)": 15.04, "step": 10220, "train_speed(iter/s)": 0.333897 }, { "acc": 0.87467747, "epoch": 1.3771043771043772, "grad_norm": 13.9375, "learning_rate": 4.851602512871092e-06, "loss": 0.49827414, "memory(GiB)": 15.04, "step": 10225, "train_speed(iter/s)": 0.333921 }, { "acc": 0.93380136, "epoch": 1.3777777777777778, "grad_norm": 6.1875, "learning_rate": 4.8420599321447085e-06, "loss": 0.2167762, "memory(GiB)": 15.04, "step": 10230, "train_speed(iter/s)": 0.333941 }, { "acc": 0.90687685, "epoch": 1.3784511784511784, "grad_norm": 6.34375, "learning_rate": 4.832523746508434e-06, "loss": 0.38546033, "memory(GiB)": 15.04, "step": 10235, "train_speed(iter/s)": 0.333938 }, { "acc": 0.89047594, "epoch": 1.3791245791245792, "grad_norm": 9.8125, "learning_rate": 4.8229939677857375e-06, "loss": 0.41071882, "memory(GiB)": 15.04, "step": 10240, "train_speed(iter/s)": 0.333965 }, { "acc": 0.90846529, "epoch": 1.3797979797979798, "grad_norm": 9.9375, "learning_rate": 4.813470607792154e-06, "loss": 0.27075646, "memory(GiB)": 15.04, "step": 10245, "train_speed(iter/s)": 0.333999 }, { "acc": 0.92954416, "epoch": 1.3804713804713804, "grad_norm": 6.4375, "learning_rate": 4.803953678335249e-06, "loss": 0.2295686, "memory(GiB)": 15.04, "step": 10250, "train_speed(iter/s)": 0.334027 }, { "acc": 0.89961386, "epoch": 1.3811447811447812, "grad_norm": 9.75, "learning_rate": 4.794443191214624e-06, "loss": 0.46945553, "memory(GiB)": 15.04, "step": 10255, "train_speed(iter/s)": 0.334043 }, { "acc": 0.89444189, "epoch": 1.3818181818181818, "grad_norm": 11.25, "learning_rate": 4.784939158221893e-06, "loss": 0.41053882, "memory(GiB)": 15.04, "step": 10260, "train_speed(iter/s)": 0.334076 }, { "acc": 0.91152878, "epoch": 1.3824915824915824, "grad_norm": 6.0625, "learning_rate": 4.775441591140657e-06, "loss": 0.3230912, "memory(GiB)": 15.04, "step": 10265, "train_speed(iter/s)": 0.334084 }, { "acc": 0.9085844, "epoch": 1.383164983164983, "grad_norm": 5.0625, "learning_rate": 4.765950501746517e-06, "loss": 0.41453176, "memory(GiB)": 15.04, "step": 10270, "train_speed(iter/s)": 0.334075 }, { "acc": 0.84861412, "epoch": 1.3838383838383839, "grad_norm": 5.3125, "learning_rate": 4.756465901807025e-06, "loss": 0.51905255, "memory(GiB)": 15.04, "step": 10275, "train_speed(iter/s)": 0.334048 }, { "acc": 0.89370041, "epoch": 1.3845117845117845, "grad_norm": 7.875, "learning_rate": 4.746987803081698e-06, "loss": 0.34876211, "memory(GiB)": 15.04, "step": 10280, "train_speed(iter/s)": 0.33408 }, { "acc": 0.88962431, "epoch": 1.3851851851851853, "grad_norm": 4.21875, "learning_rate": 4.737516217321996e-06, "loss": 0.44635606, "memory(GiB)": 15.04, "step": 10285, "train_speed(iter/s)": 0.334096 }, { "acc": 0.89850054, "epoch": 1.385858585858586, "grad_norm": 6.09375, "learning_rate": 4.728051156271289e-06, "loss": 0.4509078, "memory(GiB)": 15.04, "step": 10290, "train_speed(iter/s)": 0.334114 }, { "acc": 0.89144163, "epoch": 1.3865319865319865, "grad_norm": 4.375, "learning_rate": 4.718592631664875e-06, "loss": 0.38697112, "memory(GiB)": 15.04, "step": 10295, "train_speed(iter/s)": 0.334129 }, { "acc": 0.9227663, "epoch": 1.387205387205387, "grad_norm": 5.9375, "learning_rate": 4.70914065522993e-06, "loss": 0.42397604, "memory(GiB)": 15.04, "step": 10300, "train_speed(iter/s)": 0.334125 }, { "acc": 0.93395185, "epoch": 1.387878787878788, "grad_norm": 23.25, "learning_rate": 4.699695238685526e-06, "loss": 0.24768918, "memory(GiB)": 15.04, "step": 10305, "train_speed(iter/s)": 0.334162 }, { "acc": 0.89229746, "epoch": 1.3885521885521885, "grad_norm": 15.25, "learning_rate": 4.690256393742596e-06, "loss": 0.32988002, "memory(GiB)": 15.04, "step": 10310, "train_speed(iter/s)": 0.334197 }, { "acc": 0.834126, "epoch": 1.3892255892255894, "grad_norm": 42.25, "learning_rate": 4.680824132103921e-06, "loss": 0.7447185, "memory(GiB)": 15.04, "step": 10315, "train_speed(iter/s)": 0.334218 }, { "acc": 0.86799088, "epoch": 1.38989898989899, "grad_norm": 13.3125, "learning_rate": 4.671398465464129e-06, "loss": 0.33674641, "memory(GiB)": 15.04, "step": 10320, "train_speed(iter/s)": 0.334257 }, { "acc": 0.94394722, "epoch": 1.3905723905723906, "grad_norm": 5.03125, "learning_rate": 4.661979405509659e-06, "loss": 0.22380381, "memory(GiB)": 15.04, "step": 10325, "train_speed(iter/s)": 0.334271 }, { "acc": 0.88122826, "epoch": 1.3912457912457912, "grad_norm": 19.625, "learning_rate": 4.6525669639187705e-06, "loss": 0.29948568, "memory(GiB)": 15.04, "step": 10330, "train_speed(iter/s)": 0.334284 }, { "acc": 0.93242474, "epoch": 1.391919191919192, "grad_norm": 12.375, "learning_rate": 4.643161152361515e-06, "loss": 0.23160937, "memory(GiB)": 15.04, "step": 10335, "train_speed(iter/s)": 0.334312 }, { "acc": 0.84878254, "epoch": 1.3925925925925926, "grad_norm": 28.5, "learning_rate": 4.633761982499713e-06, "loss": 0.44875698, "memory(GiB)": 15.04, "step": 10340, "train_speed(iter/s)": 0.334354 }, { "acc": 0.89076071, "epoch": 1.3932659932659932, "grad_norm": 5.28125, "learning_rate": 4.624369465986967e-06, "loss": 0.34800448, "memory(GiB)": 15.04, "step": 10345, "train_speed(iter/s)": 0.334366 }, { "acc": 0.90187988, "epoch": 1.393939393939394, "grad_norm": 5.0625, "learning_rate": 4.614983614468613e-06, "loss": 0.25995982, "memory(GiB)": 15.04, "step": 10350, "train_speed(iter/s)": 0.334383 }, { "acc": 0.86971121, "epoch": 1.3946127946127946, "grad_norm": 11.0625, "learning_rate": 4.60560443958174e-06, "loss": 0.32057152, "memory(GiB)": 15.04, "step": 10355, "train_speed(iter/s)": 0.334386 }, { "acc": 0.88873186, "epoch": 1.3952861952861952, "grad_norm": 7.5, "learning_rate": 4.596231952955143e-06, "loss": 0.4387907, "memory(GiB)": 15.04, "step": 10360, "train_speed(iter/s)": 0.334393 }, { "acc": 0.89455509, "epoch": 1.3959595959595958, "grad_norm": 6.0, "learning_rate": 4.586866166209342e-06, "loss": 0.25350516, "memory(GiB)": 15.04, "step": 10365, "train_speed(iter/s)": 0.334425 }, { "acc": 0.90793009, "epoch": 1.3966329966329967, "grad_norm": 5.4375, "learning_rate": 4.577507090956529e-06, "loss": 0.38416338, "memory(GiB)": 15.04, "step": 10370, "train_speed(iter/s)": 0.33443 }, { "acc": 0.91300716, "epoch": 1.3973063973063973, "grad_norm": 7.96875, "learning_rate": 4.568154738800597e-06, "loss": 0.30698023, "memory(GiB)": 15.04, "step": 10375, "train_speed(iter/s)": 0.334457 }, { "acc": 0.94259071, "epoch": 1.397979797979798, "grad_norm": 9.125, "learning_rate": 4.558809121337086e-06, "loss": 0.24739785, "memory(GiB)": 15.04, "step": 10380, "train_speed(iter/s)": 0.334486 }, { "acc": 0.9485342, "epoch": 1.3986531986531987, "grad_norm": 7.09375, "learning_rate": 4.549470250153197e-06, "loss": 0.18721933, "memory(GiB)": 15.04, "step": 10385, "train_speed(iter/s)": 0.334502 }, { "acc": 0.92590742, "epoch": 1.3993265993265993, "grad_norm": 3.765625, "learning_rate": 4.5401381368277555e-06, "loss": 0.24307759, "memory(GiB)": 15.04, "step": 10390, "train_speed(iter/s)": 0.334516 }, { "acc": 0.92164106, "epoch": 1.4, "grad_norm": 6.8125, "learning_rate": 4.530812792931224e-06, "loss": 0.25890124, "memory(GiB)": 15.04, "step": 10395, "train_speed(iter/s)": 0.334527 }, { "acc": 0.92346478, "epoch": 1.4006734006734007, "grad_norm": 5.78125, "learning_rate": 4.521494230025655e-06, "loss": 0.33697472, "memory(GiB)": 15.04, "step": 10400, "train_speed(iter/s)": 0.334535 }, { "acc": 0.8685358, "epoch": 1.4013468013468013, "grad_norm": 7.28125, "learning_rate": 4.512182459664705e-06, "loss": 0.38889794, "memory(GiB)": 15.04, "step": 10405, "train_speed(iter/s)": 0.334567 }, { "acc": 0.88733273, "epoch": 1.402020202020202, "grad_norm": 8.5, "learning_rate": 4.502877493393607e-06, "loss": 0.56911922, "memory(GiB)": 15.04, "step": 10410, "train_speed(iter/s)": 0.334578 }, { "acc": 0.93346405, "epoch": 1.4026936026936028, "grad_norm": 9.3125, "learning_rate": 4.493579342749152e-06, "loss": 0.3124732, "memory(GiB)": 15.04, "step": 10415, "train_speed(iter/s)": 0.334597 }, { "acc": 0.89845037, "epoch": 1.4033670033670034, "grad_norm": 7.71875, "learning_rate": 4.4842880192596896e-06, "loss": 0.439781, "memory(GiB)": 15.04, "step": 10420, "train_speed(iter/s)": 0.334633 }, { "acc": 0.89592113, "epoch": 1.404040404040404, "grad_norm": 11.75, "learning_rate": 4.475003534445094e-06, "loss": 0.48291736, "memory(GiB)": 15.04, "step": 10425, "train_speed(iter/s)": 0.334639 }, { "acc": 0.8618639, "epoch": 1.4047138047138046, "grad_norm": 7.1875, "learning_rate": 4.46572589981677e-06, "loss": 0.4195827, "memory(GiB)": 15.04, "step": 10430, "train_speed(iter/s)": 0.334653 }, { "acc": 0.84246922, "epoch": 1.4053872053872054, "grad_norm": 6.5625, "learning_rate": 4.456455126877627e-06, "loss": 0.3907258, "memory(GiB)": 15.04, "step": 10435, "train_speed(iter/s)": 0.334658 }, { "acc": 0.88151245, "epoch": 1.406060606060606, "grad_norm": 26.0, "learning_rate": 4.44719122712206e-06, "loss": 0.45118403, "memory(GiB)": 15.04, "step": 10440, "train_speed(iter/s)": 0.334663 }, { "acc": 0.8924264, "epoch": 1.4067340067340068, "grad_norm": 14.1875, "learning_rate": 4.437934212035954e-06, "loss": 0.33682299, "memory(GiB)": 15.04, "step": 10445, "train_speed(iter/s)": 0.334696 }, { "acc": 0.91919479, "epoch": 1.4074074074074074, "grad_norm": 6.625, "learning_rate": 4.428684093096647e-06, "loss": 0.24935861, "memory(GiB)": 15.04, "step": 10450, "train_speed(iter/s)": 0.33472 }, { "acc": 0.90029221, "epoch": 1.408080808080808, "grad_norm": 8.25, "learning_rate": 4.41944088177293e-06, "loss": 0.52882147, "memory(GiB)": 15.04, "step": 10455, "train_speed(iter/s)": 0.334719 }, { "acc": 0.88726606, "epoch": 1.4087542087542086, "grad_norm": 16.75, "learning_rate": 4.41020458952504e-06, "loss": 0.64942045, "memory(GiB)": 15.04, "step": 10460, "train_speed(iter/s)": 0.334708 }, { "acc": 0.93211842, "epoch": 1.4094276094276095, "grad_norm": 6.3125, "learning_rate": 4.400975227804616e-06, "loss": 0.21920092, "memory(GiB)": 15.04, "step": 10465, "train_speed(iter/s)": 0.334722 }, { "acc": 0.85095205, "epoch": 1.41010101010101, "grad_norm": 24.125, "learning_rate": 4.3917528080547225e-06, "loss": 0.56060896, "memory(GiB)": 15.04, "step": 10470, "train_speed(iter/s)": 0.334743 }, { "acc": 0.93012409, "epoch": 1.410774410774411, "grad_norm": 8.625, "learning_rate": 4.3825373417098015e-06, "loss": 0.25061631, "memory(GiB)": 15.04, "step": 10475, "train_speed(iter/s)": 0.334761 }, { "acc": 0.93014574, "epoch": 1.4114478114478115, "grad_norm": 12.5625, "learning_rate": 4.373328840195686e-06, "loss": 0.2959131, "memory(GiB)": 15.04, "step": 10480, "train_speed(iter/s)": 0.334742 }, { "acc": 0.91991816, "epoch": 1.412121212121212, "grad_norm": 4.65625, "learning_rate": 4.364127314929571e-06, "loss": 0.31254485, "memory(GiB)": 15.04, "step": 10485, "train_speed(iter/s)": 0.334709 }, { "acc": 0.88106947, "epoch": 1.4127946127946127, "grad_norm": 6.3125, "learning_rate": 4.354932777319995e-06, "loss": 0.40515866, "memory(GiB)": 15.04, "step": 10490, "train_speed(iter/s)": 0.334747 }, { "acc": 0.84238834, "epoch": 1.4134680134680135, "grad_norm": 8.5625, "learning_rate": 4.345745238766842e-06, "loss": 0.61503758, "memory(GiB)": 15.04, "step": 10495, "train_speed(iter/s)": 0.334777 }, { "acc": 0.90629997, "epoch": 1.4141414141414141, "grad_norm": 6.53125, "learning_rate": 4.3365647106613085e-06, "loss": 0.35899134, "memory(GiB)": 15.04, "step": 10500, "train_speed(iter/s)": 0.334812 }, { "epoch": 1.4141414141414141, "eval_acc": 0.8955231477946604, "eval_loss": 0.4071575999259949, "eval_runtime": 110.4916, "eval_samples_per_second": 1.358, "eval_steps_per_second": 1.358, "step": 10500 } ], "logging_steps": 5, "max_steps": 14850, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.608415718624e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }