diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21358 @@ +{ + "best_metric": 0.4071576, + "best_model_checkpoint": "/home/adithya/workspace/Nayana/data/hindi_got_model_full_ft/got-ocr2/v2-20241103-190944/checkpoint-10500", + "epoch": 1.4141414141414141, + "eval_steps": 300, + "global_step": 10500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc": 0.51908016, + "epoch": 0.00013468013468013467, + "grad_norm": 23.125, + "learning_rate": 2.6917900403768507e-08, + "loss": 1.52505195, + "memory(GiB)": 5.54, + "step": 1, + "train_speed(iter/s)": 0.240681 + }, + { + "acc": 0.63399017, + "epoch": 0.0006734006734006734, + "grad_norm": 12.4375, + "learning_rate": 1.3458950201884255e-07, + "loss": 1.57149458, + "memory(GiB)": 9.53, + "step": 5, + "train_speed(iter/s)": 0.3205 + }, + { + "acc": 0.62002406, + "epoch": 0.0013468013468013469, + "grad_norm": 27.25, + "learning_rate": 2.691790040376851e-07, + "loss": 1.54218817, + "memory(GiB)": 9.53, + "step": 10, + "train_speed(iter/s)": 0.366563 + }, + { + "acc": 0.55254116, + "epoch": 0.00202020202020202, + "grad_norm": 18.375, + "learning_rate": 4.037685060565276e-07, + "loss": 1.88158722, + "memory(GiB)": 9.53, + "step": 15, + "train_speed(iter/s)": 0.384698 + }, + { + "acc": 0.57307739, + "epoch": 0.0026936026936026937, + "grad_norm": 22.25, + "learning_rate": 5.383580080753702e-07, + "loss": 1.70565033, + "memory(GiB)": 9.53, + "step": 20, + "train_speed(iter/s)": 0.396148 + }, + { + "acc": 0.58070536, + "epoch": 0.003367003367003367, + "grad_norm": 23.125, + "learning_rate": 6.729475100942127e-07, + "loss": 1.41031113, + "memory(GiB)": 9.53, + "step": 25, + "train_speed(iter/s)": 0.392144 + }, + { + "acc": 0.55927949, + "epoch": 0.00404040404040404, + "grad_norm": 13.8125, + "learning_rate": 8.075370121130552e-07, + "loss": 1.77414036, + "memory(GiB)": 12.13, + "step": 30, + "train_speed(iter/s)": 0.378235 + }, + { + "acc": 0.55564704, + "epoch": 0.0047138047138047135, + "grad_norm": 12.0625, + "learning_rate": 9.421265141318977e-07, + "loss": 1.64451008, + "memory(GiB)": 14.36, + "step": 35, + "train_speed(iter/s)": 0.359814 + }, + { + "acc": 0.51643333, + "epoch": 0.0053872053872053875, + "grad_norm": 18.625, + "learning_rate": 1.0767160161507404e-06, + "loss": 1.69634914, + "memory(GiB)": 8.14, + "step": 40, + "train_speed(iter/s)": 0.366719 + }, + { + "acc": 0.59141669, + "epoch": 0.006060606060606061, + "grad_norm": 30.75, + "learning_rate": 1.2113055181695828e-06, + "loss": 1.78964634, + "memory(GiB)": 8.14, + "step": 45, + "train_speed(iter/s)": 0.371 + }, + { + "acc": 0.51927662, + "epoch": 0.006734006734006734, + "grad_norm": 27.25, + "learning_rate": 1.3458950201884255e-06, + "loss": 2.00049534, + "memory(GiB)": 8.14, + "step": 50, + "train_speed(iter/s)": 0.372852 + }, + { + "acc": 0.557618, + "epoch": 0.007407407407407408, + "grad_norm": 20.75, + "learning_rate": 1.4804845222072681e-06, + "loss": 1.64312649, + "memory(GiB)": 8.14, + "step": 55, + "train_speed(iter/s)": 0.373569 + }, + { + "acc": 0.53778725, + "epoch": 0.00808080808080808, + "grad_norm": 13.4375, + "learning_rate": 1.6150740242261104e-06, + "loss": 1.7603075, + "memory(GiB)": 11.57, + "step": 60, + "train_speed(iter/s)": 0.365224 + }, + { + "acc": 0.58266153, + "epoch": 0.008754208754208754, + "grad_norm": 17.625, + "learning_rate": 1.749663526244953e-06, + "loss": 1.49343386, + "memory(GiB)": 11.57, + "step": 65, + "train_speed(iter/s)": 0.363502 + }, + { + "acc": 0.57580504, + "epoch": 0.009427609427609427, + "grad_norm": 17.875, + "learning_rate": 1.8842530282637955e-06, + "loss": 1.66268654, + "memory(GiB)": 11.57, + "step": 70, + "train_speed(iter/s)": 0.36812 + }, + { + "acc": 0.61554484, + "epoch": 0.010101010101010102, + "grad_norm": 18.5, + "learning_rate": 2.018842530282638e-06, + "loss": 1.58854914, + "memory(GiB)": 11.57, + "step": 75, + "train_speed(iter/s)": 0.371307 + }, + { + "acc": 0.54652371, + "epoch": 0.010774410774410775, + "grad_norm": 17.125, + "learning_rate": 2.1534320323014808e-06, + "loss": 1.61172218, + "memory(GiB)": 11.57, + "step": 80, + "train_speed(iter/s)": 0.37045 + }, + { + "acc": 0.62052369, + "epoch": 0.011447811447811448, + "grad_norm": 17.625, + "learning_rate": 2.2880215343203232e-06, + "loss": 1.61053829, + "memory(GiB)": 11.57, + "step": 85, + "train_speed(iter/s)": 0.371281 + }, + { + "acc": 0.57062507, + "epoch": 0.012121212121212121, + "grad_norm": 24.0, + "learning_rate": 2.4226110363391657e-06, + "loss": 1.63032074, + "memory(GiB)": 11.57, + "step": 90, + "train_speed(iter/s)": 0.371793 + }, + { + "acc": 0.62314548, + "epoch": 0.012794612794612794, + "grad_norm": 19.75, + "learning_rate": 2.5572005383580085e-06, + "loss": 1.42481365, + "memory(GiB)": 11.57, + "step": 95, + "train_speed(iter/s)": 0.366657 + }, + { + "acc": 0.56463518, + "epoch": 0.013468013468013467, + "grad_norm": 14.625, + "learning_rate": 2.691790040376851e-06, + "loss": 1.66354046, + "memory(GiB)": 11.57, + "step": 100, + "train_speed(iter/s)": 0.365204 + }, + { + "acc": 0.67074585, + "epoch": 0.014141414141414142, + "grad_norm": 15.0, + "learning_rate": 2.8263795423956934e-06, + "loss": 1.34491758, + "memory(GiB)": 11.57, + "step": 105, + "train_speed(iter/s)": 0.367968 + }, + { + "acc": 0.54115891, + "epoch": 0.014814814814814815, + "grad_norm": 18.25, + "learning_rate": 2.9609690444145363e-06, + "loss": 1.61188202, + "memory(GiB)": 11.57, + "step": 110, + "train_speed(iter/s)": 0.368511 + }, + { + "acc": 0.55807858, + "epoch": 0.015488215488215488, + "grad_norm": 17.25, + "learning_rate": 3.0955585464333787e-06, + "loss": 1.61766186, + "memory(GiB)": 11.57, + "step": 115, + "train_speed(iter/s)": 0.369026 + }, + { + "acc": 0.56790361, + "epoch": 0.01616161616161616, + "grad_norm": 25.875, + "learning_rate": 3.2301480484522207e-06, + "loss": 1.44421263, + "memory(GiB)": 11.57, + "step": 120, + "train_speed(iter/s)": 0.369682 + }, + { + "acc": 0.62042007, + "epoch": 0.016835016835016835, + "grad_norm": 26.5, + "learning_rate": 3.364737550471063e-06, + "loss": 1.53551636, + "memory(GiB)": 11.57, + "step": 125, + "train_speed(iter/s)": 0.372003 + }, + { + "acc": 0.53192263, + "epoch": 0.017508417508417508, + "grad_norm": 33.75, + "learning_rate": 3.499327052489906e-06, + "loss": 1.86681862, + "memory(GiB)": 11.57, + "step": 130, + "train_speed(iter/s)": 0.372911 + }, + { + "acc": 0.62459035, + "epoch": 0.01818181818181818, + "grad_norm": 14.6875, + "learning_rate": 3.6339165545087485e-06, + "loss": 1.41363401, + "memory(GiB)": 11.57, + "step": 135, + "train_speed(iter/s)": 0.373871 + }, + { + "acc": 0.61042018, + "epoch": 0.018855218855218854, + "grad_norm": 14.75, + "learning_rate": 3.768506056527591e-06, + "loss": 1.44343262, + "memory(GiB)": 11.57, + "step": 140, + "train_speed(iter/s)": 0.375919 + }, + { + "acc": 0.63943481, + "epoch": 0.019528619528619527, + "grad_norm": 25.75, + "learning_rate": 3.903095558546434e-06, + "loss": 1.44404135, + "memory(GiB)": 11.57, + "step": 145, + "train_speed(iter/s)": 0.377064 + }, + { + "acc": 0.66924458, + "epoch": 0.020202020202020204, + "grad_norm": 18.875, + "learning_rate": 4.037685060565276e-06, + "loss": 1.43287277, + "memory(GiB)": 11.57, + "step": 150, + "train_speed(iter/s)": 0.376558 + }, + { + "acc": 0.64015527, + "epoch": 0.020875420875420877, + "grad_norm": 15.75, + "learning_rate": 4.172274562584119e-06, + "loss": 1.31725788, + "memory(GiB)": 11.57, + "step": 155, + "train_speed(iter/s)": 0.377151 + }, + { + "acc": 0.56795197, + "epoch": 0.02154882154882155, + "grad_norm": 28.5, + "learning_rate": 4.3068640646029616e-06, + "loss": 1.58099527, + "memory(GiB)": 11.57, + "step": 160, + "train_speed(iter/s)": 0.37853 + }, + { + "acc": 0.56977224, + "epoch": 0.022222222222222223, + "grad_norm": 18.625, + "learning_rate": 4.4414535666218036e-06, + "loss": 1.74765873, + "memory(GiB)": 11.57, + "step": 165, + "train_speed(iter/s)": 0.37979 + }, + { + "acc": 0.57543573, + "epoch": 0.022895622895622896, + "grad_norm": 14.3125, + "learning_rate": 4.5760430686406464e-06, + "loss": 1.48664017, + "memory(GiB)": 11.57, + "step": 170, + "train_speed(iter/s)": 0.379818 + }, + { + "acc": 0.62976146, + "epoch": 0.02356902356902357, + "grad_norm": 13.375, + "learning_rate": 4.710632570659489e-06, + "loss": 1.4615386, + "memory(GiB)": 11.57, + "step": 175, + "train_speed(iter/s)": 0.379344 + }, + { + "acc": 0.61564579, + "epoch": 0.024242424242424242, + "grad_norm": 11.6875, + "learning_rate": 4.845222072678331e-06, + "loss": 1.33001451, + "memory(GiB)": 11.57, + "step": 180, + "train_speed(iter/s)": 0.37618 + }, + { + "acc": 0.59344044, + "epoch": 0.024915824915824916, + "grad_norm": 12.6875, + "learning_rate": 4.979811574697174e-06, + "loss": 1.36530914, + "memory(GiB)": 11.57, + "step": 185, + "train_speed(iter/s)": 0.374182 + }, + { + "acc": 0.60937552, + "epoch": 0.02558922558922559, + "grad_norm": 17.75, + "learning_rate": 5.114401076716017e-06, + "loss": 1.42839565, + "memory(GiB)": 11.57, + "step": 190, + "train_speed(iter/s)": 0.375383 + }, + { + "acc": 0.56700568, + "epoch": 0.026262626262626262, + "grad_norm": 23.125, + "learning_rate": 5.248990578734859e-06, + "loss": 1.72391968, + "memory(GiB)": 11.57, + "step": 195, + "train_speed(iter/s)": 0.376799 + }, + { + "acc": 0.64125962, + "epoch": 0.026936026936026935, + "grad_norm": 12.875, + "learning_rate": 5.383580080753702e-06, + "loss": 1.45175018, + "memory(GiB)": 11.57, + "step": 200, + "train_speed(iter/s)": 0.377518 + }, + { + "acc": 0.63175621, + "epoch": 0.027609427609427608, + "grad_norm": 18.75, + "learning_rate": 5.518169582772545e-06, + "loss": 1.22912884, + "memory(GiB)": 11.57, + "step": 205, + "train_speed(iter/s)": 0.378115 + }, + { + "acc": 0.54751568, + "epoch": 0.028282828282828285, + "grad_norm": 13.5, + "learning_rate": 5.652759084791387e-06, + "loss": 1.51581717, + "memory(GiB)": 11.57, + "step": 210, + "train_speed(iter/s)": 0.37873 + }, + { + "acc": 0.59878306, + "epoch": 0.028956228956228958, + "grad_norm": 25.25, + "learning_rate": 5.78734858681023e-06, + "loss": 1.66876678, + "memory(GiB)": 11.57, + "step": 215, + "train_speed(iter/s)": 0.379176 + }, + { + "acc": 0.68903089, + "epoch": 0.02962962962962963, + "grad_norm": 15.75, + "learning_rate": 5.9219380888290726e-06, + "loss": 1.21776981, + "memory(GiB)": 11.57, + "step": 220, + "train_speed(iter/s)": 0.380325 + }, + { + "acc": 0.65183606, + "epoch": 0.030303030303030304, + "grad_norm": 16.5, + "learning_rate": 6.056527590847915e-06, + "loss": 1.33960438, + "memory(GiB)": 11.57, + "step": 225, + "train_speed(iter/s)": 0.378646 + }, + { + "acc": 0.56697273, + "epoch": 0.030976430976430977, + "grad_norm": 20.25, + "learning_rate": 6.1911170928667574e-06, + "loss": 1.61164932, + "memory(GiB)": 11.57, + "step": 230, + "train_speed(iter/s)": 0.379002 + }, + { + "acc": 0.60521441, + "epoch": 0.03164983164983165, + "grad_norm": 14.125, + "learning_rate": 6.325706594885599e-06, + "loss": 1.62042446, + "memory(GiB)": 11.57, + "step": 235, + "train_speed(iter/s)": 0.379663 + }, + { + "acc": 0.57916322, + "epoch": 0.03232323232323232, + "grad_norm": 20.0, + "learning_rate": 6.4602960969044415e-06, + "loss": 1.60195808, + "memory(GiB)": 11.57, + "step": 240, + "train_speed(iter/s)": 0.380356 + }, + { + "acc": 0.63395228, + "epoch": 0.032996632996632996, + "grad_norm": 29.75, + "learning_rate": 6.594885598923284e-06, + "loss": 1.40564528, + "memory(GiB)": 11.57, + "step": 245, + "train_speed(iter/s)": 0.380995 + }, + { + "acc": 0.58923273, + "epoch": 0.03367003367003367, + "grad_norm": 25.125, + "learning_rate": 6.729475100942126e-06, + "loss": 1.4849618, + "memory(GiB)": 11.57, + "step": 250, + "train_speed(iter/s)": 0.381146 + }, + { + "acc": 0.57267365, + "epoch": 0.03434343434343434, + "grad_norm": 20.25, + "learning_rate": 6.864064602960969e-06, + "loss": 1.61652508, + "memory(GiB)": 11.57, + "step": 255, + "train_speed(iter/s)": 0.382063 + }, + { + "acc": 0.54358578, + "epoch": 0.035016835016835016, + "grad_norm": 13.875, + "learning_rate": 6.998654104979812e-06, + "loss": 1.63632908, + "memory(GiB)": 11.57, + "step": 260, + "train_speed(iter/s)": 0.38277 + }, + { + "acc": 0.60778151, + "epoch": 0.03569023569023569, + "grad_norm": 21.875, + "learning_rate": 7.133243606998654e-06, + "loss": 1.2964942, + "memory(GiB)": 11.57, + "step": 265, + "train_speed(iter/s)": 0.383463 + }, + { + "acc": 0.6651093, + "epoch": 0.03636363636363636, + "grad_norm": 11.9375, + "learning_rate": 7.267833109017497e-06, + "loss": 1.15631075, + "memory(GiB)": 11.57, + "step": 270, + "train_speed(iter/s)": 0.381222 + }, + { + "acc": 0.61537771, + "epoch": 0.037037037037037035, + "grad_norm": 19.625, + "learning_rate": 7.40242261103634e-06, + "loss": 1.52807426, + "memory(GiB)": 11.57, + "step": 275, + "train_speed(iter/s)": 0.381404 + }, + { + "acc": 0.69333591, + "epoch": 0.03771043771043771, + "grad_norm": 16.0, + "learning_rate": 7.537012113055182e-06, + "loss": 1.10553503, + "memory(GiB)": 11.57, + "step": 280, + "train_speed(iter/s)": 0.381658 + }, + { + "acc": 0.65817041, + "epoch": 0.03838383838383838, + "grad_norm": 21.125, + "learning_rate": 7.671601615074024e-06, + "loss": 1.39679871, + "memory(GiB)": 11.57, + "step": 285, + "train_speed(iter/s)": 0.381858 + }, + { + "acc": 0.65443778, + "epoch": 0.039057239057239054, + "grad_norm": 15.6875, + "learning_rate": 7.806191117092868e-06, + "loss": 1.19904938, + "memory(GiB)": 11.57, + "step": 290, + "train_speed(iter/s)": 0.382372 + }, + { + "acc": 0.5628746, + "epoch": 0.03973063973063973, + "grad_norm": 10.5, + "learning_rate": 7.94078061911171e-06, + "loss": 1.59957199, + "memory(GiB)": 11.57, + "step": 295, + "train_speed(iter/s)": 0.382493 + }, + { + "acc": 0.6009872, + "epoch": 0.04040404040404041, + "grad_norm": 17.0, + "learning_rate": 8.075370121130552e-06, + "loss": 1.64843807, + "memory(GiB)": 11.57, + "step": 300, + "train_speed(iter/s)": 0.383049 + }, + { + "epoch": 0.04040404040404041, + "eval_acc": 0.6508429803434077, + "eval_loss": 1.4724615812301636, + "eval_runtime": 109.8308, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 300 + }, + { + "acc": 0.63098454, + "epoch": 0.04107744107744108, + "grad_norm": 14.3125, + "learning_rate": 8.209959623149395e-06, + "loss": 1.40613441, + "memory(GiB)": 11.57, + "step": 305, + "train_speed(iter/s)": 0.337115 + }, + { + "acc": 0.663204, + "epoch": 0.041750841750841754, + "grad_norm": 8.25, + "learning_rate": 8.344549125168237e-06, + "loss": 1.29444733, + "memory(GiB)": 11.57, + "step": 310, + "train_speed(iter/s)": 0.337523 + }, + { + "acc": 0.56645985, + "epoch": 0.04242424242424243, + "grad_norm": 12.875, + "learning_rate": 8.47913862718708e-06, + "loss": 1.68605423, + "memory(GiB)": 11.57, + "step": 315, + "train_speed(iter/s)": 0.338738 + }, + { + "acc": 0.67048483, + "epoch": 0.0430976430976431, + "grad_norm": 11.9375, + "learning_rate": 8.613728129205923e-06, + "loss": 1.39615965, + "memory(GiB)": 11.57, + "step": 320, + "train_speed(iter/s)": 0.338694 + }, + { + "acc": 0.72679968, + "epoch": 0.04377104377104377, + "grad_norm": 23.375, + "learning_rate": 8.748317631224765e-06, + "loss": 1.10950098, + "memory(GiB)": 11.57, + "step": 325, + "train_speed(iter/s)": 0.339094 + }, + { + "acc": 0.62103057, + "epoch": 0.044444444444444446, + "grad_norm": 18.375, + "learning_rate": 8.882907133243607e-06, + "loss": 1.33663902, + "memory(GiB)": 11.57, + "step": 330, + "train_speed(iter/s)": 0.339679 + }, + { + "acc": 0.71397562, + "epoch": 0.04511784511784512, + "grad_norm": 22.25, + "learning_rate": 9.017496635262451e-06, + "loss": 1.09442234, + "memory(GiB)": 11.57, + "step": 335, + "train_speed(iter/s)": 0.340369 + }, + { + "acc": 0.67727013, + "epoch": 0.04579124579124579, + "grad_norm": 16.125, + "learning_rate": 9.152086137281293e-06, + "loss": 1.37714205, + "memory(GiB)": 11.57, + "step": 340, + "train_speed(iter/s)": 0.340722 + }, + { + "acc": 0.56682105, + "epoch": 0.046464646464646465, + "grad_norm": 23.25, + "learning_rate": 9.286675639300135e-06, + "loss": 1.47191801, + "memory(GiB)": 11.57, + "step": 345, + "train_speed(iter/s)": 0.341435 + }, + { + "acc": 0.64976182, + "epoch": 0.04713804713804714, + "grad_norm": 19.625, + "learning_rate": 9.421265141318979e-06, + "loss": 1.43252811, + "memory(GiB)": 11.57, + "step": 350, + "train_speed(iter/s)": 0.342317 + }, + { + "acc": 0.73611102, + "epoch": 0.04781144781144781, + "grad_norm": 9.125, + "learning_rate": 9.55585464333782e-06, + "loss": 1.09405451, + "memory(GiB)": 11.57, + "step": 355, + "train_speed(iter/s)": 0.342304 + }, + { + "acc": 0.62947536, + "epoch": 0.048484848484848485, + "grad_norm": 17.0, + "learning_rate": 9.690444145356663e-06, + "loss": 1.33388605, + "memory(GiB)": 11.57, + "step": 360, + "train_speed(iter/s)": 0.343008 + }, + { + "acc": 0.61660504, + "epoch": 0.04915824915824916, + "grad_norm": 14.0, + "learning_rate": 9.825033647375506e-06, + "loss": 1.26432142, + "memory(GiB)": 11.57, + "step": 365, + "train_speed(iter/s)": 0.343933 + }, + { + "acc": 0.6861485, + "epoch": 0.04983164983164983, + "grad_norm": 13.125, + "learning_rate": 9.959623149394348e-06, + "loss": 1.17055111, + "memory(GiB)": 11.57, + "step": 370, + "train_speed(iter/s)": 0.344701 + }, + { + "acc": 0.68641739, + "epoch": 0.050505050505050504, + "grad_norm": 10.75, + "learning_rate": 1.009421265141319e-05, + "loss": 1.19751577, + "memory(GiB)": 11.57, + "step": 375, + "train_speed(iter/s)": 0.344801 + }, + { + "acc": 0.58911633, + "epoch": 0.05117845117845118, + "grad_norm": 12.875, + "learning_rate": 1.0228802153432034e-05, + "loss": 1.57844391, + "memory(GiB)": 11.57, + "step": 380, + "train_speed(iter/s)": 0.343816 + }, + { + "acc": 0.66787591, + "epoch": 0.05185185185185185, + "grad_norm": 9.375, + "learning_rate": 1.0363391655450876e-05, + "loss": 1.22666464, + "memory(GiB)": 11.57, + "step": 385, + "train_speed(iter/s)": 0.344265 + }, + { + "acc": 0.72761278, + "epoch": 0.052525252525252523, + "grad_norm": 13.5, + "learning_rate": 1.0497981157469718e-05, + "loss": 1.11251936, + "memory(GiB)": 11.57, + "step": 390, + "train_speed(iter/s)": 0.344006 + }, + { + "acc": 0.63036947, + "epoch": 0.0531986531986532, + "grad_norm": 14.75, + "learning_rate": 1.0632570659488562e-05, + "loss": 1.37007179, + "memory(GiB)": 11.57, + "step": 395, + "train_speed(iter/s)": 0.344776 + }, + { + "acc": 0.7391459, + "epoch": 0.05387205387205387, + "grad_norm": 9.0, + "learning_rate": 1.0767160161507404e-05, + "loss": 1.07246151, + "memory(GiB)": 11.57, + "step": 400, + "train_speed(iter/s)": 0.345077 + }, + { + "acc": 0.69003735, + "epoch": 0.05454545454545454, + "grad_norm": 20.125, + "learning_rate": 1.0901749663526246e-05, + "loss": 1.18787746, + "memory(GiB)": 11.57, + "step": 405, + "train_speed(iter/s)": 0.34616 + }, + { + "acc": 0.75034552, + "epoch": 0.055218855218855216, + "grad_norm": 37.75, + "learning_rate": 1.103633916554509e-05, + "loss": 0.96831808, + "memory(GiB)": 11.57, + "step": 410, + "train_speed(iter/s)": 0.346841 + }, + { + "acc": 0.57640429, + "epoch": 0.05589225589225589, + "grad_norm": 39.0, + "learning_rate": 1.1170928667563932e-05, + "loss": 1.33788805, + "memory(GiB)": 11.57, + "step": 415, + "train_speed(iter/s)": 0.347735 + }, + { + "acc": 0.73999271, + "epoch": 0.05656565656565657, + "grad_norm": 9.375, + "learning_rate": 1.1305518169582774e-05, + "loss": 1.05237989, + "memory(GiB)": 11.57, + "step": 420, + "train_speed(iter/s)": 0.347691 + }, + { + "acc": 0.66486025, + "epoch": 0.05723905723905724, + "grad_norm": 13.8125, + "learning_rate": 1.1440107671601617e-05, + "loss": 1.10302191, + "memory(GiB)": 11.57, + "step": 425, + "train_speed(iter/s)": 0.3484 + }, + { + "acc": 0.6567131, + "epoch": 0.057912457912457915, + "grad_norm": 20.5, + "learning_rate": 1.157469717362046e-05, + "loss": 1.13996429, + "memory(GiB)": 11.57, + "step": 430, + "train_speed(iter/s)": 0.349331 + }, + { + "acc": 0.70993357, + "epoch": 0.05858585858585859, + "grad_norm": 12.5625, + "learning_rate": 1.1709286675639301e-05, + "loss": 1.02943163, + "memory(GiB)": 11.57, + "step": 435, + "train_speed(iter/s)": 0.349709 + }, + { + "acc": 0.75071635, + "epoch": 0.05925925925925926, + "grad_norm": 12.25, + "learning_rate": 1.1843876177658145e-05, + "loss": 0.89214468, + "memory(GiB)": 11.57, + "step": 440, + "train_speed(iter/s)": 0.349855 + }, + { + "acc": 0.67376766, + "epoch": 0.059932659932659935, + "grad_norm": 9.9375, + "learning_rate": 1.1978465679676987e-05, + "loss": 1.3653923, + "memory(GiB)": 11.57, + "step": 445, + "train_speed(iter/s)": 0.3505 + }, + { + "acc": 0.64020987, + "epoch": 0.06060606060606061, + "grad_norm": 18.25, + "learning_rate": 1.211305518169583e-05, + "loss": 1.10264673, + "memory(GiB)": 11.57, + "step": 450, + "train_speed(iter/s)": 0.351128 + }, + { + "acc": 0.70698514, + "epoch": 0.06127946127946128, + "grad_norm": 16.625, + "learning_rate": 1.2247644683714673e-05, + "loss": 0.94584637, + "memory(GiB)": 11.57, + "step": 455, + "train_speed(iter/s)": 0.351868 + }, + { + "acc": 0.70906076, + "epoch": 0.061952861952861954, + "grad_norm": 11.5625, + "learning_rate": 1.2382234185733515e-05, + "loss": 1.19026566, + "memory(GiB)": 11.57, + "step": 460, + "train_speed(iter/s)": 0.351803 + }, + { + "acc": 0.71455946, + "epoch": 0.06262626262626263, + "grad_norm": 13.9375, + "learning_rate": 1.2516823687752355e-05, + "loss": 1.10838032, + "memory(GiB)": 11.57, + "step": 465, + "train_speed(iter/s)": 0.352101 + }, + { + "acc": 0.65413985, + "epoch": 0.0632996632996633, + "grad_norm": 9.25, + "learning_rate": 1.2651413189771197e-05, + "loss": 1.20070848, + "memory(GiB)": 11.57, + "step": 470, + "train_speed(iter/s)": 0.352223 + }, + { + "acc": 0.66868377, + "epoch": 0.06397306397306397, + "grad_norm": 12.6875, + "learning_rate": 1.2786002691790041e-05, + "loss": 1.40500479, + "memory(GiB)": 11.57, + "step": 475, + "train_speed(iter/s)": 0.352577 + }, + { + "acc": 0.71377177, + "epoch": 0.06464646464646465, + "grad_norm": 9.75, + "learning_rate": 1.2920592193808883e-05, + "loss": 0.95710163, + "memory(GiB)": 11.57, + "step": 480, + "train_speed(iter/s)": 0.35326 + }, + { + "acc": 0.69024115, + "epoch": 0.06531986531986532, + "grad_norm": 9.625, + "learning_rate": 1.3055181695827725e-05, + "loss": 1.21541462, + "memory(GiB)": 11.57, + "step": 485, + "train_speed(iter/s)": 0.353899 + }, + { + "acc": 0.67221255, + "epoch": 0.06599326599326599, + "grad_norm": 15.1875, + "learning_rate": 1.3189771197846569e-05, + "loss": 1.12308636, + "memory(GiB)": 11.57, + "step": 490, + "train_speed(iter/s)": 0.354344 + }, + { + "acc": 0.72153354, + "epoch": 0.06666666666666667, + "grad_norm": 7.1875, + "learning_rate": 1.332436069986541e-05, + "loss": 1.1415411, + "memory(GiB)": 11.57, + "step": 495, + "train_speed(iter/s)": 0.354829 + }, + { + "acc": 0.65405703, + "epoch": 0.06734006734006734, + "grad_norm": 10.125, + "learning_rate": 1.3458950201884253e-05, + "loss": 1.10353146, + "memory(GiB)": 11.57, + "step": 500, + "train_speed(iter/s)": 0.355372 + }, + { + "acc": 0.71678686, + "epoch": 0.06801346801346801, + "grad_norm": 11.3125, + "learning_rate": 1.3593539703903096e-05, + "loss": 0.81731205, + "memory(GiB)": 11.57, + "step": 505, + "train_speed(iter/s)": 0.354671 + }, + { + "acc": 0.75929475, + "epoch": 0.06868686868686869, + "grad_norm": 10.75, + "learning_rate": 1.3728129205921938e-05, + "loss": 0.86307383, + "memory(GiB)": 11.57, + "step": 510, + "train_speed(iter/s)": 0.355125 + }, + { + "acc": 0.71500969, + "epoch": 0.06936026936026936, + "grad_norm": 14.9375, + "learning_rate": 1.386271870794078e-05, + "loss": 1.25699072, + "memory(GiB)": 11.57, + "step": 515, + "train_speed(iter/s)": 0.355201 + }, + { + "acc": 0.70714483, + "epoch": 0.07003367003367003, + "grad_norm": 10.0, + "learning_rate": 1.3997308209959624e-05, + "loss": 1.0081563, + "memory(GiB)": 11.57, + "step": 520, + "train_speed(iter/s)": 0.35544 + }, + { + "acc": 0.69690123, + "epoch": 0.0707070707070707, + "grad_norm": 8.5, + "learning_rate": 1.4131897711978466e-05, + "loss": 1.17593975, + "memory(GiB)": 11.57, + "step": 525, + "train_speed(iter/s)": 0.354865 + }, + { + "acc": 0.65076323, + "epoch": 0.07138047138047138, + "grad_norm": 12.625, + "learning_rate": 1.4266487213997308e-05, + "loss": 0.90211763, + "memory(GiB)": 11.57, + "step": 530, + "train_speed(iter/s)": 0.355576 + }, + { + "acc": 0.75683184, + "epoch": 0.07205387205387205, + "grad_norm": 10.9375, + "learning_rate": 1.4401076716016152e-05, + "loss": 0.78305464, + "memory(GiB)": 11.57, + "step": 535, + "train_speed(iter/s)": 0.355643 + }, + { + "acc": 0.63366609, + "epoch": 0.07272727272727272, + "grad_norm": 24.5, + "learning_rate": 1.4535666218034994e-05, + "loss": 1.13021212, + "memory(GiB)": 11.57, + "step": 540, + "train_speed(iter/s)": 0.355557 + }, + { + "acc": 0.79359851, + "epoch": 0.0734006734006734, + "grad_norm": 13.4375, + "learning_rate": 1.4670255720053836e-05, + "loss": 0.66878433, + "memory(GiB)": 11.57, + "step": 545, + "train_speed(iter/s)": 0.355904 + }, + { + "acc": 0.73218951, + "epoch": 0.07407407407407407, + "grad_norm": 13.1875, + "learning_rate": 1.480484522207268e-05, + "loss": 0.8534152, + "memory(GiB)": 11.57, + "step": 550, + "train_speed(iter/s)": 0.356506 + }, + { + "acc": 0.74032345, + "epoch": 0.07474747474747474, + "grad_norm": 18.125, + "learning_rate": 1.4939434724091522e-05, + "loss": 1.04831533, + "memory(GiB)": 11.57, + "step": 555, + "train_speed(iter/s)": 0.356691 + }, + { + "acc": 0.79713354, + "epoch": 0.07542087542087542, + "grad_norm": 9.8125, + "learning_rate": 1.5074024226110364e-05, + "loss": 0.72469769, + "memory(GiB)": 11.57, + "step": 560, + "train_speed(iter/s)": 0.357149 + }, + { + "acc": 0.73963046, + "epoch": 0.07609427609427609, + "grad_norm": 13.0625, + "learning_rate": 1.5208613728129207e-05, + "loss": 0.92076893, + "memory(GiB)": 11.57, + "step": 565, + "train_speed(iter/s)": 0.35763 + }, + { + "acc": 0.76453247, + "epoch": 0.07676767676767676, + "grad_norm": 17.75, + "learning_rate": 1.5343203230148048e-05, + "loss": 0.82259836, + "memory(GiB)": 11.57, + "step": 570, + "train_speed(iter/s)": 0.358102 + }, + { + "acc": 0.72729082, + "epoch": 0.07744107744107744, + "grad_norm": 10.375, + "learning_rate": 1.547779273216689e-05, + "loss": 0.81022062, + "memory(GiB)": 11.57, + "step": 575, + "train_speed(iter/s)": 0.358299 + }, + { + "acc": 0.67112513, + "epoch": 0.07811447811447811, + "grad_norm": 11.5625, + "learning_rate": 1.5612382234185735e-05, + "loss": 1.24032078, + "memory(GiB)": 11.57, + "step": 580, + "train_speed(iter/s)": 0.358747 + }, + { + "acc": 0.74445891, + "epoch": 0.07878787878787878, + "grad_norm": 9.25, + "learning_rate": 1.5746971736204576e-05, + "loss": 0.91865206, + "memory(GiB)": 11.57, + "step": 585, + "train_speed(iter/s)": 0.358349 + }, + { + "acc": 0.67735329, + "epoch": 0.07946127946127945, + "grad_norm": 15.75, + "learning_rate": 1.588156123822342e-05, + "loss": 1.07022667, + "memory(GiB)": 11.57, + "step": 590, + "train_speed(iter/s)": 0.358313 + }, + { + "acc": 0.65999546, + "epoch": 0.08013468013468013, + "grad_norm": 19.25, + "learning_rate": 1.6016150740242263e-05, + "loss": 1.37132463, + "memory(GiB)": 11.57, + "step": 595, + "train_speed(iter/s)": 0.358626 + }, + { + "acc": 0.72108564, + "epoch": 0.08080808080808081, + "grad_norm": 9.5, + "learning_rate": 1.6150740242261103e-05, + "loss": 0.84458551, + "memory(GiB)": 11.57, + "step": 600, + "train_speed(iter/s)": 0.358946 + }, + { + "epoch": 0.08080808080808081, + "eval_acc": 0.7462481867334828, + "eval_loss": 1.052156686782837, + "eval_runtime": 109.4723, + "eval_samples_per_second": 1.37, + "eval_steps_per_second": 1.37, + "step": 600 + }, + { + "acc": 0.81282282, + "epoch": 0.08148148148148149, + "grad_norm": 16.75, + "learning_rate": 1.6285329744279947e-05, + "loss": 0.7542716, + "memory(GiB)": 11.57, + "step": 605, + "train_speed(iter/s)": 0.336852 + }, + { + "acc": 0.70444484, + "epoch": 0.08215488215488216, + "grad_norm": 10.375, + "learning_rate": 1.641991924629879e-05, + "loss": 1.11807833, + "memory(GiB)": 11.57, + "step": 610, + "train_speed(iter/s)": 0.337448 + }, + { + "acc": 0.73636794, + "epoch": 0.08282828282828283, + "grad_norm": 20.125, + "learning_rate": 1.655450874831763e-05, + "loss": 0.9746563, + "memory(GiB)": 11.57, + "step": 615, + "train_speed(iter/s)": 0.33771 + }, + { + "acc": 0.69418383, + "epoch": 0.08350168350168351, + "grad_norm": 9.5625, + "learning_rate": 1.6689098250336475e-05, + "loss": 1.19150572, + "memory(GiB)": 11.57, + "step": 620, + "train_speed(iter/s)": 0.338338 + }, + { + "acc": 0.77662058, + "epoch": 0.08417508417508418, + "grad_norm": 15.4375, + "learning_rate": 1.682368775235532e-05, + "loss": 0.85510263, + "memory(GiB)": 11.57, + "step": 625, + "train_speed(iter/s)": 0.338681 + }, + { + "acc": 0.66876321, + "epoch": 0.08484848484848485, + "grad_norm": 10.75, + "learning_rate": 1.695827725437416e-05, + "loss": 1.03631792, + "memory(GiB)": 11.57, + "step": 630, + "train_speed(iter/s)": 0.339105 + }, + { + "acc": 0.72490911, + "epoch": 0.08552188552188553, + "grad_norm": 12.4375, + "learning_rate": 1.7092866756393003e-05, + "loss": 0.90714159, + "memory(GiB)": 11.57, + "step": 635, + "train_speed(iter/s)": 0.339573 + }, + { + "acc": 0.715308, + "epoch": 0.0861952861952862, + "grad_norm": 16.75, + "learning_rate": 1.7227456258411846e-05, + "loss": 0.83676376, + "memory(GiB)": 11.57, + "step": 640, + "train_speed(iter/s)": 0.339536 + }, + { + "acc": 0.72004409, + "epoch": 0.08686868686868687, + "grad_norm": 11.125, + "learning_rate": 1.7362045760430687e-05, + "loss": 0.99052076, + "memory(GiB)": 11.57, + "step": 645, + "train_speed(iter/s)": 0.340137 + }, + { + "acc": 0.77303476, + "epoch": 0.08754208754208755, + "grad_norm": 13.625, + "learning_rate": 1.749663526244953e-05, + "loss": 0.89626684, + "memory(GiB)": 11.57, + "step": 650, + "train_speed(iter/s)": 0.340687 + }, + { + "acc": 0.73230481, + "epoch": 0.08821548821548822, + "grad_norm": 16.5, + "learning_rate": 1.7631224764468374e-05, + "loss": 0.84235859, + "memory(GiB)": 11.57, + "step": 655, + "train_speed(iter/s)": 0.341039 + }, + { + "acc": 0.78219714, + "epoch": 0.08888888888888889, + "grad_norm": 6.90625, + "learning_rate": 1.7765814266487214e-05, + "loss": 0.67909622, + "memory(GiB)": 11.57, + "step": 660, + "train_speed(iter/s)": 0.341389 + }, + { + "acc": 0.76528678, + "epoch": 0.08956228956228957, + "grad_norm": 13.125, + "learning_rate": 1.7900403768506058e-05, + "loss": 0.81274815, + "memory(GiB)": 11.57, + "step": 665, + "train_speed(iter/s)": 0.341734 + }, + { + "acc": 0.7723166, + "epoch": 0.09023569023569024, + "grad_norm": 11.4375, + "learning_rate": 1.8034993270524902e-05, + "loss": 0.86763144, + "memory(GiB)": 11.57, + "step": 670, + "train_speed(iter/s)": 0.341873 + }, + { + "acc": 0.75492234, + "epoch": 0.09090909090909091, + "grad_norm": 13.3125, + "learning_rate": 1.8169582772543742e-05, + "loss": 0.96772375, + "memory(GiB)": 15.04, + "step": 675, + "train_speed(iter/s)": 0.341943 + }, + { + "acc": 0.77179217, + "epoch": 0.09158249158249158, + "grad_norm": 8.5625, + "learning_rate": 1.8304172274562586e-05, + "loss": 0.804498, + "memory(GiB)": 15.04, + "step": 680, + "train_speed(iter/s)": 0.342304 + }, + { + "acc": 0.7159348, + "epoch": 0.09225589225589226, + "grad_norm": 9.6875, + "learning_rate": 1.843876177658143e-05, + "loss": 0.99472179, + "memory(GiB)": 15.04, + "step": 685, + "train_speed(iter/s)": 0.342728 + }, + { + "acc": 0.69876709, + "epoch": 0.09292929292929293, + "grad_norm": 8.0625, + "learning_rate": 1.857335127860027e-05, + "loss": 1.07095146, + "memory(GiB)": 15.04, + "step": 690, + "train_speed(iter/s)": 0.343191 + }, + { + "acc": 0.80019283, + "epoch": 0.0936026936026936, + "grad_norm": 22.5, + "learning_rate": 1.8707940780619114e-05, + "loss": 0.71981606, + "memory(GiB)": 15.04, + "step": 695, + "train_speed(iter/s)": 0.343506 + }, + { + "acc": 0.80901213, + "epoch": 0.09427609427609428, + "grad_norm": 12.375, + "learning_rate": 1.8842530282637957e-05, + "loss": 0.70246754, + "memory(GiB)": 15.04, + "step": 700, + "train_speed(iter/s)": 0.343768 + }, + { + "acc": 0.71540055, + "epoch": 0.09494949494949495, + "grad_norm": 12.9375, + "learning_rate": 1.8977119784656798e-05, + "loss": 1.19597988, + "memory(GiB)": 15.04, + "step": 705, + "train_speed(iter/s)": 0.344148 + }, + { + "acc": 0.65902338, + "epoch": 0.09562289562289562, + "grad_norm": 9.25, + "learning_rate": 1.911170928667564e-05, + "loss": 1.11431704, + "memory(GiB)": 15.04, + "step": 710, + "train_speed(iter/s)": 0.34419 + }, + { + "acc": 0.8256176, + "epoch": 0.0962962962962963, + "grad_norm": 8.875, + "learning_rate": 1.9246298788694485e-05, + "loss": 0.58568578, + "memory(GiB)": 15.04, + "step": 715, + "train_speed(iter/s)": 0.344353 + }, + { + "acc": 0.79482641, + "epoch": 0.09696969696969697, + "grad_norm": 9.75, + "learning_rate": 1.9380888290713325e-05, + "loss": 0.8713213, + "memory(GiB)": 15.04, + "step": 720, + "train_speed(iter/s)": 0.344625 + }, + { + "acc": 0.74385633, + "epoch": 0.09764309764309764, + "grad_norm": 8.375, + "learning_rate": 1.951547779273217e-05, + "loss": 0.83829308, + "memory(GiB)": 15.04, + "step": 725, + "train_speed(iter/s)": 0.344891 + }, + { + "acc": 0.79673042, + "epoch": 0.09831649831649832, + "grad_norm": 16.25, + "learning_rate": 1.9650067294751013e-05, + "loss": 0.73992858, + "memory(GiB)": 15.04, + "step": 730, + "train_speed(iter/s)": 0.345234 + }, + { + "acc": 0.80602369, + "epoch": 0.09898989898989899, + "grad_norm": 10.8125, + "learning_rate": 1.9784656796769853e-05, + "loss": 0.68064899, + "memory(GiB)": 15.04, + "step": 735, + "train_speed(iter/s)": 0.34545 + }, + { + "acc": 0.77511039, + "epoch": 0.09966329966329966, + "grad_norm": 7.4375, + "learning_rate": 1.9919246298788697e-05, + "loss": 0.77711539, + "memory(GiB)": 15.04, + "step": 740, + "train_speed(iter/s)": 0.345826 + }, + { + "acc": 0.76451511, + "epoch": 0.10033670033670034, + "grad_norm": 8.125, + "learning_rate": 1.9999999008117105e-05, + "loss": 0.83295755, + "memory(GiB)": 15.04, + "step": 745, + "train_speed(iter/s)": 0.34599 + }, + { + "acc": 0.78624372, + "epoch": 0.10101010101010101, + "grad_norm": 7.84375, + "learning_rate": 1.999998784943679e-05, + "loss": 0.7498014, + "memory(GiB)": 15.04, + "step": 750, + "train_speed(iter/s)": 0.346026 + }, + { + "acc": 0.77836795, + "epoch": 0.10168350168350168, + "grad_norm": 15.75, + "learning_rate": 1.999996429223642e-05, + "loss": 0.76124926, + "memory(GiB)": 15.04, + "step": 755, + "train_speed(iter/s)": 0.346306 + }, + { + "acc": 0.84560328, + "epoch": 0.10235690235690235, + "grad_norm": 7.5625, + "learning_rate": 1.9999928336545205e-05, + "loss": 0.63052053, + "memory(GiB)": 15.04, + "step": 760, + "train_speed(iter/s)": 0.346406 + }, + { + "acc": 0.77344265, + "epoch": 0.10303030303030303, + "grad_norm": 16.5, + "learning_rate": 1.9999879982407722e-05, + "loss": 0.75681195, + "memory(GiB)": 15.04, + "step": 765, + "train_speed(iter/s)": 0.346668 + }, + { + "acc": 0.78544083, + "epoch": 0.1037037037037037, + "grad_norm": 14.5, + "learning_rate": 1.9999819229883925e-05, + "loss": 0.7890686, + "memory(GiB)": 15.04, + "step": 770, + "train_speed(iter/s)": 0.346842 + }, + { + "acc": 0.80033808, + "epoch": 0.10437710437710437, + "grad_norm": 12.5625, + "learning_rate": 1.9999746079049136e-05, + "loss": 0.72480288, + "memory(GiB)": 15.04, + "step": 775, + "train_speed(iter/s)": 0.346905 + }, + { + "acc": 0.81095028, + "epoch": 0.10505050505050505, + "grad_norm": 7.78125, + "learning_rate": 1.9999660529994056e-05, + "loss": 0.647194, + "memory(GiB)": 15.04, + "step": 780, + "train_speed(iter/s)": 0.347101 + }, + { + "acc": 0.83096733, + "epoch": 0.10572390572390572, + "grad_norm": 8.75, + "learning_rate": 1.9999562582824747e-05, + "loss": 0.67273521, + "memory(GiB)": 15.04, + "step": 785, + "train_speed(iter/s)": 0.347234 + }, + { + "acc": 0.74170909, + "epoch": 0.1063973063973064, + "grad_norm": 12.125, + "learning_rate": 1.9999452237662655e-05, + "loss": 0.9027566, + "memory(GiB)": 15.04, + "step": 790, + "train_speed(iter/s)": 0.34764 + }, + { + "acc": 0.78647127, + "epoch": 0.10707070707070707, + "grad_norm": 9.875, + "learning_rate": 1.9999329494644588e-05, + "loss": 0.85048122, + "memory(GiB)": 15.04, + "step": 795, + "train_speed(iter/s)": 0.347928 + }, + { + "acc": 0.81664906, + "epoch": 0.10774410774410774, + "grad_norm": 25.125, + "learning_rate": 1.9999194353922732e-05, + "loss": 0.60916672, + "memory(GiB)": 15.04, + "step": 800, + "train_speed(iter/s)": 0.348221 + }, + { + "acc": 0.76929884, + "epoch": 0.10841750841750841, + "grad_norm": 15.0625, + "learning_rate": 1.999904681566464e-05, + "loss": 1.00046215, + "memory(GiB)": 15.04, + "step": 805, + "train_speed(iter/s)": 0.348048 + }, + { + "acc": 0.79135976, + "epoch": 0.10909090909090909, + "grad_norm": 11.8125, + "learning_rate": 1.9998886880053233e-05, + "loss": 0.7122694, + "memory(GiB)": 15.04, + "step": 810, + "train_speed(iter/s)": 0.348227 + }, + { + "acc": 0.79001317, + "epoch": 0.10976430976430976, + "grad_norm": 16.25, + "learning_rate": 1.9998714547286816e-05, + "loss": 0.7056983, + "memory(GiB)": 15.04, + "step": 815, + "train_speed(iter/s)": 0.348282 + }, + { + "acc": 0.73441644, + "epoch": 0.11043771043771043, + "grad_norm": 28.5, + "learning_rate": 1.9998529817579055e-05, + "loss": 1.05510197, + "memory(GiB)": 15.04, + "step": 820, + "train_speed(iter/s)": 0.348166 + }, + { + "acc": 0.79821463, + "epoch": 0.1111111111111111, + "grad_norm": 9.3125, + "learning_rate": 1.9998332691158985e-05, + "loss": 0.60838032, + "memory(GiB)": 15.04, + "step": 825, + "train_speed(iter/s)": 0.348333 + }, + { + "acc": 0.72554183, + "epoch": 0.11178451178451178, + "grad_norm": 10.1875, + "learning_rate": 1.9998123168271017e-05, + "loss": 0.8306942, + "memory(GiB)": 15.04, + "step": 830, + "train_speed(iter/s)": 0.348743 + }, + { + "acc": 0.86007366, + "epoch": 0.11245791245791245, + "grad_norm": 6.8125, + "learning_rate": 1.9997901249174924e-05, + "loss": 0.5353889, + "memory(GiB)": 15.04, + "step": 835, + "train_speed(iter/s)": 0.348942 + }, + { + "acc": 0.78542347, + "epoch": 0.11313131313131314, + "grad_norm": 11.5625, + "learning_rate": 1.9997666934145858e-05, + "loss": 0.6618031, + "memory(GiB)": 15.04, + "step": 840, + "train_speed(iter/s)": 0.349304 + }, + { + "acc": 0.76181016, + "epoch": 0.11380471380471381, + "grad_norm": 17.875, + "learning_rate": 1.999742022347433e-05, + "loss": 0.74508605, + "memory(GiB)": 15.04, + "step": 845, + "train_speed(iter/s)": 0.349287 + }, + { + "acc": 0.75039043, + "epoch": 0.11447811447811448, + "grad_norm": 7.4375, + "learning_rate": 1.999716111746623e-05, + "loss": 0.70882645, + "memory(GiB)": 15.04, + "step": 850, + "train_speed(iter/s)": 0.349239 + }, + { + "acc": 0.75623989, + "epoch": 0.11515151515151516, + "grad_norm": 19.25, + "learning_rate": 1.9996889616442808e-05, + "loss": 0.73483896, + "memory(GiB)": 15.04, + "step": 855, + "train_speed(iter/s)": 0.349329 + }, + { + "acc": 0.82510233, + "epoch": 0.11582491582491583, + "grad_norm": 18.75, + "learning_rate": 1.999660572074069e-05, + "loss": 0.57684283, + "memory(GiB)": 15.04, + "step": 860, + "train_speed(iter/s)": 0.349558 + }, + { + "acc": 0.76790061, + "epoch": 0.1164983164983165, + "grad_norm": 16.75, + "learning_rate": 1.999630943071186e-05, + "loss": 1.15222225, + "memory(GiB)": 15.04, + "step": 865, + "train_speed(iter/s)": 0.34962 + }, + { + "acc": 0.70848417, + "epoch": 0.11717171717171718, + "grad_norm": 15.625, + "learning_rate": 1.9996000746723677e-05, + "loss": 0.76727939, + "memory(GiB)": 15.04, + "step": 870, + "train_speed(iter/s)": 0.349986 + }, + { + "acc": 0.75316901, + "epoch": 0.11784511784511785, + "grad_norm": 14.5625, + "learning_rate": 1.999567966915886e-05, + "loss": 0.86907434, + "memory(GiB)": 15.04, + "step": 875, + "train_speed(iter/s)": 0.34999 + }, + { + "acc": 0.81012983, + "epoch": 0.11851851851851852, + "grad_norm": 8.5625, + "learning_rate": 1.9995346198415507e-05, + "loss": 0.69052753, + "memory(GiB)": 15.04, + "step": 880, + "train_speed(iter/s)": 0.350055 + }, + { + "acc": 0.72671976, + "epoch": 0.1191919191919192, + "grad_norm": 13.5625, + "learning_rate": 1.9995000334907067e-05, + "loss": 1.07237949, + "memory(GiB)": 15.04, + "step": 885, + "train_speed(iter/s)": 0.350041 + }, + { + "acc": 0.79563127, + "epoch": 0.11986531986531987, + "grad_norm": 6.34375, + "learning_rate": 1.9994642079062355e-05, + "loss": 0.71864276, + "memory(GiB)": 15.04, + "step": 890, + "train_speed(iter/s)": 0.35027 + }, + { + "acc": 0.80651369, + "epoch": 0.12053872053872054, + "grad_norm": 11.4375, + "learning_rate": 1.999427143132557e-05, + "loss": 0.55639653, + "memory(GiB)": 15.04, + "step": 895, + "train_speed(iter/s)": 0.350582 + }, + { + "acc": 0.80839796, + "epoch": 0.12121212121212122, + "grad_norm": 6.8125, + "learning_rate": 1.9993888392156243e-05, + "loss": 0.78113098, + "memory(GiB)": 15.04, + "step": 900, + "train_speed(iter/s)": 0.35086 + }, + { + "epoch": 0.12121212121212122, + "eval_acc": 0.7964180042418371, + "eval_loss": 0.8123638033866882, + "eval_runtime": 109.7552, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 1.367, + "step": 900 + }, + { + "acc": 0.82671185, + "epoch": 0.12188552188552189, + "grad_norm": 8.9375, + "learning_rate": 1.99934929620293e-05, + "loss": 0.67872958, + "memory(GiB)": 15.04, + "step": 905, + "train_speed(iter/s)": 0.336644 + }, + { + "acc": 0.78594241, + "epoch": 0.12255892255892256, + "grad_norm": 13.0, + "learning_rate": 1.9993085141435013e-05, + "loss": 0.72096338, + "memory(GiB)": 15.04, + "step": 910, + "train_speed(iter/s)": 0.336923 + }, + { + "acc": 0.80444469, + "epoch": 0.12323232323232323, + "grad_norm": 8.4375, + "learning_rate": 1.9992664930879018e-05, + "loss": 0.80335388, + "memory(GiB)": 15.04, + "step": 915, + "train_speed(iter/s)": 0.337332 + }, + { + "acc": 0.87560616, + "epoch": 0.12390572390572391, + "grad_norm": 8.6875, + "learning_rate": 1.9992232330882314e-05, + "loss": 0.45894027, + "memory(GiB)": 15.04, + "step": 920, + "train_speed(iter/s)": 0.337763 + }, + { + "acc": 0.73397541, + "epoch": 0.12457912457912458, + "grad_norm": 16.75, + "learning_rate": 1.9991787341981263e-05, + "loss": 1.12627192, + "memory(GiB)": 15.04, + "step": 925, + "train_speed(iter/s)": 0.338106 + }, + { + "acc": 0.80420237, + "epoch": 0.12525252525252525, + "grad_norm": 17.875, + "learning_rate": 1.9991329964727585e-05, + "loss": 0.65375113, + "memory(GiB)": 15.04, + "step": 930, + "train_speed(iter/s)": 0.338396 + }, + { + "acc": 0.76427999, + "epoch": 0.1259259259259259, + "grad_norm": 12.375, + "learning_rate": 1.9990860199688366e-05, + "loss": 0.82494192, + "memory(GiB)": 15.04, + "step": 935, + "train_speed(iter/s)": 0.33873 + }, + { + "acc": 0.6646564, + "epoch": 0.1265993265993266, + "grad_norm": 9.6875, + "learning_rate": 1.999037804744604e-05, + "loss": 0.83276958, + "memory(GiB)": 15.04, + "step": 940, + "train_speed(iter/s)": 0.338974 + }, + { + "acc": 0.81569929, + "epoch": 0.12727272727272726, + "grad_norm": 9.4375, + "learning_rate": 1.9989883508598406e-05, + "loss": 0.73578134, + "memory(GiB)": 15.04, + "step": 945, + "train_speed(iter/s)": 0.339056 + }, + { + "acc": 0.7477592, + "epoch": 0.12794612794612795, + "grad_norm": 8.625, + "learning_rate": 1.998937658375862e-05, + "loss": 0.58547649, + "memory(GiB)": 15.04, + "step": 950, + "train_speed(iter/s)": 0.339272 + }, + { + "acc": 0.79679656, + "epoch": 0.1286195286195286, + "grad_norm": 6.9375, + "learning_rate": 1.9988857273555196e-05, + "loss": 0.64831743, + "memory(GiB)": 15.04, + "step": 955, + "train_speed(iter/s)": 0.339339 + }, + { + "acc": 0.83822346, + "epoch": 0.1292929292929293, + "grad_norm": 6.71875, + "learning_rate": 1.9988325578632003e-05, + "loss": 0.66460371, + "memory(GiB)": 15.04, + "step": 960, + "train_speed(iter/s)": 0.339392 + }, + { + "acc": 0.78960447, + "epoch": 0.12996632996632998, + "grad_norm": 8.0625, + "learning_rate": 1.9987781499648262e-05, + "loss": 0.60568805, + "memory(GiB)": 15.04, + "step": 965, + "train_speed(iter/s)": 0.339771 + }, + { + "acc": 0.79516697, + "epoch": 0.13063973063973064, + "grad_norm": 10.5625, + "learning_rate": 1.9987225037278553e-05, + "loss": 0.79415574, + "memory(GiB)": 15.04, + "step": 970, + "train_speed(iter/s)": 0.33991 + }, + { + "acc": 0.80595303, + "epoch": 0.13131313131313133, + "grad_norm": 11.25, + "learning_rate": 1.9986656192212805e-05, + "loss": 0.64286175, + "memory(GiB)": 15.04, + "step": 975, + "train_speed(iter/s)": 0.339948 + }, + { + "acc": 0.80993471, + "epoch": 0.13198653198653199, + "grad_norm": 7.1875, + "learning_rate": 1.9986074965156307e-05, + "loss": 0.70274277, + "memory(GiB)": 15.04, + "step": 980, + "train_speed(iter/s)": 0.340129 + }, + { + "acc": 0.77113862, + "epoch": 0.13265993265993267, + "grad_norm": 9.375, + "learning_rate": 1.9985481356829693e-05, + "loss": 0.59495711, + "memory(GiB)": 15.04, + "step": 985, + "train_speed(iter/s)": 0.340343 + }, + { + "acc": 0.84521904, + "epoch": 0.13333333333333333, + "grad_norm": 7.75, + "learning_rate": 1.9984875367968955e-05, + "loss": 0.57561235, + "memory(GiB)": 15.04, + "step": 990, + "train_speed(iter/s)": 0.340386 + }, + { + "acc": 0.80663528, + "epoch": 0.13400673400673402, + "grad_norm": 13.0625, + "learning_rate": 1.9984256999325423e-05, + "loss": 0.90646124, + "memory(GiB)": 15.04, + "step": 995, + "train_speed(iter/s)": 0.340563 + }, + { + "acc": 0.74123559, + "epoch": 0.13468013468013468, + "grad_norm": 11.25, + "learning_rate": 1.9983626251665788e-05, + "loss": 0.90148401, + "memory(GiB)": 15.04, + "step": 1000, + "train_speed(iter/s)": 0.340293 + }, + { + "acc": 0.85422077, + "epoch": 0.13535353535353536, + "grad_norm": 7.03125, + "learning_rate": 1.9982983125772082e-05, + "loss": 0.51592302, + "memory(GiB)": 15.04, + "step": 1005, + "train_speed(iter/s)": 0.339951 + }, + { + "acc": 0.84535894, + "epoch": 0.13602693602693602, + "grad_norm": 13.1875, + "learning_rate": 1.9982327622441688e-05, + "loss": 0.53820696, + "memory(GiB)": 15.04, + "step": 1010, + "train_speed(iter/s)": 0.340195 + }, + { + "acc": 0.83110466, + "epoch": 0.1367003367003367, + "grad_norm": 10.75, + "learning_rate": 1.9981659742487337e-05, + "loss": 0.56353827, + "memory(GiB)": 15.04, + "step": 1015, + "train_speed(iter/s)": 0.340461 + }, + { + "acc": 0.87078714, + "epoch": 0.13737373737373737, + "grad_norm": 7.6875, + "learning_rate": 1.99809794867371e-05, + "loss": 0.47190948, + "memory(GiB)": 15.04, + "step": 1020, + "train_speed(iter/s)": 0.340756 + }, + { + "acc": 0.86921206, + "epoch": 0.13804713804713806, + "grad_norm": 12.5, + "learning_rate": 1.998028685603439e-05, + "loss": 0.47288909, + "memory(GiB)": 15.04, + "step": 1025, + "train_speed(iter/s)": 0.34102 + }, + { + "acc": 0.74840388, + "epoch": 0.13872053872053872, + "grad_norm": 10.6875, + "learning_rate": 1.9979581851237974e-05, + "loss": 0.69720249, + "memory(GiB)": 15.04, + "step": 1030, + "train_speed(iter/s)": 0.341312 + }, + { + "acc": 0.81624384, + "epoch": 0.1393939393939394, + "grad_norm": 7.9375, + "learning_rate": 1.9978864473221954e-05, + "loss": 0.59940052, + "memory(GiB)": 15.04, + "step": 1035, + "train_speed(iter/s)": 0.341474 + }, + { + "acc": 0.84893894, + "epoch": 0.14006734006734006, + "grad_norm": 9.125, + "learning_rate": 1.997813472287577e-05, + "loss": 0.53334293, + "memory(GiB)": 15.04, + "step": 1040, + "train_speed(iter/s)": 0.341755 + }, + { + "acc": 0.81686001, + "epoch": 0.14074074074074075, + "grad_norm": 10.0, + "learning_rate": 1.997739260110421e-05, + "loss": 0.70683684, + "memory(GiB)": 15.04, + "step": 1045, + "train_speed(iter/s)": 0.341467 + }, + { + "acc": 0.73511386, + "epoch": 0.1414141414141414, + "grad_norm": 12.4375, + "learning_rate": 1.9976638108827395e-05, + "loss": 0.8299778, + "memory(GiB)": 15.04, + "step": 1050, + "train_speed(iter/s)": 0.341592 + }, + { + "acc": 0.84761677, + "epoch": 0.1420875420875421, + "grad_norm": 7.34375, + "learning_rate": 1.997587124698078e-05, + "loss": 0.52306743, + "memory(GiB)": 15.04, + "step": 1055, + "train_speed(iter/s)": 0.341691 + }, + { + "acc": 0.80669956, + "epoch": 0.14276094276094276, + "grad_norm": 5.75, + "learning_rate": 1.997509201651517e-05, + "loss": 0.57092929, + "memory(GiB)": 15.04, + "step": 1060, + "train_speed(iter/s)": 0.341936 + }, + { + "acc": 0.79653311, + "epoch": 0.14343434343434344, + "grad_norm": 9.125, + "learning_rate": 1.9974300418396688e-05, + "loss": 0.84826164, + "memory(GiB)": 15.04, + "step": 1065, + "train_speed(iter/s)": 0.342245 + }, + { + "acc": 0.84764261, + "epoch": 0.1441077441077441, + "grad_norm": 7.9375, + "learning_rate": 1.9973496453606808e-05, + "loss": 0.48173232, + "memory(GiB)": 15.04, + "step": 1070, + "train_speed(iter/s)": 0.34231 + }, + { + "acc": 0.76810174, + "epoch": 0.1447811447811448, + "grad_norm": 11.25, + "learning_rate": 1.9972680123142322e-05, + "loss": 0.87655268, + "memory(GiB)": 15.04, + "step": 1075, + "train_speed(iter/s)": 0.342497 + }, + { + "acc": 0.83370619, + "epoch": 0.14545454545454545, + "grad_norm": 7.25, + "learning_rate": 1.997185142801536e-05, + "loss": 0.69885473, + "memory(GiB)": 15.04, + "step": 1080, + "train_speed(iter/s)": 0.342508 + }, + { + "acc": 0.83789968, + "epoch": 0.14612794612794613, + "grad_norm": 7.3125, + "learning_rate": 1.9971010369253388e-05, + "loss": 0.6340858, + "memory(GiB)": 15.04, + "step": 1085, + "train_speed(iter/s)": 0.342112 + }, + { + "acc": 0.80213232, + "epoch": 0.1468013468013468, + "grad_norm": 23.5, + "learning_rate": 1.997015694789919e-05, + "loss": 0.65443606, + "memory(GiB)": 15.04, + "step": 1090, + "train_speed(iter/s)": 0.342229 + }, + { + "acc": 0.74166551, + "epoch": 0.14747474747474748, + "grad_norm": 9.4375, + "learning_rate": 1.9969291165010886e-05, + "loss": 1.01165895, + "memory(GiB)": 15.04, + "step": 1095, + "train_speed(iter/s)": 0.342523 + }, + { + "acc": 0.8297987, + "epoch": 0.14814814814814814, + "grad_norm": 8.9375, + "learning_rate": 1.9968413021661925e-05, + "loss": 0.64009571, + "memory(GiB)": 15.04, + "step": 1100, + "train_speed(iter/s)": 0.342779 + }, + { + "acc": 0.83918276, + "epoch": 0.14882154882154883, + "grad_norm": 10.5625, + "learning_rate": 1.9967522518941066e-05, + "loss": 0.45333061, + "memory(GiB)": 15.04, + "step": 1105, + "train_speed(iter/s)": 0.343074 + }, + { + "acc": 0.78889961, + "epoch": 0.1494949494949495, + "grad_norm": 12.25, + "learning_rate": 1.996661965795241e-05, + "loss": 0.70123796, + "memory(GiB)": 15.04, + "step": 1110, + "train_speed(iter/s)": 0.343202 + }, + { + "acc": 0.70950618, + "epoch": 0.15016835016835017, + "grad_norm": 15.8125, + "learning_rate": 1.9965704439815368e-05, + "loss": 1.0612565, + "memory(GiB)": 15.04, + "step": 1115, + "train_speed(iter/s)": 0.343061 + }, + { + "acc": 0.71501498, + "epoch": 0.15084175084175083, + "grad_norm": 6.875, + "learning_rate": 1.996477686566468e-05, + "loss": 1.3117177, + "memory(GiB)": 15.04, + "step": 1120, + "train_speed(iter/s)": 0.343087 + }, + { + "acc": 0.85039539, + "epoch": 0.15151515151515152, + "grad_norm": 10.1875, + "learning_rate": 1.9963836936650397e-05, + "loss": 0.52983537, + "memory(GiB)": 15.04, + "step": 1125, + "train_speed(iter/s)": 0.34331 + }, + { + "acc": 0.85561619, + "epoch": 0.15218855218855218, + "grad_norm": 15.375, + "learning_rate": 1.9962884653937897e-05, + "loss": 0.4804214, + "memory(GiB)": 15.04, + "step": 1130, + "train_speed(iter/s)": 0.343552 + }, + { + "acc": 0.780967, + "epoch": 0.15286195286195287, + "grad_norm": 7.75, + "learning_rate": 1.996192001870787e-05, + "loss": 0.67414761, + "memory(GiB)": 15.04, + "step": 1135, + "train_speed(iter/s)": 0.343508 + }, + { + "acc": 0.78523622, + "epoch": 0.15353535353535352, + "grad_norm": 10.875, + "learning_rate": 1.9960943032156327e-05, + "loss": 0.79191904, + "memory(GiB)": 15.04, + "step": 1140, + "train_speed(iter/s)": 0.343405 + }, + { + "acc": 0.81021652, + "epoch": 0.1542087542087542, + "grad_norm": 18.0, + "learning_rate": 1.995995369549458e-05, + "loss": 0.63394156, + "memory(GiB)": 15.04, + "step": 1145, + "train_speed(iter/s)": 0.343739 + }, + { + "acc": 0.84955807, + "epoch": 0.15488215488215487, + "grad_norm": 6.8125, + "learning_rate": 1.9958952009949264e-05, + "loss": 0.53475342, + "memory(GiB)": 15.04, + "step": 1150, + "train_speed(iter/s)": 0.343725 + }, + { + "acc": 0.78412313, + "epoch": 0.15555555555555556, + "grad_norm": 40.5, + "learning_rate": 1.9957937976762327e-05, + "loss": 0.79959445, + "memory(GiB)": 15.04, + "step": 1155, + "train_speed(iter/s)": 0.344017 + }, + { + "acc": 0.7680315, + "epoch": 0.15622895622895622, + "grad_norm": 10.1875, + "learning_rate": 1.9956911597191017e-05, + "loss": 0.60923586, + "memory(GiB)": 15.04, + "step": 1160, + "train_speed(iter/s)": 0.344356 + }, + { + "acc": 0.83539982, + "epoch": 0.1569023569023569, + "grad_norm": 7.9375, + "learning_rate": 1.9955872872507897e-05, + "loss": 0.63824601, + "memory(GiB)": 15.04, + "step": 1165, + "train_speed(iter/s)": 0.34463 + }, + { + "acc": 0.79414153, + "epoch": 0.15757575757575756, + "grad_norm": 6.21875, + "learning_rate": 1.995482180400083e-05, + "loss": 0.73920679, + "memory(GiB)": 15.04, + "step": 1170, + "train_speed(iter/s)": 0.344532 + }, + { + "acc": 0.87649174, + "epoch": 0.15824915824915825, + "grad_norm": 5.78125, + "learning_rate": 1.9953758392972988e-05, + "loss": 0.52685447, + "memory(GiB)": 15.04, + "step": 1175, + "train_speed(iter/s)": 0.344496 + }, + { + "acc": 0.74325509, + "epoch": 0.1589225589225589, + "grad_norm": 36.25, + "learning_rate": 1.9952682640742847e-05, + "loss": 0.96344852, + "memory(GiB)": 15.04, + "step": 1180, + "train_speed(iter/s)": 0.344618 + }, + { + "acc": 0.8145812, + "epoch": 0.1595959595959596, + "grad_norm": 7.4375, + "learning_rate": 1.9951594548644183e-05, + "loss": 0.48234911, + "memory(GiB)": 15.04, + "step": 1185, + "train_speed(iter/s)": 0.344703 + }, + { + "acc": 0.81938677, + "epoch": 0.16026936026936026, + "grad_norm": 7.59375, + "learning_rate": 1.995049411802607e-05, + "loss": 0.47550235, + "memory(GiB)": 15.04, + "step": 1190, + "train_speed(iter/s)": 0.344847 + }, + { + "acc": 0.79189601, + "epoch": 0.16094276094276094, + "grad_norm": 18.5, + "learning_rate": 1.9949381350252878e-05, + "loss": 0.77639685, + "memory(GiB)": 15.04, + "step": 1195, + "train_speed(iter/s)": 0.345159 + }, + { + "acc": 0.86258478, + "epoch": 0.16161616161616163, + "grad_norm": 8.0, + "learning_rate": 1.9948256246704275e-05, + "loss": 0.60039167, + "memory(GiB)": 15.04, + "step": 1200, + "train_speed(iter/s)": 0.345081 + }, + { + "epoch": 0.16161616161616163, + "eval_acc": 0.8230703414659442, + "eval_loss": 0.7040194869041443, + "eval_runtime": 109.8343, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 1200 + }, + { + "acc": 0.82389164, + "epoch": 0.1622895622895623, + "grad_norm": 19.625, + "learning_rate": 1.994711880877523e-05, + "loss": 0.63736305, + "memory(GiB)": 15.04, + "step": 1205, + "train_speed(iter/s)": 0.334716 + }, + { + "acc": 0.82516699, + "epoch": 0.16296296296296298, + "grad_norm": 7.71875, + "learning_rate": 1.9945969037876e-05, + "loss": 0.62200007, + "memory(GiB)": 15.04, + "step": 1210, + "train_speed(iter/s)": 0.334904 + }, + { + "acc": 0.8145359, + "epoch": 0.16363636363636364, + "grad_norm": 16.125, + "learning_rate": 1.9944806935432127e-05, + "loss": 0.56476302, + "memory(GiB)": 15.04, + "step": 1215, + "train_speed(iter/s)": 0.335158 + }, + { + "acc": 0.76596346, + "epoch": 0.16430976430976432, + "grad_norm": 9.8125, + "learning_rate": 1.9943632502884448e-05, + "loss": 0.91625757, + "memory(GiB)": 15.04, + "step": 1220, + "train_speed(iter/s)": 0.335401 + }, + { + "acc": 0.80096502, + "epoch": 0.16498316498316498, + "grad_norm": 7.65625, + "learning_rate": 1.9942445741689093e-05, + "loss": 0.54522581, + "memory(GiB)": 15.04, + "step": 1225, + "train_speed(iter/s)": 0.335478 + }, + { + "acc": 0.82449436, + "epoch": 0.16565656565656567, + "grad_norm": 7.53125, + "learning_rate": 1.9941246653317465e-05, + "loss": 0.61662765, + "memory(GiB)": 15.04, + "step": 1230, + "train_speed(iter/s)": 0.335694 + }, + { + "acc": 0.78079591, + "epoch": 0.16632996632996633, + "grad_norm": 8.6875, + "learning_rate": 1.9940035239256265e-05, + "loss": 0.77522221, + "memory(GiB)": 15.04, + "step": 1235, + "train_speed(iter/s)": 0.335946 + }, + { + "acc": 0.78646274, + "epoch": 0.16700336700336701, + "grad_norm": 21.375, + "learning_rate": 1.9938811501007462e-05, + "loss": 0.62522068, + "memory(GiB)": 15.04, + "step": 1240, + "train_speed(iter/s)": 0.336263 + }, + { + "acc": 0.7506434, + "epoch": 0.16767676767676767, + "grad_norm": 7.5, + "learning_rate": 1.9937575440088316e-05, + "loss": 0.62290888, + "memory(GiB)": 15.04, + "step": 1245, + "train_speed(iter/s)": 0.336456 + }, + { + "acc": 0.84636002, + "epoch": 0.16835016835016836, + "grad_norm": 8.875, + "learning_rate": 1.993632705803136e-05, + "loss": 0.60505009, + "memory(GiB)": 15.04, + "step": 1250, + "train_speed(iter/s)": 0.336644 + }, + { + "acc": 0.80847054, + "epoch": 0.16902356902356902, + "grad_norm": 14.875, + "learning_rate": 1.993506635638441e-05, + "loss": 0.63604989, + "memory(GiB)": 15.04, + "step": 1255, + "train_speed(iter/s)": 0.336913 + }, + { + "acc": 0.87230434, + "epoch": 0.1696969696969697, + "grad_norm": 8.375, + "learning_rate": 1.9933793336710545e-05, + "loss": 0.50676417, + "memory(GiB)": 15.04, + "step": 1260, + "train_speed(iter/s)": 0.337162 + }, + { + "acc": 0.80247259, + "epoch": 0.17037037037037037, + "grad_norm": 18.25, + "learning_rate": 1.9932508000588123e-05, + "loss": 0.73981152, + "memory(GiB)": 15.04, + "step": 1265, + "train_speed(iter/s)": 0.337447 + }, + { + "acc": 0.82226982, + "epoch": 0.17104377104377105, + "grad_norm": 6.5, + "learning_rate": 1.9931210349610776e-05, + "loss": 0.9066226, + "memory(GiB)": 15.04, + "step": 1270, + "train_speed(iter/s)": 0.337438 + }, + { + "acc": 0.78620706, + "epoch": 0.1717171717171717, + "grad_norm": 5.5, + "learning_rate": 1.99299003853874e-05, + "loss": 0.70232997, + "memory(GiB)": 15.04, + "step": 1275, + "train_speed(iter/s)": 0.337262 + }, + { + "acc": 0.78566852, + "epoch": 0.1723905723905724, + "grad_norm": 13.375, + "learning_rate": 1.992857810954216e-05, + "loss": 0.53193336, + "memory(GiB)": 15.04, + "step": 1280, + "train_speed(iter/s)": 0.337281 + }, + { + "acc": 0.79574518, + "epoch": 0.17306397306397306, + "grad_norm": 11.625, + "learning_rate": 1.992724352371448e-05, + "loss": 0.50628576, + "memory(GiB)": 15.04, + "step": 1285, + "train_speed(iter/s)": 0.337486 + }, + { + "acc": 0.79572105, + "epoch": 0.17373737373737375, + "grad_norm": 7.8125, + "learning_rate": 1.9925896629559058e-05, + "loss": 0.89431658, + "memory(GiB)": 15.04, + "step": 1290, + "train_speed(iter/s)": 0.337792 + }, + { + "acc": 0.76915941, + "epoch": 0.1744107744107744, + "grad_norm": 14.1875, + "learning_rate": 1.9924537428745838e-05, + "loss": 0.71194468, + "memory(GiB)": 15.04, + "step": 1295, + "train_speed(iter/s)": 0.337887 + }, + { + "acc": 0.84247351, + "epoch": 0.1750841750841751, + "grad_norm": 28.5, + "learning_rate": 1.9923165922960036e-05, + "loss": 0.52920589, + "memory(GiB)": 15.04, + "step": 1300, + "train_speed(iter/s)": 0.337901 + }, + { + "acc": 0.80220499, + "epoch": 0.17575757575757575, + "grad_norm": 7.9375, + "learning_rate": 1.9921782113902113e-05, + "loss": 0.52305183, + "memory(GiB)": 15.04, + "step": 1305, + "train_speed(iter/s)": 0.338015 + }, + { + "acc": 0.79102888, + "epoch": 0.17643097643097644, + "grad_norm": 8.375, + "learning_rate": 1.992038600328779e-05, + "loss": 0.77678123, + "memory(GiB)": 15.04, + "step": 1310, + "train_speed(iter/s)": 0.33802 + }, + { + "acc": 0.84969645, + "epoch": 0.1771043771043771, + "grad_norm": 7.25, + "learning_rate": 1.9918977592848044e-05, + "loss": 0.68033156, + "memory(GiB)": 15.04, + "step": 1315, + "train_speed(iter/s)": 0.33813 + }, + { + "acc": 0.84281616, + "epoch": 0.17777777777777778, + "grad_norm": 13.0625, + "learning_rate": 1.9917556884329096e-05, + "loss": 0.55574412, + "memory(GiB)": 15.04, + "step": 1320, + "train_speed(iter/s)": 0.33842 + }, + { + "acc": 0.82773514, + "epoch": 0.17845117845117844, + "grad_norm": 9.75, + "learning_rate": 1.9916123879492416e-05, + "loss": 0.61481857, + "memory(GiB)": 15.04, + "step": 1325, + "train_speed(iter/s)": 0.338574 + }, + { + "acc": 0.8670886, + "epoch": 0.17912457912457913, + "grad_norm": 7.375, + "learning_rate": 1.9914678580114716e-05, + "loss": 0.52466178, + "memory(GiB)": 15.04, + "step": 1330, + "train_speed(iter/s)": 0.338585 + }, + { + "acc": 0.81682062, + "epoch": 0.1797979797979798, + "grad_norm": 19.375, + "learning_rate": 1.9913220987987963e-05, + "loss": 0.61289821, + "memory(GiB)": 15.04, + "step": 1335, + "train_speed(iter/s)": 0.338825 + }, + { + "acc": 0.85524416, + "epoch": 0.18047138047138048, + "grad_norm": 12.5, + "learning_rate": 1.9911751104919353e-05, + "loss": 0.5054049, + "memory(GiB)": 15.04, + "step": 1340, + "train_speed(iter/s)": 0.339015 + }, + { + "acc": 0.82324381, + "epoch": 0.18114478114478114, + "grad_norm": 8.875, + "learning_rate": 1.9910268932731327e-05, + "loss": 0.76316633, + "memory(GiB)": 15.04, + "step": 1345, + "train_speed(iter/s)": 0.339023 + }, + { + "acc": 0.83471432, + "epoch": 0.18181818181818182, + "grad_norm": 12.5625, + "learning_rate": 1.9908774473261557e-05, + "loss": 0.56146412, + "memory(GiB)": 15.04, + "step": 1350, + "train_speed(iter/s)": 0.33925 + }, + { + "acc": 0.81195011, + "epoch": 0.18249158249158248, + "grad_norm": 18.625, + "learning_rate": 1.9907267728362962e-05, + "loss": 0.54229565, + "memory(GiB)": 15.04, + "step": 1355, + "train_speed(iter/s)": 0.339518 + }, + { + "acc": 0.78173838, + "epoch": 0.18316498316498317, + "grad_norm": 13.8125, + "learning_rate": 1.990574869990368e-05, + "loss": 0.72042937, + "memory(GiB)": 15.04, + "step": 1360, + "train_speed(iter/s)": 0.339795 + }, + { + "acc": 0.89754705, + "epoch": 0.18383838383838383, + "grad_norm": 7.96875, + "learning_rate": 1.9904217389767084e-05, + "loss": 0.35997989, + "memory(GiB)": 15.04, + "step": 1365, + "train_speed(iter/s)": 0.339999 + }, + { + "acc": 0.80328083, + "epoch": 0.18451178451178452, + "grad_norm": 9.4375, + "learning_rate": 1.9902673799851777e-05, + "loss": 0.64457412, + "memory(GiB)": 15.04, + "step": 1370, + "train_speed(iter/s)": 0.340028 + }, + { + "acc": 0.75070305, + "epoch": 0.18518518518518517, + "grad_norm": 15.25, + "learning_rate": 1.990111793207158e-05, + "loss": 0.99465656, + "memory(GiB)": 15.04, + "step": 1375, + "train_speed(iter/s)": 0.3402 + }, + { + "acc": 0.86041613, + "epoch": 0.18585858585858586, + "grad_norm": 7.4375, + "learning_rate": 1.9899549788355545e-05, + "loss": 0.52056928, + "memory(GiB)": 15.04, + "step": 1380, + "train_speed(iter/s)": 0.34023 + }, + { + "acc": 0.85896616, + "epoch": 0.18653198653198652, + "grad_norm": 7.4375, + "learning_rate": 1.9897969370647937e-05, + "loss": 0.47336903, + "memory(GiB)": 15.04, + "step": 1385, + "train_speed(iter/s)": 0.340349 + }, + { + "acc": 0.8049366, + "epoch": 0.1872053872053872, + "grad_norm": 8.625, + "learning_rate": 1.9896376680908244e-05, + "loss": 0.92537689, + "memory(GiB)": 15.04, + "step": 1390, + "train_speed(iter/s)": 0.340255 + }, + { + "acc": 0.82606783, + "epoch": 0.18787878787878787, + "grad_norm": 7.0625, + "learning_rate": 1.989477172111117e-05, + "loss": 0.65767555, + "memory(GiB)": 15.04, + "step": 1395, + "train_speed(iter/s)": 0.340219 + }, + { + "acc": 0.84840879, + "epoch": 0.18855218855218855, + "grad_norm": 9.625, + "learning_rate": 1.989315449324663e-05, + "loss": 0.50497999, + "memory(GiB)": 15.04, + "step": 1400, + "train_speed(iter/s)": 0.340368 + }, + { + "acc": 0.83024111, + "epoch": 0.1892255892255892, + "grad_norm": 7.8125, + "learning_rate": 1.9891524999319744e-05, + "loss": 0.57054515, + "memory(GiB)": 15.04, + "step": 1405, + "train_speed(iter/s)": 0.34047 + }, + { + "acc": 0.75613337, + "epoch": 0.1898989898989899, + "grad_norm": 7.84375, + "learning_rate": 1.988988324135085e-05, + "loss": 0.86667471, + "memory(GiB)": 15.04, + "step": 1410, + "train_speed(iter/s)": 0.340671 + }, + { + "acc": 0.8119998, + "epoch": 0.19057239057239056, + "grad_norm": 8.3125, + "learning_rate": 1.988822922137549e-05, + "loss": 0.8280241, + "memory(GiB)": 15.04, + "step": 1415, + "train_speed(iter/s)": 0.340947 + }, + { + "acc": 0.73748841, + "epoch": 0.19124579124579125, + "grad_norm": 14.0, + "learning_rate": 1.98865629414444e-05, + "loss": 0.74178867, + "memory(GiB)": 15.04, + "step": 1420, + "train_speed(iter/s)": 0.341137 + }, + { + "acc": 0.85077839, + "epoch": 0.1919191919191919, + "grad_norm": 10.1875, + "learning_rate": 1.988488440362353e-05, + "loss": 0.60300379, + "memory(GiB)": 15.04, + "step": 1425, + "train_speed(iter/s)": 0.341382 + }, + { + "acc": 0.79046402, + "epoch": 0.1925925925925926, + "grad_norm": 6.34375, + "learning_rate": 1.9883193609994013e-05, + "loss": 0.4608397, + "memory(GiB)": 15.04, + "step": 1430, + "train_speed(iter/s)": 0.341308 + }, + { + "acc": 0.85704985, + "epoch": 0.19326599326599325, + "grad_norm": 8.5625, + "learning_rate": 1.9881490562652195e-05, + "loss": 0.49345632, + "memory(GiB)": 15.04, + "step": 1435, + "train_speed(iter/s)": 0.341365 + }, + { + "acc": 0.80162678, + "epoch": 0.19393939393939394, + "grad_norm": 11.25, + "learning_rate": 1.9879775263709597e-05, + "loss": 0.75367384, + "memory(GiB)": 15.04, + "step": 1440, + "train_speed(iter/s)": 0.341417 + }, + { + "acc": 0.86569061, + "epoch": 0.19461279461279463, + "grad_norm": 24.875, + "learning_rate": 1.9878047715292944e-05, + "loss": 0.47033658, + "memory(GiB)": 15.04, + "step": 1445, + "train_speed(iter/s)": 0.341514 + }, + { + "acc": 0.89666128, + "epoch": 0.19528619528619529, + "grad_norm": 13.9375, + "learning_rate": 1.987630791954414e-05, + "loss": 0.36658845, + "memory(GiB)": 15.04, + "step": 1450, + "train_speed(iter/s)": 0.34175 + }, + { + "acc": 0.86566019, + "epoch": 0.19595959595959597, + "grad_norm": 10.375, + "learning_rate": 1.9874555878620278e-05, + "loss": 0.47194514, + "memory(GiB)": 15.04, + "step": 1455, + "train_speed(iter/s)": 0.341849 + }, + { + "acc": 0.85536938, + "epoch": 0.19663299663299663, + "grad_norm": 6.75, + "learning_rate": 1.987279159469363e-05, + "loss": 0.54388247, + "memory(GiB)": 15.04, + "step": 1460, + "train_speed(iter/s)": 0.341891 + }, + { + "acc": 0.83794804, + "epoch": 0.19730639730639732, + "grad_norm": 7.125, + "learning_rate": 1.987101506995165e-05, + "loss": 0.55969214, + "memory(GiB)": 15.04, + "step": 1465, + "train_speed(iter/s)": 0.341893 + }, + { + "acc": 0.85852814, + "epoch": 0.19797979797979798, + "grad_norm": 11.6875, + "learning_rate": 1.9869226306596973e-05, + "loss": 0.45950704, + "memory(GiB)": 15.04, + "step": 1470, + "train_speed(iter/s)": 0.342045 + }, + { + "acc": 0.88174915, + "epoch": 0.19865319865319866, + "grad_norm": 11.1875, + "learning_rate": 1.98674253068474e-05, + "loss": 0.37424619, + "memory(GiB)": 15.04, + "step": 1475, + "train_speed(iter/s)": 0.342135 + }, + { + "acc": 0.83266773, + "epoch": 0.19932659932659932, + "grad_norm": 7.40625, + "learning_rate": 1.9865612072935904e-05, + "loss": 0.83164015, + "memory(GiB)": 15.04, + "step": 1480, + "train_speed(iter/s)": 0.342252 + }, + { + "acc": 0.8816473, + "epoch": 0.2, + "grad_norm": 8.0, + "learning_rate": 1.9863786607110634e-05, + "loss": 0.41165295, + "memory(GiB)": 15.04, + "step": 1485, + "train_speed(iter/s)": 0.34244 + }, + { + "acc": 0.85913954, + "epoch": 0.20067340067340067, + "grad_norm": 17.625, + "learning_rate": 1.98619489116349e-05, + "loss": 0.44735775, + "memory(GiB)": 15.04, + "step": 1490, + "train_speed(iter/s)": 0.342647 + }, + { + "acc": 0.87764893, + "epoch": 0.20134680134680136, + "grad_norm": 6.96875, + "learning_rate": 1.9860098988787175e-05, + "loss": 0.3400228, + "memory(GiB)": 15.04, + "step": 1495, + "train_speed(iter/s)": 0.342664 + }, + { + "acc": 0.8648241, + "epoch": 0.20202020202020202, + "grad_norm": 13.9375, + "learning_rate": 1.9858236840861087e-05, + "loss": 0.48049917, + "memory(GiB)": 15.04, + "step": 1500, + "train_speed(iter/s)": 0.342613 + }, + { + "epoch": 0.20202020202020202, + "eval_acc": 0.8393517425851624, + "eval_loss": 0.6326767206192017, + "eval_runtime": 109.81, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 1500 + }, + { + "acc": 0.83591461, + "epoch": 0.2026936026936027, + "grad_norm": 9.5, + "learning_rate": 1.9856362470165432e-05, + "loss": 0.62246451, + "memory(GiB)": 15.04, + "step": 1505, + "train_speed(iter/s)": 0.33423 + }, + { + "acc": 0.81393814, + "epoch": 0.20336700336700336, + "grad_norm": 10.8125, + "learning_rate": 1.9854475879024155e-05, + "loss": 0.71856928, + "memory(GiB)": 15.04, + "step": 1510, + "train_speed(iter/s)": 0.334249 + }, + { + "acc": 0.85712175, + "epoch": 0.20404040404040405, + "grad_norm": 8.0, + "learning_rate": 1.9852577069776352e-05, + "loss": 0.50289125, + "memory(GiB)": 15.04, + "step": 1515, + "train_speed(iter/s)": 0.334309 + }, + { + "acc": 0.74970808, + "epoch": 0.2047138047138047, + "grad_norm": 11.9375, + "learning_rate": 1.985066604477627e-05, + "loss": 1.00005846, + "memory(GiB)": 15.04, + "step": 1520, + "train_speed(iter/s)": 0.334471 + }, + { + "acc": 0.71272602, + "epoch": 0.2053872053872054, + "grad_norm": 23.0, + "learning_rate": 1.9848742806393293e-05, + "loss": 0.90867653, + "memory(GiB)": 15.04, + "step": 1525, + "train_speed(iter/s)": 0.334625 + }, + { + "acc": 0.83619108, + "epoch": 0.20606060606060606, + "grad_norm": 7.46875, + "learning_rate": 1.984680735701196e-05, + "loss": 0.78301525, + "memory(GiB)": 15.04, + "step": 1530, + "train_speed(iter/s)": 0.334686 + }, + { + "acc": 0.86082382, + "epoch": 0.20673400673400674, + "grad_norm": 12.5, + "learning_rate": 1.984485969903195e-05, + "loss": 0.48262935, + "memory(GiB)": 15.04, + "step": 1535, + "train_speed(iter/s)": 0.334732 + }, + { + "acc": 0.86204128, + "epoch": 0.2074074074074074, + "grad_norm": 8.375, + "learning_rate": 1.9842899834868063e-05, + "loss": 0.51524258, + "memory(GiB)": 15.04, + "step": 1540, + "train_speed(iter/s)": 0.334906 + }, + { + "acc": 0.79670696, + "epoch": 0.2080808080808081, + "grad_norm": 9.25, + "learning_rate": 1.9840927766950253e-05, + "loss": 0.64929891, + "memory(GiB)": 15.04, + "step": 1545, + "train_speed(iter/s)": 0.335055 + }, + { + "acc": 0.85708027, + "epoch": 0.20875420875420875, + "grad_norm": 13.0625, + "learning_rate": 1.9838943497723585e-05, + "loss": 0.54325323, + "memory(GiB)": 15.04, + "step": 1550, + "train_speed(iter/s)": 0.33524 + }, + { + "acc": 0.89223938, + "epoch": 0.20942760942760943, + "grad_norm": 9.625, + "learning_rate": 1.9836947029648276e-05, + "loss": 0.36963723, + "memory(GiB)": 15.04, + "step": 1555, + "train_speed(iter/s)": 0.335507 + }, + { + "acc": 0.84713097, + "epoch": 0.2101010101010101, + "grad_norm": 13.25, + "learning_rate": 1.9834938365199637e-05, + "loss": 0.49888825, + "memory(GiB)": 15.04, + "step": 1560, + "train_speed(iter/s)": 0.335758 + }, + { + "acc": 0.79915905, + "epoch": 0.21077441077441078, + "grad_norm": 17.25, + "learning_rate": 1.9832917506868135e-05, + "loss": 0.7733633, + "memory(GiB)": 15.04, + "step": 1565, + "train_speed(iter/s)": 0.33599 + }, + { + "acc": 0.80804539, + "epoch": 0.21144781144781144, + "grad_norm": 10.875, + "learning_rate": 1.9830884457159328e-05, + "loss": 0.7196496, + "memory(GiB)": 15.04, + "step": 1570, + "train_speed(iter/s)": 0.336234 + }, + { + "acc": 0.82204647, + "epoch": 0.21212121212121213, + "grad_norm": 12.0625, + "learning_rate": 1.9828839218593897e-05, + "loss": 0.66851048, + "memory(GiB)": 15.04, + "step": 1575, + "train_speed(iter/s)": 0.336436 + }, + { + "acc": 0.84909801, + "epoch": 0.2127946127946128, + "grad_norm": 6.90625, + "learning_rate": 1.982678179370765e-05, + "loss": 0.54903288, + "memory(GiB)": 15.04, + "step": 1580, + "train_speed(iter/s)": 0.33664 + }, + { + "acc": 0.849652, + "epoch": 0.21346801346801347, + "grad_norm": 14.1875, + "learning_rate": 1.982471218505148e-05, + "loss": 0.5503304, + "memory(GiB)": 15.04, + "step": 1585, + "train_speed(iter/s)": 0.336776 + }, + { + "acc": 0.8291213, + "epoch": 0.21414141414141413, + "grad_norm": 8.4375, + "learning_rate": 1.9822630395191408e-05, + "loss": 0.49029202, + "memory(GiB)": 15.04, + "step": 1590, + "train_speed(iter/s)": 0.337014 + }, + { + "acc": 0.88485823, + "epoch": 0.21481481481481482, + "grad_norm": 7.84375, + "learning_rate": 1.982053642670854e-05, + "loss": 0.45210991, + "memory(GiB)": 15.04, + "step": 1595, + "train_speed(iter/s)": 0.337155 + }, + { + "acc": 0.82303982, + "epoch": 0.21548821548821548, + "grad_norm": 7.15625, + "learning_rate": 1.9818430282199098e-05, + "loss": 0.44473071, + "memory(GiB)": 15.04, + "step": 1600, + "train_speed(iter/s)": 0.337222 + }, + { + "acc": 0.82110701, + "epoch": 0.21616161616161617, + "grad_norm": 10.6875, + "learning_rate": 1.981631196427439e-05, + "loss": 0.57491765, + "memory(GiB)": 15.04, + "step": 1605, + "train_speed(iter/s)": 0.337458 + }, + { + "acc": 0.82914457, + "epoch": 0.21683501683501682, + "grad_norm": 6.625, + "learning_rate": 1.981418147556082e-05, + "loss": 0.72364321, + "memory(GiB)": 15.04, + "step": 1610, + "train_speed(iter/s)": 0.337569 + }, + { + "acc": 0.8326498, + "epoch": 0.2175084175084175, + "grad_norm": 13.125, + "learning_rate": 1.9812038818699878e-05, + "loss": 0.56981144, + "memory(GiB)": 15.04, + "step": 1615, + "train_speed(iter/s)": 0.337762 + }, + { + "acc": 0.82243462, + "epoch": 0.21818181818181817, + "grad_norm": 6.625, + "learning_rate": 1.980988399634815e-05, + "loss": 0.48505726, + "memory(GiB)": 15.04, + "step": 1620, + "train_speed(iter/s)": 0.337939 + }, + { + "acc": 0.85339508, + "epoch": 0.21885521885521886, + "grad_norm": 11.3125, + "learning_rate": 1.9807717011177298e-05, + "loss": 0.59794049, + "memory(GiB)": 15.04, + "step": 1625, + "train_speed(iter/s)": 0.338067 + }, + { + "acc": 0.85775013, + "epoch": 0.21952861952861952, + "grad_norm": 11.6875, + "learning_rate": 1.9805537865874063e-05, + "loss": 0.59125137, + "memory(GiB)": 15.04, + "step": 1630, + "train_speed(iter/s)": 0.338083 + }, + { + "acc": 0.88099813, + "epoch": 0.2202020202020202, + "grad_norm": 10.6875, + "learning_rate": 1.9803346563140273e-05, + "loss": 0.37204971, + "memory(GiB)": 15.04, + "step": 1635, + "train_speed(iter/s)": 0.338111 + }, + { + "acc": 0.84456615, + "epoch": 0.22087542087542086, + "grad_norm": 11.4375, + "learning_rate": 1.9801143105692815e-05, + "loss": 0.65295229, + "memory(GiB)": 15.04, + "step": 1640, + "train_speed(iter/s)": 0.338167 + }, + { + "acc": 0.87735825, + "epoch": 0.22154882154882155, + "grad_norm": 8.125, + "learning_rate": 1.979892749626366e-05, + "loss": 0.41986647, + "memory(GiB)": 15.04, + "step": 1645, + "train_speed(iter/s)": 0.338334 + }, + { + "acc": 0.81775703, + "epoch": 0.2222222222222222, + "grad_norm": 15.3125, + "learning_rate": 1.9796699737599835e-05, + "loss": 0.67741585, + "memory(GiB)": 15.04, + "step": 1650, + "train_speed(iter/s)": 0.338542 + }, + { + "acc": 0.8514535, + "epoch": 0.2228956228956229, + "grad_norm": 9.0625, + "learning_rate": 1.9794459832463438e-05, + "loss": 0.49367156, + "memory(GiB)": 15.04, + "step": 1655, + "train_speed(iter/s)": 0.338657 + }, + { + "acc": 0.83197441, + "epoch": 0.22356902356902356, + "grad_norm": 8.5625, + "learning_rate": 1.9792207783631615e-05, + "loss": 0.57320757, + "memory(GiB)": 15.04, + "step": 1660, + "train_speed(iter/s)": 0.338744 + }, + { + "acc": 0.83286524, + "epoch": 0.22424242424242424, + "grad_norm": 7.34375, + "learning_rate": 1.9789943593896588e-05, + "loss": 0.63577981, + "memory(GiB)": 15.04, + "step": 1665, + "train_speed(iter/s)": 0.338755 + }, + { + "acc": 0.80521784, + "epoch": 0.2249158249158249, + "grad_norm": 15.625, + "learning_rate": 1.9787667266065612e-05, + "loss": 0.64762154, + "memory(GiB)": 15.04, + "step": 1670, + "train_speed(iter/s)": 0.338956 + }, + { + "acc": 0.79904494, + "epoch": 0.2255892255892256, + "grad_norm": 9.375, + "learning_rate": 1.9785378802961005e-05, + "loss": 0.43870797, + "memory(GiB)": 15.04, + "step": 1675, + "train_speed(iter/s)": 0.339061 + }, + { + "acc": 0.89039049, + "epoch": 0.22626262626262628, + "grad_norm": 12.875, + "learning_rate": 1.978307820742012e-05, + "loss": 0.4549356, + "memory(GiB)": 15.04, + "step": 1680, + "train_speed(iter/s)": 0.339174 + }, + { + "acc": 0.87218695, + "epoch": 0.22693602693602694, + "grad_norm": 6.5, + "learning_rate": 1.9780765482295366e-05, + "loss": 0.50175328, + "memory(GiB)": 15.04, + "step": 1685, + "train_speed(iter/s)": 0.339312 + }, + { + "acc": 0.88136101, + "epoch": 0.22760942760942762, + "grad_norm": 9.25, + "learning_rate": 1.9778440630454178e-05, + "loss": 0.48342233, + "memory(GiB)": 15.04, + "step": 1690, + "train_speed(iter/s)": 0.339531 + }, + { + "acc": 0.80393095, + "epoch": 0.22828282828282828, + "grad_norm": 10.4375, + "learning_rate": 1.9776103654779037e-05, + "loss": 0.93885641, + "memory(GiB)": 15.04, + "step": 1695, + "train_speed(iter/s)": 0.339721 + }, + { + "acc": 0.85020695, + "epoch": 0.22895622895622897, + "grad_norm": 20.125, + "learning_rate": 1.9773754558167442e-05, + "loss": 0.71851912, + "memory(GiB)": 15.04, + "step": 1700, + "train_speed(iter/s)": 0.339822 + }, + { + "acc": 0.81808643, + "epoch": 0.22962962962962963, + "grad_norm": 9.625, + "learning_rate": 1.9771393343531938e-05, + "loss": 0.7361897, + "memory(GiB)": 15.04, + "step": 1705, + "train_speed(iter/s)": 0.339862 + }, + { + "acc": 0.7765965, + "epoch": 0.23030303030303031, + "grad_norm": 11.4375, + "learning_rate": 1.976902001380008e-05, + "loss": 0.95105314, + "memory(GiB)": 15.04, + "step": 1710, + "train_speed(iter/s)": 0.340035 + }, + { + "acc": 0.86265593, + "epoch": 0.23097643097643097, + "grad_norm": 8.9375, + "learning_rate": 1.9766634571914448e-05, + "loss": 0.62800045, + "memory(GiB)": 15.04, + "step": 1715, + "train_speed(iter/s)": 0.339976 + }, + { + "acc": 0.89045258, + "epoch": 0.23164983164983166, + "grad_norm": 11.875, + "learning_rate": 1.9764237020832644e-05, + "loss": 0.42381425, + "memory(GiB)": 15.04, + "step": 1720, + "train_speed(iter/s)": 0.340081 + }, + { + "acc": 0.8708931, + "epoch": 0.23232323232323232, + "grad_norm": 16.375, + "learning_rate": 1.976182736352728e-05, + "loss": 0.57231526, + "memory(GiB)": 15.04, + "step": 1725, + "train_speed(iter/s)": 0.340121 + }, + { + "acc": 0.78722067, + "epoch": 0.232996632996633, + "grad_norm": 21.875, + "learning_rate": 1.9759405602985973e-05, + "loss": 0.74272079, + "memory(GiB)": 15.04, + "step": 1730, + "train_speed(iter/s)": 0.340333 + }, + { + "acc": 0.91115456, + "epoch": 0.23367003367003367, + "grad_norm": 10.5, + "learning_rate": 1.975697174221136e-05, + "loss": 0.33090246, + "memory(GiB)": 15.04, + "step": 1735, + "train_speed(iter/s)": 0.340501 + }, + { + "acc": 0.79775496, + "epoch": 0.23434343434343435, + "grad_norm": 15.8125, + "learning_rate": 1.9754525784221067e-05, + "loss": 0.71790228, + "memory(GiB)": 15.04, + "step": 1740, + "train_speed(iter/s)": 0.340611 + }, + { + "acc": 0.83832827, + "epoch": 0.235016835016835, + "grad_norm": 13.875, + "learning_rate": 1.975206773204772e-05, + "loss": 0.47882934, + "memory(GiB)": 15.04, + "step": 1745, + "train_speed(iter/s)": 0.340706 + }, + { + "acc": 0.81263514, + "epoch": 0.2356902356902357, + "grad_norm": 9.25, + "learning_rate": 1.974959758873895e-05, + "loss": 0.63873296, + "memory(GiB)": 15.04, + "step": 1750, + "train_speed(iter/s)": 0.340898 + }, + { + "acc": 0.8787571, + "epoch": 0.23636363636363636, + "grad_norm": 11.1875, + "learning_rate": 1.974711535735737e-05, + "loss": 0.52029638, + "memory(GiB)": 15.04, + "step": 1755, + "train_speed(iter/s)": 0.341015 + }, + { + "acc": 0.80551605, + "epoch": 0.23703703703703705, + "grad_norm": 7.125, + "learning_rate": 1.9744621040980584e-05, + "loss": 0.53825655, + "memory(GiB)": 15.04, + "step": 1760, + "train_speed(iter/s)": 0.341192 + }, + { + "acc": 0.87776775, + "epoch": 0.2377104377104377, + "grad_norm": 16.0, + "learning_rate": 1.9742114642701177e-05, + "loss": 0.42201047, + "memory(GiB)": 15.04, + "step": 1765, + "train_speed(iter/s)": 0.341277 + }, + { + "acc": 0.88196945, + "epoch": 0.2383838383838384, + "grad_norm": 9.5, + "learning_rate": 1.9739596165626714e-05, + "loss": 0.42846918, + "memory(GiB)": 15.04, + "step": 1770, + "train_speed(iter/s)": 0.341506 + }, + { + "acc": 0.83472624, + "epoch": 0.23905723905723905, + "grad_norm": 21.625, + "learning_rate": 1.9737065612879748e-05, + "loss": 0.5340138, + "memory(GiB)": 15.04, + "step": 1775, + "train_speed(iter/s)": 0.341626 + }, + { + "acc": 0.82384624, + "epoch": 0.23973063973063974, + "grad_norm": 9.375, + "learning_rate": 1.973452298759778e-05, + "loss": 0.65406966, + "memory(GiB)": 15.04, + "step": 1780, + "train_speed(iter/s)": 0.341785 + }, + { + "acc": 0.88805351, + "epoch": 0.2404040404040404, + "grad_norm": 13.5625, + "learning_rate": 1.9731968292933303e-05, + "loss": 0.42406859, + "memory(GiB)": 15.04, + "step": 1785, + "train_speed(iter/s)": 0.341915 + }, + { + "acc": 0.86141911, + "epoch": 0.24107744107744108, + "grad_norm": 6.75, + "learning_rate": 1.972940153205376e-05, + "loss": 0.44310484, + "memory(GiB)": 15.04, + "step": 1790, + "train_speed(iter/s)": 0.342011 + }, + { + "acc": 0.80693083, + "epoch": 0.24175084175084174, + "grad_norm": 27.0, + "learning_rate": 1.972682270814156e-05, + "loss": 0.82178726, + "memory(GiB)": 15.04, + "step": 1795, + "train_speed(iter/s)": 0.342255 + }, + { + "acc": 0.86837473, + "epoch": 0.24242424242424243, + "grad_norm": 7.5625, + "learning_rate": 1.972423182439406e-05, + "loss": 0.47514634, + "memory(GiB)": 15.04, + "step": 1800, + "train_speed(iter/s)": 0.342382 + }, + { + "epoch": 0.24242424242424243, + "eval_acc": 0.843474600794465, + "eval_loss": 0.6183801889419556, + "eval_runtime": 109.7755, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 1800 + }, + { + "acc": 0.89186497, + "epoch": 0.2430976430976431, + "grad_norm": 11.1875, + "learning_rate": 1.972162888402359e-05, + "loss": 0.45051031, + "memory(GiB)": 15.04, + "step": 1805, + "train_speed(iter/s)": 0.335557 + }, + { + "acc": 0.78250394, + "epoch": 0.24377104377104378, + "grad_norm": 9.0, + "learning_rate": 1.9719013890257402e-05, + "loss": 0.64437308, + "memory(GiB)": 15.04, + "step": 1810, + "train_speed(iter/s)": 0.335702 + }, + { + "acc": 0.80951223, + "epoch": 0.24444444444444444, + "grad_norm": 8.9375, + "learning_rate": 1.971638684633771e-05, + "loss": 0.83825159, + "memory(GiB)": 15.04, + "step": 1815, + "train_speed(iter/s)": 0.335812 + }, + { + "acc": 0.86868896, + "epoch": 0.24511784511784512, + "grad_norm": 12.9375, + "learning_rate": 1.9713747755521665e-05, + "loss": 0.52982354, + "memory(GiB)": 15.04, + "step": 1820, + "train_speed(iter/s)": 0.335908 + }, + { + "acc": 0.90424519, + "epoch": 0.24579124579124578, + "grad_norm": 12.1875, + "learning_rate": 1.9711096621081353e-05, + "loss": 0.34413009, + "memory(GiB)": 15.04, + "step": 1825, + "train_speed(iter/s)": 0.336133 + }, + { + "acc": 0.73576164, + "epoch": 0.24646464646464647, + "grad_norm": 6.96875, + "learning_rate": 1.970843344630379e-05, + "loss": 1.21805191, + "memory(GiB)": 15.04, + "step": 1830, + "train_speed(iter/s)": 0.336158 + }, + { + "acc": 0.77103219, + "epoch": 0.24713804713804713, + "grad_norm": 13.5, + "learning_rate": 1.9705758234490923e-05, + "loss": 0.7155683, + "memory(GiB)": 15.04, + "step": 1835, + "train_speed(iter/s)": 0.336201 + }, + { + "acc": 0.8558444, + "epoch": 0.24781144781144782, + "grad_norm": 9.5625, + "learning_rate": 1.9703070988959622e-05, + "loss": 0.49693632, + "memory(GiB)": 15.04, + "step": 1840, + "train_speed(iter/s)": 0.336255 + }, + { + "acc": 0.86143408, + "epoch": 0.24848484848484848, + "grad_norm": 9.5625, + "learning_rate": 1.9700371713041682e-05, + "loss": 0.53240633, + "memory(GiB)": 15.04, + "step": 1845, + "train_speed(iter/s)": 0.336364 + }, + { + "acc": 0.88647976, + "epoch": 0.24915824915824916, + "grad_norm": 8.9375, + "learning_rate": 1.969766041008381e-05, + "loss": 0.44659019, + "memory(GiB)": 15.04, + "step": 1850, + "train_speed(iter/s)": 0.336546 + }, + { + "acc": 0.89583845, + "epoch": 0.24983164983164982, + "grad_norm": 7.28125, + "learning_rate": 1.9694937083447614e-05, + "loss": 0.3929671, + "memory(GiB)": 15.04, + "step": 1855, + "train_speed(iter/s)": 0.336773 + }, + { + "acc": 0.84875755, + "epoch": 0.2505050505050505, + "grad_norm": 10.0625, + "learning_rate": 1.9692201736509632e-05, + "loss": 0.55382719, + "memory(GiB)": 15.04, + "step": 1860, + "train_speed(iter/s)": 0.336874 + }, + { + "acc": 0.86458168, + "epoch": 0.25117845117845117, + "grad_norm": 10.6875, + "learning_rate": 1.968945437266129e-05, + "loss": 0.53511877, + "memory(GiB)": 15.04, + "step": 1865, + "train_speed(iter/s)": 0.336995 + }, + { + "acc": 0.85841675, + "epoch": 0.2518518518518518, + "grad_norm": 14.125, + "learning_rate": 1.9686694995308913e-05, + "loss": 0.58149495, + "memory(GiB)": 15.04, + "step": 1870, + "train_speed(iter/s)": 0.337087 + }, + { + "acc": 0.81333094, + "epoch": 0.25252525252525254, + "grad_norm": 21.0, + "learning_rate": 1.9683923607873726e-05, + "loss": 0.51532841, + "memory(GiB)": 15.04, + "step": 1875, + "train_speed(iter/s)": 0.337319 + }, + { + "acc": 0.81919813, + "epoch": 0.2531986531986532, + "grad_norm": 9.625, + "learning_rate": 1.968114021379185e-05, + "loss": 0.72619085, + "memory(GiB)": 15.04, + "step": 1880, + "train_speed(iter/s)": 0.337285 + }, + { + "acc": 0.79009395, + "epoch": 0.25387205387205386, + "grad_norm": 6.71875, + "learning_rate": 1.967834481651428e-05, + "loss": 0.59885988, + "memory(GiB)": 15.04, + "step": 1885, + "train_speed(iter/s)": 0.33742 + }, + { + "acc": 0.82559881, + "epoch": 0.2545454545454545, + "grad_norm": 11.125, + "learning_rate": 1.9675537419506897e-05, + "loss": 0.73017178, + "memory(GiB)": 15.04, + "step": 1890, + "train_speed(iter/s)": 0.337499 + }, + { + "acc": 0.84612713, + "epoch": 0.25521885521885523, + "grad_norm": 8.8125, + "learning_rate": 1.9672718026250467e-05, + "loss": 0.56751752, + "memory(GiB)": 15.04, + "step": 1895, + "train_speed(iter/s)": 0.337682 + }, + { + "acc": 0.82871618, + "epoch": 0.2558922558922559, + "grad_norm": 9.375, + "learning_rate": 1.9669886640240622e-05, + "loss": 0.53650327, + "memory(GiB)": 15.04, + "step": 1900, + "train_speed(iter/s)": 0.337762 + }, + { + "acc": 0.88074055, + "epoch": 0.25656565656565655, + "grad_norm": 9.0625, + "learning_rate": 1.966704326498787e-05, + "loss": 0.37428265, + "memory(GiB)": 15.04, + "step": 1905, + "train_speed(iter/s)": 0.337908 + }, + { + "acc": 0.74937229, + "epoch": 0.2572390572390572, + "grad_norm": 7.34375, + "learning_rate": 1.966418790401757e-05, + "loss": 0.48255777, + "memory(GiB)": 15.04, + "step": 1910, + "train_speed(iter/s)": 0.337963 + }, + { + "acc": 0.91495829, + "epoch": 0.2579124579124579, + "grad_norm": 5.0625, + "learning_rate": 1.966132056086996e-05, + "loss": 0.29679358, + "memory(GiB)": 15.04, + "step": 1915, + "train_speed(iter/s)": 0.337877 + }, + { + "acc": 0.82159767, + "epoch": 0.2585858585858586, + "grad_norm": 7.09375, + "learning_rate": 1.9658441239100125e-05, + "loss": 0.50666471, + "memory(GiB)": 15.04, + "step": 1920, + "train_speed(iter/s)": 0.337989 + }, + { + "acc": 0.83229828, + "epoch": 0.25925925925925924, + "grad_norm": 5.34375, + "learning_rate": 1.9655549942278e-05, + "loss": 0.54949713, + "memory(GiB)": 15.04, + "step": 1925, + "train_speed(iter/s)": 0.33792 + }, + { + "acc": 0.85097256, + "epoch": 0.25993265993265996, + "grad_norm": 6.28125, + "learning_rate": 1.9652646673988373e-05, + "loss": 0.48549409, + "memory(GiB)": 15.04, + "step": 1930, + "train_speed(iter/s)": 0.337977 + }, + { + "acc": 0.84481211, + "epoch": 0.2606060606060606, + "grad_norm": 9.4375, + "learning_rate": 1.964973143783086e-05, + "loss": 0.67328267, + "memory(GiB)": 15.04, + "step": 1935, + "train_speed(iter/s)": 0.338016 + }, + { + "acc": 0.8501667, + "epoch": 0.2612794612794613, + "grad_norm": 19.125, + "learning_rate": 1.964680423741994e-05, + "loss": 0.43254037, + "memory(GiB)": 15.04, + "step": 1940, + "train_speed(iter/s)": 0.338206 + }, + { + "acc": 0.86048279, + "epoch": 0.26195286195286194, + "grad_norm": 32.75, + "learning_rate": 1.964386507638491e-05, + "loss": 0.59896231, + "memory(GiB)": 15.04, + "step": 1945, + "train_speed(iter/s)": 0.338329 + }, + { + "acc": 0.88457479, + "epoch": 0.26262626262626265, + "grad_norm": 8.25, + "learning_rate": 1.9640913958369895e-05, + "loss": 0.43649497, + "memory(GiB)": 15.04, + "step": 1950, + "train_speed(iter/s)": 0.338487 + }, + { + "acc": 0.82918968, + "epoch": 0.2632996632996633, + "grad_norm": 8.375, + "learning_rate": 1.963795088703385e-05, + "loss": 0.50043716, + "memory(GiB)": 15.04, + "step": 1955, + "train_speed(iter/s)": 0.338408 + }, + { + "acc": 0.77567244, + "epoch": 0.26397306397306397, + "grad_norm": 19.125, + "learning_rate": 1.963497586605055e-05, + "loss": 0.88877125, + "memory(GiB)": 15.04, + "step": 1960, + "train_speed(iter/s)": 0.338565 + }, + { + "acc": 0.87920694, + "epoch": 0.26464646464646463, + "grad_norm": 6.65625, + "learning_rate": 1.963198889910859e-05, + "loss": 0.4500257, + "memory(GiB)": 15.04, + "step": 1965, + "train_speed(iter/s)": 0.338666 + }, + { + "acc": 0.86709795, + "epoch": 0.26531986531986534, + "grad_norm": 11.4375, + "learning_rate": 1.962898998991136e-05, + "loss": 0.35767465, + "memory(GiB)": 15.04, + "step": 1970, + "train_speed(iter/s)": 0.33889 + }, + { + "acc": 0.87004099, + "epoch": 0.265993265993266, + "grad_norm": 6.96875, + "learning_rate": 1.962597914217708e-05, + "loss": 0.44529519, + "memory(GiB)": 15.04, + "step": 1975, + "train_speed(iter/s)": 0.338998 + }, + { + "acc": 0.88034582, + "epoch": 0.26666666666666666, + "grad_norm": 6.40625, + "learning_rate": 1.9622956359638752e-05, + "loss": 0.40983272, + "memory(GiB)": 15.04, + "step": 1980, + "train_speed(iter/s)": 0.339066 + }, + { + "acc": 0.85966377, + "epoch": 0.2673400673400673, + "grad_norm": 18.875, + "learning_rate": 1.9619921646044188e-05, + "loss": 0.57280025, + "memory(GiB)": 15.04, + "step": 1985, + "train_speed(iter/s)": 0.339158 + }, + { + "acc": 0.87830267, + "epoch": 0.26801346801346804, + "grad_norm": 7.875, + "learning_rate": 1.9616875005155988e-05, + "loss": 0.5095448, + "memory(GiB)": 15.04, + "step": 1990, + "train_speed(iter/s)": 0.339323 + }, + { + "acc": 0.86530752, + "epoch": 0.2686868686868687, + "grad_norm": 6.78125, + "learning_rate": 1.961381644075154e-05, + "loss": 0.45437994, + "memory(GiB)": 15.04, + "step": 1995, + "train_speed(iter/s)": 0.339361 + }, + { + "acc": 0.84290581, + "epoch": 0.26936026936026936, + "grad_norm": 9.6875, + "learning_rate": 1.9610745956623013e-05, + "loss": 0.48863568, + "memory(GiB)": 15.04, + "step": 2000, + "train_speed(iter/s)": 0.339462 + }, + { + "acc": 0.88849602, + "epoch": 0.27003367003367, + "grad_norm": 6.96875, + "learning_rate": 1.9607663556577365e-05, + "loss": 0.45863638, + "memory(GiB)": 15.04, + "step": 2005, + "train_speed(iter/s)": 0.339325 + }, + { + "acc": 0.87045746, + "epoch": 0.27070707070707073, + "grad_norm": 11.5625, + "learning_rate": 1.9604569244436308e-05, + "loss": 0.44171586, + "memory(GiB)": 15.04, + "step": 2010, + "train_speed(iter/s)": 0.339288 + }, + { + "acc": 0.84935579, + "epoch": 0.2713804713804714, + "grad_norm": 10.1875, + "learning_rate": 1.9601463024036346e-05, + "loss": 0.50538421, + "memory(GiB)": 15.04, + "step": 2015, + "train_speed(iter/s)": 0.339413 + }, + { + "acc": 0.90092773, + "epoch": 0.27205387205387205, + "grad_norm": 8.75, + "learning_rate": 1.959834489922874e-05, + "loss": 0.38511703, + "memory(GiB)": 15.04, + "step": 2020, + "train_speed(iter/s)": 0.339554 + }, + { + "acc": 0.87212915, + "epoch": 0.2727272727272727, + "grad_norm": 6.96875, + "learning_rate": 1.9595214873879494e-05, + "loss": 0.35981777, + "memory(GiB)": 15.04, + "step": 2025, + "train_speed(iter/s)": 0.339741 + }, + { + "acc": 0.86869631, + "epoch": 0.2734006734006734, + "grad_norm": 11.25, + "learning_rate": 1.9592072951869394e-05, + "loss": 0.53693933, + "memory(GiB)": 15.04, + "step": 2030, + "train_speed(iter/s)": 0.339871 + }, + { + "acc": 0.81653395, + "epoch": 0.2740740740740741, + "grad_norm": 7.5, + "learning_rate": 1.9588919137093956e-05, + "loss": 0.51912298, + "memory(GiB)": 15.04, + "step": 2035, + "train_speed(iter/s)": 0.339811 + }, + { + "acc": 0.79491487, + "epoch": 0.27474747474747474, + "grad_norm": 6.625, + "learning_rate": 1.9585753433463452e-05, + "loss": 0.64108443, + "memory(GiB)": 15.04, + "step": 2040, + "train_speed(iter/s)": 0.339931 + }, + { + "acc": 0.80765038, + "epoch": 0.2754208754208754, + "grad_norm": 9.6875, + "learning_rate": 1.958257584490289e-05, + "loss": 1.07152681, + "memory(GiB)": 15.04, + "step": 2045, + "train_speed(iter/s)": 0.339958 + }, + { + "acc": 0.79323549, + "epoch": 0.2760942760942761, + "grad_norm": 7.34375, + "learning_rate": 1.9579386375352015e-05, + "loss": 0.79024658, + "memory(GiB)": 15.04, + "step": 2050, + "train_speed(iter/s)": 0.339931 + }, + { + "acc": 0.78419557, + "epoch": 0.2767676767676768, + "grad_norm": 6.21875, + "learning_rate": 1.9576185028765296e-05, + "loss": 0.72721648, + "memory(GiB)": 15.04, + "step": 2055, + "train_speed(iter/s)": 0.340049 + }, + { + "acc": 0.86066732, + "epoch": 0.27744107744107743, + "grad_norm": 7.3125, + "learning_rate": 1.9572971809111944e-05, + "loss": 0.53243432, + "memory(GiB)": 15.04, + "step": 2060, + "train_speed(iter/s)": 0.340117 + }, + { + "acc": 0.87064056, + "epoch": 0.2781144781144781, + "grad_norm": 8.3125, + "learning_rate": 1.9569746720375873e-05, + "loss": 0.63533983, + "memory(GiB)": 15.04, + "step": 2065, + "train_speed(iter/s)": 0.340126 + }, + { + "acc": 0.87459402, + "epoch": 0.2787878787878788, + "grad_norm": 13.5, + "learning_rate": 1.9566509766555725e-05, + "loss": 0.5083847, + "memory(GiB)": 15.04, + "step": 2070, + "train_speed(iter/s)": 0.340285 + }, + { + "acc": 0.86485825, + "epoch": 0.27946127946127947, + "grad_norm": 6.09375, + "learning_rate": 1.9563260951664844e-05, + "loss": 0.46479993, + "memory(GiB)": 15.04, + "step": 2075, + "train_speed(iter/s)": 0.340277 + }, + { + "acc": 0.87869692, + "epoch": 0.2801346801346801, + "grad_norm": 16.625, + "learning_rate": 1.9560000279731285e-05, + "loss": 0.40669374, + "memory(GiB)": 15.04, + "step": 2080, + "train_speed(iter/s)": 0.340421 + }, + { + "acc": 0.87450075, + "epoch": 0.2808080808080808, + "grad_norm": 8.0, + "learning_rate": 1.9556727754797808e-05, + "loss": 0.41202154, + "memory(GiB)": 15.04, + "step": 2085, + "train_speed(iter/s)": 0.340408 + }, + { + "acc": 0.86928625, + "epoch": 0.2814814814814815, + "grad_norm": 9.8125, + "learning_rate": 1.9553443380921862e-05, + "loss": 0.4687561, + "memory(GiB)": 15.04, + "step": 2090, + "train_speed(iter/s)": 0.340497 + }, + { + "acc": 0.82436953, + "epoch": 0.28215488215488216, + "grad_norm": 10.5625, + "learning_rate": 1.955014716217559e-05, + "loss": 0.53379784, + "memory(GiB)": 15.04, + "step": 2095, + "train_speed(iter/s)": 0.34057 + }, + { + "acc": 0.86324949, + "epoch": 0.2828282828282828, + "grad_norm": 13.25, + "learning_rate": 1.954683910264582e-05, + "loss": 0.47917733, + "memory(GiB)": 15.04, + "step": 2100, + "train_speed(iter/s)": 0.34075 + }, + { + "epoch": 0.2828282828282828, + "eval_acc": 0.8546162196510365, + "eval_loss": 0.5698094964027405, + "eval_runtime": 109.4795, + "eval_samples_per_second": 1.37, + "eval_steps_per_second": 1.37, + "step": 2100 + }, + { + "acc": 0.8573041, + "epoch": 0.2835016835016835, + "grad_norm": 68.5, + "learning_rate": 1.954351920643406e-05, + "loss": 0.45866795, + "memory(GiB)": 15.04, + "step": 2105, + "train_speed(iter/s)": 0.335005 + }, + { + "acc": 0.88993645, + "epoch": 0.2841750841750842, + "grad_norm": 8.0, + "learning_rate": 1.95401874776565e-05, + "loss": 0.43657699, + "memory(GiB)": 15.04, + "step": 2110, + "train_speed(iter/s)": 0.335161 + }, + { + "acc": 0.85032578, + "epoch": 0.28484848484848485, + "grad_norm": 9.125, + "learning_rate": 1.953684392044399e-05, + "loss": 0.49008636, + "memory(GiB)": 15.04, + "step": 2115, + "train_speed(iter/s)": 0.335288 + }, + { + "acc": 0.88042259, + "epoch": 0.2855218855218855, + "grad_norm": 9.0625, + "learning_rate": 1.953348853894205e-05, + "loss": 0.46665258, + "memory(GiB)": 15.04, + "step": 2120, + "train_speed(iter/s)": 0.335299 + }, + { + "acc": 0.84420929, + "epoch": 0.28619528619528617, + "grad_norm": 6.03125, + "learning_rate": 1.9530121337310866e-05, + "loss": 0.4866931, + "memory(GiB)": 15.04, + "step": 2125, + "train_speed(iter/s)": 0.335485 + }, + { + "acc": 0.75258598, + "epoch": 0.2868686868686869, + "grad_norm": 10.0625, + "learning_rate": 1.952674231972527e-05, + "loss": 0.78863392, + "memory(GiB)": 15.04, + "step": 2130, + "train_speed(iter/s)": 0.335674 + }, + { + "acc": 0.80859814, + "epoch": 0.28754208754208754, + "grad_norm": 17.375, + "learning_rate": 1.952335149037476e-05, + "loss": 0.73442564, + "memory(GiB)": 15.04, + "step": 2135, + "train_speed(iter/s)": 0.335804 + }, + { + "acc": 0.82581291, + "epoch": 0.2882154882154882, + "grad_norm": 30.125, + "learning_rate": 1.9519948853463453e-05, + "loss": 0.47450933, + "memory(GiB)": 15.04, + "step": 2140, + "train_speed(iter/s)": 0.33591 + }, + { + "acc": 0.84496117, + "epoch": 0.28888888888888886, + "grad_norm": 11.625, + "learning_rate": 1.951653441321013e-05, + "loss": 0.50630031, + "memory(GiB)": 15.04, + "step": 2145, + "train_speed(iter/s)": 0.336086 + }, + { + "acc": 0.90782948, + "epoch": 0.2895622895622896, + "grad_norm": 14.5625, + "learning_rate": 1.9513108173848193e-05, + "loss": 0.36844997, + "memory(GiB)": 15.04, + "step": 2150, + "train_speed(iter/s)": 0.336242 + }, + { + "acc": 0.83160648, + "epoch": 0.29023569023569024, + "grad_norm": 17.5, + "learning_rate": 1.950967013962568e-05, + "loss": 0.74357166, + "memory(GiB)": 15.04, + "step": 2155, + "train_speed(iter/s)": 0.336361 + }, + { + "acc": 0.86514874, + "epoch": 0.2909090909090909, + "grad_norm": 11.5, + "learning_rate": 1.950622031480524e-05, + "loss": 0.4806139, + "memory(GiB)": 15.04, + "step": 2160, + "train_speed(iter/s)": 0.336509 + }, + { + "acc": 0.88152399, + "epoch": 0.2915824915824916, + "grad_norm": 15.5625, + "learning_rate": 1.950275870366417e-05, + "loss": 0.42793164, + "memory(GiB)": 15.04, + "step": 2165, + "train_speed(iter/s)": 0.336669 + }, + { + "acc": 0.81066751, + "epoch": 0.29225589225589227, + "grad_norm": 13.9375, + "learning_rate": 1.9499285310494337e-05, + "loss": 0.62704086, + "memory(GiB)": 15.04, + "step": 2170, + "train_speed(iter/s)": 0.336743 + }, + { + "acc": 0.84785643, + "epoch": 0.29292929292929293, + "grad_norm": 13.5, + "learning_rate": 1.949580013960226e-05, + "loss": 0.52745781, + "memory(GiB)": 15.04, + "step": 2175, + "train_speed(iter/s)": 0.33674 + }, + { + "acc": 0.82926636, + "epoch": 0.2936026936026936, + "grad_norm": 19.125, + "learning_rate": 1.9492303195309028e-05, + "loss": 0.65729084, + "memory(GiB)": 15.04, + "step": 2180, + "train_speed(iter/s)": 0.336937 + }, + { + "acc": 0.72600865, + "epoch": 0.2942760942760943, + "grad_norm": 8.1875, + "learning_rate": 1.9488794481950345e-05, + "loss": 0.94059401, + "memory(GiB)": 15.04, + "step": 2185, + "train_speed(iter/s)": 0.337106 + }, + { + "acc": 0.85088053, + "epoch": 0.29494949494949496, + "grad_norm": 9.8125, + "learning_rate": 1.9485274003876497e-05, + "loss": 0.35568237, + "memory(GiB)": 15.04, + "step": 2190, + "train_speed(iter/s)": 0.337205 + }, + { + "acc": 0.88804274, + "epoch": 0.2956228956228956, + "grad_norm": 14.8125, + "learning_rate": 1.9481741765452364e-05, + "loss": 0.49214945, + "memory(GiB)": 15.04, + "step": 2195, + "train_speed(iter/s)": 0.337299 + }, + { + "acc": 0.88171883, + "epoch": 0.2962962962962963, + "grad_norm": 7.8125, + "learning_rate": 1.9478197771057407e-05, + "loss": 0.3708432, + "memory(GiB)": 15.04, + "step": 2200, + "train_speed(iter/s)": 0.337492 + }, + { + "acc": 0.83816452, + "epoch": 0.296969696969697, + "grad_norm": 13.4375, + "learning_rate": 1.9474642025085656e-05, + "loss": 0.63066597, + "memory(GiB)": 15.04, + "step": 2205, + "train_speed(iter/s)": 0.337539 + }, + { + "acc": 0.87264824, + "epoch": 0.29764309764309765, + "grad_norm": 12.375, + "learning_rate": 1.9471074531945716e-05, + "loss": 0.42701688, + "memory(GiB)": 15.04, + "step": 2210, + "train_speed(iter/s)": 0.337641 + }, + { + "acc": 0.86768637, + "epoch": 0.2983164983164983, + "grad_norm": 14.875, + "learning_rate": 1.9467495296060755e-05, + "loss": 0.48693333, + "memory(GiB)": 15.04, + "step": 2215, + "train_speed(iter/s)": 0.337826 + }, + { + "acc": 0.85808105, + "epoch": 0.298989898989899, + "grad_norm": 17.125, + "learning_rate": 1.9463904321868508e-05, + "loss": 0.51649294, + "memory(GiB)": 15.04, + "step": 2220, + "train_speed(iter/s)": 0.337937 + }, + { + "acc": 0.82949419, + "epoch": 0.2996632996632997, + "grad_norm": 17.625, + "learning_rate": 1.9460301613821246e-05, + "loss": 0.4572835, + "memory(GiB)": 15.04, + "step": 2225, + "train_speed(iter/s)": 0.338094 + }, + { + "acc": 0.84240913, + "epoch": 0.30033670033670035, + "grad_norm": 10.3125, + "learning_rate": 1.9456687176385806e-05, + "loss": 0.74288449, + "memory(GiB)": 15.04, + "step": 2230, + "train_speed(iter/s)": 0.338173 + }, + { + "acc": 0.86656504, + "epoch": 0.301010101010101, + "grad_norm": 6.5, + "learning_rate": 1.945306101404356e-05, + "loss": 0.38929241, + "memory(GiB)": 15.04, + "step": 2235, + "train_speed(iter/s)": 0.338236 + }, + { + "acc": 0.90445004, + "epoch": 0.30168350168350166, + "grad_norm": 7.875, + "learning_rate": 1.944942313129042e-05, + "loss": 0.40537362, + "memory(GiB)": 15.04, + "step": 2240, + "train_speed(iter/s)": 0.338265 + }, + { + "acc": 0.86881905, + "epoch": 0.3023569023569024, + "grad_norm": 13.375, + "learning_rate": 1.9445773532636823e-05, + "loss": 0.56779408, + "memory(GiB)": 15.04, + "step": 2245, + "train_speed(iter/s)": 0.338382 + }, + { + "acc": 0.84482965, + "epoch": 0.30303030303030304, + "grad_norm": 10.5625, + "learning_rate": 1.9442112222607737e-05, + "loss": 0.56202984, + "memory(GiB)": 15.04, + "step": 2250, + "train_speed(iter/s)": 0.338441 + }, + { + "acc": 0.88069286, + "epoch": 0.3037037037037037, + "grad_norm": 7.625, + "learning_rate": 1.9438439205742656e-05, + "loss": 0.26127765, + "memory(GiB)": 15.04, + "step": 2255, + "train_speed(iter/s)": 0.338643 + }, + { + "acc": 0.85242128, + "epoch": 0.30437710437710436, + "grad_norm": 7.03125, + "learning_rate": 1.9434754486595576e-05, + "loss": 0.53033781, + "memory(GiB)": 15.04, + "step": 2260, + "train_speed(iter/s)": 0.338764 + }, + { + "acc": 0.89758568, + "epoch": 0.30505050505050507, + "grad_norm": 13.375, + "learning_rate": 1.9431058069735016e-05, + "loss": 0.3443686, + "memory(GiB)": 15.04, + "step": 2265, + "train_speed(iter/s)": 0.338912 + }, + { + "acc": 0.86288328, + "epoch": 0.30572390572390573, + "grad_norm": 14.9375, + "learning_rate": 1.9427349959743983e-05, + "loss": 0.44742985, + "memory(GiB)": 15.04, + "step": 2270, + "train_speed(iter/s)": 0.339055 + }, + { + "acc": 0.7970829, + "epoch": 0.3063973063973064, + "grad_norm": 23.5, + "learning_rate": 1.9423630161219996e-05, + "loss": 0.73262329, + "memory(GiB)": 15.04, + "step": 2275, + "train_speed(iter/s)": 0.339145 + }, + { + "acc": 0.90343246, + "epoch": 0.30707070707070705, + "grad_norm": 10.3125, + "learning_rate": 1.941989867877506e-05, + "loss": 0.37732604, + "memory(GiB)": 15.04, + "step": 2280, + "train_speed(iter/s)": 0.33926 + }, + { + "acc": 0.89101963, + "epoch": 0.30774410774410776, + "grad_norm": 10.375, + "learning_rate": 1.9416155517035666e-05, + "loss": 0.42690949, + "memory(GiB)": 15.04, + "step": 2285, + "train_speed(iter/s)": 0.339246 + }, + { + "acc": 0.80757065, + "epoch": 0.3084175084175084, + "grad_norm": 22.375, + "learning_rate": 1.9412400680642785e-05, + "loss": 0.57092986, + "memory(GiB)": 15.04, + "step": 2290, + "train_speed(iter/s)": 0.339331 + }, + { + "acc": 0.84998417, + "epoch": 0.3090909090909091, + "grad_norm": 9.125, + "learning_rate": 1.9408634174251864e-05, + "loss": 0.52270007, + "memory(GiB)": 15.04, + "step": 2295, + "train_speed(iter/s)": 0.339404 + }, + { + "acc": 0.89776154, + "epoch": 0.30976430976430974, + "grad_norm": 6.90625, + "learning_rate": 1.9404856002532822e-05, + "loss": 0.34066579, + "memory(GiB)": 15.04, + "step": 2300, + "train_speed(iter/s)": 0.339485 + }, + { + "acc": 0.87052498, + "epoch": 0.31043771043771046, + "grad_norm": 7.1875, + "learning_rate": 1.9401066170170034e-05, + "loss": 0.279761, + "memory(GiB)": 15.04, + "step": 2305, + "train_speed(iter/s)": 0.339601 + }, + { + "acc": 0.8039012, + "epoch": 0.3111111111111111, + "grad_norm": 17.25, + "learning_rate": 1.939726468186234e-05, + "loss": 0.89665995, + "memory(GiB)": 15.04, + "step": 2310, + "train_speed(iter/s)": 0.339741 + }, + { + "acc": 0.87178888, + "epoch": 0.3117845117845118, + "grad_norm": 7.0625, + "learning_rate": 1.939345154232303e-05, + "loss": 0.46687875, + "memory(GiB)": 15.04, + "step": 2315, + "train_speed(iter/s)": 0.339855 + }, + { + "acc": 0.77452545, + "epoch": 0.31245791245791243, + "grad_norm": 7.15625, + "learning_rate": 1.9389626756279834e-05, + "loss": 0.58602438, + "memory(GiB)": 15.04, + "step": 2320, + "train_speed(iter/s)": 0.339912 + }, + { + "acc": 0.87413473, + "epoch": 0.31313131313131315, + "grad_norm": 11.3125, + "learning_rate": 1.938579032847493e-05, + "loss": 0.43738194, + "memory(GiB)": 15.04, + "step": 2325, + "train_speed(iter/s)": 0.339952 + }, + { + "acc": 0.91057472, + "epoch": 0.3138047138047138, + "grad_norm": 10.9375, + "learning_rate": 1.9381942263664927e-05, + "loss": 0.37268085, + "memory(GiB)": 15.04, + "step": 2330, + "train_speed(iter/s)": 0.340096 + }, + { + "acc": 0.87649097, + "epoch": 0.31447811447811447, + "grad_norm": 8.4375, + "learning_rate": 1.9378082566620854e-05, + "loss": 0.36977, + "memory(GiB)": 15.04, + "step": 2335, + "train_speed(iter/s)": 0.340248 + }, + { + "acc": 0.77074413, + "epoch": 0.3151515151515151, + "grad_norm": 6.125, + "learning_rate": 1.9374211242128185e-05, + "loss": 0.65000758, + "memory(GiB)": 15.04, + "step": 2340, + "train_speed(iter/s)": 0.340298 + }, + { + "acc": 0.88139248, + "epoch": 0.31582491582491584, + "grad_norm": 11.4375, + "learning_rate": 1.937032829498678e-05, + "loss": 0.48605113, + "memory(GiB)": 15.04, + "step": 2345, + "train_speed(iter/s)": 0.340348 + }, + { + "acc": 0.8511385, + "epoch": 0.3164983164983165, + "grad_norm": 9.9375, + "learning_rate": 1.9366433730010933e-05, + "loss": 0.60395231, + "memory(GiB)": 15.04, + "step": 2350, + "train_speed(iter/s)": 0.340361 + }, + { + "acc": 0.85783482, + "epoch": 0.31717171717171716, + "grad_norm": 6.59375, + "learning_rate": 1.9362527552029332e-05, + "loss": 0.5698195, + "memory(GiB)": 15.04, + "step": 2355, + "train_speed(iter/s)": 0.340345 + }, + { + "acc": 0.81080713, + "epoch": 0.3178451178451178, + "grad_norm": 7.21875, + "learning_rate": 1.9358609765885066e-05, + "loss": 0.66538157, + "memory(GiB)": 15.04, + "step": 2360, + "train_speed(iter/s)": 0.340421 + }, + { + "acc": 0.86503201, + "epoch": 0.31851851851851853, + "grad_norm": 8.125, + "learning_rate": 1.9354680376435616e-05, + "loss": 0.66491618, + "memory(GiB)": 15.04, + "step": 2365, + "train_speed(iter/s)": 0.340483 + }, + { + "acc": 0.87831783, + "epoch": 0.3191919191919192, + "grad_norm": 8.875, + "learning_rate": 1.9350739388552845e-05, + "loss": 0.43169312, + "memory(GiB)": 15.04, + "step": 2370, + "train_speed(iter/s)": 0.340562 + }, + { + "acc": 0.86912212, + "epoch": 0.31986531986531985, + "grad_norm": 10.5625, + "learning_rate": 1.934678680712301e-05, + "loss": 0.47349477, + "memory(GiB)": 15.04, + "step": 2375, + "train_speed(iter/s)": 0.340661 + }, + { + "acc": 0.88821764, + "epoch": 0.3205387205387205, + "grad_norm": 15.4375, + "learning_rate": 1.934282263704672e-05, + "loss": 0.46184831, + "memory(GiB)": 15.04, + "step": 2380, + "train_speed(iter/s)": 0.340781 + }, + { + "acc": 0.89957047, + "epoch": 0.3212121212121212, + "grad_norm": 8.5, + "learning_rate": 1.933884688323898e-05, + "loss": 0.38067825, + "memory(GiB)": 15.04, + "step": 2385, + "train_speed(iter/s)": 0.340804 + }, + { + "acc": 0.85859814, + "epoch": 0.3218855218855219, + "grad_norm": 21.375, + "learning_rate": 1.933485955062913e-05, + "loss": 0.39313557, + "memory(GiB)": 15.04, + "step": 2390, + "train_speed(iter/s)": 0.340922 + }, + { + "acc": 0.86764755, + "epoch": 0.32255892255892255, + "grad_norm": 6.9375, + "learning_rate": 1.9330860644160884e-05, + "loss": 0.39578037, + "memory(GiB)": 15.04, + "step": 2395, + "train_speed(iter/s)": 0.340853 + }, + { + "acc": 0.8775794, + "epoch": 0.32323232323232326, + "grad_norm": 16.0, + "learning_rate": 1.93268501687923e-05, + "loss": 0.35117748, + "memory(GiB)": 15.04, + "step": 2400, + "train_speed(iter/s)": 0.340913 + }, + { + "epoch": 0.32323232323232326, + "eval_acc": 0.8610003959350666, + "eval_loss": 0.5346771478652954, + "eval_runtime": 109.5685, + "eval_samples_per_second": 1.369, + "eval_steps_per_second": 1.369, + "step": 2400 + }, + { + "acc": 0.85465479, + "epoch": 0.3239057239057239, + "grad_norm": 9.75, + "learning_rate": 1.9322828129495783e-05, + "loss": 0.58138494, + "memory(GiB)": 15.04, + "step": 2405, + "train_speed(iter/s)": 0.335788 + }, + { + "acc": 0.89197845, + "epoch": 0.3245791245791246, + "grad_norm": 24.5, + "learning_rate": 1.9318794531258064e-05, + "loss": 0.41574764, + "memory(GiB)": 15.04, + "step": 2410, + "train_speed(iter/s)": 0.335922 + }, + { + "acc": 0.82217188, + "epoch": 0.32525252525252524, + "grad_norm": 9.375, + "learning_rate": 1.931474937908022e-05, + "loss": 0.63134084, + "memory(GiB)": 15.04, + "step": 2415, + "train_speed(iter/s)": 0.33601 + }, + { + "acc": 0.87278347, + "epoch": 0.32592592592592595, + "grad_norm": 10.1875, + "learning_rate": 1.9310692677977645e-05, + "loss": 0.56534538, + "memory(GiB)": 15.04, + "step": 2420, + "train_speed(iter/s)": 0.336135 + }, + { + "acc": 0.83628635, + "epoch": 0.3265993265993266, + "grad_norm": 7.0, + "learning_rate": 1.930662443298006e-05, + "loss": 0.56281152, + "memory(GiB)": 15.04, + "step": 2425, + "train_speed(iter/s)": 0.33613 + }, + { + "acc": 0.85470467, + "epoch": 0.32727272727272727, + "grad_norm": 7.5, + "learning_rate": 1.9302544649131482e-05, + "loss": 0.4939652, + "memory(GiB)": 15.04, + "step": 2430, + "train_speed(iter/s)": 0.336209 + }, + { + "acc": 0.89396553, + "epoch": 0.32794612794612793, + "grad_norm": 8.9375, + "learning_rate": 1.9298453331490257e-05, + "loss": 0.43196988, + "memory(GiB)": 15.04, + "step": 2435, + "train_speed(iter/s)": 0.336355 + }, + { + "acc": 0.82339792, + "epoch": 0.32861952861952864, + "grad_norm": 5.375, + "learning_rate": 1.929435048512901e-05, + "loss": 0.61394019, + "memory(GiB)": 15.04, + "step": 2440, + "train_speed(iter/s)": 0.336437 + }, + { + "acc": 0.83198853, + "epoch": 0.3292929292929293, + "grad_norm": 12.25, + "learning_rate": 1.9290236115134677e-05, + "loss": 0.47708797, + "memory(GiB)": 15.04, + "step": 2445, + "train_speed(iter/s)": 0.336573 + }, + { + "acc": 0.82138634, + "epoch": 0.32996632996632996, + "grad_norm": 14.0625, + "learning_rate": 1.9286110226608465e-05, + "loss": 0.55513005, + "memory(GiB)": 15.04, + "step": 2450, + "train_speed(iter/s)": 0.336678 + }, + { + "acc": 0.89609404, + "epoch": 0.3306397306397306, + "grad_norm": 11.3125, + "learning_rate": 1.928197282466588e-05, + "loss": 0.36580346, + "memory(GiB)": 15.04, + "step": 2455, + "train_speed(iter/s)": 0.336814 + }, + { + "acc": 0.78545809, + "epoch": 0.33131313131313134, + "grad_norm": 28.25, + "learning_rate": 1.9277823914436688e-05, + "loss": 0.89613228, + "memory(GiB)": 15.04, + "step": 2460, + "train_speed(iter/s)": 0.336865 + }, + { + "acc": 0.89025421, + "epoch": 0.331986531986532, + "grad_norm": 7.65625, + "learning_rate": 1.927366350106494e-05, + "loss": 0.36613114, + "memory(GiB)": 15.04, + "step": 2465, + "train_speed(iter/s)": 0.336846 + }, + { + "acc": 0.86367445, + "epoch": 0.33265993265993266, + "grad_norm": 8.8125, + "learning_rate": 1.9269491589708927e-05, + "loss": 0.4765162, + "memory(GiB)": 15.04, + "step": 2470, + "train_speed(iter/s)": 0.336932 + }, + { + "acc": 0.90018415, + "epoch": 0.3333333333333333, + "grad_norm": 7.75, + "learning_rate": 1.926530818554121e-05, + "loss": 0.37076304, + "memory(GiB)": 15.04, + "step": 2475, + "train_speed(iter/s)": 0.337088 + }, + { + "acc": 0.89008112, + "epoch": 0.33400673400673403, + "grad_norm": 9.0625, + "learning_rate": 1.9261113293748607e-05, + "loss": 0.35568271, + "memory(GiB)": 15.04, + "step": 2480, + "train_speed(iter/s)": 0.337163 + }, + { + "acc": 0.881637, + "epoch": 0.3346801346801347, + "grad_norm": 10.875, + "learning_rate": 1.9256906919532162e-05, + "loss": 0.39150298, + "memory(GiB)": 15.04, + "step": 2485, + "train_speed(iter/s)": 0.337288 + }, + { + "acc": 0.87954836, + "epoch": 0.33535353535353535, + "grad_norm": 6.28125, + "learning_rate": 1.925268906810716e-05, + "loss": 0.4459631, + "memory(GiB)": 15.04, + "step": 2490, + "train_speed(iter/s)": 0.337353 + }, + { + "acc": 0.86157665, + "epoch": 0.336026936026936, + "grad_norm": 7.96875, + "learning_rate": 1.9248459744703126e-05, + "loss": 0.53909798, + "memory(GiB)": 15.04, + "step": 2495, + "train_speed(iter/s)": 0.33748 + }, + { + "acc": 0.84323149, + "epoch": 0.3367003367003367, + "grad_norm": 7.53125, + "learning_rate": 1.9244218954563797e-05, + "loss": 0.4920475, + "memory(GiB)": 15.04, + "step": 2500, + "train_speed(iter/s)": 0.337538 + }, + { + "acc": 0.82223568, + "epoch": 0.3373737373737374, + "grad_norm": 20.25, + "learning_rate": 1.923996670294713e-05, + "loss": 0.52200842, + "memory(GiB)": 15.04, + "step": 2505, + "train_speed(iter/s)": 0.337522 + }, + { + "acc": 0.86578159, + "epoch": 0.33804713804713804, + "grad_norm": 14.3125, + "learning_rate": 1.92357029951253e-05, + "loss": 0.49261842, + "memory(GiB)": 15.04, + "step": 2510, + "train_speed(iter/s)": 0.337617 + }, + { + "acc": 0.89437923, + "epoch": 0.3387205387205387, + "grad_norm": 6.9375, + "learning_rate": 1.9231427836384673e-05, + "loss": 0.38732541, + "memory(GiB)": 15.04, + "step": 2515, + "train_speed(iter/s)": 0.337674 + }, + { + "acc": 0.88720303, + "epoch": 0.3393939393939394, + "grad_norm": 5.75, + "learning_rate": 1.9227141232025824e-05, + "loss": 0.41020632, + "memory(GiB)": 15.04, + "step": 2520, + "train_speed(iter/s)": 0.337773 + }, + { + "acc": 0.88686686, + "epoch": 0.3400673400673401, + "grad_norm": 5.8125, + "learning_rate": 1.9222843187363518e-05, + "loss": 0.40615201, + "memory(GiB)": 15.04, + "step": 2525, + "train_speed(iter/s)": 0.33776 + }, + { + "acc": 0.88341837, + "epoch": 0.34074074074074073, + "grad_norm": 4.71875, + "learning_rate": 1.9218533707726693e-05, + "loss": 0.43876266, + "memory(GiB)": 15.04, + "step": 2530, + "train_speed(iter/s)": 0.337763 + }, + { + "acc": 0.88013363, + "epoch": 0.3414141414141414, + "grad_norm": 6.28125, + "learning_rate": 1.9214212798458477e-05, + "loss": 0.45347929, + "memory(GiB)": 15.04, + "step": 2535, + "train_speed(iter/s)": 0.337818 + }, + { + "acc": 0.89623661, + "epoch": 0.3420875420875421, + "grad_norm": 7.0, + "learning_rate": 1.9209880464916163e-05, + "loss": 0.41357417, + "memory(GiB)": 15.04, + "step": 2540, + "train_speed(iter/s)": 0.337882 + }, + { + "acc": 0.85681429, + "epoch": 0.34276094276094277, + "grad_norm": 8.4375, + "learning_rate": 1.9205536712471212e-05, + "loss": 0.47629442, + "memory(GiB)": 15.04, + "step": 2545, + "train_speed(iter/s)": 0.337973 + }, + { + "acc": 0.85323639, + "epoch": 0.3434343434343434, + "grad_norm": 7.9375, + "learning_rate": 1.920118154650924e-05, + "loss": 0.42014799, + "memory(GiB)": 15.04, + "step": 2550, + "train_speed(iter/s)": 0.33803 + }, + { + "acc": 0.81921091, + "epoch": 0.3441077441077441, + "grad_norm": 7.34375, + "learning_rate": 1.9196814972430013e-05, + "loss": 0.67137895, + "memory(GiB)": 15.04, + "step": 2555, + "train_speed(iter/s)": 0.338099 + }, + { + "acc": 0.86797428, + "epoch": 0.3447811447811448, + "grad_norm": 23.75, + "learning_rate": 1.9192436995647444e-05, + "loss": 0.6300138, + "memory(GiB)": 15.04, + "step": 2560, + "train_speed(iter/s)": 0.338222 + }, + { + "acc": 0.86078186, + "epoch": 0.34545454545454546, + "grad_norm": 9.25, + "learning_rate": 1.918804762158958e-05, + "loss": 0.46266317, + "memory(GiB)": 15.04, + "step": 2565, + "train_speed(iter/s)": 0.338335 + }, + { + "acc": 0.90208426, + "epoch": 0.3461279461279461, + "grad_norm": 8.6875, + "learning_rate": 1.918364685569861e-05, + "loss": 0.348434, + "memory(GiB)": 15.04, + "step": 2570, + "train_speed(iter/s)": 0.33846 + }, + { + "acc": 0.87423429, + "epoch": 0.3468013468013468, + "grad_norm": 5.15625, + "learning_rate": 1.9179234703430834e-05, + "loss": 0.42522211, + "memory(GiB)": 15.04, + "step": 2575, + "train_speed(iter/s)": 0.3385 + }, + { + "acc": 0.83370371, + "epoch": 0.3474747474747475, + "grad_norm": 9.875, + "learning_rate": 1.917481117025667e-05, + "loss": 0.6267858, + "memory(GiB)": 15.04, + "step": 2580, + "train_speed(iter/s)": 0.338562 + }, + { + "acc": 0.89248533, + "epoch": 0.34814814814814815, + "grad_norm": 12.0625, + "learning_rate": 1.917037626166066e-05, + "loss": 0.38710203, + "memory(GiB)": 15.04, + "step": 2585, + "train_speed(iter/s)": 0.338704 + }, + { + "acc": 0.88555288, + "epoch": 0.3488215488215488, + "grad_norm": 10.375, + "learning_rate": 1.9165929983141436e-05, + "loss": 0.36460023, + "memory(GiB)": 15.04, + "step": 2590, + "train_speed(iter/s)": 0.338806 + }, + { + "acc": 0.85448437, + "epoch": 0.34949494949494947, + "grad_norm": 16.75, + "learning_rate": 1.916147234021173e-05, + "loss": 0.58085775, + "memory(GiB)": 15.04, + "step": 2595, + "train_speed(iter/s)": 0.338771 + }, + { + "acc": 0.90577812, + "epoch": 0.3501683501683502, + "grad_norm": 8.5, + "learning_rate": 1.915700333839837e-05, + "loss": 0.28706489, + "memory(GiB)": 15.04, + "step": 2600, + "train_speed(iter/s)": 0.338854 + }, + { + "acc": 0.83092384, + "epoch": 0.35084175084175084, + "grad_norm": 18.125, + "learning_rate": 1.9152522983242266e-05, + "loss": 0.59093304, + "memory(GiB)": 15.04, + "step": 2605, + "train_speed(iter/s)": 0.338916 + }, + { + "acc": 0.877106, + "epoch": 0.3515151515151515, + "grad_norm": 9.5625, + "learning_rate": 1.9148031280298393e-05, + "loss": 0.53991098, + "memory(GiB)": 15.04, + "step": 2610, + "train_speed(iter/s)": 0.338995 + }, + { + "acc": 0.84207191, + "epoch": 0.35218855218855216, + "grad_norm": 6.40625, + "learning_rate": 1.9143528235135815e-05, + "loss": 0.80373363, + "memory(GiB)": 15.04, + "step": 2615, + "train_speed(iter/s)": 0.338889 + }, + { + "acc": 0.88392582, + "epoch": 0.3528619528619529, + "grad_norm": 10.5, + "learning_rate": 1.9139013853337644e-05, + "loss": 0.37835231, + "memory(GiB)": 15.04, + "step": 2620, + "train_speed(iter/s)": 0.338966 + }, + { + "acc": 0.85071363, + "epoch": 0.35353535353535354, + "grad_norm": 5.71875, + "learning_rate": 1.9134488140501046e-05, + "loss": 0.42265129, + "memory(GiB)": 15.04, + "step": 2625, + "train_speed(iter/s)": 0.338968 + }, + { + "acc": 0.83898754, + "epoch": 0.3542087542087542, + "grad_norm": 15.8125, + "learning_rate": 1.9129951102237254e-05, + "loss": 0.74677463, + "memory(GiB)": 15.04, + "step": 2630, + "train_speed(iter/s)": 0.338976 + }, + { + "acc": 0.84760046, + "epoch": 0.3548821548821549, + "grad_norm": 8.9375, + "learning_rate": 1.9125402744171523e-05, + "loss": 0.4743371, + "memory(GiB)": 15.04, + "step": 2635, + "train_speed(iter/s)": 0.339044 + }, + { + "acc": 0.88525162, + "epoch": 0.35555555555555557, + "grad_norm": 8.375, + "learning_rate": 1.912084307194315e-05, + "loss": 0.39847014, + "memory(GiB)": 15.04, + "step": 2640, + "train_speed(iter/s)": 0.33912 + }, + { + "acc": 0.86372814, + "epoch": 0.35622895622895623, + "grad_norm": 12.5, + "learning_rate": 1.9116272091205464e-05, + "loss": 0.40252566, + "memory(GiB)": 15.04, + "step": 2645, + "train_speed(iter/s)": 0.339259 + }, + { + "acc": 0.88826094, + "epoch": 0.3569023569023569, + "grad_norm": 9.375, + "learning_rate": 1.9111689807625812e-05, + "loss": 0.47242732, + "memory(GiB)": 15.04, + "step": 2650, + "train_speed(iter/s)": 0.339291 + }, + { + "acc": 0.87926731, + "epoch": 0.3575757575757576, + "grad_norm": 6.84375, + "learning_rate": 1.910709622688555e-05, + "loss": 0.36133299, + "memory(GiB)": 15.04, + "step": 2655, + "train_speed(iter/s)": 0.339335 + }, + { + "acc": 0.76420794, + "epoch": 0.35824915824915826, + "grad_norm": 9.125, + "learning_rate": 1.9102491354680048e-05, + "loss": 0.98229446, + "memory(GiB)": 15.04, + "step": 2660, + "train_speed(iter/s)": 0.339389 + }, + { + "acc": 0.84489317, + "epoch": 0.3589225589225589, + "grad_norm": 5.875, + "learning_rate": 1.9097875196718676e-05, + "loss": 0.47089186, + "memory(GiB)": 15.04, + "step": 2665, + "train_speed(iter/s)": 0.339384 + }, + { + "acc": 0.9002903, + "epoch": 0.3595959595959596, + "grad_norm": 6.125, + "learning_rate": 1.9093247758724786e-05, + "loss": 0.29986346, + "memory(GiB)": 15.04, + "step": 2670, + "train_speed(iter/s)": 0.339496 + }, + { + "acc": 0.8404459, + "epoch": 0.3602693602693603, + "grad_norm": 13.8125, + "learning_rate": 1.9088609046435732e-05, + "loss": 0.56306767, + "memory(GiB)": 15.04, + "step": 2675, + "train_speed(iter/s)": 0.339618 + }, + { + "acc": 0.89491272, + "epoch": 0.36094276094276095, + "grad_norm": 12.4375, + "learning_rate": 1.9083959065602834e-05, + "loss": 0.30484023, + "memory(GiB)": 15.04, + "step": 2680, + "train_speed(iter/s)": 0.339702 + }, + { + "acc": 0.85790787, + "epoch": 0.3616161616161616, + "grad_norm": 7.65625, + "learning_rate": 1.9079297821991384e-05, + "loss": 0.60322633, + "memory(GiB)": 15.04, + "step": 2685, + "train_speed(iter/s)": 0.33974 + }, + { + "acc": 0.80215073, + "epoch": 0.3622895622895623, + "grad_norm": 9.1875, + "learning_rate": 1.9074625321380645e-05, + "loss": 0.72699332, + "memory(GiB)": 15.04, + "step": 2690, + "train_speed(iter/s)": 0.339882 + }, + { + "acc": 0.83883438, + "epoch": 0.362962962962963, + "grad_norm": 7.46875, + "learning_rate": 1.9069941569563833e-05, + "loss": 0.78730655, + "memory(GiB)": 15.04, + "step": 2695, + "train_speed(iter/s)": 0.339985 + }, + { + "acc": 0.89459028, + "epoch": 0.36363636363636365, + "grad_norm": 11.875, + "learning_rate": 1.9065246572348112e-05, + "loss": 0.40213513, + "memory(GiB)": 15.04, + "step": 2700, + "train_speed(iter/s)": 0.340122 + }, + { + "epoch": 0.36363636363636365, + "eval_acc": 0.8660754467240238, + "eval_loss": 0.5180864334106445, + "eval_runtime": 109.593, + "eval_samples_per_second": 1.369, + "eval_steps_per_second": 1.369, + "step": 2700 + }, + { + "acc": 0.81508904, + "epoch": 0.3643097643097643, + "grad_norm": 6.96875, + "learning_rate": 1.9060540335554597e-05, + "loss": 0.48082347, + "memory(GiB)": 15.04, + "step": 2705, + "train_speed(iter/s)": 0.33562 + }, + { + "acc": 0.85648174, + "epoch": 0.36498316498316496, + "grad_norm": 5.9375, + "learning_rate": 1.905582286501832e-05, + "loss": 0.4697526, + "memory(GiB)": 15.04, + "step": 2710, + "train_speed(iter/s)": 0.335652 + }, + { + "acc": 0.88261299, + "epoch": 0.3656565656565657, + "grad_norm": 13.75, + "learning_rate": 1.9051094166588265e-05, + "loss": 0.38674028, + "memory(GiB)": 15.04, + "step": 2715, + "train_speed(iter/s)": 0.335695 + }, + { + "acc": 0.86682205, + "epoch": 0.36632996632996634, + "grad_norm": 8.0625, + "learning_rate": 1.9046354246127322e-05, + "loss": 0.53947921, + "memory(GiB)": 15.04, + "step": 2720, + "train_speed(iter/s)": 0.335667 + }, + { + "acc": 0.8555212, + "epoch": 0.367003367003367, + "grad_norm": 7.25, + "learning_rate": 1.9041603109512296e-05, + "loss": 0.29316967, + "memory(GiB)": 15.04, + "step": 2725, + "train_speed(iter/s)": 0.335742 + }, + { + "acc": 0.89408617, + "epoch": 0.36767676767676766, + "grad_norm": 10.4375, + "learning_rate": 1.90368407626339e-05, + "loss": 0.3587872, + "memory(GiB)": 15.04, + "step": 2730, + "train_speed(iter/s)": 0.335816 + }, + { + "acc": 0.89987421, + "epoch": 0.36835016835016837, + "grad_norm": 8.6875, + "learning_rate": 1.9032067211396747e-05, + "loss": 0.40207324, + "memory(GiB)": 15.04, + "step": 2735, + "train_speed(iter/s)": 0.335891 + }, + { + "acc": 0.81146135, + "epoch": 0.36902356902356903, + "grad_norm": 13.5625, + "learning_rate": 1.9027282461719348e-05, + "loss": 0.69441767, + "memory(GiB)": 15.04, + "step": 2740, + "train_speed(iter/s)": 0.335968 + }, + { + "acc": 0.84760532, + "epoch": 0.3696969696969697, + "grad_norm": 10.375, + "learning_rate": 1.902248651953408e-05, + "loss": 0.47364888, + "memory(GiB)": 15.04, + "step": 2745, + "train_speed(iter/s)": 0.336091 + }, + { + "acc": 0.85408421, + "epoch": 0.37037037037037035, + "grad_norm": 6.28125, + "learning_rate": 1.901767939078722e-05, + "loss": 0.44126396, + "memory(GiB)": 15.04, + "step": 2750, + "train_speed(iter/s)": 0.336111 + }, + { + "acc": 0.86288681, + "epoch": 0.37104377104377106, + "grad_norm": 15.0, + "learning_rate": 1.9012861081438896e-05, + "loss": 0.63420038, + "memory(GiB)": 15.04, + "step": 2755, + "train_speed(iter/s)": 0.336166 + }, + { + "acc": 0.86562033, + "epoch": 0.3717171717171717, + "grad_norm": 12.125, + "learning_rate": 1.900803159746311e-05, + "loss": 0.36843266, + "memory(GiB)": 15.04, + "step": 2760, + "train_speed(iter/s)": 0.336268 + }, + { + "acc": 0.84120855, + "epoch": 0.3723905723905724, + "grad_norm": 11.5, + "learning_rate": 1.900319094484771e-05, + "loss": 0.62543631, + "memory(GiB)": 15.04, + "step": 2765, + "train_speed(iter/s)": 0.336305 + }, + { + "acc": 0.893818, + "epoch": 0.37306397306397304, + "grad_norm": 11.3125, + "learning_rate": 1.89983391295944e-05, + "loss": 0.41919742, + "memory(GiB)": 15.04, + "step": 2770, + "train_speed(iter/s)": 0.336404 + }, + { + "acc": 0.87164793, + "epoch": 0.37373737373737376, + "grad_norm": 5.3125, + "learning_rate": 1.8993476157718715e-05, + "loss": 0.35139127, + "memory(GiB)": 15.04, + "step": 2775, + "train_speed(iter/s)": 0.336479 + }, + { + "acc": 0.86639566, + "epoch": 0.3744107744107744, + "grad_norm": 8.3125, + "learning_rate": 1.8988602035250037e-05, + "loss": 0.49558616, + "memory(GiB)": 15.04, + "step": 2780, + "train_speed(iter/s)": 0.336593 + }, + { + "acc": 0.83523035, + "epoch": 0.3750841750841751, + "grad_norm": 27.75, + "learning_rate": 1.8983716768231554e-05, + "loss": 0.53672395, + "memory(GiB)": 15.04, + "step": 2785, + "train_speed(iter/s)": 0.336731 + }, + { + "acc": 0.87954035, + "epoch": 0.37575757575757573, + "grad_norm": 9.8125, + "learning_rate": 1.897882036272029e-05, + "loss": 0.48749771, + "memory(GiB)": 15.04, + "step": 2790, + "train_speed(iter/s)": 0.336859 + }, + { + "acc": 0.87321129, + "epoch": 0.37643097643097645, + "grad_norm": 6.1875, + "learning_rate": 1.8973912824787068e-05, + "loss": 0.44543729, + "memory(GiB)": 15.04, + "step": 2795, + "train_speed(iter/s)": 0.336957 + }, + { + "acc": 0.87619419, + "epoch": 0.3771043771043771, + "grad_norm": 6.5625, + "learning_rate": 1.8968994160516516e-05, + "loss": 0.52440739, + "memory(GiB)": 15.04, + "step": 2800, + "train_speed(iter/s)": 0.336906 + }, + { + "acc": 0.83263006, + "epoch": 0.37777777777777777, + "grad_norm": 15.0, + "learning_rate": 1.896406437600705e-05, + "loss": 0.69280262, + "memory(GiB)": 15.04, + "step": 2805, + "train_speed(iter/s)": 0.336973 + }, + { + "acc": 0.82279167, + "epoch": 0.3784511784511784, + "grad_norm": 9.0, + "learning_rate": 1.895912347737089e-05, + "loss": 0.48401203, + "memory(GiB)": 15.04, + "step": 2810, + "train_speed(iter/s)": 0.337095 + }, + { + "acc": 0.86654186, + "epoch": 0.37912457912457914, + "grad_norm": 18.25, + "learning_rate": 1.8954171470734023e-05, + "loss": 0.41237874, + "memory(GiB)": 15.04, + "step": 2815, + "train_speed(iter/s)": 0.337226 + }, + { + "acc": 0.8949935, + "epoch": 0.3797979797979798, + "grad_norm": 10.625, + "learning_rate": 1.894920836223621e-05, + "loss": 0.3719039, + "memory(GiB)": 15.04, + "step": 2820, + "train_speed(iter/s)": 0.33733 + }, + { + "acc": 0.88594656, + "epoch": 0.38047138047138046, + "grad_norm": 9.0, + "learning_rate": 1.894423415803098e-05, + "loss": 0.44963932, + "memory(GiB)": 15.04, + "step": 2825, + "train_speed(iter/s)": 0.337397 + }, + { + "acc": 0.8899971, + "epoch": 0.3811447811447811, + "grad_norm": 14.125, + "learning_rate": 1.893924886428562e-05, + "loss": 0.37841399, + "memory(GiB)": 15.04, + "step": 2830, + "train_speed(iter/s)": 0.337479 + }, + { + "acc": 0.8806571, + "epoch": 0.38181818181818183, + "grad_norm": 10.625, + "learning_rate": 1.8934252487181165e-05, + "loss": 0.44682927, + "memory(GiB)": 15.04, + "step": 2835, + "train_speed(iter/s)": 0.337607 + }, + { + "acc": 0.85668678, + "epoch": 0.3824915824915825, + "grad_norm": 4.34375, + "learning_rate": 1.8929245032912385e-05, + "loss": 0.46805487, + "memory(GiB)": 15.04, + "step": 2840, + "train_speed(iter/s)": 0.337638 + }, + { + "acc": 0.90456429, + "epoch": 0.38316498316498315, + "grad_norm": 8.375, + "learning_rate": 1.8924226507687793e-05, + "loss": 0.43100266, + "memory(GiB)": 15.04, + "step": 2845, + "train_speed(iter/s)": 0.337693 + }, + { + "acc": 0.92130375, + "epoch": 0.3838383838383838, + "grad_norm": 8.6875, + "learning_rate": 1.8919196917729623e-05, + "loss": 0.34470038, + "memory(GiB)": 15.04, + "step": 2850, + "train_speed(iter/s)": 0.337775 + }, + { + "acc": 0.81275425, + "epoch": 0.3845117845117845, + "grad_norm": 8.1875, + "learning_rate": 1.8914156269273833e-05, + "loss": 0.79330454, + "memory(GiB)": 15.04, + "step": 2855, + "train_speed(iter/s)": 0.337919 + }, + { + "acc": 0.84950256, + "epoch": 0.3851851851851852, + "grad_norm": 19.25, + "learning_rate": 1.8909104568570086e-05, + "loss": 0.3624054, + "memory(GiB)": 15.04, + "step": 2860, + "train_speed(iter/s)": 0.338038 + }, + { + "acc": 0.91833963, + "epoch": 0.38585858585858585, + "grad_norm": 9.5625, + "learning_rate": 1.890404182188175e-05, + "loss": 0.30394173, + "memory(GiB)": 15.04, + "step": 2865, + "train_speed(iter/s)": 0.338122 + }, + { + "acc": 0.88120604, + "epoch": 0.3865319865319865, + "grad_norm": 8.5625, + "learning_rate": 1.8898968035485895e-05, + "loss": 0.44618464, + "memory(GiB)": 15.04, + "step": 2870, + "train_speed(iter/s)": 0.338145 + }, + { + "acc": 0.90447998, + "epoch": 0.3872053872053872, + "grad_norm": 8.25, + "learning_rate": 1.8893883215673266e-05, + "loss": 0.39719179, + "memory(GiB)": 15.04, + "step": 2875, + "train_speed(iter/s)": 0.338192 + }, + { + "acc": 0.81772985, + "epoch": 0.3878787878787879, + "grad_norm": 21.25, + "learning_rate": 1.88887873687483e-05, + "loss": 0.3955199, + "memory(GiB)": 15.04, + "step": 2880, + "train_speed(iter/s)": 0.338299 + }, + { + "acc": 0.85085459, + "epoch": 0.38855218855218854, + "grad_norm": 4.5625, + "learning_rate": 1.8883680501029098e-05, + "loss": 0.59997473, + "memory(GiB)": 15.04, + "step": 2885, + "train_speed(iter/s)": 0.338406 + }, + { + "acc": 0.83106422, + "epoch": 0.38922558922558925, + "grad_norm": 6.0, + "learning_rate": 1.887856261884743e-05, + "loss": 0.30067787, + "memory(GiB)": 15.04, + "step": 2890, + "train_speed(iter/s)": 0.338447 + }, + { + "acc": 0.92400789, + "epoch": 0.3898989898989899, + "grad_norm": 7.0, + "learning_rate": 1.8873433728548716e-05, + "loss": 0.23872533, + "memory(GiB)": 15.04, + "step": 2895, + "train_speed(iter/s)": 0.338594 + }, + { + "acc": 0.8933054, + "epoch": 0.39057239057239057, + "grad_norm": 9.0, + "learning_rate": 1.886829383649203e-05, + "loss": 0.32230327, + "memory(GiB)": 15.04, + "step": 2900, + "train_speed(iter/s)": 0.338727 + }, + { + "acc": 0.90259237, + "epoch": 0.39124579124579123, + "grad_norm": 12.75, + "learning_rate": 1.886314294905009e-05, + "loss": 0.30169015, + "memory(GiB)": 15.04, + "step": 2905, + "train_speed(iter/s)": 0.338845 + }, + { + "acc": 0.84158916, + "epoch": 0.39191919191919194, + "grad_norm": 6.96875, + "learning_rate": 1.8857981072609236e-05, + "loss": 0.45472388, + "memory(GiB)": 15.04, + "step": 2910, + "train_speed(iter/s)": 0.338896 + }, + { + "acc": 0.8605051, + "epoch": 0.3925925925925926, + "grad_norm": 6.3125, + "learning_rate": 1.8852808213569443e-05, + "loss": 0.54497418, + "memory(GiB)": 15.04, + "step": 2915, + "train_speed(iter/s)": 0.33896 + }, + { + "acc": 0.87266006, + "epoch": 0.39326599326599326, + "grad_norm": 7.78125, + "learning_rate": 1.8847624378344293e-05, + "loss": 0.48023129, + "memory(GiB)": 15.04, + "step": 2920, + "train_speed(iter/s)": 0.339032 + }, + { + "acc": 0.88253145, + "epoch": 0.3939393939393939, + "grad_norm": 6.875, + "learning_rate": 1.8842429573360987e-05, + "loss": 0.27666228, + "memory(GiB)": 15.04, + "step": 2925, + "train_speed(iter/s)": 0.339151 + }, + { + "acc": 0.88198681, + "epoch": 0.39461279461279464, + "grad_norm": 5.5625, + "learning_rate": 1.8837223805060323e-05, + "loss": 0.38521972, + "memory(GiB)": 15.04, + "step": 2930, + "train_speed(iter/s)": 0.339195 + }, + { + "acc": 0.85078106, + "epoch": 0.3952861952861953, + "grad_norm": 12.6875, + "learning_rate": 1.8832007079896685e-05, + "loss": 0.56066775, + "memory(GiB)": 15.04, + "step": 2935, + "train_speed(iter/s)": 0.339241 + }, + { + "acc": 0.86893778, + "epoch": 0.39595959595959596, + "grad_norm": 7.71875, + "learning_rate": 1.8826779404338055e-05, + "loss": 0.58338552, + "memory(GiB)": 15.04, + "step": 2940, + "train_speed(iter/s)": 0.33937 + }, + { + "acc": 0.87823391, + "epoch": 0.3966329966329966, + "grad_norm": 13.9375, + "learning_rate": 1.8821540784865983e-05, + "loss": 0.36951785, + "memory(GiB)": 15.04, + "step": 2945, + "train_speed(iter/s)": 0.339399 + }, + { + "acc": 0.81051741, + "epoch": 0.39730639730639733, + "grad_norm": 26.5, + "learning_rate": 1.8816291227975587e-05, + "loss": 0.58486147, + "memory(GiB)": 15.04, + "step": 2950, + "train_speed(iter/s)": 0.339504 + }, + { + "acc": 0.88986492, + "epoch": 0.397979797979798, + "grad_norm": 14.625, + "learning_rate": 1.881103074017555e-05, + "loss": 0.32249541, + "memory(GiB)": 15.04, + "step": 2955, + "train_speed(iter/s)": 0.339629 + }, + { + "acc": 0.77094898, + "epoch": 0.39865319865319865, + "grad_norm": 7.28125, + "learning_rate": 1.8805759327988108e-05, + "loss": 0.47732401, + "memory(GiB)": 15.04, + "step": 2960, + "train_speed(iter/s)": 0.3397 + }, + { + "acc": 0.87760706, + "epoch": 0.3993265993265993, + "grad_norm": 6.625, + "learning_rate": 1.8800476997949033e-05, + "loss": 0.39705057, + "memory(GiB)": 15.04, + "step": 2965, + "train_speed(iter/s)": 0.339765 + }, + { + "acc": 0.88057041, + "epoch": 0.4, + "grad_norm": 11.875, + "learning_rate": 1.879518375660765e-05, + "loss": 0.40552897, + "memory(GiB)": 15.04, + "step": 2970, + "train_speed(iter/s)": 0.339757 + }, + { + "acc": 0.90826578, + "epoch": 0.4006734006734007, + "grad_norm": 7.6875, + "learning_rate": 1.87898796105268e-05, + "loss": 0.29796383, + "memory(GiB)": 15.04, + "step": 2975, + "train_speed(iter/s)": 0.33984 + }, + { + "acc": 0.88463964, + "epoch": 0.40134680134680134, + "grad_norm": 12.5, + "learning_rate": 1.8784564566282845e-05, + "loss": 0.41117978, + "memory(GiB)": 15.04, + "step": 2980, + "train_speed(iter/s)": 0.33991 + }, + { + "acc": 0.87788677, + "epoch": 0.402020202020202, + "grad_norm": 6.78125, + "learning_rate": 1.877923863046566e-05, + "loss": 0.63874054, + "memory(GiB)": 15.04, + "step": 2985, + "train_speed(iter/s)": 0.339976 + }, + { + "acc": 0.90607147, + "epoch": 0.4026936026936027, + "grad_norm": 6.8125, + "learning_rate": 1.877390180967863e-05, + "loss": 0.33274183, + "memory(GiB)": 15.04, + "step": 2990, + "train_speed(iter/s)": 0.340038 + }, + { + "acc": 0.88634825, + "epoch": 0.4033670033670034, + "grad_norm": 7.03125, + "learning_rate": 1.8768554110538626e-05, + "loss": 0.34868686, + "memory(GiB)": 15.04, + "step": 2995, + "train_speed(iter/s)": 0.340082 + }, + { + "acc": 0.83088036, + "epoch": 0.40404040404040403, + "grad_norm": 5.78125, + "learning_rate": 1.8763195539676017e-05, + "loss": 0.54208879, + "memory(GiB)": 15.04, + "step": 3000, + "train_speed(iter/s)": 0.340137 + }, + { + "epoch": 0.40404040404040403, + "eval_acc": 0.8710384184245389, + "eval_loss": 0.5025013089179993, + "eval_runtime": 110.2728, + "eval_samples_per_second": 1.36, + "eval_steps_per_second": 1.36, + "step": 3000 + }, + { + "acc": 0.82111616, + "epoch": 0.4047138047138047, + "grad_norm": 6.46875, + "learning_rate": 1.875782610373464e-05, + "loss": 0.78256674, + "memory(GiB)": 15.04, + "step": 3005, + "train_speed(iter/s)": 0.335798 + }, + { + "acc": 0.84464054, + "epoch": 0.4053872053872054, + "grad_norm": 20.75, + "learning_rate": 1.8752445809371813e-05, + "loss": 0.53980637, + "memory(GiB)": 15.04, + "step": 3010, + "train_speed(iter/s)": 0.335805 + }, + { + "acc": 0.83855562, + "epoch": 0.40606060606060607, + "grad_norm": 8.75, + "learning_rate": 1.874705466325831e-05, + "loss": 0.60607486, + "memory(GiB)": 15.04, + "step": 3015, + "train_speed(iter/s)": 0.335864 + }, + { + "acc": 0.90274582, + "epoch": 0.4067340067340067, + "grad_norm": 14.3125, + "learning_rate": 1.8741652672078366e-05, + "loss": 0.41548071, + "memory(GiB)": 15.04, + "step": 3020, + "train_speed(iter/s)": 0.335997 + }, + { + "acc": 0.92024384, + "epoch": 0.4074074074074074, + "grad_norm": 7.21875, + "learning_rate": 1.8736239842529658e-05, + "loss": 0.25771976, + "memory(GiB)": 15.04, + "step": 3025, + "train_speed(iter/s)": 0.336048 + }, + { + "acc": 0.89563293, + "epoch": 0.4080808080808081, + "grad_norm": 5.09375, + "learning_rate": 1.8730816181323297e-05, + "loss": 0.40562844, + "memory(GiB)": 15.04, + "step": 3030, + "train_speed(iter/s)": 0.336048 + }, + { + "acc": 0.81184473, + "epoch": 0.40875420875420876, + "grad_norm": 7.25, + "learning_rate": 1.8725381695183836e-05, + "loss": 0.62321434, + "memory(GiB)": 15.04, + "step": 3035, + "train_speed(iter/s)": 0.336055 + }, + { + "acc": 0.89695549, + "epoch": 0.4094276094276094, + "grad_norm": 5.0625, + "learning_rate": 1.8719936390849234e-05, + "loss": 0.37249773, + "memory(GiB)": 15.04, + "step": 3040, + "train_speed(iter/s)": 0.336156 + }, + { + "acc": 0.89363928, + "epoch": 0.4101010101010101, + "grad_norm": 10.0625, + "learning_rate": 1.8714480275070874e-05, + "loss": 0.37879419, + "memory(GiB)": 15.04, + "step": 3045, + "train_speed(iter/s)": 0.336237 + }, + { + "acc": 0.84344769, + "epoch": 0.4107744107744108, + "grad_norm": 9.1875, + "learning_rate": 1.8709013354613544e-05, + "loss": 0.48255649, + "memory(GiB)": 15.04, + "step": 3050, + "train_speed(iter/s)": 0.336308 + }, + { + "acc": 0.83006134, + "epoch": 0.41144781144781145, + "grad_norm": 13.6875, + "learning_rate": 1.8703535636255423e-05, + "loss": 0.52381067, + "memory(GiB)": 15.04, + "step": 3055, + "train_speed(iter/s)": 0.336377 + }, + { + "acc": 0.82723989, + "epoch": 0.4121212121212121, + "grad_norm": 17.25, + "learning_rate": 1.869804712678807e-05, + "loss": 0.5395534, + "memory(GiB)": 15.04, + "step": 3060, + "train_speed(iter/s)": 0.33647 + }, + { + "acc": 0.888585, + "epoch": 0.41279461279461277, + "grad_norm": 7.125, + "learning_rate": 1.8692547833016446e-05, + "loss": 0.33099582, + "memory(GiB)": 15.04, + "step": 3065, + "train_speed(iter/s)": 0.336561 + }, + { + "acc": 0.87514992, + "epoch": 0.4134680134680135, + "grad_norm": 6.5, + "learning_rate": 1.8687037761758864e-05, + "loss": 0.51219778, + "memory(GiB)": 15.04, + "step": 3070, + "train_speed(iter/s)": 0.336619 + }, + { + "acc": 0.89671497, + "epoch": 0.41414141414141414, + "grad_norm": 4.96875, + "learning_rate": 1.8681516919847004e-05, + "loss": 0.33684578, + "memory(GiB)": 15.04, + "step": 3075, + "train_speed(iter/s)": 0.336663 + }, + { + "acc": 0.89340353, + "epoch": 0.4148148148148148, + "grad_norm": 12.5, + "learning_rate": 1.8675985314125903e-05, + "loss": 0.43263216, + "memory(GiB)": 15.04, + "step": 3080, + "train_speed(iter/s)": 0.336757 + }, + { + "acc": 0.9157217, + "epoch": 0.41548821548821546, + "grad_norm": 10.0, + "learning_rate": 1.867044295145394e-05, + "loss": 0.36201947, + "memory(GiB)": 15.04, + "step": 3085, + "train_speed(iter/s)": 0.336882 + }, + { + "acc": 0.8681529, + "epoch": 0.4161616161616162, + "grad_norm": 8.25, + "learning_rate": 1.8664889838702837e-05, + "loss": 0.51161928, + "memory(GiB)": 15.04, + "step": 3090, + "train_speed(iter/s)": 0.33699 + }, + { + "acc": 0.88494596, + "epoch": 0.41683501683501684, + "grad_norm": 5.65625, + "learning_rate": 1.8659325982757632e-05, + "loss": 0.27869239, + "memory(GiB)": 15.04, + "step": 3095, + "train_speed(iter/s)": 0.337046 + }, + { + "acc": 0.92889404, + "epoch": 0.4175084175084175, + "grad_norm": 12.3125, + "learning_rate": 1.86537513905167e-05, + "loss": 0.27401457, + "memory(GiB)": 15.04, + "step": 3100, + "train_speed(iter/s)": 0.337156 + }, + { + "acc": 0.85353842, + "epoch": 0.41818181818181815, + "grad_norm": 7.625, + "learning_rate": 1.8648166068891716e-05, + "loss": 0.45773873, + "memory(GiB)": 15.04, + "step": 3105, + "train_speed(iter/s)": 0.33723 + }, + { + "acc": 0.8940958, + "epoch": 0.41885521885521887, + "grad_norm": 6.90625, + "learning_rate": 1.864257002480766e-05, + "loss": 0.50468621, + "memory(GiB)": 15.04, + "step": 3110, + "train_speed(iter/s)": 0.33724 + }, + { + "acc": 0.85336952, + "epoch": 0.41952861952861953, + "grad_norm": 5.5625, + "learning_rate": 1.8636963265202804e-05, + "loss": 0.6110342, + "memory(GiB)": 15.04, + "step": 3115, + "train_speed(iter/s)": 0.337191 + }, + { + "acc": 0.8752511, + "epoch": 0.4202020202020202, + "grad_norm": 6.125, + "learning_rate": 1.863134579702872e-05, + "loss": 0.36342602, + "memory(GiB)": 15.04, + "step": 3120, + "train_speed(iter/s)": 0.337222 + }, + { + "acc": 0.80075426, + "epoch": 0.4208754208754209, + "grad_norm": 14.4375, + "learning_rate": 1.8625717627250225e-05, + "loss": 0.64978418, + "memory(GiB)": 15.04, + "step": 3125, + "train_speed(iter/s)": 0.337323 + }, + { + "acc": 0.89596958, + "epoch": 0.42154882154882156, + "grad_norm": 5.6875, + "learning_rate": 1.8620078762845443e-05, + "loss": 0.40613952, + "memory(GiB)": 15.04, + "step": 3130, + "train_speed(iter/s)": 0.337369 + }, + { + "acc": 0.87584219, + "epoch": 0.4222222222222222, + "grad_norm": 7.1875, + "learning_rate": 1.8614429210805737e-05, + "loss": 0.47951093, + "memory(GiB)": 15.04, + "step": 3135, + "train_speed(iter/s)": 0.337441 + }, + { + "acc": 0.84029493, + "epoch": 0.4228956228956229, + "grad_norm": 7.875, + "learning_rate": 1.8608768978135717e-05, + "loss": 0.49031301, + "memory(GiB)": 15.04, + "step": 3140, + "train_speed(iter/s)": 0.337518 + }, + { + "acc": 0.92243776, + "epoch": 0.4235690235690236, + "grad_norm": 8.1875, + "learning_rate": 1.8603098071853252e-05, + "loss": 0.30771151, + "memory(GiB)": 15.04, + "step": 3145, + "train_speed(iter/s)": 0.337583 + }, + { + "acc": 0.89681549, + "epoch": 0.42424242424242425, + "grad_norm": 12.875, + "learning_rate": 1.8597416498989423e-05, + "loss": 0.38468418, + "memory(GiB)": 15.04, + "step": 3150, + "train_speed(iter/s)": 0.337667 + }, + { + "acc": 0.86587696, + "epoch": 0.4249158249158249, + "grad_norm": 5.6875, + "learning_rate": 1.859172426658856e-05, + "loss": 0.42589006, + "memory(GiB)": 15.04, + "step": 3155, + "train_speed(iter/s)": 0.337743 + }, + { + "acc": 0.84840202, + "epoch": 0.4255892255892256, + "grad_norm": 8.125, + "learning_rate": 1.8586021381708186e-05, + "loss": 0.58262167, + "memory(GiB)": 15.04, + "step": 3160, + "train_speed(iter/s)": 0.337864 + }, + { + "acc": 0.89175978, + "epoch": 0.4262626262626263, + "grad_norm": 8.0, + "learning_rate": 1.8580307851419055e-05, + "loss": 0.41947713, + "memory(GiB)": 15.04, + "step": 3165, + "train_speed(iter/s)": 0.337862 + }, + { + "acc": 0.88823881, + "epoch": 0.42693602693602695, + "grad_norm": 6.28125, + "learning_rate": 1.85745836828051e-05, + "loss": 0.42632914, + "memory(GiB)": 15.04, + "step": 3170, + "train_speed(iter/s)": 0.337978 + }, + { + "acc": 0.80676479, + "epoch": 0.4276094276094276, + "grad_norm": 7.59375, + "learning_rate": 1.856884888296345e-05, + "loss": 0.53672976, + "memory(GiB)": 15.04, + "step": 3175, + "train_speed(iter/s)": 0.338031 + }, + { + "acc": 0.91209641, + "epoch": 0.42828282828282827, + "grad_norm": 5.21875, + "learning_rate": 1.8563103459004423e-05, + "loss": 0.37504165, + "memory(GiB)": 15.04, + "step": 3180, + "train_speed(iter/s)": 0.337931 + }, + { + "acc": 0.87650108, + "epoch": 0.428956228956229, + "grad_norm": 5.3125, + "learning_rate": 1.85573474180515e-05, + "loss": 0.45593104, + "memory(GiB)": 15.04, + "step": 3185, + "train_speed(iter/s)": 0.338001 + }, + { + "acc": 0.89606991, + "epoch": 0.42962962962962964, + "grad_norm": 8.5, + "learning_rate": 1.8551580767241325e-05, + "loss": 0.49555717, + "memory(GiB)": 15.04, + "step": 3190, + "train_speed(iter/s)": 0.338088 + }, + { + "acc": 0.92269087, + "epoch": 0.4303030303030303, + "grad_norm": 8.3125, + "learning_rate": 1.8545803513723703e-05, + "loss": 0.31717582, + "memory(GiB)": 15.04, + "step": 3195, + "train_speed(iter/s)": 0.338184 + }, + { + "acc": 0.90833187, + "epoch": 0.43097643097643096, + "grad_norm": 7.21875, + "learning_rate": 1.8540015664661583e-05, + "loss": 0.3351985, + "memory(GiB)": 15.04, + "step": 3200, + "train_speed(iter/s)": 0.338254 + }, + { + "acc": 0.90001535, + "epoch": 0.4316498316498317, + "grad_norm": 7.0625, + "learning_rate": 1.853421722723105e-05, + "loss": 0.36360321, + "memory(GiB)": 15.04, + "step": 3205, + "train_speed(iter/s)": 0.338311 + }, + { + "acc": 0.85820007, + "epoch": 0.43232323232323233, + "grad_norm": 10.0625, + "learning_rate": 1.8528408208621324e-05, + "loss": 0.50676789, + "memory(GiB)": 15.04, + "step": 3210, + "train_speed(iter/s)": 0.338329 + }, + { + "acc": 0.7928196, + "epoch": 0.432996632996633, + "grad_norm": 7.46875, + "learning_rate": 1.852258861603472e-05, + "loss": 0.57429237, + "memory(GiB)": 15.04, + "step": 3215, + "train_speed(iter/s)": 0.338351 + }, + { + "acc": 0.87294598, + "epoch": 0.43367003367003365, + "grad_norm": 9.3125, + "learning_rate": 1.8516758456686694e-05, + "loss": 0.41843138, + "memory(GiB)": 15.04, + "step": 3220, + "train_speed(iter/s)": 0.338414 + }, + { + "acc": 0.90121832, + "epoch": 0.43434343434343436, + "grad_norm": 7.875, + "learning_rate": 1.8510917737805785e-05, + "loss": 0.22768776, + "memory(GiB)": 15.04, + "step": 3225, + "train_speed(iter/s)": 0.338506 + }, + { + "acc": 0.88239965, + "epoch": 0.435016835016835, + "grad_norm": 9.9375, + "learning_rate": 1.850506646663363e-05, + "loss": 0.4163702, + "memory(GiB)": 15.04, + "step": 3230, + "train_speed(iter/s)": 0.338511 + }, + { + "acc": 0.89738407, + "epoch": 0.4356902356902357, + "grad_norm": 16.875, + "learning_rate": 1.8499204650424947e-05, + "loss": 0.44875789, + "memory(GiB)": 15.04, + "step": 3235, + "train_speed(iter/s)": 0.338602 + }, + { + "acc": 0.86667624, + "epoch": 0.43636363636363634, + "grad_norm": 11.5, + "learning_rate": 1.849333229644753e-05, + "loss": 0.46161666, + "memory(GiB)": 15.04, + "step": 3240, + "train_speed(iter/s)": 0.33861 + }, + { + "acc": 0.83652563, + "epoch": 0.43703703703703706, + "grad_norm": 8.25, + "learning_rate": 1.848744941198224e-05, + "loss": 0.3679383, + "memory(GiB)": 15.04, + "step": 3245, + "train_speed(iter/s)": 0.338589 + }, + { + "acc": 0.88514147, + "epoch": 0.4377104377104377, + "grad_norm": 5.625, + "learning_rate": 1.8481556004322984e-05, + "loss": 0.39434249, + "memory(GiB)": 15.04, + "step": 3250, + "train_speed(iter/s)": 0.338664 + }, + { + "acc": 0.74523182, + "epoch": 0.4383838383838384, + "grad_norm": 5.96875, + "learning_rate": 1.8475652080776733e-05, + "loss": 0.65559001, + "memory(GiB)": 15.04, + "step": 3255, + "train_speed(iter/s)": 0.338744 + }, + { + "acc": 0.8988143, + "epoch": 0.43905723905723903, + "grad_norm": 6.3125, + "learning_rate": 1.8469737648663487e-05, + "loss": 0.503934, + "memory(GiB)": 15.04, + "step": 3260, + "train_speed(iter/s)": 0.338723 + }, + { + "acc": 0.86166887, + "epoch": 0.43973063973063975, + "grad_norm": 6.9375, + "learning_rate": 1.846381271531627e-05, + "loss": 0.40841799, + "memory(GiB)": 15.04, + "step": 3265, + "train_speed(iter/s)": 0.338795 + }, + { + "acc": 0.90724373, + "epoch": 0.4404040404040404, + "grad_norm": 6.9375, + "learning_rate": 1.8457877288081132e-05, + "loss": 0.3223772, + "memory(GiB)": 15.04, + "step": 3270, + "train_speed(iter/s)": 0.338811 + }, + { + "acc": 0.89002905, + "epoch": 0.44107744107744107, + "grad_norm": 9.75, + "learning_rate": 1.8451931374317138e-05, + "loss": 0.38144407, + "memory(GiB)": 15.04, + "step": 3275, + "train_speed(iter/s)": 0.338923 + }, + { + "acc": 0.8057189, + "epoch": 0.4417508417508417, + "grad_norm": 7.375, + "learning_rate": 1.8445974981396345e-05, + "loss": 0.50009413, + "memory(GiB)": 15.04, + "step": 3280, + "train_speed(iter/s)": 0.338962 + }, + { + "acc": 0.83952885, + "epoch": 0.44242424242424244, + "grad_norm": 10.1875, + "learning_rate": 1.844000811670381e-05, + "loss": 0.64937525, + "memory(GiB)": 15.04, + "step": 3285, + "train_speed(iter/s)": 0.339017 + }, + { + "acc": 0.90349655, + "epoch": 0.4430976430976431, + "grad_norm": 19.25, + "learning_rate": 1.8434030787637576e-05, + "loss": 0.38233004, + "memory(GiB)": 15.04, + "step": 3290, + "train_speed(iter/s)": 0.339117 + }, + { + "acc": 0.92429094, + "epoch": 0.44377104377104376, + "grad_norm": 7.40625, + "learning_rate": 1.8428043001608646e-05, + "loss": 0.28647325, + "memory(GiB)": 15.04, + "step": 3295, + "train_speed(iter/s)": 0.339195 + }, + { + "acc": 0.83496065, + "epoch": 0.4444444444444444, + "grad_norm": 16.75, + "learning_rate": 1.8422044766041007e-05, + "loss": 0.3958189, + "memory(GiB)": 15.04, + "step": 3300, + "train_speed(iter/s)": 0.339266 + }, + { + "epoch": 0.4444444444444444, + "eval_acc": 0.8735746939204456, + "eval_loss": 0.4853774905204773, + "eval_runtime": 109.648, + "eval_samples_per_second": 1.368, + "eval_steps_per_second": 1.368, + "step": 3300 + }, + { + "acc": 0.87697134, + "epoch": 0.44511784511784513, + "grad_norm": 23.375, + "learning_rate": 1.8416036088371584e-05, + "loss": 0.56972561, + "memory(GiB)": 15.04, + "step": 3305, + "train_speed(iter/s)": 0.335535 + }, + { + "acc": 0.85543346, + "epoch": 0.4457912457912458, + "grad_norm": 27.625, + "learning_rate": 1.8410016976050257e-05, + "loss": 0.55093555, + "memory(GiB)": 15.04, + "step": 3310, + "train_speed(iter/s)": 0.33554 + }, + { + "acc": 0.92177601, + "epoch": 0.44646464646464645, + "grad_norm": 14.25, + "learning_rate": 1.8403987436539852e-05, + "loss": 0.26810853, + "memory(GiB)": 15.04, + "step": 3315, + "train_speed(iter/s)": 0.335662 + }, + { + "acc": 0.84944754, + "epoch": 0.4471380471380471, + "grad_norm": 11.9375, + "learning_rate": 1.839794747731611e-05, + "loss": 0.59606938, + "memory(GiB)": 15.04, + "step": 3320, + "train_speed(iter/s)": 0.335694 + }, + { + "acc": 0.80258579, + "epoch": 0.4478114478114478, + "grad_norm": 15.5, + "learning_rate": 1.8391897105867695e-05, + "loss": 0.65352468, + "memory(GiB)": 15.04, + "step": 3325, + "train_speed(iter/s)": 0.335752 + }, + { + "acc": 0.84866686, + "epoch": 0.4484848484848485, + "grad_norm": 7.71875, + "learning_rate": 1.838583632969618e-05, + "loss": 0.53425221, + "memory(GiB)": 15.04, + "step": 3330, + "train_speed(iter/s)": 0.33579 + }, + { + "acc": 0.88721209, + "epoch": 0.44915824915824915, + "grad_norm": 8.1875, + "learning_rate": 1.837976515631604e-05, + "loss": 0.50930309, + "memory(GiB)": 15.04, + "step": 3335, + "train_speed(iter/s)": 0.335762 + }, + { + "acc": 0.82474318, + "epoch": 0.4498316498316498, + "grad_norm": 12.9375, + "learning_rate": 1.8373683593254646e-05, + "loss": 0.47711983, + "memory(GiB)": 15.04, + "step": 3340, + "train_speed(iter/s)": 0.335867 + }, + { + "acc": 0.90135469, + "epoch": 0.4505050505050505, + "grad_norm": 13.0625, + "learning_rate": 1.8367591648052242e-05, + "loss": 0.33111401, + "memory(GiB)": 15.04, + "step": 3345, + "train_speed(iter/s)": 0.335958 + }, + { + "acc": 0.88928232, + "epoch": 0.4511784511784512, + "grad_norm": 5.4375, + "learning_rate": 1.8361489328261947e-05, + "loss": 0.34135718, + "memory(GiB)": 15.04, + "step": 3350, + "train_speed(iter/s)": 0.336026 + }, + { + "acc": 0.83216152, + "epoch": 0.45185185185185184, + "grad_norm": 10.0625, + "learning_rate": 1.835537664144974e-05, + "loss": 0.46434398, + "memory(GiB)": 15.04, + "step": 3355, + "train_speed(iter/s)": 0.336126 + }, + { + "acc": 0.87009315, + "epoch": 0.45252525252525255, + "grad_norm": 16.5, + "learning_rate": 1.8349253595194465e-05, + "loss": 0.44999747, + "memory(GiB)": 15.04, + "step": 3360, + "train_speed(iter/s)": 0.336234 + }, + { + "acc": 0.77190375, + "epoch": 0.4531986531986532, + "grad_norm": 6.25, + "learning_rate": 1.8343120197087798e-05, + "loss": 0.6485055, + "memory(GiB)": 15.04, + "step": 3365, + "train_speed(iter/s)": 0.33628 + }, + { + "acc": 0.89529934, + "epoch": 0.45387205387205387, + "grad_norm": 9.5, + "learning_rate": 1.8336976454734254e-05, + "loss": 0.42717724, + "memory(GiB)": 15.04, + "step": 3370, + "train_speed(iter/s)": 0.336309 + }, + { + "acc": 0.8757637, + "epoch": 0.45454545454545453, + "grad_norm": 8.625, + "learning_rate": 1.8330822375751172e-05, + "loss": 0.390833, + "memory(GiB)": 15.04, + "step": 3375, + "train_speed(iter/s)": 0.336342 + }, + { + "acc": 0.86099453, + "epoch": 0.45521885521885525, + "grad_norm": 11.125, + "learning_rate": 1.8324657967768712e-05, + "loss": 0.55917716, + "memory(GiB)": 15.04, + "step": 3380, + "train_speed(iter/s)": 0.336374 + }, + { + "acc": 0.88323507, + "epoch": 0.4558922558922559, + "grad_norm": 13.9375, + "learning_rate": 1.8318483238429835e-05, + "loss": 0.32566054, + "memory(GiB)": 15.04, + "step": 3385, + "train_speed(iter/s)": 0.336483 + }, + { + "acc": 0.89567146, + "epoch": 0.45656565656565656, + "grad_norm": 7.8125, + "learning_rate": 1.8312298195390303e-05, + "loss": 0.35909457, + "memory(GiB)": 15.04, + "step": 3390, + "train_speed(iter/s)": 0.33651 + }, + { + "acc": 0.92252321, + "epoch": 0.4572390572390572, + "grad_norm": 5.875, + "learning_rate": 1.8306102846318664e-05, + "loss": 0.28103931, + "memory(GiB)": 15.04, + "step": 3395, + "train_speed(iter/s)": 0.33656 + }, + { + "acc": 0.88294439, + "epoch": 0.45791245791245794, + "grad_norm": 9.25, + "learning_rate": 1.8299897198896234e-05, + "loss": 0.58328576, + "memory(GiB)": 15.04, + "step": 3400, + "train_speed(iter/s)": 0.336662 + }, + { + "acc": 0.89842358, + "epoch": 0.4585858585858586, + "grad_norm": 9.6875, + "learning_rate": 1.829368126081712e-05, + "loss": 0.32496204, + "memory(GiB)": 15.04, + "step": 3405, + "train_speed(iter/s)": 0.336753 + }, + { + "acc": 0.88026924, + "epoch": 0.45925925925925926, + "grad_norm": 8.25, + "learning_rate": 1.828745503978816e-05, + "loss": 0.48563657, + "memory(GiB)": 15.04, + "step": 3410, + "train_speed(iter/s)": 0.336855 + }, + { + "acc": 0.8944335, + "epoch": 0.4599326599326599, + "grad_norm": 4.40625, + "learning_rate": 1.8281218543528973e-05, + "loss": 0.30072145, + "memory(GiB)": 15.04, + "step": 3415, + "train_speed(iter/s)": 0.336902 + }, + { + "acc": 0.84510555, + "epoch": 0.46060606060606063, + "grad_norm": 14.125, + "learning_rate": 1.8274971779771888e-05, + "loss": 0.46908517, + "memory(GiB)": 15.04, + "step": 3420, + "train_speed(iter/s)": 0.336992 + }, + { + "acc": 0.89817324, + "epoch": 0.4612794612794613, + "grad_norm": 10.6875, + "learning_rate": 1.826871475626198e-05, + "loss": 0.32642584, + "memory(GiB)": 15.04, + "step": 3425, + "train_speed(iter/s)": 0.337076 + }, + { + "acc": 0.88077135, + "epoch": 0.46195286195286195, + "grad_norm": 7.09375, + "learning_rate": 1.8262447480757048e-05, + "loss": 0.32293191, + "memory(GiB)": 15.04, + "step": 3430, + "train_speed(iter/s)": 0.337165 + }, + { + "acc": 0.89517422, + "epoch": 0.4626262626262626, + "grad_norm": 10.5625, + "learning_rate": 1.8256169961027588e-05, + "loss": 0.33811147, + "memory(GiB)": 15.04, + "step": 3435, + "train_speed(iter/s)": 0.337221 + }, + { + "acc": 0.75822539, + "epoch": 0.4632996632996633, + "grad_norm": 11.5, + "learning_rate": 1.8249882204856802e-05, + "loss": 0.66904917, + "memory(GiB)": 15.04, + "step": 3440, + "train_speed(iter/s)": 0.337323 + }, + { + "acc": 0.89435711, + "epoch": 0.463973063973064, + "grad_norm": 6.5, + "learning_rate": 1.82435842200406e-05, + "loss": 0.27627466, + "memory(GiB)": 15.04, + "step": 3445, + "train_speed(iter/s)": 0.33741 + }, + { + "acc": 0.9228344, + "epoch": 0.46464646464646464, + "grad_norm": 8.6875, + "learning_rate": 1.823727601438755e-05, + "loss": 0.33515551, + "memory(GiB)": 15.04, + "step": 3450, + "train_speed(iter/s)": 0.337459 + }, + { + "acc": 0.8139946, + "epoch": 0.4653198653198653, + "grad_norm": 7.59375, + "learning_rate": 1.82309575957189e-05, + "loss": 0.40257473, + "memory(GiB)": 15.04, + "step": 3455, + "train_speed(iter/s)": 0.337525 + }, + { + "acc": 0.90620852, + "epoch": 0.465993265993266, + "grad_norm": 14.5, + "learning_rate": 1.8224628971868573e-05, + "loss": 0.33643632, + "memory(GiB)": 15.04, + "step": 3460, + "train_speed(iter/s)": 0.337598 + }, + { + "acc": 0.87207336, + "epoch": 0.4666666666666667, + "grad_norm": 21.125, + "learning_rate": 1.821829015068313e-05, + "loss": 0.50049963, + "memory(GiB)": 15.04, + "step": 3465, + "train_speed(iter/s)": 0.337654 + }, + { + "acc": 0.91413269, + "epoch": 0.46734006734006733, + "grad_norm": 7.4375, + "learning_rate": 1.821194114002178e-05, + "loss": 0.29297369, + "memory(GiB)": 15.04, + "step": 3470, + "train_speed(iter/s)": 0.337695 + }, + { + "acc": 0.82051849, + "epoch": 0.468013468013468, + "grad_norm": 9.375, + "learning_rate": 1.820558194775637e-05, + "loss": 0.56539397, + "memory(GiB)": 15.04, + "step": 3475, + "train_speed(iter/s)": 0.33781 + }, + { + "acc": 0.89184208, + "epoch": 0.4686868686868687, + "grad_norm": 6.15625, + "learning_rate": 1.8199212581771366e-05, + "loss": 0.40346231, + "memory(GiB)": 15.04, + "step": 3480, + "train_speed(iter/s)": 0.33786 + }, + { + "acc": 0.88671398, + "epoch": 0.46936026936026937, + "grad_norm": 7.0, + "learning_rate": 1.8192833049963848e-05, + "loss": 0.49412446, + "memory(GiB)": 15.04, + "step": 3485, + "train_speed(iter/s)": 0.337936 + }, + { + "acc": 0.83467436, + "epoch": 0.47003367003367, + "grad_norm": 9.3125, + "learning_rate": 1.8186443360243502e-05, + "loss": 0.43304119, + "memory(GiB)": 15.04, + "step": 3490, + "train_speed(iter/s)": 0.337992 + }, + { + "acc": 0.78681512, + "epoch": 0.4707070707070707, + "grad_norm": 9.9375, + "learning_rate": 1.81800435205326e-05, + "loss": 1.1245203, + "memory(GiB)": 15.04, + "step": 3495, + "train_speed(iter/s)": 0.338099 + }, + { + "acc": 0.87781572, + "epoch": 0.4713804713804714, + "grad_norm": 9.25, + "learning_rate": 1.8173633538766018e-05, + "loss": 0.43102632, + "memory(GiB)": 15.04, + "step": 3500, + "train_speed(iter/s)": 0.338126 + }, + { + "acc": 0.85797892, + "epoch": 0.47205387205387206, + "grad_norm": 14.1875, + "learning_rate": 1.8167213422891187e-05, + "loss": 0.49299965, + "memory(GiB)": 15.04, + "step": 3505, + "train_speed(iter/s)": 0.338127 + }, + { + "acc": 0.79441729, + "epoch": 0.4727272727272727, + "grad_norm": 16.125, + "learning_rate": 1.8160783180868108e-05, + "loss": 0.60936694, + "memory(GiB)": 15.04, + "step": 3510, + "train_speed(iter/s)": 0.338192 + }, + { + "acc": 0.89987812, + "epoch": 0.4734006734006734, + "grad_norm": 10.6875, + "learning_rate": 1.8154342820669346e-05, + "loss": 0.31670492, + "memory(GiB)": 15.04, + "step": 3515, + "train_speed(iter/s)": 0.338282 + }, + { + "acc": 0.84160137, + "epoch": 0.4740740740740741, + "grad_norm": 11.375, + "learning_rate": 1.8147892350279997e-05, + "loss": 0.65960035, + "memory(GiB)": 15.04, + "step": 3520, + "train_speed(iter/s)": 0.338413 + }, + { + "acc": 0.86717596, + "epoch": 0.47474747474747475, + "grad_norm": 11.5, + "learning_rate": 1.8141431777697707e-05, + "loss": 0.46436634, + "memory(GiB)": 15.04, + "step": 3525, + "train_speed(iter/s)": 0.338476 + }, + { + "acc": 0.83819246, + "epoch": 0.4754208754208754, + "grad_norm": 22.375, + "learning_rate": 1.8134961110932634e-05, + "loss": 0.65677876, + "memory(GiB)": 15.04, + "step": 3530, + "train_speed(iter/s)": 0.338566 + }, + { + "acc": 0.91159954, + "epoch": 0.47609427609427607, + "grad_norm": 5.96875, + "learning_rate": 1.812848035800746e-05, + "loss": 0.29325655, + "memory(GiB)": 15.04, + "step": 3535, + "train_speed(iter/s)": 0.338581 + }, + { + "acc": 0.87480431, + "epoch": 0.4767676767676768, + "grad_norm": 13.0625, + "learning_rate": 1.8121989526957364e-05, + "loss": 0.50847125, + "memory(GiB)": 15.04, + "step": 3540, + "train_speed(iter/s)": 0.338624 + }, + { + "acc": 0.88437653, + "epoch": 0.47744107744107744, + "grad_norm": 7.40625, + "learning_rate": 1.8115488625830032e-05, + "loss": 0.41104035, + "memory(GiB)": 15.04, + "step": 3545, + "train_speed(iter/s)": 0.338657 + }, + { + "acc": 0.91306934, + "epoch": 0.4781144781144781, + "grad_norm": 8.9375, + "learning_rate": 1.8108977662685628e-05, + "loss": 0.32102196, + "memory(GiB)": 15.04, + "step": 3550, + "train_speed(iter/s)": 0.338739 + }, + { + "acc": 0.81103878, + "epoch": 0.47878787878787876, + "grad_norm": 20.125, + "learning_rate": 1.8102456645596787e-05, + "loss": 0.53531909, + "memory(GiB)": 15.04, + "step": 3555, + "train_speed(iter/s)": 0.338796 + }, + { + "acc": 0.88863211, + "epoch": 0.4794612794612795, + "grad_norm": 12.75, + "learning_rate": 1.8095925582648624e-05, + "loss": 0.33367224, + "memory(GiB)": 15.04, + "step": 3560, + "train_speed(iter/s)": 0.338885 + }, + { + "acc": 0.82931652, + "epoch": 0.48013468013468014, + "grad_norm": 10.25, + "learning_rate": 1.8089384481938694e-05, + "loss": 0.43920531, + "memory(GiB)": 15.04, + "step": 3565, + "train_speed(iter/s)": 0.33892 + }, + { + "acc": 0.90031691, + "epoch": 0.4808080808080808, + "grad_norm": 6.46875, + "learning_rate": 1.8082833351577003e-05, + "loss": 0.29520645, + "memory(GiB)": 15.04, + "step": 3570, + "train_speed(iter/s)": 0.338981 + }, + { + "acc": 0.86380301, + "epoch": 0.48148148148148145, + "grad_norm": 11.4375, + "learning_rate": 1.8076272199685996e-05, + "loss": 0.43042107, + "memory(GiB)": 15.04, + "step": 3575, + "train_speed(iter/s)": 0.339054 + }, + { + "acc": 0.89454012, + "epoch": 0.48215488215488217, + "grad_norm": 7.6875, + "learning_rate": 1.806970103440054e-05, + "loss": 0.21537628, + "memory(GiB)": 15.04, + "step": 3580, + "train_speed(iter/s)": 0.339125 + }, + { + "acc": 0.85797701, + "epoch": 0.48282828282828283, + "grad_norm": 7.28125, + "learning_rate": 1.8063119863867915e-05, + "loss": 0.60251961, + "memory(GiB)": 15.04, + "step": 3585, + "train_speed(iter/s)": 0.339012 + }, + { + "acc": 0.89630127, + "epoch": 0.4835016835016835, + "grad_norm": 6.625, + "learning_rate": 1.805652869624781e-05, + "loss": 0.27930336, + "memory(GiB)": 15.04, + "step": 3590, + "train_speed(iter/s)": 0.339137 + }, + { + "acc": 0.83644896, + "epoch": 0.4841750841750842, + "grad_norm": 15.875, + "learning_rate": 1.804992753971231e-05, + "loss": 0.58859396, + "memory(GiB)": 15.04, + "step": 3595, + "train_speed(iter/s)": 0.339189 + }, + { + "acc": 0.85748091, + "epoch": 0.48484848484848486, + "grad_norm": 6.625, + "learning_rate": 1.8043316402445876e-05, + "loss": 0.5261065, + "memory(GiB)": 15.04, + "step": 3600, + "train_speed(iter/s)": 0.33925 + }, + { + "epoch": 0.48484848484848486, + "eval_acc": 0.876711972764365, + "eval_loss": 0.4761183559894562, + "eval_runtime": 109.805, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 3600 + }, + { + "acc": 0.83587589, + "epoch": 0.4855218855218855, + "grad_norm": 9.8125, + "learning_rate": 1.8036695292645356e-05, + "loss": 0.38770075, + "memory(GiB)": 15.04, + "step": 3605, + "train_speed(iter/s)": 0.33586 + }, + { + "acc": 0.90010643, + "epoch": 0.4861952861952862, + "grad_norm": 9.6875, + "learning_rate": 1.8030064218519952e-05, + "loss": 0.43593073, + "memory(GiB)": 15.04, + "step": 3610, + "train_speed(iter/s)": 0.335904 + }, + { + "acc": 0.88875904, + "epoch": 0.4868686868686869, + "grad_norm": 11.875, + "learning_rate": 1.8023423188291227e-05, + "loss": 0.30031426, + "memory(GiB)": 15.04, + "step": 3615, + "train_speed(iter/s)": 0.335967 + }, + { + "acc": 0.84387503, + "epoch": 0.48754208754208755, + "grad_norm": 9.25, + "learning_rate": 1.8016772210193086e-05, + "loss": 0.80649157, + "memory(GiB)": 15.04, + "step": 3620, + "train_speed(iter/s)": 0.336027 + }, + { + "acc": 0.83444624, + "epoch": 0.4882154882154882, + "grad_norm": 6.90625, + "learning_rate": 1.8010111292471765e-05, + "loss": 0.44340019, + "memory(GiB)": 15.04, + "step": 3625, + "train_speed(iter/s)": 0.336048 + }, + { + "acc": 0.89580994, + "epoch": 0.4888888888888889, + "grad_norm": 12.0625, + "learning_rate": 1.8003440443385827e-05, + "loss": 0.41821451, + "memory(GiB)": 15.04, + "step": 3630, + "train_speed(iter/s)": 0.336115 + }, + { + "acc": 0.85760431, + "epoch": 0.4895622895622896, + "grad_norm": 17.5, + "learning_rate": 1.7996759671206148e-05, + "loss": 0.43475323, + "memory(GiB)": 15.04, + "step": 3635, + "train_speed(iter/s)": 0.336135 + }, + { + "acc": 0.88512459, + "epoch": 0.49023569023569025, + "grad_norm": 8.0625, + "learning_rate": 1.7990068984215905e-05, + "loss": 0.2988543, + "memory(GiB)": 15.04, + "step": 3640, + "train_speed(iter/s)": 0.336223 + }, + { + "acc": 0.77889462, + "epoch": 0.4909090909090909, + "grad_norm": 7.34375, + "learning_rate": 1.7983368390710576e-05, + "loss": 0.62918062, + "memory(GiB)": 15.04, + "step": 3645, + "train_speed(iter/s)": 0.33624 + }, + { + "acc": 0.90301514, + "epoch": 0.49158249158249157, + "grad_norm": 5.71875, + "learning_rate": 1.797665789899791e-05, + "loss": 0.32716761, + "memory(GiB)": 15.04, + "step": 3650, + "train_speed(iter/s)": 0.33628 + }, + { + "acc": 0.89472647, + "epoch": 0.4922558922558923, + "grad_norm": 12.125, + "learning_rate": 1.796993751739793e-05, + "loss": 0.46433377, + "memory(GiB)": 15.04, + "step": 3655, + "train_speed(iter/s)": 0.336342 + }, + { + "acc": 0.84266872, + "epoch": 0.49292929292929294, + "grad_norm": 8.375, + "learning_rate": 1.7963207254242933e-05, + "loss": 0.54928746, + "memory(GiB)": 15.04, + "step": 3660, + "train_speed(iter/s)": 0.336405 + }, + { + "acc": 0.89203091, + "epoch": 0.4936026936026936, + "grad_norm": 4.21875, + "learning_rate": 1.795646711787746e-05, + "loss": 0.41538334, + "memory(GiB)": 15.04, + "step": 3665, + "train_speed(iter/s)": 0.336362 + }, + { + "acc": 0.89859972, + "epoch": 0.49427609427609426, + "grad_norm": 10.8125, + "learning_rate": 1.7949717116658282e-05, + "loss": 0.31999536, + "memory(GiB)": 15.04, + "step": 3670, + "train_speed(iter/s)": 0.336473 + }, + { + "acc": 0.91256189, + "epoch": 0.494949494949495, + "grad_norm": 5.90625, + "learning_rate": 1.7942957258954425e-05, + "loss": 0.34010236, + "memory(GiB)": 15.04, + "step": 3675, + "train_speed(iter/s)": 0.33655 + }, + { + "acc": 0.88669271, + "epoch": 0.49562289562289563, + "grad_norm": 17.625, + "learning_rate": 1.7936187553147108e-05, + "loss": 0.49786587, + "memory(GiB)": 15.04, + "step": 3680, + "train_speed(iter/s)": 0.33657 + }, + { + "acc": 0.88359947, + "epoch": 0.4962962962962963, + "grad_norm": 9.1875, + "learning_rate": 1.7929408007629788e-05, + "loss": 0.48834329, + "memory(GiB)": 15.04, + "step": 3685, + "train_speed(iter/s)": 0.336623 + }, + { + "acc": 0.9091341, + "epoch": 0.49696969696969695, + "grad_norm": 9.5625, + "learning_rate": 1.79226186308081e-05, + "loss": 0.29978616, + "memory(GiB)": 15.04, + "step": 3690, + "train_speed(iter/s)": 0.336687 + }, + { + "acc": 0.87099781, + "epoch": 0.49764309764309766, + "grad_norm": 11.0625, + "learning_rate": 1.7915819431099882e-05, + "loss": 0.41647654, + "memory(GiB)": 15.04, + "step": 3695, + "train_speed(iter/s)": 0.336752 + }, + { + "acc": 0.88838787, + "epoch": 0.4983164983164983, + "grad_norm": 7.875, + "learning_rate": 1.790901041693514e-05, + "loss": 0.45208883, + "memory(GiB)": 15.04, + "step": 3700, + "train_speed(iter/s)": 0.336698 + }, + { + "acc": 0.88066473, + "epoch": 0.498989898989899, + "grad_norm": 7.0625, + "learning_rate": 1.7902191596756058e-05, + "loss": 0.35533628, + "memory(GiB)": 15.04, + "step": 3705, + "train_speed(iter/s)": 0.336749 + }, + { + "acc": 0.85620632, + "epoch": 0.49966329966329964, + "grad_norm": 7.59375, + "learning_rate": 1.7895362979016975e-05, + "loss": 0.51151724, + "memory(GiB)": 15.04, + "step": 3710, + "train_speed(iter/s)": 0.336765 + }, + { + "acc": 0.89235287, + "epoch": 0.5003367003367003, + "grad_norm": 7.90625, + "learning_rate": 1.7888524572184375e-05, + "loss": 0.42579594, + "memory(GiB)": 15.04, + "step": 3715, + "train_speed(iter/s)": 0.336862 + }, + { + "acc": 0.85100842, + "epoch": 0.501010101010101, + "grad_norm": 13.25, + "learning_rate": 1.7881676384736876e-05, + "loss": 0.5550694, + "memory(GiB)": 15.04, + "step": 3720, + "train_speed(iter/s)": 0.336955 + }, + { + "acc": 0.84736996, + "epoch": 0.5016835016835017, + "grad_norm": 7.34375, + "learning_rate": 1.7874818425165233e-05, + "loss": 0.57301402, + "memory(GiB)": 15.04, + "step": 3725, + "train_speed(iter/s)": 0.337066 + }, + { + "acc": 0.90914621, + "epoch": 0.5023569023569023, + "grad_norm": 7.90625, + "learning_rate": 1.7867950701972313e-05, + "loss": 0.30549459, + "memory(GiB)": 15.04, + "step": 3730, + "train_speed(iter/s)": 0.337125 + }, + { + "acc": 0.91866875, + "epoch": 0.503030303030303, + "grad_norm": 8.1875, + "learning_rate": 1.7861073223673084e-05, + "loss": 0.28113177, + "memory(GiB)": 15.04, + "step": 3735, + "train_speed(iter/s)": 0.3372 + }, + { + "acc": 0.90182066, + "epoch": 0.5037037037037037, + "grad_norm": 16.5, + "learning_rate": 1.785418599879461e-05, + "loss": 0.36212847, + "memory(GiB)": 15.04, + "step": 3740, + "train_speed(iter/s)": 0.337301 + }, + { + "acc": 0.79256129, + "epoch": 0.5043771043771044, + "grad_norm": 24.25, + "learning_rate": 1.7847289035876044e-05, + "loss": 0.93178387, + "memory(GiB)": 15.04, + "step": 3745, + "train_speed(iter/s)": 0.337414 + }, + { + "acc": 0.83150692, + "epoch": 0.5050505050505051, + "grad_norm": 14.25, + "learning_rate": 1.7840382343468604e-05, + "loss": 0.67089224, + "memory(GiB)": 15.04, + "step": 3750, + "train_speed(iter/s)": 0.337449 + }, + { + "acc": 0.89720764, + "epoch": 0.5057239057239057, + "grad_norm": 6.84375, + "learning_rate": 1.7833465930135586e-05, + "loss": 0.37244151, + "memory(GiB)": 15.04, + "step": 3755, + "train_speed(iter/s)": 0.337492 + }, + { + "acc": 0.86719933, + "epoch": 0.5063973063973064, + "grad_norm": 9.5, + "learning_rate": 1.782653980445232e-05, + "loss": 0.59741712, + "memory(GiB)": 15.04, + "step": 3760, + "train_speed(iter/s)": 0.33755 + }, + { + "acc": 0.8698164, + "epoch": 0.5070707070707071, + "grad_norm": 10.4375, + "learning_rate": 1.7819603975006195e-05, + "loss": 0.38258035, + "memory(GiB)": 15.04, + "step": 3765, + "train_speed(iter/s)": 0.337623 + }, + { + "acc": 0.90045404, + "epoch": 0.5077441077441077, + "grad_norm": 5.84375, + "learning_rate": 1.781265845039662e-05, + "loss": 0.42641959, + "memory(GiB)": 15.04, + "step": 3770, + "train_speed(iter/s)": 0.337552 + }, + { + "acc": 0.86632509, + "epoch": 0.5084175084175084, + "grad_norm": 7.6875, + "learning_rate": 1.7805703239235023e-05, + "loss": 0.51779404, + "memory(GiB)": 15.04, + "step": 3775, + "train_speed(iter/s)": 0.337629 + }, + { + "acc": 0.82737207, + "epoch": 0.509090909090909, + "grad_norm": 11.4375, + "learning_rate": 1.7798738350144854e-05, + "loss": 0.61180573, + "memory(GiB)": 15.04, + "step": 3780, + "train_speed(iter/s)": 0.337738 + }, + { + "acc": 0.91728687, + "epoch": 0.5097643097643098, + "grad_norm": 6.0625, + "learning_rate": 1.7791763791761557e-05, + "loss": 0.25166643, + "memory(GiB)": 15.04, + "step": 3785, + "train_speed(iter/s)": 0.337811 + }, + { + "acc": 0.88268843, + "epoch": 0.5104377104377105, + "grad_norm": 4.59375, + "learning_rate": 1.7784779572732558e-05, + "loss": 0.43862414, + "memory(GiB)": 15.04, + "step": 3790, + "train_speed(iter/s)": 0.337802 + }, + { + "acc": 0.86389875, + "epoch": 0.5111111111111111, + "grad_norm": 10.4375, + "learning_rate": 1.7777785701717266e-05, + "loss": 0.41195416, + "memory(GiB)": 15.04, + "step": 3795, + "train_speed(iter/s)": 0.337886 + }, + { + "acc": 0.8191576, + "epoch": 0.5117845117845118, + "grad_norm": 7.40625, + "learning_rate": 1.7770782187387056e-05, + "loss": 0.74752345, + "memory(GiB)": 15.04, + "step": 3800, + "train_speed(iter/s)": 0.337868 + }, + { + "acc": 0.86887932, + "epoch": 0.5124579124579125, + "grad_norm": 7.03125, + "learning_rate": 1.776376903842526e-05, + "loss": 0.45682001, + "memory(GiB)": 15.04, + "step": 3805, + "train_speed(iter/s)": 0.337834 + }, + { + "acc": 0.92683773, + "epoch": 0.5131313131313131, + "grad_norm": 12.4375, + "learning_rate": 1.7756746263527157e-05, + "loss": 0.31054833, + "memory(GiB)": 15.04, + "step": 3810, + "train_speed(iter/s)": 0.337921 + }, + { + "acc": 0.89547215, + "epoch": 0.5138047138047138, + "grad_norm": 5.875, + "learning_rate": 1.774971387139996e-05, + "loss": 0.45491533, + "memory(GiB)": 15.04, + "step": 3815, + "train_speed(iter/s)": 0.337943 + }, + { + "acc": 0.88838444, + "epoch": 0.5144781144781144, + "grad_norm": 12.3125, + "learning_rate": 1.7742671870762805e-05, + "loss": 0.34393697, + "memory(GiB)": 15.04, + "step": 3820, + "train_speed(iter/s)": 0.337996 + }, + { + "acc": 0.85375404, + "epoch": 0.5151515151515151, + "grad_norm": 9.3125, + "learning_rate": 1.7735620270346735e-05, + "loss": 0.64072185, + "memory(GiB)": 15.04, + "step": 3825, + "train_speed(iter/s)": 0.338096 + }, + { + "acc": 0.89439125, + "epoch": 0.5158249158249159, + "grad_norm": 7.84375, + "learning_rate": 1.7728559078894708e-05, + "loss": 0.34980464, + "memory(GiB)": 15.04, + "step": 3830, + "train_speed(iter/s)": 0.33818 + }, + { + "acc": 0.89482927, + "epoch": 0.5164983164983165, + "grad_norm": 15.375, + "learning_rate": 1.7721488305161566e-05, + "loss": 0.42905107, + "memory(GiB)": 15.04, + "step": 3835, + "train_speed(iter/s)": 0.33823 + }, + { + "acc": 0.78987341, + "epoch": 0.5171717171717172, + "grad_norm": 19.125, + "learning_rate": 1.7714407957914033e-05, + "loss": 0.84220772, + "memory(GiB)": 15.04, + "step": 3840, + "train_speed(iter/s)": 0.338284 + }, + { + "acc": 0.8020134, + "epoch": 0.5178451178451179, + "grad_norm": 18.0, + "learning_rate": 1.77073180459307e-05, + "loss": 0.80393257, + "memory(GiB)": 15.04, + "step": 3845, + "train_speed(iter/s)": 0.338378 + }, + { + "acc": 0.85574732, + "epoch": 0.5185185185185185, + "grad_norm": 13.0625, + "learning_rate": 1.7700218578002018e-05, + "loss": 0.60742431, + "memory(GiB)": 15.04, + "step": 3850, + "train_speed(iter/s)": 0.338399 + }, + { + "acc": 0.90230808, + "epoch": 0.5191919191919192, + "grad_norm": 8.625, + "learning_rate": 1.7693109562930294e-05, + "loss": 0.48803911, + "memory(GiB)": 15.04, + "step": 3855, + "train_speed(iter/s)": 0.338495 + }, + { + "acc": 0.9076643, + "epoch": 0.5198653198653199, + "grad_norm": 11.25, + "learning_rate": 1.7685991009529658e-05, + "loss": 0.30903282, + "memory(GiB)": 15.04, + "step": 3860, + "train_speed(iter/s)": 0.338563 + }, + { + "acc": 0.71694846, + "epoch": 0.5205387205387205, + "grad_norm": 10.5, + "learning_rate": 1.7678862926626076e-05, + "loss": 0.96086855, + "memory(GiB)": 15.04, + "step": 3865, + "train_speed(iter/s)": 0.338651 + }, + { + "acc": 0.91138239, + "epoch": 0.5212121212121212, + "grad_norm": 5.46875, + "learning_rate": 1.767172532305733e-05, + "loss": 0.37817876, + "memory(GiB)": 15.04, + "step": 3870, + "train_speed(iter/s)": 0.338704 + }, + { + "acc": 0.91213226, + "epoch": 0.5218855218855218, + "grad_norm": 7.5625, + "learning_rate": 1.7664578207672997e-05, + "loss": 0.33159304, + "memory(GiB)": 15.04, + "step": 3875, + "train_speed(iter/s)": 0.338766 + }, + { + "acc": 0.94331083, + "epoch": 0.5225589225589226, + "grad_norm": 6.875, + "learning_rate": 1.765742158933446e-05, + "loss": 0.23964734, + "memory(GiB)": 15.04, + "step": 3880, + "train_speed(iter/s)": 0.338776 + }, + { + "acc": 0.88471804, + "epoch": 0.5232323232323233, + "grad_norm": 7.25, + "learning_rate": 1.765025547691487e-05, + "loss": 0.45875759, + "memory(GiB)": 15.04, + "step": 3885, + "train_speed(iter/s)": 0.338707 + }, + { + "acc": 0.89637489, + "epoch": 0.5239057239057239, + "grad_norm": 11.0, + "learning_rate": 1.7643079879299163e-05, + "loss": 0.44263835, + "memory(GiB)": 15.04, + "step": 3890, + "train_speed(iter/s)": 0.338725 + }, + { + "acc": 0.86330814, + "epoch": 0.5245791245791246, + "grad_norm": 8.875, + "learning_rate": 1.7635894805384024e-05, + "loss": 0.54586911, + "memory(GiB)": 15.04, + "step": 3895, + "train_speed(iter/s)": 0.338773 + }, + { + "acc": 0.82347183, + "epoch": 0.5252525252525253, + "grad_norm": 10.5625, + "learning_rate": 1.7628700264077893e-05, + "loss": 0.66672564, + "memory(GiB)": 15.04, + "step": 3900, + "train_speed(iter/s)": 0.338762 + }, + { + "epoch": 0.5252525252525253, + "eval_acc": 0.8781347962382445, + "eval_loss": 0.4679408669471741, + "eval_runtime": 109.8842, + "eval_samples_per_second": 1.365, + "eval_steps_per_second": 1.365, + "step": 3900 + }, + { + "acc": 0.77695065, + "epoch": 0.5259259259259259, + "grad_norm": 18.625, + "learning_rate": 1.7621496264300954e-05, + "loss": 0.54761453, + "memory(GiB)": 15.04, + "step": 3905, + "train_speed(iter/s)": 0.335549 + }, + { + "acc": 0.86843939, + "epoch": 0.5265993265993266, + "grad_norm": 8.5625, + "learning_rate": 1.76142828149851e-05, + "loss": 0.45215411, + "memory(GiB)": 15.04, + "step": 3910, + "train_speed(iter/s)": 0.335647 + }, + { + "acc": 0.86568508, + "epoch": 0.5272727272727272, + "grad_norm": 7.59375, + "learning_rate": 1.760705992507396e-05, + "loss": 0.52686167, + "memory(GiB)": 15.04, + "step": 3915, + "train_speed(iter/s)": 0.335723 + }, + { + "acc": 0.84859962, + "epoch": 0.5279461279461279, + "grad_norm": 11.875, + "learning_rate": 1.7599827603522858e-05, + "loss": 0.56251554, + "memory(GiB)": 15.04, + "step": 3920, + "train_speed(iter/s)": 0.335692 + }, + { + "acc": 0.84980879, + "epoch": 0.5286195286195287, + "grad_norm": 8.3125, + "learning_rate": 1.7592585859298808e-05, + "loss": 0.41706219, + "memory(GiB)": 15.04, + "step": 3925, + "train_speed(iter/s)": 0.335597 + }, + { + "acc": 0.82573633, + "epoch": 0.5292929292929293, + "grad_norm": 5.4375, + "learning_rate": 1.7585334701380518e-05, + "loss": 0.57327018, + "memory(GiB)": 15.04, + "step": 3930, + "train_speed(iter/s)": 0.335688 + }, + { + "acc": 0.80837812, + "epoch": 0.52996632996633, + "grad_norm": 9.0, + "learning_rate": 1.757807413875836e-05, + "loss": 0.77750101, + "memory(GiB)": 15.04, + "step": 3935, + "train_speed(iter/s)": 0.335716 + }, + { + "acc": 0.92627468, + "epoch": 0.5306397306397307, + "grad_norm": 5.53125, + "learning_rate": 1.7570804180434368e-05, + "loss": 0.32449107, + "memory(GiB)": 15.04, + "step": 3940, + "train_speed(iter/s)": 0.335731 + }, + { + "acc": 0.9044364, + "epoch": 0.5313131313131313, + "grad_norm": 7.15625, + "learning_rate": 1.7563524835422224e-05, + "loss": 0.4426527, + "memory(GiB)": 15.04, + "step": 3945, + "train_speed(iter/s)": 0.335705 + }, + { + "acc": 0.87809496, + "epoch": 0.531986531986532, + "grad_norm": 10.0, + "learning_rate": 1.7556236112747253e-05, + "loss": 0.47075071, + "memory(GiB)": 15.04, + "step": 3950, + "train_speed(iter/s)": 0.335751 + }, + { + "acc": 0.92612886, + "epoch": 0.5326599326599326, + "grad_norm": 11.5625, + "learning_rate": 1.7548938021446398e-05, + "loss": 0.27105238, + "memory(GiB)": 15.04, + "step": 3955, + "train_speed(iter/s)": 0.335827 + }, + { + "acc": 0.91240883, + "epoch": 0.5333333333333333, + "grad_norm": 6.09375, + "learning_rate": 1.7541630570568227e-05, + "loss": 0.27689917, + "memory(GiB)": 15.04, + "step": 3960, + "train_speed(iter/s)": 0.335874 + }, + { + "acc": 0.87582722, + "epoch": 0.534006734006734, + "grad_norm": 9.3125, + "learning_rate": 1.7534313769172908e-05, + "loss": 0.48547306, + "memory(GiB)": 15.04, + "step": 3965, + "train_speed(iter/s)": 0.335879 + }, + { + "acc": 0.88297787, + "epoch": 0.5346801346801346, + "grad_norm": 14.125, + "learning_rate": 1.7526987626332202e-05, + "loss": 0.45881667, + "memory(GiB)": 15.04, + "step": 3970, + "train_speed(iter/s)": 0.335924 + }, + { + "acc": 0.81192646, + "epoch": 0.5353535353535354, + "grad_norm": 11.0, + "learning_rate": 1.7519652151129458e-05, + "loss": 0.8739994, + "memory(GiB)": 15.04, + "step": 3975, + "train_speed(iter/s)": 0.335987 + }, + { + "acc": 0.88407602, + "epoch": 0.5360269360269361, + "grad_norm": 10.5, + "learning_rate": 1.7512307352659583e-05, + "loss": 0.27493479, + "memory(GiB)": 15.04, + "step": 3980, + "train_speed(iter/s)": 0.336073 + }, + { + "acc": 0.90319691, + "epoch": 0.5367003367003367, + "grad_norm": 7.25, + "learning_rate": 1.7504953240029053e-05, + "loss": 0.3337369, + "memory(GiB)": 15.04, + "step": 3985, + "train_speed(iter/s)": 0.336143 + }, + { + "acc": 0.914468, + "epoch": 0.5373737373737374, + "grad_norm": 7.15625, + "learning_rate": 1.7497589822355892e-05, + "loss": 0.30370355, + "memory(GiB)": 15.04, + "step": 3990, + "train_speed(iter/s)": 0.336166 + }, + { + "acc": 0.8577981, + "epoch": 0.538047138047138, + "grad_norm": 8.875, + "learning_rate": 1.7490217108769663e-05, + "loss": 0.5094347, + "memory(GiB)": 15.04, + "step": 3995, + "train_speed(iter/s)": 0.33625 + }, + { + "acc": 0.90495567, + "epoch": 0.5387205387205387, + "grad_norm": 7.59375, + "learning_rate": 1.7482835108411442e-05, + "loss": 0.36858811, + "memory(GiB)": 15.04, + "step": 4000, + "train_speed(iter/s)": 0.336304 + }, + { + "acc": 0.89041739, + "epoch": 0.5393939393939394, + "grad_norm": 4.84375, + "learning_rate": 1.7475443830433835e-05, + "loss": 0.32242258, + "memory(GiB)": 15.04, + "step": 4005, + "train_speed(iter/s)": 0.336248 + }, + { + "acc": 0.89687119, + "epoch": 0.54006734006734, + "grad_norm": 6.5625, + "learning_rate": 1.7468043284000945e-05, + "loss": 0.29208932, + "memory(GiB)": 15.04, + "step": 4010, + "train_speed(iter/s)": 0.336335 + }, + { + "acc": 0.82246227, + "epoch": 0.5407407407407407, + "grad_norm": 10.8125, + "learning_rate": 1.746063347828836e-05, + "loss": 0.56478887, + "memory(GiB)": 15.04, + "step": 4015, + "train_speed(iter/s)": 0.336412 + }, + { + "acc": 0.87356806, + "epoch": 0.5414141414141415, + "grad_norm": 7.90625, + "learning_rate": 1.7453214422483154e-05, + "loss": 0.48184762, + "memory(GiB)": 15.04, + "step": 4020, + "train_speed(iter/s)": 0.336486 + }, + { + "acc": 0.84914217, + "epoch": 0.5420875420875421, + "grad_norm": 11.625, + "learning_rate": 1.744578612578387e-05, + "loss": 0.42026148, + "memory(GiB)": 15.04, + "step": 4025, + "train_speed(iter/s)": 0.336521 + }, + { + "acc": 0.8880044, + "epoch": 0.5427609427609428, + "grad_norm": 8.5625, + "learning_rate": 1.7438348597400513e-05, + "loss": 0.34287093, + "memory(GiB)": 15.04, + "step": 4030, + "train_speed(iter/s)": 0.336597 + }, + { + "acc": 0.89508591, + "epoch": 0.5434343434343434, + "grad_norm": 11.375, + "learning_rate": 1.7430901846554525e-05, + "loss": 0.33280551, + "memory(GiB)": 15.04, + "step": 4035, + "train_speed(iter/s)": 0.336685 + }, + { + "acc": 0.88267012, + "epoch": 0.5441077441077441, + "grad_norm": 7.46875, + "learning_rate": 1.7423445882478785e-05, + "loss": 0.36910105, + "memory(GiB)": 15.04, + "step": 4040, + "train_speed(iter/s)": 0.336762 + }, + { + "acc": 0.88442879, + "epoch": 0.5447811447811448, + "grad_norm": 7.53125, + "learning_rate": 1.74159807144176e-05, + "loss": 0.42848067, + "memory(GiB)": 15.04, + "step": 4045, + "train_speed(iter/s)": 0.336796 + }, + { + "acc": 0.90293417, + "epoch": 0.5454545454545454, + "grad_norm": 10.8125, + "learning_rate": 1.7408506351626677e-05, + "loss": 0.37314644, + "memory(GiB)": 15.04, + "step": 4050, + "train_speed(iter/s)": 0.336881 + }, + { + "acc": 0.94608707, + "epoch": 0.5461279461279461, + "grad_norm": 10.75, + "learning_rate": 1.740102280337314e-05, + "loss": 0.19708145, + "memory(GiB)": 15.04, + "step": 4055, + "train_speed(iter/s)": 0.336948 + }, + { + "acc": 0.86210232, + "epoch": 0.5468013468013468, + "grad_norm": 14.625, + "learning_rate": 1.7393530078935486e-05, + "loss": 0.54546967, + "memory(GiB)": 15.04, + "step": 4060, + "train_speed(iter/s)": 0.337023 + }, + { + "acc": 0.8828249, + "epoch": 0.5474747474747474, + "grad_norm": 5.84375, + "learning_rate": 1.73860281876036e-05, + "loss": 0.53822279, + "memory(GiB)": 15.04, + "step": 4065, + "train_speed(iter/s)": 0.33703 + }, + { + "acc": 0.91557751, + "epoch": 0.5481481481481482, + "grad_norm": 11.8125, + "learning_rate": 1.7378517138678727e-05, + "loss": 0.40204964, + "memory(GiB)": 15.04, + "step": 4070, + "train_speed(iter/s)": 0.337059 + }, + { + "acc": 0.88881645, + "epoch": 0.5488215488215489, + "grad_norm": 9.625, + "learning_rate": 1.7370996941473464e-05, + "loss": 0.50532298, + "memory(GiB)": 15.04, + "step": 4075, + "train_speed(iter/s)": 0.337095 + }, + { + "acc": 0.8979125, + "epoch": 0.5494949494949495, + "grad_norm": 18.0, + "learning_rate": 1.736346760531176e-05, + "loss": 0.31720853, + "memory(GiB)": 15.04, + "step": 4080, + "train_speed(iter/s)": 0.337157 + }, + { + "acc": 0.84728556, + "epoch": 0.5501683501683502, + "grad_norm": 8.3125, + "learning_rate": 1.7355929139528888e-05, + "loss": 0.45086961, + "memory(GiB)": 15.04, + "step": 4085, + "train_speed(iter/s)": 0.33714 + }, + { + "acc": 0.92506981, + "epoch": 0.5508417508417508, + "grad_norm": 4.0625, + "learning_rate": 1.7348381553471436e-05, + "loss": 0.2887996, + "memory(GiB)": 15.04, + "step": 4090, + "train_speed(iter/s)": 0.337177 + }, + { + "acc": 0.91318417, + "epoch": 0.5515151515151515, + "grad_norm": 9.5625, + "learning_rate": 1.734082485649731e-05, + "loss": 0.40977402, + "memory(GiB)": 15.04, + "step": 4095, + "train_speed(iter/s)": 0.337245 + }, + { + "acc": 0.89119549, + "epoch": 0.5521885521885522, + "grad_norm": 8.0, + "learning_rate": 1.7333259057975705e-05, + "loss": 0.31618371, + "memory(GiB)": 15.04, + "step": 4100, + "train_speed(iter/s)": 0.337301 + }, + { + "acc": 0.91062832, + "epoch": 0.5528619528619528, + "grad_norm": 7.15625, + "learning_rate": 1.7325684167287105e-05, + "loss": 0.28954463, + "memory(GiB)": 15.04, + "step": 4105, + "train_speed(iter/s)": 0.337332 + }, + { + "acc": 0.92623892, + "epoch": 0.5535353535353535, + "grad_norm": 5.75, + "learning_rate": 1.731810019382326e-05, + "loss": 0.25716453, + "memory(GiB)": 15.04, + "step": 4110, + "train_speed(iter/s)": 0.337401 + }, + { + "acc": 0.92022734, + "epoch": 0.5542087542087543, + "grad_norm": 10.0, + "learning_rate": 1.731050714698719e-05, + "loss": 0.25363297, + "memory(GiB)": 15.04, + "step": 4115, + "train_speed(iter/s)": 0.337441 + }, + { + "acc": 0.8096302, + "epoch": 0.5548821548821549, + "grad_norm": 4.0625, + "learning_rate": 1.730290503619316e-05, + "loss": 0.65786781, + "memory(GiB)": 15.04, + "step": 4120, + "train_speed(iter/s)": 0.337443 + }, + { + "acc": 0.84931622, + "epoch": 0.5555555555555556, + "grad_norm": 12.0625, + "learning_rate": 1.7295293870866677e-05, + "loss": 0.58855181, + "memory(GiB)": 15.04, + "step": 4125, + "train_speed(iter/s)": 0.337539 + }, + { + "acc": 0.9130374, + "epoch": 0.5562289562289562, + "grad_norm": 14.4375, + "learning_rate": 1.7287673660444464e-05, + "loss": 0.28229191, + "memory(GiB)": 15.04, + "step": 4130, + "train_speed(iter/s)": 0.337626 + }, + { + "acc": 0.88506327, + "epoch": 0.5569023569023569, + "grad_norm": 11.1875, + "learning_rate": 1.728004441437447e-05, + "loss": 0.47019825, + "memory(GiB)": 15.04, + "step": 4135, + "train_speed(iter/s)": 0.337605 + }, + { + "acc": 0.84497614, + "epoch": 0.5575757575757576, + "grad_norm": 5.78125, + "learning_rate": 1.7272406142115846e-05, + "loss": 0.4880383, + "memory(GiB)": 15.04, + "step": 4140, + "train_speed(iter/s)": 0.337599 + }, + { + "acc": 0.89238911, + "epoch": 0.5582491582491582, + "grad_norm": 7.03125, + "learning_rate": 1.7264758853138923e-05, + "loss": 0.37703359, + "memory(GiB)": 15.04, + "step": 4145, + "train_speed(iter/s)": 0.337609 + }, + { + "acc": 0.89045725, + "epoch": 0.5589225589225589, + "grad_norm": 6.59375, + "learning_rate": 1.7257102556925227e-05, + "loss": 0.4140079, + "memory(GiB)": 15.04, + "step": 4150, + "train_speed(iter/s)": 0.337668 + }, + { + "acc": 0.91018286, + "epoch": 0.5595959595959596, + "grad_norm": 21.375, + "learning_rate": 1.7249437262967436e-05, + "loss": 0.40824814, + "memory(GiB)": 15.04, + "step": 4155, + "train_speed(iter/s)": 0.337687 + }, + { + "acc": 0.870082, + "epoch": 0.5602693602693603, + "grad_norm": 6.21875, + "learning_rate": 1.7241762980769398e-05, + "loss": 0.37111406, + "memory(GiB)": 15.04, + "step": 4160, + "train_speed(iter/s)": 0.337636 + }, + { + "acc": 0.86445503, + "epoch": 0.560942760942761, + "grad_norm": 16.625, + "learning_rate": 1.7234079719846092e-05, + "loss": 0.63412347, + "memory(GiB)": 15.04, + "step": 4165, + "train_speed(iter/s)": 0.337721 + }, + { + "acc": 0.91648846, + "epoch": 0.5616161616161616, + "grad_norm": 8.8125, + "learning_rate": 1.722638748972364e-05, + "loss": 0.32388, + "memory(GiB)": 15.04, + "step": 4170, + "train_speed(iter/s)": 0.3378 + }, + { + "acc": 0.89812613, + "epoch": 0.5622895622895623, + "grad_norm": 13.0625, + "learning_rate": 1.7218686299939286e-05, + "loss": 0.31773405, + "memory(GiB)": 15.04, + "step": 4175, + "train_speed(iter/s)": 0.337759 + }, + { + "acc": 0.90312166, + "epoch": 0.562962962962963, + "grad_norm": 6.125, + "learning_rate": 1.721097616004137e-05, + "loss": 0.44310713, + "memory(GiB)": 15.04, + "step": 4180, + "train_speed(iter/s)": 0.337812 + }, + { + "acc": 0.90160627, + "epoch": 0.5636363636363636, + "grad_norm": 4.4375, + "learning_rate": 1.7203257079589334e-05, + "loss": 0.44974608, + "memory(GiB)": 15.04, + "step": 4185, + "train_speed(iter/s)": 0.337836 + }, + { + "acc": 0.86169586, + "epoch": 0.5643097643097643, + "grad_norm": 10.125, + "learning_rate": 1.7195529068153715e-05, + "loss": 0.37513566, + "memory(GiB)": 15.04, + "step": 4190, + "train_speed(iter/s)": 0.337902 + }, + { + "acc": 0.88859873, + "epoch": 0.564983164983165, + "grad_norm": 11.75, + "learning_rate": 1.718779213531611e-05, + "loss": 0.4264184, + "memory(GiB)": 15.04, + "step": 4195, + "train_speed(iter/s)": 0.337944 + }, + { + "acc": 0.88831396, + "epoch": 0.5656565656565656, + "grad_norm": 8.25, + "learning_rate": 1.7180046290669182e-05, + "loss": 0.51441598, + "memory(GiB)": 15.04, + "step": 4200, + "train_speed(iter/s)": 0.338041 + }, + { + "epoch": 0.5656565656565656, + "eval_acc": 0.8814506728833316, + "eval_loss": 0.4589705467224121, + "eval_runtime": 109.7591, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 1.367, + "step": 4200 + }, + { + "acc": 0.86627464, + "epoch": 0.5663299663299664, + "grad_norm": 8.9375, + "learning_rate": 1.7172291543816647e-05, + "loss": 0.43737912, + "memory(GiB)": 15.04, + "step": 4205, + "train_speed(iter/s)": 0.335068 + }, + { + "acc": 0.84894361, + "epoch": 0.567003367003367, + "grad_norm": 13.75, + "learning_rate": 1.716452790437325e-05, + "loss": 0.45851641, + "memory(GiB)": 15.04, + "step": 4210, + "train_speed(iter/s)": 0.335161 + }, + { + "acc": 0.92653151, + "epoch": 0.5676767676767677, + "grad_norm": 9.9375, + "learning_rate": 1.7156755381964773e-05, + "loss": 0.23524013, + "memory(GiB)": 15.04, + "step": 4215, + "train_speed(iter/s)": 0.335236 + }, + { + "acc": 0.81595592, + "epoch": 0.5683501683501684, + "grad_norm": 26.125, + "learning_rate": 1.7148973986228e-05, + "loss": 0.46322184, + "memory(GiB)": 15.04, + "step": 4220, + "train_speed(iter/s)": 0.335287 + }, + { + "acc": 0.90123253, + "epoch": 0.569023569023569, + "grad_norm": 8.1875, + "learning_rate": 1.7141183726810727e-05, + "loss": 0.358708, + "memory(GiB)": 15.04, + "step": 4225, + "train_speed(iter/s)": 0.335338 + }, + { + "acc": 0.92097464, + "epoch": 0.5696969696969697, + "grad_norm": 8.1875, + "learning_rate": 1.713338461337173e-05, + "loss": 0.29511561, + "memory(GiB)": 15.04, + "step": 4230, + "train_speed(iter/s)": 0.335371 + }, + { + "acc": 0.87373762, + "epoch": 0.5703703703703704, + "grad_norm": 8.9375, + "learning_rate": 1.7125576655580765e-05, + "loss": 0.23357124, + "memory(GiB)": 15.04, + "step": 4235, + "train_speed(iter/s)": 0.33544 + }, + { + "acc": 0.84742498, + "epoch": 0.571043771043771, + "grad_norm": 5.65625, + "learning_rate": 1.7117759863118562e-05, + "loss": 0.40855536, + "memory(GiB)": 15.04, + "step": 4240, + "train_speed(iter/s)": 0.335503 + }, + { + "acc": 0.89106922, + "epoch": 0.5717171717171717, + "grad_norm": 9.1875, + "learning_rate": 1.7109934245676797e-05, + "loss": 0.44124908, + "memory(GiB)": 15.04, + "step": 4245, + "train_speed(iter/s)": 0.335585 + }, + { + "acc": 0.85798492, + "epoch": 0.5723905723905723, + "grad_norm": 14.8125, + "learning_rate": 1.7102099812958086e-05, + "loss": 0.56557817, + "memory(GiB)": 15.04, + "step": 4250, + "train_speed(iter/s)": 0.335641 + }, + { + "acc": 0.85664806, + "epoch": 0.573063973063973, + "grad_norm": 12.5, + "learning_rate": 1.7094256574675984e-05, + "loss": 0.44262295, + "memory(GiB)": 15.04, + "step": 4255, + "train_speed(iter/s)": 0.335669 + }, + { + "acc": 0.91525526, + "epoch": 0.5737373737373738, + "grad_norm": 6.71875, + "learning_rate": 1.7086404540554947e-05, + "loss": 0.27817523, + "memory(GiB)": 15.04, + "step": 4260, + "train_speed(iter/s)": 0.3357 + }, + { + "acc": 0.85219793, + "epoch": 0.5744107744107744, + "grad_norm": 6.78125, + "learning_rate": 1.7078543720330357e-05, + "loss": 0.6271781, + "memory(GiB)": 15.04, + "step": 4265, + "train_speed(iter/s)": 0.335788 + }, + { + "acc": 0.91735287, + "epoch": 0.5750841750841751, + "grad_norm": 10.875, + "learning_rate": 1.707067412374848e-05, + "loss": 0.35268691, + "memory(GiB)": 15.04, + "step": 4270, + "train_speed(iter/s)": 0.335868 + }, + { + "acc": 0.90828514, + "epoch": 0.5757575757575758, + "grad_norm": 6.96875, + "learning_rate": 1.7062795760566453e-05, + "loss": 0.30698793, + "memory(GiB)": 15.04, + "step": 4275, + "train_speed(iter/s)": 0.335937 + }, + { + "acc": 0.87601233, + "epoch": 0.5764309764309764, + "grad_norm": 6.125, + "learning_rate": 1.7054908640552302e-05, + "loss": 0.37530935, + "memory(GiB)": 15.04, + "step": 4280, + "train_speed(iter/s)": 0.336009 + }, + { + "acc": 0.87714987, + "epoch": 0.5771043771043771, + "grad_norm": 8.75, + "learning_rate": 1.7047012773484898e-05, + "loss": 0.33812811, + "memory(GiB)": 15.04, + "step": 4285, + "train_speed(iter/s)": 0.33605 + }, + { + "acc": 0.88150272, + "epoch": 0.5777777777777777, + "grad_norm": 8.375, + "learning_rate": 1.703910816915396e-05, + "loss": 0.36193902, + "memory(GiB)": 15.04, + "step": 4290, + "train_speed(iter/s)": 0.336074 + }, + { + "acc": 0.89709082, + "epoch": 0.5784511784511784, + "grad_norm": 7.46875, + "learning_rate": 1.7031194837360035e-05, + "loss": 0.27638795, + "memory(GiB)": 15.04, + "step": 4295, + "train_speed(iter/s)": 0.336157 + }, + { + "acc": 0.90212574, + "epoch": 0.5791245791245792, + "grad_norm": 11.5, + "learning_rate": 1.7023272787914496e-05, + "loss": 0.41099176, + "memory(GiB)": 15.04, + "step": 4300, + "train_speed(iter/s)": 0.336195 + }, + { + "acc": 0.91579752, + "epoch": 0.5797979797979798, + "grad_norm": 9.25, + "learning_rate": 1.701534203063953e-05, + "loss": 0.30125055, + "memory(GiB)": 15.04, + "step": 4305, + "train_speed(iter/s)": 0.336291 + }, + { + "acc": 0.92083387, + "epoch": 0.5804713804713805, + "grad_norm": 6.75, + "learning_rate": 1.7007402575368107e-05, + "loss": 0.3330127, + "memory(GiB)": 15.04, + "step": 4310, + "train_speed(iter/s)": 0.336333 + }, + { + "acc": 0.91086092, + "epoch": 0.5811447811447812, + "grad_norm": 5.4375, + "learning_rate": 1.6999454431943997e-05, + "loss": 0.31445036, + "memory(GiB)": 15.04, + "step": 4315, + "train_speed(iter/s)": 0.336361 + }, + { + "acc": 0.8398551, + "epoch": 0.5818181818181818, + "grad_norm": 6.40625, + "learning_rate": 1.6991497610221722e-05, + "loss": 0.51611238, + "memory(GiB)": 15.04, + "step": 4320, + "train_speed(iter/s)": 0.336398 + }, + { + "acc": 0.85949497, + "epoch": 0.5824915824915825, + "grad_norm": 7.28125, + "learning_rate": 1.6983532120066583e-05, + "loss": 0.61256056, + "memory(GiB)": 15.04, + "step": 4325, + "train_speed(iter/s)": 0.336384 + }, + { + "acc": 0.80934429, + "epoch": 0.5831649831649832, + "grad_norm": 6.875, + "learning_rate": 1.6975557971354622e-05, + "loss": 0.26548476, + "memory(GiB)": 15.04, + "step": 4330, + "train_speed(iter/s)": 0.336428 + }, + { + "acc": 0.90867815, + "epoch": 0.5838383838383838, + "grad_norm": 7.8125, + "learning_rate": 1.6967575173972614e-05, + "loss": 0.38837395, + "memory(GiB)": 15.04, + "step": 4335, + "train_speed(iter/s)": 0.336432 + }, + { + "acc": 0.88366899, + "epoch": 0.5845117845117845, + "grad_norm": 9.25, + "learning_rate": 1.6959583737818053e-05, + "loss": 0.42913079, + "memory(GiB)": 15.04, + "step": 4340, + "train_speed(iter/s)": 0.336457 + }, + { + "acc": 0.84328384, + "epoch": 0.5851851851851851, + "grad_norm": 15.4375, + "learning_rate": 1.6951583672799153e-05, + "loss": 0.54969959, + "memory(GiB)": 15.04, + "step": 4345, + "train_speed(iter/s)": 0.336537 + }, + { + "acc": 0.92732067, + "epoch": 0.5858585858585859, + "grad_norm": 10.875, + "learning_rate": 1.6943574988834828e-05, + "loss": 0.25192871, + "memory(GiB)": 15.04, + "step": 4350, + "train_speed(iter/s)": 0.336583 + }, + { + "acc": 0.8462472, + "epoch": 0.5865319865319866, + "grad_norm": 5.75, + "learning_rate": 1.6935557695854666e-05, + "loss": 0.50387387, + "memory(GiB)": 15.04, + "step": 4355, + "train_speed(iter/s)": 0.336632 + }, + { + "acc": 0.8854928, + "epoch": 0.5872053872053872, + "grad_norm": 11.1875, + "learning_rate": 1.6927531803798937e-05, + "loss": 0.41608453, + "memory(GiB)": 15.04, + "step": 4360, + "train_speed(iter/s)": 0.336677 + }, + { + "acc": 0.90479069, + "epoch": 0.5878787878787879, + "grad_norm": 7.34375, + "learning_rate": 1.691949732261857e-05, + "loss": 0.25519066, + "memory(GiB)": 15.04, + "step": 4365, + "train_speed(iter/s)": 0.336709 + }, + { + "acc": 0.88697033, + "epoch": 0.5885521885521886, + "grad_norm": 9.1875, + "learning_rate": 1.6911454262275153e-05, + "loss": 0.33217278, + "memory(GiB)": 15.04, + "step": 4370, + "train_speed(iter/s)": 0.336792 + }, + { + "acc": 0.83444138, + "epoch": 0.5892255892255892, + "grad_norm": 7.9375, + "learning_rate": 1.6903402632740893e-05, + "loss": 0.48537288, + "memory(GiB)": 15.04, + "step": 4375, + "train_speed(iter/s)": 0.336835 + }, + { + "acc": 0.88983879, + "epoch": 0.5898989898989899, + "grad_norm": 8.3125, + "learning_rate": 1.6895342443998637e-05, + "loss": 0.37899985, + "memory(GiB)": 15.04, + "step": 4380, + "train_speed(iter/s)": 0.336899 + }, + { + "acc": 0.91641159, + "epoch": 0.5905723905723905, + "grad_norm": 6.84375, + "learning_rate": 1.6887273706041833e-05, + "loss": 0.28896179, + "memory(GiB)": 15.04, + "step": 4385, + "train_speed(iter/s)": 0.33691 + }, + { + "acc": 0.87947559, + "epoch": 0.5912457912457912, + "grad_norm": 7.03125, + "learning_rate": 1.687919642887454e-05, + "loss": 0.53417592, + "memory(GiB)": 15.04, + "step": 4390, + "train_speed(iter/s)": 0.336984 + }, + { + "acc": 0.89173651, + "epoch": 0.591919191919192, + "grad_norm": 12.1875, + "learning_rate": 1.6871110622511394e-05, + "loss": 0.40582504, + "memory(GiB)": 15.04, + "step": 4395, + "train_speed(iter/s)": 0.337027 + }, + { + "acc": 0.88862801, + "epoch": 0.5925925925925926, + "grad_norm": 16.5, + "learning_rate": 1.6863016296977613e-05, + "loss": 0.40161347, + "memory(GiB)": 15.04, + "step": 4400, + "train_speed(iter/s)": 0.33707 + }, + { + "acc": 0.89179201, + "epoch": 0.5932659932659933, + "grad_norm": 8.0, + "learning_rate": 1.6854913462308972e-05, + "loss": 0.47911062, + "memory(GiB)": 15.04, + "step": 4405, + "train_speed(iter/s)": 0.337085 + }, + { + "acc": 0.88523207, + "epoch": 0.593939393939394, + "grad_norm": 10.75, + "learning_rate": 1.6846802128551803e-05, + "loss": 0.39476483, + "memory(GiB)": 15.04, + "step": 4410, + "train_speed(iter/s)": 0.337137 + }, + { + "acc": 0.89964447, + "epoch": 0.5946127946127946, + "grad_norm": 5.4375, + "learning_rate": 1.6838682305762972e-05, + "loss": 0.35869577, + "memory(GiB)": 15.04, + "step": 4415, + "train_speed(iter/s)": 0.337133 + }, + { + "acc": 0.91826773, + "epoch": 0.5952861952861953, + "grad_norm": 12.0625, + "learning_rate": 1.6830554004009863e-05, + "loss": 0.34219422, + "memory(GiB)": 15.04, + "step": 4420, + "train_speed(iter/s)": 0.337174 + }, + { + "acc": 0.89292841, + "epoch": 0.5959595959595959, + "grad_norm": 5.5625, + "learning_rate": 1.6822417233370387e-05, + "loss": 0.36768606, + "memory(GiB)": 15.04, + "step": 4425, + "train_speed(iter/s)": 0.33716 + }, + { + "acc": 0.87695522, + "epoch": 0.5966329966329966, + "grad_norm": 7.375, + "learning_rate": 1.6814272003932943e-05, + "loss": 0.36812396, + "memory(GiB)": 15.04, + "step": 4430, + "train_speed(iter/s)": 0.337195 + }, + { + "acc": 0.89027824, + "epoch": 0.5973063973063973, + "grad_norm": 9.4375, + "learning_rate": 1.6806118325796425e-05, + "loss": 0.34605751, + "memory(GiB)": 15.04, + "step": 4435, + "train_speed(iter/s)": 0.337258 + }, + { + "acc": 0.92887344, + "epoch": 0.597979797979798, + "grad_norm": 12.0, + "learning_rate": 1.67979562090702e-05, + "loss": 0.27661517, + "memory(GiB)": 15.04, + "step": 4440, + "train_speed(iter/s)": 0.337316 + }, + { + "acc": 0.88897724, + "epoch": 0.5986531986531987, + "grad_norm": 15.375, + "learning_rate": 1.6789785663874096e-05, + "loss": 0.42862206, + "memory(GiB)": 15.04, + "step": 4445, + "train_speed(iter/s)": 0.337418 + }, + { + "acc": 0.91717243, + "epoch": 0.5993265993265994, + "grad_norm": 15.6875, + "learning_rate": 1.6781606700338386e-05, + "loss": 0.31442499, + "memory(GiB)": 15.04, + "step": 4450, + "train_speed(iter/s)": 0.33748 + }, + { + "acc": 0.87508106, + "epoch": 0.6, + "grad_norm": 17.25, + "learning_rate": 1.6773419328603796e-05, + "loss": 0.5540029, + "memory(GiB)": 15.04, + "step": 4455, + "train_speed(iter/s)": 0.337426 + }, + { + "acc": 0.89905529, + "epoch": 0.6006734006734007, + "grad_norm": 6.21875, + "learning_rate": 1.6765223558821465e-05, + "loss": 0.30281894, + "memory(GiB)": 15.04, + "step": 4460, + "train_speed(iter/s)": 0.337459 + }, + { + "acc": 0.86200438, + "epoch": 0.6013468013468013, + "grad_norm": 13.0, + "learning_rate": 1.675701940115294e-05, + "loss": 0.52247534, + "memory(GiB)": 15.04, + "step": 4465, + "train_speed(iter/s)": 0.337513 + }, + { + "acc": 0.89549341, + "epoch": 0.602020202020202, + "grad_norm": 10.125, + "learning_rate": 1.6748806865770188e-05, + "loss": 0.39380765, + "memory(GiB)": 15.04, + "step": 4470, + "train_speed(iter/s)": 0.337485 + }, + { + "acc": 0.85047359, + "epoch": 0.6026936026936027, + "grad_norm": 8.3125, + "learning_rate": 1.674058596285554e-05, + "loss": 0.58578529, + "memory(GiB)": 15.04, + "step": 4475, + "train_speed(iter/s)": 0.337566 + }, + { + "acc": 0.88651133, + "epoch": 0.6033670033670033, + "grad_norm": 5.5625, + "learning_rate": 1.6732356702601716e-05, + "loss": 0.48212433, + "memory(GiB)": 15.04, + "step": 4480, + "train_speed(iter/s)": 0.337543 + }, + { + "acc": 0.84304943, + "epoch": 0.604040404040404, + "grad_norm": 7.15625, + "learning_rate": 1.672411909521179e-05, + "loss": 0.57582579, + "memory(GiB)": 15.04, + "step": 4485, + "train_speed(iter/s)": 0.337575 + }, + { + "acc": 0.94032316, + "epoch": 0.6047138047138048, + "grad_norm": 6.90625, + "learning_rate": 1.6715873150899184e-05, + "loss": 0.24680698, + "memory(GiB)": 15.04, + "step": 4490, + "train_speed(iter/s)": 0.337627 + }, + { + "acc": 0.92611914, + "epoch": 0.6053872053872054, + "grad_norm": 7.21875, + "learning_rate": 1.6707618879887673e-05, + "loss": 0.26302812, + "memory(GiB)": 15.04, + "step": 4495, + "train_speed(iter/s)": 0.337637 + }, + { + "acc": 0.88908119, + "epoch": 0.6060606060606061, + "grad_norm": 8.625, + "learning_rate": 1.6699356292411336e-05, + "loss": 0.33910556, + "memory(GiB)": 15.04, + "step": 4500, + "train_speed(iter/s)": 0.337705 + }, + { + "epoch": 0.6060606060606061, + "eval_acc": 0.8837320067739204, + "eval_loss": 0.44876089692115784, + "eval_runtime": 109.7203, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 1.367, + "step": 4500 + }, + { + "acc": 0.90021839, + "epoch": 0.6067340067340067, + "grad_norm": 6.375, + "learning_rate": 1.669108539871457e-05, + "loss": 0.28921244, + "memory(GiB)": 15.04, + "step": 4505, + "train_speed(iter/s)": 0.334934 + }, + { + "acc": 0.86491404, + "epoch": 0.6074074074074074, + "grad_norm": 7.375, + "learning_rate": 1.6682806209052077e-05, + "loss": 0.32596858, + "memory(GiB)": 15.04, + "step": 4510, + "train_speed(iter/s)": 0.334937 + }, + { + "acc": 0.90164213, + "epoch": 0.6080808080808081, + "grad_norm": 4.9375, + "learning_rate": 1.6674518733688833e-05, + "loss": 0.4585031, + "memory(GiB)": 15.04, + "step": 4515, + "train_speed(iter/s)": 0.334899 + }, + { + "acc": 0.91891603, + "epoch": 0.6087542087542087, + "grad_norm": 8.375, + "learning_rate": 1.6666222982900098e-05, + "loss": 0.27369349, + "memory(GiB)": 15.04, + "step": 4520, + "train_speed(iter/s)": 0.334966 + }, + { + "acc": 0.83691006, + "epoch": 0.6094276094276094, + "grad_norm": 6.09375, + "learning_rate": 1.665791896697139e-05, + "loss": 0.54756508, + "memory(GiB)": 15.04, + "step": 4525, + "train_speed(iter/s)": 0.334992 + }, + { + "acc": 0.88414259, + "epoch": 0.6101010101010101, + "grad_norm": 5.625, + "learning_rate": 1.6649606696198467e-05, + "loss": 0.54539962, + "memory(GiB)": 15.04, + "step": 4530, + "train_speed(iter/s)": 0.335015 + }, + { + "acc": 0.89783468, + "epoch": 0.6107744107744107, + "grad_norm": 17.5, + "learning_rate": 1.664128618088733e-05, + "loss": 0.43428464, + "memory(GiB)": 15.04, + "step": 4535, + "train_speed(iter/s)": 0.335089 + }, + { + "acc": 0.88710184, + "epoch": 0.6114478114478115, + "grad_norm": 19.375, + "learning_rate": 1.6632957431354192e-05, + "loss": 0.4390841, + "memory(GiB)": 15.04, + "step": 4540, + "train_speed(iter/s)": 0.335119 + }, + { + "acc": 0.92598915, + "epoch": 0.6121212121212121, + "grad_norm": 8.875, + "learning_rate": 1.6624620457925494e-05, + "loss": 0.25133462, + "memory(GiB)": 15.04, + "step": 4545, + "train_speed(iter/s)": 0.33516 + }, + { + "acc": 0.85548468, + "epoch": 0.6127946127946128, + "grad_norm": 6.03125, + "learning_rate": 1.6616275270937858e-05, + "loss": 0.60639415, + "memory(GiB)": 15.04, + "step": 4550, + "train_speed(iter/s)": 0.335206 + }, + { + "acc": 0.89241152, + "epoch": 0.6134680134680135, + "grad_norm": 5.8125, + "learning_rate": 1.660792188073809e-05, + "loss": 0.39169483, + "memory(GiB)": 15.04, + "step": 4555, + "train_speed(iter/s)": 0.33528 + }, + { + "acc": 0.91160355, + "epoch": 0.6141414141414141, + "grad_norm": 6.40625, + "learning_rate": 1.659956029768317e-05, + "loss": 0.3247937, + "memory(GiB)": 15.04, + "step": 4560, + "train_speed(iter/s)": 0.335304 + }, + { + "acc": 0.91380854, + "epoch": 0.6148148148148148, + "grad_norm": 9.75, + "learning_rate": 1.659119053214024e-05, + "loss": 0.32185378, + "memory(GiB)": 15.04, + "step": 4565, + "train_speed(iter/s)": 0.33535 + }, + { + "acc": 0.89268837, + "epoch": 0.6154882154882155, + "grad_norm": 8.4375, + "learning_rate": 1.658281259448658e-05, + "loss": 0.3313453, + "memory(GiB)": 15.04, + "step": 4570, + "train_speed(iter/s)": 0.33537 + }, + { + "acc": 0.84350071, + "epoch": 0.6161616161616161, + "grad_norm": 13.875, + "learning_rate": 1.657442649510961e-05, + "loss": 0.41612873, + "memory(GiB)": 15.04, + "step": 4575, + "train_speed(iter/s)": 0.335462 + }, + { + "acc": 0.90122662, + "epoch": 0.6168350168350168, + "grad_norm": 15.9375, + "learning_rate": 1.656603224440686e-05, + "loss": 0.30698125, + "memory(GiB)": 15.04, + "step": 4580, + "train_speed(iter/s)": 0.335542 + }, + { + "acc": 0.89745512, + "epoch": 0.6175084175084176, + "grad_norm": 18.625, + "learning_rate": 1.655762985278597e-05, + "loss": 0.37742386, + "memory(GiB)": 15.04, + "step": 4585, + "train_speed(iter/s)": 0.335545 + }, + { + "acc": 0.8741374, + "epoch": 0.6181818181818182, + "grad_norm": 8.0, + "learning_rate": 1.6549219330664677e-05, + "loss": 0.55189834, + "memory(GiB)": 15.04, + "step": 4590, + "train_speed(iter/s)": 0.335564 + }, + { + "acc": 0.86944952, + "epoch": 0.6188552188552189, + "grad_norm": 5.0, + "learning_rate": 1.6540800688470798e-05, + "loss": 0.54992094, + "memory(GiB)": 15.04, + "step": 4595, + "train_speed(iter/s)": 0.33558 + }, + { + "acc": 0.85860176, + "epoch": 0.6195286195286195, + "grad_norm": 10.6875, + "learning_rate": 1.6532373936642217e-05, + "loss": 0.42584996, + "memory(GiB)": 15.04, + "step": 4600, + "train_speed(iter/s)": 0.335598 + }, + { + "acc": 0.85607958, + "epoch": 0.6202020202020202, + "grad_norm": 9.5, + "learning_rate": 1.652393908562687e-05, + "loss": 0.66726885, + "memory(GiB)": 15.04, + "step": 4605, + "train_speed(iter/s)": 0.335608 + }, + { + "acc": 0.83417387, + "epoch": 0.6208754208754209, + "grad_norm": 12.0625, + "learning_rate": 1.6515496145882733e-05, + "loss": 0.44307995, + "memory(GiB)": 15.04, + "step": 4610, + "train_speed(iter/s)": 0.335684 + }, + { + "acc": 0.91452293, + "epoch": 0.6215488215488215, + "grad_norm": 10.125, + "learning_rate": 1.6507045127877817e-05, + "loss": 0.35714803, + "memory(GiB)": 15.04, + "step": 4615, + "train_speed(iter/s)": 0.335741 + }, + { + "acc": 0.91882572, + "epoch": 0.6222222222222222, + "grad_norm": 10.0625, + "learning_rate": 1.649858604209015e-05, + "loss": 0.3434411, + "memory(GiB)": 15.04, + "step": 4620, + "train_speed(iter/s)": 0.335791 + }, + { + "acc": 0.92277594, + "epoch": 0.622895622895623, + "grad_norm": 6.875, + "learning_rate": 1.6490118899007755e-05, + "loss": 0.23106558, + "memory(GiB)": 15.04, + "step": 4625, + "train_speed(iter/s)": 0.335855 + }, + { + "acc": 0.84693232, + "epoch": 0.6235690235690236, + "grad_norm": 11.25, + "learning_rate": 1.6481643709128654e-05, + "loss": 0.65914745, + "memory(GiB)": 15.04, + "step": 4630, + "train_speed(iter/s)": 0.33585 + }, + { + "acc": 0.89393358, + "epoch": 0.6242424242424243, + "grad_norm": 20.0, + "learning_rate": 1.6473160482960837e-05, + "loss": 0.43507428, + "memory(GiB)": 15.04, + "step": 4635, + "train_speed(iter/s)": 0.335843 + }, + { + "acc": 0.90842285, + "epoch": 0.6249158249158249, + "grad_norm": 5.90625, + "learning_rate": 1.6464669231022257e-05, + "loss": 0.38126285, + "memory(GiB)": 15.04, + "step": 4640, + "train_speed(iter/s)": 0.335899 + }, + { + "acc": 0.85284615, + "epoch": 0.6255892255892256, + "grad_norm": 6.625, + "learning_rate": 1.6456169963840832e-05, + "loss": 0.50575981, + "memory(GiB)": 15.04, + "step": 4645, + "train_speed(iter/s)": 0.335962 + }, + { + "acc": 0.91851702, + "epoch": 0.6262626262626263, + "grad_norm": 5.53125, + "learning_rate": 1.6447662691954402e-05, + "loss": 0.27984991, + "memory(GiB)": 15.04, + "step": 4650, + "train_speed(iter/s)": 0.335947 + }, + { + "acc": 0.89429655, + "epoch": 0.6269360269360269, + "grad_norm": 9.9375, + "learning_rate": 1.6439147425910743e-05, + "loss": 0.29749134, + "memory(GiB)": 15.04, + "step": 4655, + "train_speed(iter/s)": 0.336041 + }, + { + "acc": 0.83321552, + "epoch": 0.6276094276094276, + "grad_norm": 7.0625, + "learning_rate": 1.643062417626753e-05, + "loss": 0.48077607, + "memory(GiB)": 15.04, + "step": 4660, + "train_speed(iter/s)": 0.336087 + }, + { + "acc": 0.88771658, + "epoch": 0.6282828282828283, + "grad_norm": 5.875, + "learning_rate": 1.6422092953592353e-05, + "loss": 0.43149128, + "memory(GiB)": 15.04, + "step": 4665, + "train_speed(iter/s)": 0.336139 + }, + { + "acc": 0.89234009, + "epoch": 0.6289562289562289, + "grad_norm": 6.625, + "learning_rate": 1.6413553768462672e-05, + "loss": 0.33808124, + "memory(GiB)": 15.04, + "step": 4670, + "train_speed(iter/s)": 0.336178 + }, + { + "acc": 0.86847658, + "epoch": 0.6296296296296297, + "grad_norm": 7.8125, + "learning_rate": 1.6405006631465826e-05, + "loss": 0.50474515, + "memory(GiB)": 15.04, + "step": 4675, + "train_speed(iter/s)": 0.336129 + }, + { + "acc": 0.88128033, + "epoch": 0.6303030303030303, + "grad_norm": 7.0625, + "learning_rate": 1.6396451553199014e-05, + "loss": 0.29793146, + "memory(GiB)": 15.04, + "step": 4680, + "train_speed(iter/s)": 0.336178 + }, + { + "acc": 0.83775768, + "epoch": 0.630976430976431, + "grad_norm": 12.125, + "learning_rate": 1.638788854426928e-05, + "loss": 0.65453782, + "memory(GiB)": 15.04, + "step": 4685, + "train_speed(iter/s)": 0.33623 + }, + { + "acc": 0.88658571, + "epoch": 0.6316498316498317, + "grad_norm": 6.53125, + "learning_rate": 1.6379317615293505e-05, + "loss": 0.46322603, + "memory(GiB)": 15.04, + "step": 4690, + "train_speed(iter/s)": 0.336229 + }, + { + "acc": 0.88868914, + "epoch": 0.6323232323232323, + "grad_norm": 8.25, + "learning_rate": 1.6370738776898378e-05, + "loss": 0.36851487, + "memory(GiB)": 15.04, + "step": 4695, + "train_speed(iter/s)": 0.336241 + }, + { + "acc": 0.93904266, + "epoch": 0.632996632996633, + "grad_norm": 5.96875, + "learning_rate": 1.6362152039720407e-05, + "loss": 0.26145096, + "memory(GiB)": 15.04, + "step": 4700, + "train_speed(iter/s)": 0.336295 + }, + { + "acc": 0.88688774, + "epoch": 0.6336700336700337, + "grad_norm": 11.25, + "learning_rate": 1.6353557414405883e-05, + "loss": 0.32266824, + "memory(GiB)": 15.04, + "step": 4705, + "train_speed(iter/s)": 0.336379 + }, + { + "acc": 0.79992952, + "epoch": 0.6343434343434343, + "grad_norm": 21.25, + "learning_rate": 1.634495491161089e-05, + "loss": 0.50860252, + "memory(GiB)": 15.04, + "step": 4710, + "train_speed(iter/s)": 0.336427 + }, + { + "acc": 0.76752977, + "epoch": 0.635016835016835, + "grad_norm": 18.125, + "learning_rate": 1.6336344542001264e-05, + "loss": 0.54262133, + "memory(GiB)": 15.04, + "step": 4715, + "train_speed(iter/s)": 0.336448 + }, + { + "acc": 0.85343208, + "epoch": 0.6356902356902356, + "grad_norm": 6.3125, + "learning_rate": 1.632772631625261e-05, + "loss": 0.57416587, + "memory(GiB)": 15.04, + "step": 4720, + "train_speed(iter/s)": 0.336476 + }, + { + "acc": 0.92890425, + "epoch": 0.6363636363636364, + "grad_norm": 9.0625, + "learning_rate": 1.631910024505025e-05, + "loss": 0.29194543, + "memory(GiB)": 15.04, + "step": 4725, + "train_speed(iter/s)": 0.336545 + }, + { + "acc": 0.94128084, + "epoch": 0.6370370370370371, + "grad_norm": 14.4375, + "learning_rate": 1.631046633908927e-05, + "loss": 0.23905036, + "memory(GiB)": 15.04, + "step": 4730, + "train_speed(iter/s)": 0.336607 + }, + { + "acc": 0.87926884, + "epoch": 0.6377104377104377, + "grad_norm": 8.0625, + "learning_rate": 1.6301824609074432e-05, + "loss": 0.348334, + "memory(GiB)": 15.04, + "step": 4735, + "train_speed(iter/s)": 0.336634 + }, + { + "acc": 0.9110774, + "epoch": 0.6383838383838384, + "grad_norm": 8.3125, + "learning_rate": 1.6293175065720223e-05, + "loss": 0.22420795, + "memory(GiB)": 15.04, + "step": 4740, + "train_speed(iter/s)": 0.33669 + }, + { + "acc": 0.86740751, + "epoch": 0.6390572390572391, + "grad_norm": 14.25, + "learning_rate": 1.628451771975081e-05, + "loss": 0.43412094, + "memory(GiB)": 15.04, + "step": 4745, + "train_speed(iter/s)": 0.336757 + }, + { + "acc": 0.88887815, + "epoch": 0.6397306397306397, + "grad_norm": 6.65625, + "learning_rate": 1.627585258190003e-05, + "loss": 0.48215976, + "memory(GiB)": 15.04, + "step": 4750, + "train_speed(iter/s)": 0.336804 + }, + { + "acc": 0.85770273, + "epoch": 0.6404040404040404, + "grad_norm": 8.8125, + "learning_rate": 1.6267179662911385e-05, + "loss": 0.42769055, + "memory(GiB)": 15.04, + "step": 4755, + "train_speed(iter/s)": 0.336766 + }, + { + "acc": 0.90283947, + "epoch": 0.641077441077441, + "grad_norm": 7.625, + "learning_rate": 1.6258498973538028e-05, + "loss": 0.24866667, + "memory(GiB)": 15.04, + "step": 4760, + "train_speed(iter/s)": 0.336822 + }, + { + "acc": 0.89991875, + "epoch": 0.6417508417508417, + "grad_norm": 12.625, + "learning_rate": 1.6249810524542736e-05, + "loss": 0.35716062, + "memory(GiB)": 15.04, + "step": 4765, + "train_speed(iter/s)": 0.33688 + }, + { + "acc": 0.89296923, + "epoch": 0.6424242424242425, + "grad_norm": 10.25, + "learning_rate": 1.624111432669792e-05, + "loss": 0.35033152, + "memory(GiB)": 15.04, + "step": 4770, + "train_speed(iter/s)": 0.336927 + }, + { + "acc": 0.82698908, + "epoch": 0.6430976430976431, + "grad_norm": 6.0, + "learning_rate": 1.6232410390785584e-05, + "loss": 0.43431349, + "memory(GiB)": 15.04, + "step": 4775, + "train_speed(iter/s)": 0.336905 + }, + { + "acc": 0.89962473, + "epoch": 0.6437710437710438, + "grad_norm": 9.1875, + "learning_rate": 1.6223698727597337e-05, + "loss": 0.26351688, + "memory(GiB)": 15.04, + "step": 4780, + "train_speed(iter/s)": 0.33694 + }, + { + "acc": 0.94725761, + "epoch": 0.6444444444444445, + "grad_norm": 6.21875, + "learning_rate": 1.621497934793437e-05, + "loss": 0.21798928, + "memory(GiB)": 15.04, + "step": 4785, + "train_speed(iter/s)": 0.336996 + }, + { + "acc": 0.88771954, + "epoch": 0.6451178451178451, + "grad_norm": 5.96875, + "learning_rate": 1.620625226260743e-05, + "loss": 0.62674317, + "memory(GiB)": 15.04, + "step": 4790, + "train_speed(iter/s)": 0.337041 + }, + { + "acc": 0.86005669, + "epoch": 0.6457912457912458, + "grad_norm": 6.9375, + "learning_rate": 1.619751748243683e-05, + "loss": 0.55922494, + "memory(GiB)": 15.04, + "step": 4795, + "train_speed(iter/s)": 0.337096 + }, + { + "acc": 0.8449975, + "epoch": 0.6464646464646465, + "grad_norm": 9.4375, + "learning_rate": 1.618877501825241e-05, + "loss": 0.45132298, + "memory(GiB)": 15.04, + "step": 4800, + "train_speed(iter/s)": 0.337157 + }, + { + "epoch": 0.6464646464646465, + "eval_acc": 0.8843698897504879, + "eval_loss": 0.44451630115509033, + "eval_runtime": 109.8236, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 4800 + }, + { + "acc": 0.89971037, + "epoch": 0.6471380471380471, + "grad_norm": 6.96875, + "learning_rate": 1.618002488089355e-05, + "loss": 0.37263277, + "memory(GiB)": 15.04, + "step": 4805, + "train_speed(iter/s)": 0.33463 + }, + { + "acc": 0.89048262, + "epoch": 0.6478114478114478, + "grad_norm": 6.3125, + "learning_rate": 1.617126708120914e-05, + "loss": 0.43812232, + "memory(GiB)": 15.04, + "step": 4810, + "train_speed(iter/s)": 0.334681 + }, + { + "acc": 0.90058737, + "epoch": 0.6484848484848484, + "grad_norm": 8.3125, + "learning_rate": 1.6162501630057566e-05, + "loss": 0.31114213, + "memory(GiB)": 15.04, + "step": 4815, + "train_speed(iter/s)": 0.334756 + }, + { + "acc": 0.90134754, + "epoch": 0.6491582491582492, + "grad_norm": 14.3125, + "learning_rate": 1.6153728538306705e-05, + "loss": 0.25759342, + "memory(GiB)": 15.04, + "step": 4820, + "train_speed(iter/s)": 0.33483 + }, + { + "acc": 0.87186127, + "epoch": 0.6498316498316499, + "grad_norm": 10.0, + "learning_rate": 1.6144947816833902e-05, + "loss": 0.37279108, + "memory(GiB)": 15.04, + "step": 4825, + "train_speed(iter/s)": 0.334884 + }, + { + "acc": 0.84917154, + "epoch": 0.6505050505050505, + "grad_norm": 10.8125, + "learning_rate": 1.6136159476525968e-05, + "loss": 0.52433534, + "memory(GiB)": 15.04, + "step": 4830, + "train_speed(iter/s)": 0.334875 + }, + { + "acc": 0.89658546, + "epoch": 0.6511784511784512, + "grad_norm": 8.3125, + "learning_rate": 1.6127363528279158e-05, + "loss": 0.349422, + "memory(GiB)": 15.04, + "step": 4835, + "train_speed(iter/s)": 0.334896 + }, + { + "acc": 0.87986097, + "epoch": 0.6518518518518519, + "grad_norm": 20.125, + "learning_rate": 1.611855998299916e-05, + "loss": 0.42859497, + "memory(GiB)": 15.04, + "step": 4840, + "train_speed(iter/s)": 0.334958 + }, + { + "acc": 0.83318729, + "epoch": 0.6525252525252525, + "grad_norm": 11.125, + "learning_rate": 1.6109748851601078e-05, + "loss": 0.37896371, + "memory(GiB)": 15.04, + "step": 4845, + "train_speed(iter/s)": 0.335057 + }, + { + "acc": 0.87710838, + "epoch": 0.6531986531986532, + "grad_norm": 7.21875, + "learning_rate": 1.6100930145009427e-05, + "loss": 0.44344339, + "memory(GiB)": 15.04, + "step": 4850, + "train_speed(iter/s)": 0.335064 + }, + { + "acc": 0.84607038, + "epoch": 0.6538720538720538, + "grad_norm": 9.25, + "learning_rate": 1.6092103874158113e-05, + "loss": 0.72218566, + "memory(GiB)": 15.04, + "step": 4855, + "train_speed(iter/s)": 0.335096 + }, + { + "acc": 0.75290537, + "epoch": 0.6545454545454545, + "grad_norm": 12.4375, + "learning_rate": 1.608327004999041e-05, + "loss": 1.13738747, + "memory(GiB)": 15.04, + "step": 4860, + "train_speed(iter/s)": 0.335145 + }, + { + "acc": 0.82909489, + "epoch": 0.6552188552188553, + "grad_norm": 9.25, + "learning_rate": 1.6074428683458972e-05, + "loss": 0.49320173, + "memory(GiB)": 15.04, + "step": 4865, + "train_speed(iter/s)": 0.335121 + }, + { + "acc": 0.86532259, + "epoch": 0.6558922558922559, + "grad_norm": 6.25, + "learning_rate": 1.60655797855258e-05, + "loss": 0.34194508, + "memory(GiB)": 15.04, + "step": 4870, + "train_speed(iter/s)": 0.335165 + }, + { + "acc": 0.83790216, + "epoch": 0.6565656565656566, + "grad_norm": 14.1875, + "learning_rate": 1.605672336716223e-05, + "loss": 0.71522827, + "memory(GiB)": 15.04, + "step": 4875, + "train_speed(iter/s)": 0.335208 + }, + { + "acc": 0.91066542, + "epoch": 0.6572390572390573, + "grad_norm": 6.96875, + "learning_rate": 1.6047859439348923e-05, + "loss": 0.28191049, + "memory(GiB)": 15.04, + "step": 4880, + "train_speed(iter/s)": 0.335235 + }, + { + "acc": 0.90884056, + "epoch": 0.6579124579124579, + "grad_norm": 13.5, + "learning_rate": 1.6038988013075848e-05, + "loss": 0.29040275, + "memory(GiB)": 15.04, + "step": 4885, + "train_speed(iter/s)": 0.335306 + }, + { + "acc": 0.83176765, + "epoch": 0.6585858585858586, + "grad_norm": 5.3125, + "learning_rate": 1.603010909934228e-05, + "loss": 0.79500623, + "memory(GiB)": 15.04, + "step": 4890, + "train_speed(iter/s)": 0.335385 + }, + { + "acc": 0.86799116, + "epoch": 0.6592592592592592, + "grad_norm": 5.78125, + "learning_rate": 1.6021222709156768e-05, + "loss": 0.43854384, + "memory(GiB)": 15.04, + "step": 4895, + "train_speed(iter/s)": 0.335437 + }, + { + "acc": 0.82422266, + "epoch": 0.6599326599326599, + "grad_norm": 10.5, + "learning_rate": 1.6012328853537133e-05, + "loss": 0.73316464, + "memory(GiB)": 15.04, + "step": 4900, + "train_speed(iter/s)": 0.335516 + }, + { + "acc": 0.81474895, + "epoch": 0.6606060606060606, + "grad_norm": 12.25, + "learning_rate": 1.600342754351045e-05, + "loss": 0.39320011, + "memory(GiB)": 15.04, + "step": 4905, + "train_speed(iter/s)": 0.335504 + }, + { + "acc": 0.89751692, + "epoch": 0.6612794612794612, + "grad_norm": 8.0625, + "learning_rate": 1.5994518790113048e-05, + "loss": 0.37473979, + "memory(GiB)": 15.04, + "step": 4910, + "train_speed(iter/s)": 0.335571 + }, + { + "acc": 0.85417538, + "epoch": 0.661952861952862, + "grad_norm": 5.53125, + "learning_rate": 1.5985602604390473e-05, + "loss": 0.32387593, + "memory(GiB)": 15.04, + "step": 4915, + "train_speed(iter/s)": 0.335632 + }, + { + "acc": 0.83993015, + "epoch": 0.6626262626262627, + "grad_norm": 12.0625, + "learning_rate": 1.597667899739749e-05, + "loss": 0.68531394, + "memory(GiB)": 15.04, + "step": 4920, + "train_speed(iter/s)": 0.335674 + }, + { + "acc": 0.87181358, + "epoch": 0.6632996632996633, + "grad_norm": 11.0, + "learning_rate": 1.5967747980198058e-05, + "loss": 0.43111091, + "memory(GiB)": 15.04, + "step": 4925, + "train_speed(iter/s)": 0.33574 + }, + { + "acc": 0.90364523, + "epoch": 0.663973063973064, + "grad_norm": 7.0625, + "learning_rate": 1.595880956386534e-05, + "loss": 0.39029117, + "memory(GiB)": 15.04, + "step": 4930, + "train_speed(iter/s)": 0.33578 + }, + { + "acc": 0.93159332, + "epoch": 0.6646464646464646, + "grad_norm": 6.6875, + "learning_rate": 1.5949863759481653e-05, + "loss": 0.28740838, + "memory(GiB)": 15.04, + "step": 4935, + "train_speed(iter/s)": 0.335823 + }, + { + "acc": 0.81072521, + "epoch": 0.6653198653198653, + "grad_norm": 5.90625, + "learning_rate": 1.594091057813849e-05, + "loss": 0.46846576, + "memory(GiB)": 15.04, + "step": 4940, + "train_speed(iter/s)": 0.335769 + }, + { + "acc": 0.90675364, + "epoch": 0.665993265993266, + "grad_norm": 10.875, + "learning_rate": 1.593195003093648e-05, + "loss": 0.31139865, + "memory(GiB)": 15.04, + "step": 4945, + "train_speed(iter/s)": 0.335806 + }, + { + "acc": 0.92784157, + "epoch": 0.6666666666666666, + "grad_norm": 13.75, + "learning_rate": 1.59229821289854e-05, + "loss": 0.27862949, + "memory(GiB)": 15.04, + "step": 4950, + "train_speed(iter/s)": 0.335856 + }, + { + "acc": 0.91999216, + "epoch": 0.6673400673400673, + "grad_norm": 5.96875, + "learning_rate": 1.5914006883404115e-05, + "loss": 0.28840609, + "memory(GiB)": 15.04, + "step": 4955, + "train_speed(iter/s)": 0.335821 + }, + { + "acc": 0.89679012, + "epoch": 0.6680134680134681, + "grad_norm": 6.71875, + "learning_rate": 1.5905024305320632e-05, + "loss": 0.47731857, + "memory(GiB)": 15.04, + "step": 4960, + "train_speed(iter/s)": 0.335851 + }, + { + "acc": 0.85383654, + "epoch": 0.6686868686868687, + "grad_norm": 5.375, + "learning_rate": 1.589603440587203e-05, + "loss": 0.33904052, + "memory(GiB)": 15.04, + "step": 4965, + "train_speed(iter/s)": 0.335858 + }, + { + "acc": 0.9142395, + "epoch": 0.6693602693602694, + "grad_norm": 12.25, + "learning_rate": 1.588703719620446e-05, + "loss": 0.30967436, + "memory(GiB)": 15.04, + "step": 4970, + "train_speed(iter/s)": 0.335931 + }, + { + "acc": 0.85769234, + "epoch": 0.67003367003367, + "grad_norm": 11.4375, + "learning_rate": 1.5878032687473147e-05, + "loss": 0.4702589, + "memory(GiB)": 15.04, + "step": 4975, + "train_speed(iter/s)": 0.336 + }, + { + "acc": 0.84484854, + "epoch": 0.6707070707070707, + "grad_norm": 11.25, + "learning_rate": 1.5869020890842367e-05, + "loss": 0.26913323, + "memory(GiB)": 15.04, + "step": 4980, + "train_speed(iter/s)": 0.336065 + }, + { + "acc": 0.8702199, + "epoch": 0.6713804713804714, + "grad_norm": 12.375, + "learning_rate": 1.586000181748542e-05, + "loss": 0.45575948, + "memory(GiB)": 15.04, + "step": 4985, + "train_speed(iter/s)": 0.336122 + }, + { + "acc": 0.85966482, + "epoch": 0.672053872053872, + "grad_norm": 7.21875, + "learning_rate": 1.5850975478584643e-05, + "loss": 0.57842598, + "memory(GiB)": 15.04, + "step": 4990, + "train_speed(iter/s)": 0.336161 + }, + { + "acc": 0.87984657, + "epoch": 0.6727272727272727, + "grad_norm": 8.0625, + "learning_rate": 1.584194188533137e-05, + "loss": 0.45647306, + "memory(GiB)": 15.04, + "step": 4995, + "train_speed(iter/s)": 0.336208 + }, + { + "acc": 0.92159386, + "epoch": 0.6734006734006734, + "grad_norm": 8.4375, + "learning_rate": 1.5832901048925932e-05, + "loss": 0.25176334, + "memory(GiB)": 15.04, + "step": 5000, + "train_speed(iter/s)": 0.336284 + }, + { + "acc": 0.82076206, + "epoch": 0.674074074074074, + "grad_norm": 14.125, + "learning_rate": 1.5823852980577647e-05, + "loss": 0.49896278, + "memory(GiB)": 15.04, + "step": 5005, + "train_speed(iter/s)": 0.336272 + }, + { + "acc": 0.84987173, + "epoch": 0.6747474747474748, + "grad_norm": 23.625, + "learning_rate": 1.5814797691504788e-05, + "loss": 0.530128, + "memory(GiB)": 15.04, + "step": 5010, + "train_speed(iter/s)": 0.336299 + }, + { + "acc": 0.91046906, + "epoch": 0.6754208754208754, + "grad_norm": 8.0, + "learning_rate": 1.5805735192934596e-05, + "loss": 0.34117546, + "memory(GiB)": 15.04, + "step": 5015, + "train_speed(iter/s)": 0.336355 + }, + { + "acc": 0.84760551, + "epoch": 0.6760942760942761, + "grad_norm": 13.5625, + "learning_rate": 1.579666549610323e-05, + "loss": 0.57944155, + "memory(GiB)": 15.04, + "step": 5020, + "train_speed(iter/s)": 0.336431 + }, + { + "acc": 0.91725817, + "epoch": 0.6767676767676768, + "grad_norm": 9.3125, + "learning_rate": 1.5787588612255796e-05, + "loss": 0.35195091, + "memory(GiB)": 15.04, + "step": 5025, + "train_speed(iter/s)": 0.336493 + }, + { + "acc": 0.93613453, + "epoch": 0.6774410774410774, + "grad_norm": 9.4375, + "learning_rate": 1.5778504552646293e-05, + "loss": 0.26168051, + "memory(GiB)": 15.04, + "step": 5030, + "train_speed(iter/s)": 0.336542 + }, + { + "acc": 0.88962126, + "epoch": 0.6781144781144781, + "grad_norm": 9.0, + "learning_rate": 1.5769413328537626e-05, + "loss": 0.22385941, + "memory(GiB)": 15.04, + "step": 5035, + "train_speed(iter/s)": 0.336629 + }, + { + "acc": 0.83950338, + "epoch": 0.6787878787878788, + "grad_norm": 10.5625, + "learning_rate": 1.5760314951201585e-05, + "loss": 0.63088756, + "memory(GiB)": 15.04, + "step": 5040, + "train_speed(iter/s)": 0.336704 + }, + { + "acc": 0.88710566, + "epoch": 0.6794612794612794, + "grad_norm": 10.75, + "learning_rate": 1.575120943191882e-05, + "loss": 0.35499609, + "memory(GiB)": 15.04, + "step": 5045, + "train_speed(iter/s)": 0.33669 + }, + { + "acc": 0.94044094, + "epoch": 0.6801346801346801, + "grad_norm": 10.8125, + "learning_rate": 1.5742096781978847e-05, + "loss": 0.27930114, + "memory(GiB)": 15.04, + "step": 5050, + "train_speed(iter/s)": 0.336745 + }, + { + "acc": 0.82651567, + "epoch": 0.6808080808080809, + "grad_norm": 9.375, + "learning_rate": 1.573297701268001e-05, + "loss": 0.37496407, + "memory(GiB)": 15.04, + "step": 5055, + "train_speed(iter/s)": 0.336772 + }, + { + "acc": 0.84456654, + "epoch": 0.6814814814814815, + "grad_norm": 3.984375, + "learning_rate": 1.572385013532949e-05, + "loss": 0.4290081, + "memory(GiB)": 15.04, + "step": 5060, + "train_speed(iter/s)": 0.336806 + }, + { + "acc": 0.83932276, + "epoch": 0.6821548821548822, + "grad_norm": 21.625, + "learning_rate": 1.571471616124328e-05, + "loss": 0.5563108, + "memory(GiB)": 15.04, + "step": 5065, + "train_speed(iter/s)": 0.336824 + }, + { + "acc": 0.76784258, + "epoch": 0.6828282828282828, + "grad_norm": 13.625, + "learning_rate": 1.5705575101746166e-05, + "loss": 0.78404527, + "memory(GiB)": 15.04, + "step": 5070, + "train_speed(iter/s)": 0.336833 + }, + { + "acc": 0.8337821, + "epoch": 0.6835016835016835, + "grad_norm": 9.375, + "learning_rate": 1.569642696817173e-05, + "loss": 0.59160681, + "memory(GiB)": 15.04, + "step": 5075, + "train_speed(iter/s)": 0.336854 + }, + { + "acc": 0.85700722, + "epoch": 0.6841750841750842, + "grad_norm": 8.0, + "learning_rate": 1.5687271771862302e-05, + "loss": 0.50799479, + "memory(GiB)": 15.04, + "step": 5080, + "train_speed(iter/s)": 0.336908 + }, + { + "acc": 0.89061489, + "epoch": 0.6848484848484848, + "grad_norm": 9.25, + "learning_rate": 1.5678109524169002e-05, + "loss": 0.36893265, + "memory(GiB)": 15.04, + "step": 5085, + "train_speed(iter/s)": 0.336942 + }, + { + "acc": 0.84866123, + "epoch": 0.6855218855218855, + "grad_norm": 9.5, + "learning_rate": 1.5668940236451667e-05, + "loss": 0.78797088, + "memory(GiB)": 15.04, + "step": 5090, + "train_speed(iter/s)": 0.337012 + }, + { + "acc": 0.90500269, + "epoch": 0.6861952861952862, + "grad_norm": 26.125, + "learning_rate": 1.565976392007887e-05, + "loss": 0.33935552, + "memory(GiB)": 15.04, + "step": 5095, + "train_speed(iter/s)": 0.33708 + }, + { + "acc": 0.88241081, + "epoch": 0.6868686868686869, + "grad_norm": 5.6875, + "learning_rate": 1.5650580586427903e-05, + "loss": 0.51824064, + "memory(GiB)": 15.04, + "step": 5100, + "train_speed(iter/s)": 0.337077 + }, + { + "epoch": 0.6868686868686869, + "eval_acc": 0.8866254713028907, + "eval_loss": 0.4350052773952484, + "eval_runtime": 109.8937, + "eval_samples_per_second": 1.365, + "eval_steps_per_second": 1.365, + "step": 5100 + }, + { + "acc": 0.84023781, + "epoch": 0.6875420875420876, + "grad_norm": 10.6875, + "learning_rate": 1.564139024688475e-05, + "loss": 0.63163309, + "memory(GiB)": 15.04, + "step": 5105, + "train_speed(iter/s)": 0.334689 + }, + { + "acc": 0.83299837, + "epoch": 0.6882154882154882, + "grad_norm": 13.0625, + "learning_rate": 1.5632192912844084e-05, + "loss": 0.861168, + "memory(GiB)": 15.04, + "step": 5110, + "train_speed(iter/s)": 0.334736 + }, + { + "acc": 0.93508368, + "epoch": 0.6888888888888889, + "grad_norm": 4.59375, + "learning_rate": 1.562298859570926e-05, + "loss": 0.26280169, + "memory(GiB)": 15.04, + "step": 5115, + "train_speed(iter/s)": 0.334729 + }, + { + "acc": 0.91097832, + "epoch": 0.6895622895622896, + "grad_norm": 6.25, + "learning_rate": 1.5613777306892278e-05, + "loss": 0.32177091, + "memory(GiB)": 15.04, + "step": 5120, + "train_speed(iter/s)": 0.334798 + }, + { + "acc": 0.83395815, + "epoch": 0.6902356902356902, + "grad_norm": 7.1875, + "learning_rate": 1.560455905781378e-05, + "loss": 0.52251482, + "memory(GiB)": 15.04, + "step": 5125, + "train_speed(iter/s)": 0.334854 + }, + { + "acc": 0.87596321, + "epoch": 0.6909090909090909, + "grad_norm": 7.4375, + "learning_rate": 1.559533385990306e-05, + "loss": 0.53407445, + "memory(GiB)": 15.04, + "step": 5130, + "train_speed(iter/s)": 0.334883 + }, + { + "acc": 0.86092014, + "epoch": 0.6915824915824916, + "grad_norm": 7.3125, + "learning_rate": 1.5586101724598003e-05, + "loss": 0.63142657, + "memory(GiB)": 15.04, + "step": 5135, + "train_speed(iter/s)": 0.334941 + }, + { + "acc": 0.90926075, + "epoch": 0.6922558922558922, + "grad_norm": 9.0625, + "learning_rate": 1.5576862663345104e-05, + "loss": 0.33125496, + "memory(GiB)": 15.04, + "step": 5140, + "train_speed(iter/s)": 0.335006 + }, + { + "acc": 0.93036556, + "epoch": 0.692929292929293, + "grad_norm": 6.78125, + "learning_rate": 1.5567616687599446e-05, + "loss": 0.24624259, + "memory(GiB)": 15.04, + "step": 5145, + "train_speed(iter/s)": 0.335039 + }, + { + "acc": 0.87925148, + "epoch": 0.6936026936026936, + "grad_norm": 5.25, + "learning_rate": 1.5558363808824682e-05, + "loss": 0.39352739, + "memory(GiB)": 15.04, + "step": 5150, + "train_speed(iter/s)": 0.335049 + }, + { + "acc": 0.81072083, + "epoch": 0.6942760942760943, + "grad_norm": 9.125, + "learning_rate": 1.5549104038493034e-05, + "loss": 0.67198267, + "memory(GiB)": 15.04, + "step": 5155, + "train_speed(iter/s)": 0.335072 + }, + { + "acc": 0.82437325, + "epoch": 0.694949494949495, + "grad_norm": 5.78125, + "learning_rate": 1.5539837388085253e-05, + "loss": 0.66641488, + "memory(GiB)": 15.04, + "step": 5160, + "train_speed(iter/s)": 0.335105 + }, + { + "acc": 0.80968819, + "epoch": 0.6956228956228956, + "grad_norm": 25.5, + "learning_rate": 1.5530563869090633e-05, + "loss": 0.79818001, + "memory(GiB)": 15.04, + "step": 5165, + "train_speed(iter/s)": 0.335141 + }, + { + "acc": 0.90361757, + "epoch": 0.6962962962962963, + "grad_norm": 10.5625, + "learning_rate": 1.5521283493006975e-05, + "loss": 0.38003852, + "memory(GiB)": 15.04, + "step": 5170, + "train_speed(iter/s)": 0.335183 + }, + { + "acc": 0.93689108, + "epoch": 0.696969696969697, + "grad_norm": 7.875, + "learning_rate": 1.551199627134059e-05, + "loss": 0.22883389, + "memory(GiB)": 15.04, + "step": 5175, + "train_speed(iter/s)": 0.335212 + }, + { + "acc": 0.8980669, + "epoch": 0.6976430976430976, + "grad_norm": 5.09375, + "learning_rate": 1.5502702215606272e-05, + "loss": 0.39488235, + "memory(GiB)": 15.04, + "step": 5180, + "train_speed(iter/s)": 0.335239 + }, + { + "acc": 0.86305151, + "epoch": 0.6983164983164983, + "grad_norm": 11.125, + "learning_rate": 1.5493401337327282e-05, + "loss": 0.30057523, + "memory(GiB)": 15.04, + "step": 5185, + "train_speed(iter/s)": 0.335304 + }, + { + "acc": 0.77735128, + "epoch": 0.6989898989898989, + "grad_norm": 5.40625, + "learning_rate": 1.5484093648035357e-05, + "loss": 1.3391305, + "memory(GiB)": 15.04, + "step": 5190, + "train_speed(iter/s)": 0.335318 + }, + { + "acc": 0.90310974, + "epoch": 0.6996632996632997, + "grad_norm": 7.1875, + "learning_rate": 1.547477915927066e-05, + "loss": 0.32603617, + "memory(GiB)": 15.04, + "step": 5195, + "train_speed(iter/s)": 0.335306 + }, + { + "acc": 0.88939428, + "epoch": 0.7003367003367004, + "grad_norm": 5.03125, + "learning_rate": 1.5465457882581797e-05, + "loss": 0.41364112, + "memory(GiB)": 15.04, + "step": 5200, + "train_speed(iter/s)": 0.335289 + }, + { + "acc": 0.8884407, + "epoch": 0.701010101010101, + "grad_norm": 9.5625, + "learning_rate": 1.5456129829525784e-05, + "loss": 0.26326492, + "memory(GiB)": 15.04, + "step": 5205, + "train_speed(iter/s)": 0.335371 + }, + { + "acc": 0.91765385, + "epoch": 0.7016835016835017, + "grad_norm": 10.5625, + "learning_rate": 1.544679501166804e-05, + "loss": 0.31347032, + "memory(GiB)": 15.04, + "step": 5210, + "train_speed(iter/s)": 0.33542 + }, + { + "acc": 0.91512489, + "epoch": 0.7023569023569024, + "grad_norm": 7.0, + "learning_rate": 1.5437453440582372e-05, + "loss": 0.29753859, + "memory(GiB)": 15.04, + "step": 5215, + "train_speed(iter/s)": 0.335418 + }, + { + "acc": 0.89700146, + "epoch": 0.703030303030303, + "grad_norm": 6.46875, + "learning_rate": 1.542810512785096e-05, + "loss": 0.33366153, + "memory(GiB)": 15.04, + "step": 5220, + "train_speed(iter/s)": 0.33542 + }, + { + "acc": 0.92242832, + "epoch": 0.7037037037037037, + "grad_norm": 7.15625, + "learning_rate": 1.5418750085064343e-05, + "loss": 0.2627866, + "memory(GiB)": 15.04, + "step": 5225, + "train_speed(iter/s)": 0.335449 + }, + { + "acc": 0.88747625, + "epoch": 0.7043771043771043, + "grad_norm": 23.375, + "learning_rate": 1.5409388323821403e-05, + "loss": 0.4003212, + "memory(GiB)": 15.04, + "step": 5230, + "train_speed(iter/s)": 0.335531 + }, + { + "acc": 0.93946686, + "epoch": 0.705050505050505, + "grad_norm": 7.78125, + "learning_rate": 1.5400019855729353e-05, + "loss": 0.20857615, + "memory(GiB)": 15.04, + "step": 5235, + "train_speed(iter/s)": 0.335616 + }, + { + "acc": 0.93392143, + "epoch": 0.7057239057239058, + "grad_norm": 5.375, + "learning_rate": 1.539064469240372e-05, + "loss": 0.23336976, + "memory(GiB)": 15.04, + "step": 5240, + "train_speed(iter/s)": 0.335687 + }, + { + "acc": 0.90790997, + "epoch": 0.7063973063973064, + "grad_norm": 10.0625, + "learning_rate": 1.5381262845468336e-05, + "loss": 0.28862476, + "memory(GiB)": 15.04, + "step": 5245, + "train_speed(iter/s)": 0.335738 + }, + { + "acc": 0.88231773, + "epoch": 0.7070707070707071, + "grad_norm": 8.0625, + "learning_rate": 1.537187432655531e-05, + "loss": 0.54753809, + "memory(GiB)": 15.04, + "step": 5250, + "train_speed(iter/s)": 0.335788 + }, + { + "acc": 0.90225554, + "epoch": 0.7077441077441078, + "grad_norm": 8.9375, + "learning_rate": 1.536247914730504e-05, + "loss": 0.38348703, + "memory(GiB)": 15.04, + "step": 5255, + "train_speed(iter/s)": 0.335785 + }, + { + "acc": 0.89147949, + "epoch": 0.7084175084175084, + "grad_norm": 6.90625, + "learning_rate": 1.535307731936616e-05, + "loss": 0.40294757, + "memory(GiB)": 15.04, + "step": 5260, + "train_speed(iter/s)": 0.335827 + }, + { + "acc": 0.85820866, + "epoch": 0.7090909090909091, + "grad_norm": 8.125, + "learning_rate": 1.5343668854395574e-05, + "loss": 0.85416241, + "memory(GiB)": 15.04, + "step": 5265, + "train_speed(iter/s)": 0.335769 + }, + { + "acc": 0.89848557, + "epoch": 0.7097643097643098, + "grad_norm": 9.9375, + "learning_rate": 1.5334253764058387e-05, + "loss": 0.36731422, + "memory(GiB)": 15.04, + "step": 5270, + "train_speed(iter/s)": 0.335817 + }, + { + "acc": 0.80676889, + "epoch": 0.7104377104377104, + "grad_norm": 10.5, + "learning_rate": 1.5324832060027938e-05, + "loss": 0.36697528, + "memory(GiB)": 15.04, + "step": 5275, + "train_speed(iter/s)": 0.335857 + }, + { + "acc": 0.92737293, + "epoch": 0.7111111111111111, + "grad_norm": 6.25, + "learning_rate": 1.531540375398576e-05, + "loss": 0.35305939, + "memory(GiB)": 15.04, + "step": 5280, + "train_speed(iter/s)": 0.335899 + }, + { + "acc": 0.90856133, + "epoch": 0.7117845117845117, + "grad_norm": 9.8125, + "learning_rate": 1.5305968857621572e-05, + "loss": 0.2692791, + "memory(GiB)": 15.04, + "step": 5285, + "train_speed(iter/s)": 0.335917 + }, + { + "acc": 0.88260841, + "epoch": 0.7124579124579125, + "grad_norm": 5.75, + "learning_rate": 1.5296527382633262e-05, + "loss": 0.32560964, + "memory(GiB)": 15.04, + "step": 5290, + "train_speed(iter/s)": 0.335979 + }, + { + "acc": 0.8704318, + "epoch": 0.7131313131313132, + "grad_norm": 11.375, + "learning_rate": 1.5287079340726874e-05, + "loss": 0.48602247, + "memory(GiB)": 15.04, + "step": 5295, + "train_speed(iter/s)": 0.335989 + }, + { + "acc": 0.88656054, + "epoch": 0.7138047138047138, + "grad_norm": 14.25, + "learning_rate": 1.5277624743616597e-05, + "loss": 0.41485868, + "memory(GiB)": 15.04, + "step": 5300, + "train_speed(iter/s)": 0.336018 + }, + { + "acc": 0.89056721, + "epoch": 0.7144781144781145, + "grad_norm": 13.3125, + "learning_rate": 1.526816360302475e-05, + "loss": 0.34193959, + "memory(GiB)": 15.04, + "step": 5305, + "train_speed(iter/s)": 0.33607 + }, + { + "acc": 0.93542423, + "epoch": 0.7151515151515152, + "grad_norm": 12.0, + "learning_rate": 1.5258695930681757e-05, + "loss": 0.2437969, + "memory(GiB)": 15.04, + "step": 5310, + "train_speed(iter/s)": 0.336119 + }, + { + "acc": 0.86053991, + "epoch": 0.7158249158249158, + "grad_norm": 8.4375, + "learning_rate": 1.5249221738326147e-05, + "loss": 0.72318025, + "memory(GiB)": 15.04, + "step": 5315, + "train_speed(iter/s)": 0.336183 + }, + { + "acc": 0.77432585, + "epoch": 0.7164983164983165, + "grad_norm": 4.84375, + "learning_rate": 1.5239741037704531e-05, + "loss": 0.46462812, + "memory(GiB)": 15.04, + "step": 5320, + "train_speed(iter/s)": 0.336247 + }, + { + "acc": 0.88148537, + "epoch": 0.7171717171717171, + "grad_norm": 8.0625, + "learning_rate": 1.5230253840571585e-05, + "loss": 0.48239102, + "memory(GiB)": 15.04, + "step": 5325, + "train_speed(iter/s)": 0.336308 + }, + { + "acc": 0.85648298, + "epoch": 0.7178451178451178, + "grad_norm": 5.46875, + "learning_rate": 1.522076015869005e-05, + "loss": 0.56495943, + "memory(GiB)": 15.04, + "step": 5330, + "train_speed(iter/s)": 0.336338 + }, + { + "acc": 0.8542367, + "epoch": 0.7185185185185186, + "grad_norm": 7.65625, + "learning_rate": 1.5211260003830695e-05, + "loss": 0.53311505, + "memory(GiB)": 15.04, + "step": 5335, + "train_speed(iter/s)": 0.336398 + }, + { + "acc": 0.91923809, + "epoch": 0.7191919191919192, + "grad_norm": 5.9375, + "learning_rate": 1.5201753387772327e-05, + "loss": 0.30996644, + "memory(GiB)": 15.04, + "step": 5340, + "train_speed(iter/s)": 0.336457 + }, + { + "acc": 0.90340796, + "epoch": 0.7198653198653199, + "grad_norm": 18.125, + "learning_rate": 1.519224032230175e-05, + "loss": 0.45624275, + "memory(GiB)": 15.04, + "step": 5345, + "train_speed(iter/s)": 0.336531 + }, + { + "acc": 0.92789698, + "epoch": 0.7205387205387206, + "grad_norm": 3.25, + "learning_rate": 1.5182720819213772e-05, + "loss": 0.27522459, + "memory(GiB)": 15.04, + "step": 5350, + "train_speed(iter/s)": 0.33658 + }, + { + "acc": 0.89438334, + "epoch": 0.7212121212121212, + "grad_norm": 7.9375, + "learning_rate": 1.5173194890311189e-05, + "loss": 0.39296758, + "memory(GiB)": 15.04, + "step": 5355, + "train_speed(iter/s)": 0.336614 + }, + { + "acc": 0.91784678, + "epoch": 0.7218855218855219, + "grad_norm": 7.03125, + "learning_rate": 1.5163662547404752e-05, + "loss": 0.30004089, + "memory(GiB)": 15.04, + "step": 5360, + "train_speed(iter/s)": 0.336586 + }, + { + "acc": 0.91970224, + "epoch": 0.7225589225589225, + "grad_norm": 15.5625, + "learning_rate": 1.5154123802313173e-05, + "loss": 0.29415617, + "memory(GiB)": 15.04, + "step": 5365, + "train_speed(iter/s)": 0.336656 + }, + { + "acc": 0.91070747, + "epoch": 0.7232323232323232, + "grad_norm": 5.9375, + "learning_rate": 1.5144578666863095e-05, + "loss": 0.3511692, + "memory(GiB)": 15.04, + "step": 5370, + "train_speed(iter/s)": 0.33668 + }, + { + "acc": 0.88041229, + "epoch": 0.7239057239057239, + "grad_norm": 22.75, + "learning_rate": 1.513502715288909e-05, + "loss": 0.62181106, + "memory(GiB)": 15.04, + "step": 5375, + "train_speed(iter/s)": 0.336748 + }, + { + "acc": 0.88765354, + "epoch": 0.7245791245791245, + "grad_norm": 6.75, + "learning_rate": 1.512546927223364e-05, + "loss": 0.32409241, + "memory(GiB)": 15.04, + "step": 5380, + "train_speed(iter/s)": 0.336784 + }, + { + "acc": 0.92649918, + "epoch": 0.7252525252525253, + "grad_norm": 8.8125, + "learning_rate": 1.5115905036747109e-05, + "loss": 0.33871119, + "memory(GiB)": 15.04, + "step": 5385, + "train_speed(iter/s)": 0.336837 + }, + { + "acc": 0.7644515, + "epoch": 0.725925925925926, + "grad_norm": 7.6875, + "learning_rate": 1.5106334458287753e-05, + "loss": 0.89487886, + "memory(GiB)": 15.04, + "step": 5390, + "train_speed(iter/s)": 0.336855 + }, + { + "acc": 0.87101507, + "epoch": 0.7265993265993266, + "grad_norm": 7.5, + "learning_rate": 1.5096757548721685e-05, + "loss": 0.38332465, + "memory(GiB)": 15.04, + "step": 5395, + "train_speed(iter/s)": 0.336881 + }, + { + "acc": 0.89714222, + "epoch": 0.7272727272727273, + "grad_norm": 12.625, + "learning_rate": 1.5087174319922873e-05, + "loss": 0.39275725, + "memory(GiB)": 15.04, + "step": 5400, + "train_speed(iter/s)": 0.33689 + }, + { + "epoch": 0.7272727272727273, + "eval_acc": 0.8893768729978299, + "eval_loss": 0.42885732650756836, + "eval_runtime": 110.102, + "eval_samples_per_second": 1.362, + "eval_steps_per_second": 1.362, + "step": 5400 + }, + { + "acc": 0.91678247, + "epoch": 0.7279461279461279, + "grad_norm": 12.0, + "learning_rate": 1.5077584783773112e-05, + "loss": 0.31800861, + "memory(GiB)": 15.04, + "step": 5405, + "train_speed(iter/s)": 0.33467 + }, + { + "acc": 0.92087908, + "epoch": 0.7286195286195286, + "grad_norm": 6.5625, + "learning_rate": 1.5067988952162026e-05, + "loss": 0.22192085, + "memory(GiB)": 15.04, + "step": 5410, + "train_speed(iter/s)": 0.334707 + }, + { + "acc": 0.91095114, + "epoch": 0.7292929292929293, + "grad_norm": 7.75, + "learning_rate": 1.505838683698704e-05, + "loss": 0.34651372, + "memory(GiB)": 15.04, + "step": 5415, + "train_speed(iter/s)": 0.334748 + }, + { + "acc": 0.91784496, + "epoch": 0.7299663299663299, + "grad_norm": 5.59375, + "learning_rate": 1.504877845015337e-05, + "loss": 0.31098709, + "memory(GiB)": 15.04, + "step": 5420, + "train_speed(iter/s)": 0.334746 + }, + { + "acc": 0.89352703, + "epoch": 0.7306397306397306, + "grad_norm": 12.0, + "learning_rate": 1.5039163803574006e-05, + "loss": 0.33507795, + "memory(GiB)": 15.04, + "step": 5425, + "train_speed(iter/s)": 0.334791 + }, + { + "acc": 0.85959349, + "epoch": 0.7313131313131314, + "grad_norm": 8.125, + "learning_rate": 1.5029542909169706e-05, + "loss": 0.33322883, + "memory(GiB)": 15.04, + "step": 5430, + "train_speed(iter/s)": 0.334819 + }, + { + "acc": 0.83815746, + "epoch": 0.731986531986532, + "grad_norm": 7.5625, + "learning_rate": 1.5019915778868965e-05, + "loss": 0.58955793, + "memory(GiB)": 15.04, + "step": 5435, + "train_speed(iter/s)": 0.334867 + }, + { + "acc": 0.84119377, + "epoch": 0.7326599326599327, + "grad_norm": 9.9375, + "learning_rate": 1.5010282424608016e-05, + "loss": 0.5419806, + "memory(GiB)": 15.04, + "step": 5440, + "train_speed(iter/s)": 0.334891 + }, + { + "acc": 0.91294098, + "epoch": 0.7333333333333333, + "grad_norm": 13.125, + "learning_rate": 1.5000642858330805e-05, + "loss": 0.2769104, + "memory(GiB)": 15.04, + "step": 5445, + "train_speed(iter/s)": 0.334947 + }, + { + "acc": 0.8449502, + "epoch": 0.734006734006734, + "grad_norm": 7.96875, + "learning_rate": 1.4990997091988989e-05, + "loss": 0.38201289, + "memory(GiB)": 15.04, + "step": 5450, + "train_speed(iter/s)": 0.334942 + }, + { + "acc": 0.93443613, + "epoch": 0.7346801346801347, + "grad_norm": 4.78125, + "learning_rate": 1.4981345137541898e-05, + "loss": 0.24895678, + "memory(GiB)": 15.04, + "step": 5455, + "train_speed(iter/s)": 0.334957 + }, + { + "acc": 0.91406021, + "epoch": 0.7353535353535353, + "grad_norm": 8.1875, + "learning_rate": 1.4971687006956545e-05, + "loss": 0.34090152, + "memory(GiB)": 15.04, + "step": 5460, + "train_speed(iter/s)": 0.334957 + }, + { + "acc": 0.88916616, + "epoch": 0.736026936026936, + "grad_norm": 9.1875, + "learning_rate": 1.4962022712207598e-05, + "loss": 0.38841276, + "memory(GiB)": 15.04, + "step": 5465, + "train_speed(iter/s)": 0.334992 + }, + { + "acc": 0.87221527, + "epoch": 0.7367003367003367, + "grad_norm": 8.5, + "learning_rate": 1.4952352265277363e-05, + "loss": 0.47804179, + "memory(GiB)": 15.04, + "step": 5470, + "train_speed(iter/s)": 0.335001 + }, + { + "acc": 0.91606455, + "epoch": 0.7373737373737373, + "grad_norm": 6.375, + "learning_rate": 1.494267567815578e-05, + "loss": 0.31870043, + "memory(GiB)": 15.04, + "step": 5475, + "train_speed(iter/s)": 0.335005 + }, + { + "acc": 0.92161064, + "epoch": 0.7380471380471381, + "grad_norm": 7.3125, + "learning_rate": 1.49329929628404e-05, + "loss": 0.26377363, + "memory(GiB)": 15.04, + "step": 5480, + "train_speed(iter/s)": 0.335015 + }, + { + "acc": 0.91865702, + "epoch": 0.7387205387205387, + "grad_norm": 6.46875, + "learning_rate": 1.4923304131336371e-05, + "loss": 0.19029766, + "memory(GiB)": 15.04, + "step": 5485, + "train_speed(iter/s)": 0.33505 + }, + { + "acc": 0.86236744, + "epoch": 0.7393939393939394, + "grad_norm": 12.5625, + "learning_rate": 1.4913609195656427e-05, + "loss": 0.57456055, + "memory(GiB)": 15.04, + "step": 5490, + "train_speed(iter/s)": 0.335079 + }, + { + "acc": 0.85322609, + "epoch": 0.7400673400673401, + "grad_norm": 6.125, + "learning_rate": 1.4903908167820862e-05, + "loss": 0.47080436, + "memory(GiB)": 15.04, + "step": 5495, + "train_speed(iter/s)": 0.335091 + }, + { + "acc": 0.76626668, + "epoch": 0.7407407407407407, + "grad_norm": 9.3125, + "learning_rate": 1.4894201059857536e-05, + "loss": 0.58807654, + "memory(GiB)": 15.04, + "step": 5500, + "train_speed(iter/s)": 0.335139 + }, + { + "acc": 0.85379887, + "epoch": 0.7414141414141414, + "grad_norm": 17.125, + "learning_rate": 1.4884487883801837e-05, + "loss": 0.47466197, + "memory(GiB)": 15.04, + "step": 5505, + "train_speed(iter/s)": 0.335088 + }, + { + "acc": 0.92498798, + "epoch": 0.7420875420875421, + "grad_norm": 6.625, + "learning_rate": 1.487476865169668e-05, + "loss": 0.25954266, + "memory(GiB)": 15.04, + "step": 5510, + "train_speed(iter/s)": 0.335153 + }, + { + "acc": 0.86269741, + "epoch": 0.7427609427609427, + "grad_norm": 19.0, + "learning_rate": 1.4865043375592493e-05, + "loss": 0.65633082, + "memory(GiB)": 15.04, + "step": 5515, + "train_speed(iter/s)": 0.335191 + }, + { + "acc": 0.92104807, + "epoch": 0.7434343434343434, + "grad_norm": 5.0625, + "learning_rate": 1.4855312067547187e-05, + "loss": 0.33057408, + "memory(GiB)": 15.04, + "step": 5520, + "train_speed(iter/s)": 0.335231 + }, + { + "acc": 0.89448395, + "epoch": 0.7441077441077442, + "grad_norm": 10.5, + "learning_rate": 1.4845574739626167e-05, + "loss": 0.34970629, + "memory(GiB)": 15.04, + "step": 5525, + "train_speed(iter/s)": 0.335235 + }, + { + "acc": 0.86514912, + "epoch": 0.7447811447811448, + "grad_norm": 7.4375, + "learning_rate": 1.4835831403902288e-05, + "loss": 0.40157342, + "memory(GiB)": 15.04, + "step": 5530, + "train_speed(iter/s)": 0.335189 + }, + { + "acc": 0.8821455, + "epoch": 0.7454545454545455, + "grad_norm": 5.65625, + "learning_rate": 1.482608207245586e-05, + "loss": 0.44763246, + "memory(GiB)": 15.04, + "step": 5535, + "train_speed(iter/s)": 0.335147 + }, + { + "acc": 0.90621281, + "epoch": 0.7461279461279461, + "grad_norm": 6.59375, + "learning_rate": 1.4816326757374627e-05, + "loss": 0.27925975, + "memory(GiB)": 15.04, + "step": 5540, + "train_speed(iter/s)": 0.335197 + }, + { + "acc": 0.83214693, + "epoch": 0.7468013468013468, + "grad_norm": 12.5625, + "learning_rate": 1.4806565470753747e-05, + "loss": 0.59721742, + "memory(GiB)": 15.04, + "step": 5545, + "train_speed(iter/s)": 0.335195 + }, + { + "acc": 0.88033695, + "epoch": 0.7474747474747475, + "grad_norm": 14.8125, + "learning_rate": 1.4796798224695787e-05, + "loss": 0.37324944, + "memory(GiB)": 15.04, + "step": 5550, + "train_speed(iter/s)": 0.335178 + }, + { + "acc": 0.89488153, + "epoch": 0.7481481481481481, + "grad_norm": 9.8125, + "learning_rate": 1.4787025031310706e-05, + "loss": 0.58029065, + "memory(GiB)": 15.04, + "step": 5555, + "train_speed(iter/s)": 0.335221 + }, + { + "acc": 0.85531311, + "epoch": 0.7488215488215488, + "grad_norm": 19.875, + "learning_rate": 1.4777245902715827e-05, + "loss": 0.50645776, + "memory(GiB)": 15.04, + "step": 5560, + "train_speed(iter/s)": 0.335265 + }, + { + "acc": 0.9132328, + "epoch": 0.7494949494949495, + "grad_norm": 7.875, + "learning_rate": 1.4767460851035838e-05, + "loss": 0.27047372, + "memory(GiB)": 15.04, + "step": 5565, + "train_speed(iter/s)": 0.335324 + }, + { + "acc": 0.89535303, + "epoch": 0.7501683501683502, + "grad_norm": 7.21875, + "learning_rate": 1.475766988840277e-05, + "loss": 0.42600985, + "memory(GiB)": 15.04, + "step": 5570, + "train_speed(iter/s)": 0.335372 + }, + { + "acc": 0.8320179, + "epoch": 0.7508417508417509, + "grad_norm": 6.6875, + "learning_rate": 1.4747873026955986e-05, + "loss": 0.37951889, + "memory(GiB)": 15.04, + "step": 5575, + "train_speed(iter/s)": 0.335425 + }, + { + "acc": 0.86511326, + "epoch": 0.7515151515151515, + "grad_norm": 14.1875, + "learning_rate": 1.4738070278842152e-05, + "loss": 0.5175148, + "memory(GiB)": 15.04, + "step": 5580, + "train_speed(iter/s)": 0.335493 + }, + { + "acc": 0.80341415, + "epoch": 0.7521885521885522, + "grad_norm": 11.25, + "learning_rate": 1.4728261656215243e-05, + "loss": 0.8064867, + "memory(GiB)": 15.04, + "step": 5585, + "train_speed(iter/s)": 0.335538 + }, + { + "acc": 0.85371695, + "epoch": 0.7528619528619529, + "grad_norm": 17.625, + "learning_rate": 1.4718447171236514e-05, + "loss": 0.46963062, + "memory(GiB)": 15.04, + "step": 5590, + "train_speed(iter/s)": 0.335607 + }, + { + "acc": 0.84597397, + "epoch": 0.7535353535353535, + "grad_norm": 7.21875, + "learning_rate": 1.4708626836074489e-05, + "loss": 0.335937, + "memory(GiB)": 15.04, + "step": 5595, + "train_speed(iter/s)": 0.33565 + }, + { + "acc": 0.90138521, + "epoch": 0.7542087542087542, + "grad_norm": 9.1875, + "learning_rate": 1.4698800662904948e-05, + "loss": 0.32173617, + "memory(GiB)": 15.04, + "step": 5600, + "train_speed(iter/s)": 0.335678 + }, + { + "acc": 0.89959269, + "epoch": 0.7548821548821549, + "grad_norm": 5.03125, + "learning_rate": 1.46889686639109e-05, + "loss": 0.27304873, + "memory(GiB)": 15.04, + "step": 5605, + "train_speed(iter/s)": 0.335685 + }, + { + "acc": 0.87979374, + "epoch": 0.7555555555555555, + "grad_norm": 10.375, + "learning_rate": 1.467913085128259e-05, + "loss": 0.36787922, + "memory(GiB)": 15.04, + "step": 5610, + "train_speed(iter/s)": 0.335717 + }, + { + "acc": 0.9064332, + "epoch": 0.7562289562289563, + "grad_norm": 12.0625, + "learning_rate": 1.4669287237217458e-05, + "loss": 0.33975303, + "memory(GiB)": 15.04, + "step": 5615, + "train_speed(iter/s)": 0.335765 + }, + { + "acc": 0.89176922, + "epoch": 0.7569023569023569, + "grad_norm": 8.625, + "learning_rate": 1.4659437833920149e-05, + "loss": 0.29958005, + "memory(GiB)": 15.04, + "step": 5620, + "train_speed(iter/s)": 0.335801 + }, + { + "acc": 0.89875574, + "epoch": 0.7575757575757576, + "grad_norm": 9.375, + "learning_rate": 1.464958265360248e-05, + "loss": 0.32082832, + "memory(GiB)": 15.04, + "step": 5625, + "train_speed(iter/s)": 0.33582 + }, + { + "acc": 0.93867149, + "epoch": 0.7582491582491583, + "grad_norm": 5.6875, + "learning_rate": 1.4639721708483428e-05, + "loss": 0.1997571, + "memory(GiB)": 15.04, + "step": 5630, + "train_speed(iter/s)": 0.335842 + }, + { + "acc": 0.89692726, + "epoch": 0.7589225589225589, + "grad_norm": 7.46875, + "learning_rate": 1.462985501078912e-05, + "loss": 0.38000989, + "memory(GiB)": 15.04, + "step": 5635, + "train_speed(iter/s)": 0.335852 + }, + { + "acc": 0.90923653, + "epoch": 0.7595959595959596, + "grad_norm": 11.25, + "learning_rate": 1.4619982572752816e-05, + "loss": 0.28477805, + "memory(GiB)": 15.04, + "step": 5640, + "train_speed(iter/s)": 0.335893 + }, + { + "acc": 0.80711994, + "epoch": 0.7602693602693603, + "grad_norm": 13.8125, + "learning_rate": 1.4610104406614897e-05, + "loss": 0.64299107, + "memory(GiB)": 15.04, + "step": 5645, + "train_speed(iter/s)": 0.335968 + }, + { + "acc": 0.84009895, + "epoch": 0.7609427609427609, + "grad_norm": 42.25, + "learning_rate": 1.4600220524622838e-05, + "loss": 0.65882978, + "memory(GiB)": 15.04, + "step": 5650, + "train_speed(iter/s)": 0.335993 + }, + { + "acc": 0.84721203, + "epoch": 0.7616161616161616, + "grad_norm": 18.75, + "learning_rate": 1.459033093903121e-05, + "loss": 0.42750006, + "memory(GiB)": 15.04, + "step": 5655, + "train_speed(iter/s)": 0.336039 + }, + { + "acc": 0.93362293, + "epoch": 0.7622895622895622, + "grad_norm": 5.0625, + "learning_rate": 1.4580435662101642e-05, + "loss": 0.20740576, + "memory(GiB)": 15.04, + "step": 5660, + "train_speed(iter/s)": 0.336086 + }, + { + "acc": 0.86373415, + "epoch": 0.762962962962963, + "grad_norm": 6.0, + "learning_rate": 1.4570534706102835e-05, + "loss": 0.41281152, + "memory(GiB)": 15.04, + "step": 5665, + "train_speed(iter/s)": 0.33612 + }, + { + "acc": 0.89508467, + "epoch": 0.7636363636363637, + "grad_norm": 4.84375, + "learning_rate": 1.4560628083310523e-05, + "loss": 0.39009066, + "memory(GiB)": 15.04, + "step": 5670, + "train_speed(iter/s)": 0.336198 + }, + { + "acc": 0.87871475, + "epoch": 0.7643097643097643, + "grad_norm": 10.0625, + "learning_rate": 1.4550715806007461e-05, + "loss": 0.54682922, + "memory(GiB)": 15.04, + "step": 5675, + "train_speed(iter/s)": 0.33618 + }, + { + "acc": 0.85852232, + "epoch": 0.764983164983165, + "grad_norm": 10.4375, + "learning_rate": 1.4540797886483429e-05, + "loss": 0.70370178, + "memory(GiB)": 15.04, + "step": 5680, + "train_speed(iter/s)": 0.336219 + }, + { + "acc": 0.92808132, + "epoch": 0.7656565656565657, + "grad_norm": 9.5, + "learning_rate": 1.4530874337035188e-05, + "loss": 0.27153614, + "memory(GiB)": 15.04, + "step": 5685, + "train_speed(iter/s)": 0.336278 + }, + { + "acc": 0.91171694, + "epoch": 0.7663299663299663, + "grad_norm": 12.5, + "learning_rate": 1.4520945169966487e-05, + "loss": 0.32486353, + "memory(GiB)": 15.04, + "step": 5690, + "train_speed(iter/s)": 0.336306 + }, + { + "acc": 0.86730728, + "epoch": 0.767003367003367, + "grad_norm": 10.4375, + "learning_rate": 1.4511010397588044e-05, + "loss": 0.69123378, + "memory(GiB)": 15.04, + "step": 5695, + "train_speed(iter/s)": 0.336353 + }, + { + "acc": 0.931775, + "epoch": 0.7676767676767676, + "grad_norm": 7.46875, + "learning_rate": 1.4501070032217515e-05, + "loss": 0.28064499, + "memory(GiB)": 15.04, + "step": 5700, + "train_speed(iter/s)": 0.336418 + }, + { + "epoch": 0.7676767676767676, + "eval_acc": 0.8887719748492237, + "eval_loss": 0.4258766770362854, + "eval_runtime": 110.5315, + "eval_samples_per_second": 1.357, + "eval_steps_per_second": 1.357, + "step": 5700 + }, + { + "acc": 0.86454439, + "epoch": 0.7683501683501683, + "grad_norm": 15.875, + "learning_rate": 1.44911240861795e-05, + "loss": 0.37299004, + "memory(GiB)": 15.04, + "step": 5705, + "train_speed(iter/s)": 0.334292 + }, + { + "acc": 0.8620121, + "epoch": 0.769023569023569, + "grad_norm": 6.03125, + "learning_rate": 1.4481172571805515e-05, + "loss": 0.39818571, + "memory(GiB)": 15.04, + "step": 5710, + "train_speed(iter/s)": 0.334331 + }, + { + "acc": 0.90279598, + "epoch": 0.7696969696969697, + "grad_norm": 9.25, + "learning_rate": 1.4471215501433978e-05, + "loss": 0.32868023, + "memory(GiB)": 15.04, + "step": 5715, + "train_speed(iter/s)": 0.334383 + }, + { + "acc": 0.90749531, + "epoch": 0.7703703703703704, + "grad_norm": 11.875, + "learning_rate": 1.44612528874102e-05, + "loss": 0.24195716, + "memory(GiB)": 15.04, + "step": 5720, + "train_speed(iter/s)": 0.334467 + }, + { + "acc": 0.92181034, + "epoch": 0.7710437710437711, + "grad_norm": 11.125, + "learning_rate": 1.4451284742086363e-05, + "loss": 0.30442991, + "memory(GiB)": 15.04, + "step": 5725, + "train_speed(iter/s)": 0.334473 + }, + { + "acc": 0.87175598, + "epoch": 0.7717171717171717, + "grad_norm": 6.59375, + "learning_rate": 1.4441311077821505e-05, + "loss": 0.61085787, + "memory(GiB)": 15.04, + "step": 5730, + "train_speed(iter/s)": 0.334528 + }, + { + "acc": 0.91578388, + "epoch": 0.7723905723905724, + "grad_norm": 12.625, + "learning_rate": 1.443133190698151e-05, + "loss": 0.2823009, + "memory(GiB)": 15.04, + "step": 5735, + "train_speed(iter/s)": 0.334578 + }, + { + "acc": 0.89059925, + "epoch": 0.773063973063973, + "grad_norm": 12.125, + "learning_rate": 1.4421347241939085e-05, + "loss": 0.4863658, + "memory(GiB)": 15.04, + "step": 5740, + "train_speed(iter/s)": 0.334593 + }, + { + "acc": 0.9332159, + "epoch": 0.7737373737373737, + "grad_norm": 9.4375, + "learning_rate": 1.4411357095073761e-05, + "loss": 0.35421152, + "memory(GiB)": 15.04, + "step": 5745, + "train_speed(iter/s)": 0.334644 + }, + { + "acc": 0.92666492, + "epoch": 0.7744107744107744, + "grad_norm": 6.03125, + "learning_rate": 1.4401361478771847e-05, + "loss": 0.25091832, + "memory(GiB)": 15.04, + "step": 5750, + "train_speed(iter/s)": 0.334685 + }, + { + "acc": 0.91748695, + "epoch": 0.775084175084175, + "grad_norm": 9.0, + "learning_rate": 1.4391360405426447e-05, + "loss": 0.3200659, + "memory(GiB)": 15.04, + "step": 5755, + "train_speed(iter/s)": 0.334717 + }, + { + "acc": 0.89587498, + "epoch": 0.7757575757575758, + "grad_norm": 10.875, + "learning_rate": 1.4381353887437426e-05, + "loss": 0.47424874, + "memory(GiB)": 15.04, + "step": 5760, + "train_speed(iter/s)": 0.33474 + }, + { + "acc": 0.91452494, + "epoch": 0.7764309764309765, + "grad_norm": 13.6875, + "learning_rate": 1.43713419372114e-05, + "loss": 0.33517017, + "memory(GiB)": 15.04, + "step": 5765, + "train_speed(iter/s)": 0.334776 + }, + { + "acc": 0.87971125, + "epoch": 0.7771043771043771, + "grad_norm": 7.53125, + "learning_rate": 1.4361324567161723e-05, + "loss": 0.31353204, + "memory(GiB)": 15.04, + "step": 5770, + "train_speed(iter/s)": 0.334803 + }, + { + "acc": 0.75419145, + "epoch": 0.7777777777777778, + "grad_norm": 14.6875, + "learning_rate": 1.4351301789708465e-05, + "loss": 0.72678986, + "memory(GiB)": 15.04, + "step": 5775, + "train_speed(iter/s)": 0.334848 + }, + { + "acc": 0.8997736, + "epoch": 0.7784511784511785, + "grad_norm": 10.25, + "learning_rate": 1.43412736172784e-05, + "loss": 0.36349277, + "memory(GiB)": 15.04, + "step": 5780, + "train_speed(iter/s)": 0.334887 + }, + { + "acc": 0.87870064, + "epoch": 0.7791245791245791, + "grad_norm": 12.125, + "learning_rate": 1.4331240062304996e-05, + "loss": 0.39855471, + "memory(GiB)": 15.04, + "step": 5785, + "train_speed(iter/s)": 0.334888 + }, + { + "acc": 0.89457045, + "epoch": 0.7797979797979798, + "grad_norm": 5.78125, + "learning_rate": 1.432120113722839e-05, + "loss": 0.3033524, + "memory(GiB)": 15.04, + "step": 5790, + "train_speed(iter/s)": 0.33492 + }, + { + "acc": 0.9024951, + "epoch": 0.7804713804713804, + "grad_norm": 10.0625, + "learning_rate": 1.4311156854495378e-05, + "loss": 0.41811175, + "memory(GiB)": 15.04, + "step": 5795, + "train_speed(iter/s)": 0.334956 + }, + { + "acc": 0.86573696, + "epoch": 0.7811447811447811, + "grad_norm": 18.125, + "learning_rate": 1.4301107226559399e-05, + "loss": 0.31564784, + "memory(GiB)": 15.04, + "step": 5800, + "train_speed(iter/s)": 0.335035 + }, + { + "acc": 0.90242767, + "epoch": 0.7818181818181819, + "grad_norm": 8.1875, + "learning_rate": 1.4291052265880521e-05, + "loss": 0.35957475, + "memory(GiB)": 15.04, + "step": 5805, + "train_speed(iter/s)": 0.335085 + }, + { + "acc": 0.88742027, + "epoch": 0.7824915824915825, + "grad_norm": 8.625, + "learning_rate": 1.4280991984925421e-05, + "loss": 0.3438597, + "memory(GiB)": 15.04, + "step": 5810, + "train_speed(iter/s)": 0.335131 + }, + { + "acc": 0.88498249, + "epoch": 0.7831649831649832, + "grad_norm": 10.4375, + "learning_rate": 1.4270926396167374e-05, + "loss": 0.60539603, + "memory(GiB)": 15.04, + "step": 5815, + "train_speed(iter/s)": 0.335169 + }, + { + "acc": 0.90841513, + "epoch": 0.7838383838383839, + "grad_norm": 10.0, + "learning_rate": 1.4260855512086236e-05, + "loss": 0.3165911, + "memory(GiB)": 15.04, + "step": 5820, + "train_speed(iter/s)": 0.335184 + }, + { + "acc": 0.82462864, + "epoch": 0.7845117845117845, + "grad_norm": 7.6875, + "learning_rate": 1.4250779345168428e-05, + "loss": 0.47752819, + "memory(GiB)": 15.04, + "step": 5825, + "train_speed(iter/s)": 0.335219 + }, + { + "acc": 0.93924837, + "epoch": 0.7851851851851852, + "grad_norm": 6.34375, + "learning_rate": 1.4240697907906922e-05, + "loss": 0.19553621, + "memory(GiB)": 15.04, + "step": 5830, + "train_speed(iter/s)": 0.335274 + }, + { + "acc": 0.83240747, + "epoch": 0.7858585858585858, + "grad_norm": 13.0625, + "learning_rate": 1.423061121280122e-05, + "loss": 0.84983807, + "memory(GiB)": 15.04, + "step": 5835, + "train_speed(iter/s)": 0.335349 + }, + { + "acc": 0.92390738, + "epoch": 0.7865319865319865, + "grad_norm": 12.0625, + "learning_rate": 1.422051927235735e-05, + "loss": 0.23929789, + "memory(GiB)": 15.04, + "step": 5840, + "train_speed(iter/s)": 0.335399 + }, + { + "acc": 0.93130398, + "epoch": 0.7872053872053872, + "grad_norm": 12.3125, + "learning_rate": 1.4210422099087837e-05, + "loss": 0.22142253, + "memory(GiB)": 15.04, + "step": 5845, + "train_speed(iter/s)": 0.335458 + }, + { + "acc": 0.9127141, + "epoch": 0.7878787878787878, + "grad_norm": 12.625, + "learning_rate": 1.4200319705511698e-05, + "loss": 0.37274418, + "memory(GiB)": 15.04, + "step": 5850, + "train_speed(iter/s)": 0.335495 + }, + { + "acc": 0.84959421, + "epoch": 0.7885521885521886, + "grad_norm": 10.8125, + "learning_rate": 1.4190212104154422e-05, + "loss": 0.63109818, + "memory(GiB)": 15.04, + "step": 5855, + "train_speed(iter/s)": 0.335559 + }, + { + "acc": 0.81121473, + "epoch": 0.7892255892255893, + "grad_norm": 7.46875, + "learning_rate": 1.4180099307547952e-05, + "loss": 0.52778053, + "memory(GiB)": 15.04, + "step": 5860, + "train_speed(iter/s)": 0.335577 + }, + { + "acc": 0.8930335, + "epoch": 0.7898989898989899, + "grad_norm": 9.9375, + "learning_rate": 1.4169981328230676e-05, + "loss": 0.36625161, + "memory(GiB)": 15.04, + "step": 5865, + "train_speed(iter/s)": 0.335604 + }, + { + "acc": 0.86561136, + "epoch": 0.7905723905723906, + "grad_norm": 10.5, + "learning_rate": 1.4159858178747406e-05, + "loss": 0.60013657, + "memory(GiB)": 15.04, + "step": 5870, + "train_speed(iter/s)": 0.335668 + }, + { + "acc": 0.92761574, + "epoch": 0.7912457912457912, + "grad_norm": 6.90625, + "learning_rate": 1.4149729871649363e-05, + "loss": 0.27232482, + "memory(GiB)": 15.04, + "step": 5875, + "train_speed(iter/s)": 0.335704 + }, + { + "acc": 0.90905743, + "epoch": 0.7919191919191919, + "grad_norm": 14.1875, + "learning_rate": 1.4139596419494167e-05, + "loss": 0.38415422, + "memory(GiB)": 15.04, + "step": 5880, + "train_speed(iter/s)": 0.335758 + }, + { + "acc": 0.88108702, + "epoch": 0.7925925925925926, + "grad_norm": 11.25, + "learning_rate": 1.412945783484581e-05, + "loss": 0.38889017, + "memory(GiB)": 15.04, + "step": 5885, + "train_speed(iter/s)": 0.335827 + }, + { + "acc": 0.90847778, + "epoch": 0.7932659932659932, + "grad_norm": 8.4375, + "learning_rate": 1.4119314130274655e-05, + "loss": 0.39114232, + "memory(GiB)": 15.04, + "step": 5890, + "train_speed(iter/s)": 0.335872 + }, + { + "acc": 0.93595724, + "epoch": 0.793939393939394, + "grad_norm": 15.625, + "learning_rate": 1.4109165318357409e-05, + "loss": 0.30136671, + "memory(GiB)": 15.04, + "step": 5895, + "train_speed(iter/s)": 0.33592 + }, + { + "acc": 0.94308987, + "epoch": 0.7946127946127947, + "grad_norm": 5.40625, + "learning_rate": 1.4099011411677115e-05, + "loss": 0.29036045, + "memory(GiB)": 15.04, + "step": 5900, + "train_speed(iter/s)": 0.335952 + }, + { + "acc": 0.80220032, + "epoch": 0.7952861952861953, + "grad_norm": 5.03125, + "learning_rate": 1.4088852422823125e-05, + "loss": 0.53842239, + "memory(GiB)": 15.04, + "step": 5905, + "train_speed(iter/s)": 0.335929 + }, + { + "acc": 0.90533876, + "epoch": 0.795959595959596, + "grad_norm": 7.9375, + "learning_rate": 1.4078688364391097e-05, + "loss": 0.31355054, + "memory(GiB)": 15.04, + "step": 5910, + "train_speed(iter/s)": 0.335926 + }, + { + "acc": 0.83117151, + "epoch": 0.7966329966329966, + "grad_norm": 6.34375, + "learning_rate": 1.4068519248982976e-05, + "loss": 0.54436059, + "memory(GiB)": 15.04, + "step": 5915, + "train_speed(iter/s)": 0.335966 + }, + { + "acc": 0.81759272, + "epoch": 0.7973063973063973, + "grad_norm": 16.625, + "learning_rate": 1.4058345089206981e-05, + "loss": 0.66891346, + "memory(GiB)": 15.04, + "step": 5920, + "train_speed(iter/s)": 0.335993 + }, + { + "acc": 0.90339985, + "epoch": 0.797979797979798, + "grad_norm": 6.15625, + "learning_rate": 1.4048165897677572e-05, + "loss": 0.34642906, + "memory(GiB)": 15.04, + "step": 5925, + "train_speed(iter/s)": 0.33602 + }, + { + "acc": 0.8771596, + "epoch": 0.7986531986531986, + "grad_norm": 5.4375, + "learning_rate": 1.4037981687015459e-05, + "loss": 0.53214455, + "memory(GiB)": 15.04, + "step": 5930, + "train_speed(iter/s)": 0.336046 + }, + { + "acc": 0.90402775, + "epoch": 0.7993265993265993, + "grad_norm": 6.6875, + "learning_rate": 1.402779246984757e-05, + "loss": 0.33912227, + "memory(GiB)": 15.04, + "step": 5935, + "train_speed(iter/s)": 0.336051 + }, + { + "acc": 0.9046689, + "epoch": 0.8, + "grad_norm": 13.5, + "learning_rate": 1.4017598258807042e-05, + "loss": 0.35386438, + "memory(GiB)": 15.04, + "step": 5940, + "train_speed(iter/s)": 0.336088 + }, + { + "acc": 0.93352156, + "epoch": 0.8006734006734006, + "grad_norm": 6.5, + "learning_rate": 1.4007399066533203e-05, + "loss": 0.20259643, + "memory(GiB)": 15.04, + "step": 5945, + "train_speed(iter/s)": 0.336132 + }, + { + "acc": 0.85727062, + "epoch": 0.8013468013468014, + "grad_norm": 7.125, + "learning_rate": 1.3997194905671558e-05, + "loss": 0.54228849, + "memory(GiB)": 15.04, + "step": 5950, + "train_speed(iter/s)": 0.336163 + }, + { + "acc": 0.89675951, + "epoch": 0.802020202020202, + "grad_norm": 11.5, + "learning_rate": 1.3986985788873772e-05, + "loss": 0.39463236, + "memory(GiB)": 15.04, + "step": 5955, + "train_speed(iter/s)": 0.336178 + }, + { + "acc": 0.9124465, + "epoch": 0.8026936026936027, + "grad_norm": 8.1875, + "learning_rate": 1.3976771728797651e-05, + "loss": 0.31261494, + "memory(GiB)": 15.04, + "step": 5960, + "train_speed(iter/s)": 0.336201 + }, + { + "acc": 0.89148684, + "epoch": 0.8033670033670034, + "grad_norm": 6.125, + "learning_rate": 1.396655273810714e-05, + "loss": 0.34783628, + "memory(GiB)": 15.04, + "step": 5965, + "train_speed(iter/s)": 0.336242 + }, + { + "acc": 0.87334671, + "epoch": 0.804040404040404, + "grad_norm": 5.5625, + "learning_rate": 1.3956328829472286e-05, + "loss": 0.457757, + "memory(GiB)": 15.04, + "step": 5970, + "train_speed(iter/s)": 0.336217 + }, + { + "acc": 0.88221893, + "epoch": 0.8047138047138047, + "grad_norm": 21.25, + "learning_rate": 1.3946100015569237e-05, + "loss": 0.35697987, + "memory(GiB)": 15.04, + "step": 5975, + "train_speed(iter/s)": 0.336257 + }, + { + "acc": 0.90383415, + "epoch": 0.8053872053872054, + "grad_norm": 6.375, + "learning_rate": 1.3935866309080225e-05, + "loss": 0.20525901, + "memory(GiB)": 15.04, + "step": 5980, + "train_speed(iter/s)": 0.336311 + }, + { + "acc": 0.85681868, + "epoch": 0.806060606060606, + "grad_norm": 6.3125, + "learning_rate": 1.3925627722693549e-05, + "loss": 0.49668331, + "memory(GiB)": 15.04, + "step": 5985, + "train_speed(iter/s)": 0.336315 + }, + { + "acc": 0.87076035, + "epoch": 0.8067340067340067, + "grad_norm": 7.5, + "learning_rate": 1.3915384269103553e-05, + "loss": 0.32496142, + "memory(GiB)": 15.04, + "step": 5990, + "train_speed(iter/s)": 0.336339 + }, + { + "acc": 0.94674578, + "epoch": 0.8074074074074075, + "grad_norm": 4.5625, + "learning_rate": 1.3905135961010623e-05, + "loss": 0.20143983, + "memory(GiB)": 15.04, + "step": 5995, + "train_speed(iter/s)": 0.336373 + }, + { + "acc": 0.91290941, + "epoch": 0.8080808080808081, + "grad_norm": 22.125, + "learning_rate": 1.3894882811121155e-05, + "loss": 0.38342104, + "memory(GiB)": 15.04, + "step": 6000, + "train_speed(iter/s)": 0.336415 + }, + { + "epoch": 0.8080808080808081, + "eval_acc": 0.8914740951777989, + "eval_loss": 0.42146316170692444, + "eval_runtime": 109.8221, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 6000 + }, + { + "acc": 0.9116683, + "epoch": 0.8087542087542088, + "grad_norm": 7.03125, + "learning_rate": 1.3884624832147558e-05, + "loss": 0.25487845, + "memory(GiB)": 15.04, + "step": 6005, + "train_speed(iter/s)": 0.334333 + }, + { + "acc": 0.89876242, + "epoch": 0.8094276094276094, + "grad_norm": 6.59375, + "learning_rate": 1.387436203680822e-05, + "loss": 0.36956875, + "memory(GiB)": 15.04, + "step": 6010, + "train_speed(iter/s)": 0.3343 + }, + { + "acc": 0.87323742, + "epoch": 0.8101010101010101, + "grad_norm": 7.0, + "learning_rate": 1.3864094437827502e-05, + "loss": 0.40369444, + "memory(GiB)": 15.04, + "step": 6015, + "train_speed(iter/s)": 0.334279 + }, + { + "acc": 0.90273552, + "epoch": 0.8107744107744108, + "grad_norm": 14.6875, + "learning_rate": 1.3853822047935727e-05, + "loss": 0.30668547, + "memory(GiB)": 15.04, + "step": 6020, + "train_speed(iter/s)": 0.334331 + }, + { + "acc": 0.88416748, + "epoch": 0.8114478114478114, + "grad_norm": 12.625, + "learning_rate": 1.3843544879869151e-05, + "loss": 0.44226246, + "memory(GiB)": 15.04, + "step": 6025, + "train_speed(iter/s)": 0.33436 + }, + { + "acc": 0.92972078, + "epoch": 0.8121212121212121, + "grad_norm": 6.4375, + "learning_rate": 1.3833262946369959e-05, + "loss": 0.25983531, + "memory(GiB)": 15.04, + "step": 6030, + "train_speed(iter/s)": 0.334384 + }, + { + "acc": 0.83782463, + "epoch": 0.8127946127946128, + "grad_norm": 5.40625, + "learning_rate": 1.3822976260186237e-05, + "loss": 0.37785616, + "memory(GiB)": 15.04, + "step": 6035, + "train_speed(iter/s)": 0.334426 + }, + { + "acc": 0.88986654, + "epoch": 0.8134680134680135, + "grad_norm": 13.625, + "learning_rate": 1.3812684834071976e-05, + "loss": 0.43530312, + "memory(GiB)": 15.04, + "step": 6040, + "train_speed(iter/s)": 0.334442 + }, + { + "acc": 0.89353771, + "epoch": 0.8141414141414142, + "grad_norm": 5.96875, + "learning_rate": 1.3802388680787033e-05, + "loss": 0.37451477, + "memory(GiB)": 15.04, + "step": 6045, + "train_speed(iter/s)": 0.334502 + }, + { + "acc": 0.88501167, + "epoch": 0.8148148148148148, + "grad_norm": 8.125, + "learning_rate": 1.3792087813097132e-05, + "loss": 0.42273898, + "memory(GiB)": 15.04, + "step": 6050, + "train_speed(iter/s)": 0.334552 + }, + { + "acc": 0.9307291, + "epoch": 0.8154882154882155, + "grad_norm": 8.5625, + "learning_rate": 1.3781782243773836e-05, + "loss": 0.2445611, + "memory(GiB)": 15.04, + "step": 6055, + "train_speed(iter/s)": 0.334605 + }, + { + "acc": 0.88975878, + "epoch": 0.8161616161616162, + "grad_norm": 8.25, + "learning_rate": 1.3771471985594545e-05, + "loss": 0.28871515, + "memory(GiB)": 15.04, + "step": 6060, + "train_speed(iter/s)": 0.334643 + }, + { + "acc": 0.79767194, + "epoch": 0.8168350168350168, + "grad_norm": 62.0, + "learning_rate": 1.3761157051342469e-05, + "loss": 0.74234476, + "memory(GiB)": 15.04, + "step": 6065, + "train_speed(iter/s)": 0.334657 + }, + { + "acc": 0.8930418, + "epoch": 0.8175084175084175, + "grad_norm": 5.34375, + "learning_rate": 1.375083745380661e-05, + "loss": 0.48860698, + "memory(GiB)": 15.04, + "step": 6070, + "train_speed(iter/s)": 0.334652 + }, + { + "acc": 0.92133274, + "epoch": 0.8181818181818182, + "grad_norm": 6.90625, + "learning_rate": 1.3740513205781768e-05, + "loss": 0.25783746, + "memory(GiB)": 15.04, + "step": 6075, + "train_speed(iter/s)": 0.334671 + }, + { + "acc": 0.88961411, + "epoch": 0.8188552188552188, + "grad_norm": 14.8125, + "learning_rate": 1.3730184320068484e-05, + "loss": 0.41849647, + "memory(GiB)": 15.04, + "step": 6080, + "train_speed(iter/s)": 0.334647 + }, + { + "acc": 0.91754131, + "epoch": 0.8195286195286196, + "grad_norm": 7.53125, + "learning_rate": 1.3719850809473076e-05, + "loss": 0.32514465, + "memory(GiB)": 15.04, + "step": 6085, + "train_speed(iter/s)": 0.334632 + }, + { + "acc": 0.89978971, + "epoch": 0.8202020202020202, + "grad_norm": 10.3125, + "learning_rate": 1.3709512686807578e-05, + "loss": 0.47395735, + "memory(GiB)": 15.04, + "step": 6090, + "train_speed(iter/s)": 0.334675 + }, + { + "acc": 0.92563581, + "epoch": 0.8208754208754209, + "grad_norm": 12.1875, + "learning_rate": 1.3699169964889746e-05, + "loss": 0.29105644, + "memory(GiB)": 15.04, + "step": 6095, + "train_speed(iter/s)": 0.334726 + }, + { + "acc": 0.9316227, + "epoch": 0.8215488215488216, + "grad_norm": 7.71875, + "learning_rate": 1.3688822656543044e-05, + "loss": 0.23778965, + "memory(GiB)": 15.04, + "step": 6100, + "train_speed(iter/s)": 0.334742 + }, + { + "acc": 0.88348303, + "epoch": 0.8222222222222222, + "grad_norm": 8.125, + "learning_rate": 1.3678470774596615e-05, + "loss": 0.2383842, + "memory(GiB)": 15.04, + "step": 6105, + "train_speed(iter/s)": 0.334782 + }, + { + "acc": 0.92106791, + "epoch": 0.8228956228956229, + "grad_norm": 12.6875, + "learning_rate": 1.366811433188528e-05, + "loss": 0.25929103, + "memory(GiB)": 15.04, + "step": 6110, + "train_speed(iter/s)": 0.334839 + }, + { + "acc": 0.9059947, + "epoch": 0.8235690235690236, + "grad_norm": 7.0, + "learning_rate": 1.3657753341249506e-05, + "loss": 0.29663203, + "memory(GiB)": 15.04, + "step": 6115, + "train_speed(iter/s)": 0.334848 + }, + { + "acc": 0.906668, + "epoch": 0.8242424242424242, + "grad_norm": 5.40625, + "learning_rate": 1.3647387815535407e-05, + "loss": 0.27174668, + "memory(GiB)": 15.04, + "step": 6120, + "train_speed(iter/s)": 0.334847 + }, + { + "acc": 0.89451036, + "epoch": 0.8249158249158249, + "grad_norm": 20.0, + "learning_rate": 1.3637017767594718e-05, + "loss": 0.33886986, + "memory(GiB)": 15.04, + "step": 6125, + "train_speed(iter/s)": 0.334894 + }, + { + "acc": 0.84128714, + "epoch": 0.8255892255892255, + "grad_norm": 15.0, + "learning_rate": 1.362664321028477e-05, + "loss": 0.66911936, + "memory(GiB)": 15.04, + "step": 6130, + "train_speed(iter/s)": 0.334914 + }, + { + "acc": 0.90483494, + "epoch": 0.8262626262626263, + "grad_norm": 7.125, + "learning_rate": 1.3616264156468509e-05, + "loss": 0.43964629, + "memory(GiB)": 15.04, + "step": 6135, + "train_speed(iter/s)": 0.334958 + }, + { + "acc": 0.88671465, + "epoch": 0.826936026936027, + "grad_norm": 15.25, + "learning_rate": 1.360588061901443e-05, + "loss": 0.45865536, + "memory(GiB)": 15.04, + "step": 6140, + "train_speed(iter/s)": 0.334997 + }, + { + "acc": 0.8760848, + "epoch": 0.8276094276094276, + "grad_norm": 7.65625, + "learning_rate": 1.3595492610796604e-05, + "loss": 0.53596601, + "memory(GiB)": 15.04, + "step": 6145, + "train_speed(iter/s)": 0.335031 + }, + { + "acc": 0.9154603, + "epoch": 0.8282828282828283, + "grad_norm": 12.0, + "learning_rate": 1.3585100144694637e-05, + "loss": 0.38792026, + "memory(GiB)": 15.04, + "step": 6150, + "train_speed(iter/s)": 0.335013 + }, + { + "acc": 0.92935333, + "epoch": 0.828956228956229, + "grad_norm": 25.125, + "learning_rate": 1.3574703233593663e-05, + "loss": 0.26510258, + "memory(GiB)": 15.04, + "step": 6155, + "train_speed(iter/s)": 0.33504 + }, + { + "acc": 0.90549126, + "epoch": 0.8296296296296296, + "grad_norm": 6.0, + "learning_rate": 1.3564301890384333e-05, + "loss": 0.25224361, + "memory(GiB)": 15.04, + "step": 6160, + "train_speed(iter/s)": 0.335069 + }, + { + "acc": 0.77151456, + "epoch": 0.8303030303030303, + "grad_norm": 11.875, + "learning_rate": 1.3553896127962785e-05, + "loss": 0.59610553, + "memory(GiB)": 15.04, + "step": 6165, + "train_speed(iter/s)": 0.335109 + }, + { + "acc": 0.89386921, + "epoch": 0.8309764309764309, + "grad_norm": 9.625, + "learning_rate": 1.3543485959230644e-05, + "loss": 0.36909132, + "memory(GiB)": 15.04, + "step": 6170, + "train_speed(iter/s)": 0.335162 + }, + { + "acc": 0.89750032, + "epoch": 0.8316498316498316, + "grad_norm": 15.4375, + "learning_rate": 1.3533071397094992e-05, + "loss": 0.32972035, + "memory(GiB)": 15.04, + "step": 6175, + "train_speed(iter/s)": 0.335223 + }, + { + "acc": 0.88388786, + "epoch": 0.8323232323232324, + "grad_norm": 6.78125, + "learning_rate": 1.3522652454468359e-05, + "loss": 0.52773705, + "memory(GiB)": 15.04, + "step": 6180, + "train_speed(iter/s)": 0.335262 + }, + { + "acc": 0.91888924, + "epoch": 0.832996632996633, + "grad_norm": 4.90625, + "learning_rate": 1.3512229144268712e-05, + "loss": 0.24450941, + "memory(GiB)": 15.04, + "step": 6185, + "train_speed(iter/s)": 0.33531 + }, + { + "acc": 0.94059229, + "epoch": 0.8336700336700337, + "grad_norm": 4.53125, + "learning_rate": 1.3501801479419423e-05, + "loss": 0.20464752, + "memory(GiB)": 15.04, + "step": 6190, + "train_speed(iter/s)": 0.335347 + }, + { + "acc": 0.84352541, + "epoch": 0.8343434343434344, + "grad_norm": 6.875, + "learning_rate": 1.3491369472849275e-05, + "loss": 0.36902907, + "memory(GiB)": 15.04, + "step": 6195, + "train_speed(iter/s)": 0.33539 + }, + { + "acc": 0.90285397, + "epoch": 0.835016835016835, + "grad_norm": 7.28125, + "learning_rate": 1.3480933137492423e-05, + "loss": 0.50938802, + "memory(GiB)": 15.04, + "step": 6200, + "train_speed(iter/s)": 0.335342 + }, + { + "acc": 0.86406784, + "epoch": 0.8356902356902357, + "grad_norm": 16.75, + "learning_rate": 1.3470492486288394e-05, + "loss": 0.57964182, + "memory(GiB)": 15.04, + "step": 6205, + "train_speed(iter/s)": 0.335369 + }, + { + "acc": 0.85142422, + "epoch": 0.8363636363636363, + "grad_norm": 4.125, + "learning_rate": 1.3460047532182068e-05, + "loss": 0.52853956, + "memory(GiB)": 15.04, + "step": 6210, + "train_speed(iter/s)": 0.335396 + }, + { + "acc": 0.90336504, + "epoch": 0.837037037037037, + "grad_norm": 8.375, + "learning_rate": 1.344959828812366e-05, + "loss": 0.39982934, + "memory(GiB)": 15.04, + "step": 6215, + "train_speed(iter/s)": 0.335434 + }, + { + "acc": 0.92998228, + "epoch": 0.8377104377104377, + "grad_norm": 6.375, + "learning_rate": 1.3439144767068699e-05, + "loss": 0.25489178, + "memory(GiB)": 15.04, + "step": 6220, + "train_speed(iter/s)": 0.335396 + }, + { + "acc": 0.9138279, + "epoch": 0.8383838383838383, + "grad_norm": 10.9375, + "learning_rate": 1.342868698197802e-05, + "loss": 0.23432858, + "memory(GiB)": 15.04, + "step": 6225, + "train_speed(iter/s)": 0.335433 + }, + { + "acc": 0.93223724, + "epoch": 0.8390572390572391, + "grad_norm": 9.0, + "learning_rate": 1.3418224945817747e-05, + "loss": 0.22406673, + "memory(GiB)": 15.04, + "step": 6230, + "train_speed(iter/s)": 0.335507 + }, + { + "acc": 0.91276436, + "epoch": 0.8397306397306398, + "grad_norm": 11.6875, + "learning_rate": 1.340775867155927e-05, + "loss": 0.28521492, + "memory(GiB)": 15.04, + "step": 6235, + "train_speed(iter/s)": 0.335553 + }, + { + "acc": 0.86696482, + "epoch": 0.8404040404040404, + "grad_norm": 5.59375, + "learning_rate": 1.3397288172179237e-05, + "loss": 0.60414639, + "memory(GiB)": 15.04, + "step": 6240, + "train_speed(iter/s)": 0.335602 + }, + { + "acc": 0.89616995, + "epoch": 0.8410774410774411, + "grad_norm": 7.78125, + "learning_rate": 1.3386813460659532e-05, + "loss": 0.26624556, + "memory(GiB)": 15.04, + "step": 6245, + "train_speed(iter/s)": 0.335636 + }, + { + "acc": 0.909869, + "epoch": 0.8417508417508418, + "grad_norm": 9.75, + "learning_rate": 1.3376334549987262e-05, + "loss": 0.36458156, + "memory(GiB)": 15.04, + "step": 6250, + "train_speed(iter/s)": 0.335691 + }, + { + "acc": 0.89609671, + "epoch": 0.8424242424242424, + "grad_norm": 21.0, + "learning_rate": 1.3365851453154744e-05, + "loss": 0.40891066, + "memory(GiB)": 15.04, + "step": 6255, + "train_speed(iter/s)": 0.335739 + }, + { + "acc": 0.80581694, + "epoch": 0.8430976430976431, + "grad_norm": 6.28125, + "learning_rate": 1.335536418315948e-05, + "loss": 0.46938071, + "memory(GiB)": 15.04, + "step": 6260, + "train_speed(iter/s)": 0.335723 + }, + { + "acc": 0.84534855, + "epoch": 0.8437710437710437, + "grad_norm": 16.5, + "learning_rate": 1.3344872753004155e-05, + "loss": 0.42049317, + "memory(GiB)": 15.04, + "step": 6265, + "train_speed(iter/s)": 0.335734 + }, + { + "acc": 0.75441942, + "epoch": 0.8444444444444444, + "grad_norm": 20.875, + "learning_rate": 1.3334377175696596e-05, + "loss": 1.06415768, + "memory(GiB)": 15.04, + "step": 6270, + "train_speed(iter/s)": 0.335805 + }, + { + "acc": 0.894419, + "epoch": 0.8451178451178452, + "grad_norm": 7.21875, + "learning_rate": 1.3323877464249787e-05, + "loss": 0.40499711, + "memory(GiB)": 15.04, + "step": 6275, + "train_speed(iter/s)": 0.335828 + }, + { + "acc": 0.89972925, + "epoch": 0.8457912457912458, + "grad_norm": 5.3125, + "learning_rate": 1.3313373631681832e-05, + "loss": 0.28698552, + "memory(GiB)": 15.04, + "step": 6280, + "train_speed(iter/s)": 0.335847 + }, + { + "acc": 0.92284803, + "epoch": 0.8464646464646465, + "grad_norm": 4.75, + "learning_rate": 1.3302865691015943e-05, + "loss": 0.24218967, + "memory(GiB)": 15.04, + "step": 6285, + "train_speed(iter/s)": 0.335839 + }, + { + "acc": 0.87736807, + "epoch": 0.8471380471380472, + "grad_norm": 5.84375, + "learning_rate": 1.3292353655280426e-05, + "loss": 0.41942592, + "memory(GiB)": 15.04, + "step": 6290, + "train_speed(iter/s)": 0.335863 + }, + { + "acc": 0.9020483, + "epoch": 0.8478114478114478, + "grad_norm": 12.5, + "learning_rate": 1.3281837537508668e-05, + "loss": 0.38974915, + "memory(GiB)": 15.04, + "step": 6295, + "train_speed(iter/s)": 0.335896 + }, + { + "acc": 0.84610481, + "epoch": 0.8484848484848485, + "grad_norm": 7.96875, + "learning_rate": 1.3271317350739112e-05, + "loss": 0.42944093, + "memory(GiB)": 15.04, + "step": 6300, + "train_speed(iter/s)": 0.335927 + }, + { + "epoch": 0.8484848484848485, + "eval_acc": 0.891844344081504, + "eval_loss": 0.41997501254081726, + "eval_runtime": 109.7774, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 6300 + }, + { + "acc": 0.86580458, + "epoch": 0.8491582491582491, + "grad_norm": 14.0, + "learning_rate": 1.3260793108015254e-05, + "loss": 0.59847932, + "memory(GiB)": 15.04, + "step": 6305, + "train_speed(iter/s)": 0.333975 + }, + { + "acc": 0.85472517, + "epoch": 0.8498316498316498, + "grad_norm": 5.59375, + "learning_rate": 1.3250264822385605e-05, + "loss": 0.32474654, + "memory(GiB)": 15.04, + "step": 6310, + "train_speed(iter/s)": 0.334025 + }, + { + "acc": 0.87372379, + "epoch": 0.8505050505050505, + "grad_norm": 9.375, + "learning_rate": 1.3239732506903707e-05, + "loss": 0.53135829, + "memory(GiB)": 15.04, + "step": 6315, + "train_speed(iter/s)": 0.334025 + }, + { + "acc": 0.83493977, + "epoch": 0.8511784511784511, + "grad_norm": 15.8125, + "learning_rate": 1.3229196174628078e-05, + "loss": 0.62159672, + "memory(GiB)": 15.04, + "step": 6320, + "train_speed(iter/s)": 0.33409 + }, + { + "acc": 0.87602005, + "epoch": 0.8518518518518519, + "grad_norm": 15.6875, + "learning_rate": 1.3218655838622232e-05, + "loss": 0.50940185, + "memory(GiB)": 15.04, + "step": 6325, + "train_speed(iter/s)": 0.334123 + }, + { + "acc": 0.93578596, + "epoch": 0.8525252525252526, + "grad_norm": 9.0625, + "learning_rate": 1.3208111511954641e-05, + "loss": 0.29616165, + "memory(GiB)": 15.04, + "step": 6330, + "train_speed(iter/s)": 0.33415 + }, + { + "acc": 0.91249781, + "epoch": 0.8531986531986532, + "grad_norm": 7.125, + "learning_rate": 1.3197563207698729e-05, + "loss": 0.27347684, + "memory(GiB)": 15.04, + "step": 6335, + "train_speed(iter/s)": 0.334187 + }, + { + "acc": 0.92190247, + "epoch": 0.8538720538720539, + "grad_norm": 6.8125, + "learning_rate": 1.3187010938932842e-05, + "loss": 0.24746497, + "memory(GiB)": 15.04, + "step": 6340, + "train_speed(iter/s)": 0.334208 + }, + { + "acc": 0.93268328, + "epoch": 0.8545454545454545, + "grad_norm": 6.5625, + "learning_rate": 1.317645471874025e-05, + "loss": 0.37441986, + "memory(GiB)": 15.04, + "step": 6345, + "train_speed(iter/s)": 0.334261 + }, + { + "acc": 0.92450418, + "epoch": 0.8552188552188552, + "grad_norm": 13.5, + "learning_rate": 1.3165894560209118e-05, + "loss": 0.30913606, + "memory(GiB)": 15.04, + "step": 6350, + "train_speed(iter/s)": 0.334303 + }, + { + "acc": 0.90470371, + "epoch": 0.8558922558922559, + "grad_norm": 15.6875, + "learning_rate": 1.3155330476432497e-05, + "loss": 0.36733928, + "memory(GiB)": 15.04, + "step": 6355, + "train_speed(iter/s)": 0.334335 + }, + { + "acc": 0.92587414, + "epoch": 0.8565656565656565, + "grad_norm": 9.6875, + "learning_rate": 1.3144762480508306e-05, + "loss": 0.33982816, + "memory(GiB)": 15.04, + "step": 6360, + "train_speed(iter/s)": 0.334366 + }, + { + "acc": 0.92668858, + "epoch": 0.8572390572390572, + "grad_norm": 3.953125, + "learning_rate": 1.313419058553931e-05, + "loss": 0.27973926, + "memory(GiB)": 15.04, + "step": 6365, + "train_speed(iter/s)": 0.334311 + }, + { + "acc": 0.87961559, + "epoch": 0.857912457912458, + "grad_norm": 7.34375, + "learning_rate": 1.312361480463311e-05, + "loss": 0.32757173, + "memory(GiB)": 15.04, + "step": 6370, + "train_speed(iter/s)": 0.334382 + }, + { + "acc": 0.88145189, + "epoch": 0.8585858585858586, + "grad_norm": 7.9375, + "learning_rate": 1.3113035150902122e-05, + "loss": 0.24424481, + "memory(GiB)": 15.04, + "step": 6375, + "train_speed(iter/s)": 0.334421 + }, + { + "acc": 0.92479296, + "epoch": 0.8592592592592593, + "grad_norm": 9.5625, + "learning_rate": 1.3102451637463572e-05, + "loss": 0.28027315, + "memory(GiB)": 15.04, + "step": 6380, + "train_speed(iter/s)": 0.334473 + }, + { + "acc": 0.94058914, + "epoch": 0.8599326599326599, + "grad_norm": 4.65625, + "learning_rate": 1.3091864277439461e-05, + "loss": 0.22502601, + "memory(GiB)": 15.04, + "step": 6385, + "train_speed(iter/s)": 0.334502 + }, + { + "acc": 0.89868279, + "epoch": 0.8606060606060606, + "grad_norm": 8.375, + "learning_rate": 1.308127308395657e-05, + "loss": 0.36915667, + "memory(GiB)": 15.04, + "step": 6390, + "train_speed(iter/s)": 0.334537 + }, + { + "acc": 0.89056492, + "epoch": 0.8612794612794613, + "grad_norm": 4.65625, + "learning_rate": 1.3070678070146424e-05, + "loss": 0.50626311, + "memory(GiB)": 15.04, + "step": 6395, + "train_speed(iter/s)": 0.334567 + }, + { + "acc": 0.91969786, + "epoch": 0.8619528619528619, + "grad_norm": 7.78125, + "learning_rate": 1.3060079249145288e-05, + "loss": 0.27441006, + "memory(GiB)": 15.04, + "step": 6400, + "train_speed(iter/s)": 0.334602 + }, + { + "acc": 0.88433943, + "epoch": 0.8626262626262626, + "grad_norm": 13.875, + "learning_rate": 1.3049476634094147e-05, + "loss": 0.47878642, + "memory(GiB)": 15.04, + "step": 6405, + "train_speed(iter/s)": 0.334659 + }, + { + "acc": 0.8990777, + "epoch": 0.8632996632996633, + "grad_norm": 7.03125, + "learning_rate": 1.3038870238138694e-05, + "loss": 0.45147247, + "memory(GiB)": 15.04, + "step": 6410, + "train_speed(iter/s)": 0.334685 + }, + { + "acc": 0.88826809, + "epoch": 0.863973063973064, + "grad_norm": 4.4375, + "learning_rate": 1.3028260074429304e-05, + "loss": 0.38848712, + "memory(GiB)": 15.04, + "step": 6415, + "train_speed(iter/s)": 0.334679 + }, + { + "acc": 0.90999537, + "epoch": 0.8646464646464647, + "grad_norm": 10.375, + "learning_rate": 1.3017646156121026e-05, + "loss": 0.29897921, + "memory(GiB)": 15.04, + "step": 6420, + "train_speed(iter/s)": 0.334736 + }, + { + "acc": 0.91853495, + "epoch": 0.8653198653198653, + "grad_norm": 12.0625, + "learning_rate": 1.3007028496373561e-05, + "loss": 0.28005025, + "memory(GiB)": 15.04, + "step": 6425, + "train_speed(iter/s)": 0.334787 + }, + { + "acc": 0.8939868, + "epoch": 0.865993265993266, + "grad_norm": 6.1875, + "learning_rate": 1.2996407108351256e-05, + "loss": 0.43116326, + "memory(GiB)": 15.04, + "step": 6430, + "train_speed(iter/s)": 0.334741 + }, + { + "acc": 0.89180508, + "epoch": 0.8666666666666667, + "grad_norm": 7.46875, + "learning_rate": 1.2985782005223077e-05, + "loss": 0.29863009, + "memory(GiB)": 15.04, + "step": 6435, + "train_speed(iter/s)": 0.334797 + }, + { + "acc": 0.93351583, + "epoch": 0.8673400673400673, + "grad_norm": 8.875, + "learning_rate": 1.2975153200162592e-05, + "loss": 0.21974232, + "memory(GiB)": 15.04, + "step": 6440, + "train_speed(iter/s)": 0.334834 + }, + { + "acc": 0.89463205, + "epoch": 0.868013468013468, + "grad_norm": 7.09375, + "learning_rate": 1.2964520706347963e-05, + "loss": 0.40170579, + "memory(GiB)": 15.04, + "step": 6445, + "train_speed(iter/s)": 0.334853 + }, + { + "acc": 0.91431379, + "epoch": 0.8686868686868687, + "grad_norm": 10.5625, + "learning_rate": 1.2953884536961925e-05, + "loss": 0.29531341, + "memory(GiB)": 15.04, + "step": 6450, + "train_speed(iter/s)": 0.334872 + }, + { + "acc": 0.93744087, + "epoch": 0.8693602693602693, + "grad_norm": 8.625, + "learning_rate": 1.2943244705191772e-05, + "loss": 0.24498906, + "memory(GiB)": 15.04, + "step": 6455, + "train_speed(iter/s)": 0.334875 + }, + { + "acc": 0.81136913, + "epoch": 0.87003367003367, + "grad_norm": 6.9375, + "learning_rate": 1.2932601224229333e-05, + "loss": 0.56593361, + "memory(GiB)": 15.04, + "step": 6460, + "train_speed(iter/s)": 0.334891 + }, + { + "acc": 0.9021183, + "epoch": 0.8707070707070707, + "grad_norm": 8.5625, + "learning_rate": 1.2921954107270966e-05, + "loss": 0.48122501, + "memory(GiB)": 15.04, + "step": 6465, + "train_speed(iter/s)": 0.334934 + }, + { + "acc": 0.91989908, + "epoch": 0.8713804713804714, + "grad_norm": 6.46875, + "learning_rate": 1.2911303367517541e-05, + "loss": 0.27749155, + "memory(GiB)": 15.04, + "step": 6470, + "train_speed(iter/s)": 0.33499 + }, + { + "acc": 0.93246155, + "epoch": 0.8720538720538721, + "grad_norm": 5.34375, + "learning_rate": 1.2900649018174407e-05, + "loss": 0.27817974, + "memory(GiB)": 15.04, + "step": 6475, + "train_speed(iter/s)": 0.335037 + }, + { + "acc": 0.92235565, + "epoch": 0.8727272727272727, + "grad_norm": 5.6875, + "learning_rate": 1.2889991072451404e-05, + "loss": 0.28813252, + "memory(GiB)": 15.04, + "step": 6480, + "train_speed(iter/s)": 0.335094 + }, + { + "acc": 0.78675847, + "epoch": 0.8734006734006734, + "grad_norm": 38.25, + "learning_rate": 1.287932954356282e-05, + "loss": 0.74885602, + "memory(GiB)": 15.04, + "step": 6485, + "train_speed(iter/s)": 0.335152 + }, + { + "acc": 0.90806704, + "epoch": 0.8740740740740741, + "grad_norm": 7.46875, + "learning_rate": 1.286866444472739e-05, + "loss": 0.48535357, + "memory(GiB)": 15.04, + "step": 6490, + "train_speed(iter/s)": 0.33519 + }, + { + "acc": 0.86187124, + "epoch": 0.8747474747474747, + "grad_norm": 10.375, + "learning_rate": 1.2857995789168272e-05, + "loss": 0.41511011, + "memory(GiB)": 15.04, + "step": 6495, + "train_speed(iter/s)": 0.335235 + }, + { + "acc": 0.87467813, + "epoch": 0.8754208754208754, + "grad_norm": 7.5625, + "learning_rate": 1.2847323590113039e-05, + "loss": 0.41906748, + "memory(GiB)": 15.04, + "step": 6500, + "train_speed(iter/s)": 0.335259 + }, + { + "acc": 0.88989744, + "epoch": 0.8760942760942761, + "grad_norm": 18.0, + "learning_rate": 1.2836647860793653e-05, + "loss": 0.3864934, + "memory(GiB)": 15.04, + "step": 6505, + "train_speed(iter/s)": 0.335239 + }, + { + "acc": 0.86787643, + "epoch": 0.8767676767676768, + "grad_norm": 11.3125, + "learning_rate": 1.2825968614446456e-05, + "loss": 0.44939446, + "memory(GiB)": 15.04, + "step": 6510, + "train_speed(iter/s)": 0.335285 + }, + { + "acc": 0.80952492, + "epoch": 0.8774410774410775, + "grad_norm": 7.375, + "learning_rate": 1.2815285864312148e-05, + "loss": 0.69332781, + "memory(GiB)": 15.04, + "step": 6515, + "train_speed(iter/s)": 0.335328 + }, + { + "acc": 0.87130203, + "epoch": 0.8781144781144781, + "grad_norm": 13.625, + "learning_rate": 1.2804599623635771e-05, + "loss": 0.45426693, + "memory(GiB)": 15.04, + "step": 6520, + "train_speed(iter/s)": 0.335367 + }, + { + "acc": 0.92601309, + "epoch": 0.8787878787878788, + "grad_norm": 6.34375, + "learning_rate": 1.2793909905666703e-05, + "loss": 0.24447515, + "memory(GiB)": 15.04, + "step": 6525, + "train_speed(iter/s)": 0.335405 + }, + { + "acc": 0.9039216, + "epoch": 0.8794612794612795, + "grad_norm": 9.9375, + "learning_rate": 1.278321672365863e-05, + "loss": 0.37623487, + "memory(GiB)": 15.04, + "step": 6530, + "train_speed(iter/s)": 0.335451 + }, + { + "acc": 0.88665333, + "epoch": 0.8801346801346801, + "grad_norm": 7.65625, + "learning_rate": 1.2772520090869525e-05, + "loss": 0.3670491, + "memory(GiB)": 15.04, + "step": 6535, + "train_speed(iter/s)": 0.335478 + }, + { + "acc": 0.84381504, + "epoch": 0.8808080808080808, + "grad_norm": 7.71875, + "learning_rate": 1.2761820020561649e-05, + "loss": 0.5703146, + "memory(GiB)": 15.04, + "step": 6540, + "train_speed(iter/s)": 0.335515 + }, + { + "acc": 0.88605452, + "epoch": 0.8814814814814815, + "grad_norm": 7.84375, + "learning_rate": 1.2751116526001519e-05, + "loss": 0.67496691, + "memory(GiB)": 15.04, + "step": 6545, + "train_speed(iter/s)": 0.335553 + }, + { + "acc": 0.8932662, + "epoch": 0.8821548821548821, + "grad_norm": 10.625, + "learning_rate": 1.2740409620459906e-05, + "loss": 0.37912779, + "memory(GiB)": 15.04, + "step": 6550, + "train_speed(iter/s)": 0.335575 + }, + { + "acc": 0.93029604, + "epoch": 0.8828282828282829, + "grad_norm": 9.75, + "learning_rate": 1.2729699317211799e-05, + "loss": 0.28949127, + "memory(GiB)": 15.04, + "step": 6555, + "train_speed(iter/s)": 0.33562 + }, + { + "acc": 0.89841614, + "epoch": 0.8835016835016835, + "grad_norm": 4.78125, + "learning_rate": 1.2718985629536408e-05, + "loss": 0.32285652, + "memory(GiB)": 15.04, + "step": 6560, + "train_speed(iter/s)": 0.335603 + }, + { + "acc": 0.82716017, + "epoch": 0.8841750841750842, + "grad_norm": 6.3125, + "learning_rate": 1.2708268570717138e-05, + "loss": 0.85316229, + "memory(GiB)": 15.04, + "step": 6565, + "train_speed(iter/s)": 0.335654 + }, + { + "acc": 0.903016, + "epoch": 0.8848484848484849, + "grad_norm": 4.90625, + "learning_rate": 1.2697548154041564e-05, + "loss": 0.45106721, + "memory(GiB)": 15.04, + "step": 6570, + "train_speed(iter/s)": 0.335682 + }, + { + "acc": 0.94780951, + "epoch": 0.8855218855218855, + "grad_norm": 8.875, + "learning_rate": 1.268682439280144e-05, + "loss": 0.17702886, + "memory(GiB)": 15.04, + "step": 6575, + "train_speed(iter/s)": 0.335723 + }, + { + "acc": 0.91942225, + "epoch": 0.8861952861952862, + "grad_norm": 11.9375, + "learning_rate": 1.2676097300292659e-05, + "loss": 0.32949536, + "memory(GiB)": 15.04, + "step": 6580, + "train_speed(iter/s)": 0.335714 + }, + { + "acc": 0.900741, + "epoch": 0.8868686868686869, + "grad_norm": 8.0, + "learning_rate": 1.2665366889815237e-05, + "loss": 0.39808414, + "memory(GiB)": 15.04, + "step": 6585, + "train_speed(iter/s)": 0.335729 + }, + { + "acc": 0.90363607, + "epoch": 0.8875420875420875, + "grad_norm": 9.375, + "learning_rate": 1.2654633174673321e-05, + "loss": 0.35671287, + "memory(GiB)": 15.04, + "step": 6590, + "train_speed(iter/s)": 0.33575 + }, + { + "acc": 0.85732718, + "epoch": 0.8882154882154882, + "grad_norm": 4.90625, + "learning_rate": 1.2643896168175137e-05, + "loss": 0.635991, + "memory(GiB)": 15.04, + "step": 6595, + "train_speed(iter/s)": 0.335704 + }, + { + "acc": 0.85181875, + "epoch": 0.8888888888888888, + "grad_norm": 8.25, + "learning_rate": 1.2633155883633009e-05, + "loss": 0.42253537, + "memory(GiB)": 15.04, + "step": 6600, + "train_speed(iter/s)": 0.335715 + }, + { + "epoch": 0.8888888888888888, + "eval_acc": 0.8936819510176637, + "eval_loss": 0.4132220149040222, + "eval_runtime": 110.1754, + "eval_samples_per_second": 1.361, + "eval_steps_per_second": 1.361, + "step": 6600 + }, + { + "acc": 0.89893084, + "epoch": 0.8895622895622896, + "grad_norm": 10.0, + "learning_rate": 1.2622412334363307e-05, + "loss": 0.313694, + "memory(GiB)": 15.04, + "step": 6605, + "train_speed(iter/s)": 0.333881 + }, + { + "acc": 0.92133274, + "epoch": 0.8902356902356903, + "grad_norm": 7.90625, + "learning_rate": 1.2611665533686464e-05, + "loss": 0.29927416, + "memory(GiB)": 15.04, + "step": 6610, + "train_speed(iter/s)": 0.33393 + }, + { + "acc": 0.77386642, + "epoch": 0.8909090909090909, + "grad_norm": 9.4375, + "learning_rate": 1.2600915494926937e-05, + "loss": 0.86817532, + "memory(GiB)": 15.04, + "step": 6615, + "train_speed(iter/s)": 0.333975 + }, + { + "acc": 0.90358448, + "epoch": 0.8915824915824916, + "grad_norm": 9.75, + "learning_rate": 1.25901622314132e-05, + "loss": 0.36347694, + "memory(GiB)": 15.04, + "step": 6620, + "train_speed(iter/s)": 0.334032 + }, + { + "acc": 0.86857395, + "epoch": 0.8922558922558923, + "grad_norm": 10.0, + "learning_rate": 1.2579405756477723e-05, + "loss": 0.51028337, + "memory(GiB)": 15.04, + "step": 6625, + "train_speed(iter/s)": 0.334083 + }, + { + "acc": 0.9134038, + "epoch": 0.8929292929292929, + "grad_norm": 11.0625, + "learning_rate": 1.2568646083456963e-05, + "loss": 0.33207953, + "memory(GiB)": 15.04, + "step": 6630, + "train_speed(iter/s)": 0.334128 + }, + { + "acc": 0.90302343, + "epoch": 0.8936026936026936, + "grad_norm": 8.0625, + "learning_rate": 1.2557883225691331e-05, + "loss": 0.36005437, + "memory(GiB)": 15.04, + "step": 6635, + "train_speed(iter/s)": 0.334167 + }, + { + "acc": 0.90513153, + "epoch": 0.8942760942760942, + "grad_norm": 7.0, + "learning_rate": 1.2547117196525202e-05, + "loss": 0.46652622, + "memory(GiB)": 15.04, + "step": 6640, + "train_speed(iter/s)": 0.334143 + }, + { + "acc": 0.88937864, + "epoch": 0.8949494949494949, + "grad_norm": 18.5, + "learning_rate": 1.2536348009306871e-05, + "loss": 0.42260156, + "memory(GiB)": 15.04, + "step": 6645, + "train_speed(iter/s)": 0.334196 + }, + { + "acc": 0.84189262, + "epoch": 0.8956228956228957, + "grad_norm": 15.0, + "learning_rate": 1.2525575677388552e-05, + "loss": 0.52167392, + "memory(GiB)": 15.04, + "step": 6650, + "train_speed(iter/s)": 0.334213 + }, + { + "acc": 0.8754921, + "epoch": 0.8962962962962963, + "grad_norm": 17.375, + "learning_rate": 1.251480021412636e-05, + "loss": 0.32569854, + "memory(GiB)": 15.04, + "step": 6655, + "train_speed(iter/s)": 0.334265 + }, + { + "acc": 0.92789869, + "epoch": 0.896969696969697, + "grad_norm": 15.625, + "learning_rate": 1.2504021632880294e-05, + "loss": 0.30449035, + "memory(GiB)": 15.04, + "step": 6660, + "train_speed(iter/s)": 0.334247 + }, + { + "acc": 0.88237553, + "epoch": 0.8976430976430977, + "grad_norm": 14.0625, + "learning_rate": 1.249323994701421e-05, + "loss": 0.36932364, + "memory(GiB)": 15.04, + "step": 6665, + "train_speed(iter/s)": 0.334317 + }, + { + "acc": 0.92880669, + "epoch": 0.8983164983164983, + "grad_norm": 4.78125, + "learning_rate": 1.2482455169895822e-05, + "loss": 0.28703864, + "memory(GiB)": 15.04, + "step": 6670, + "train_speed(iter/s)": 0.334359 + }, + { + "acc": 0.89129295, + "epoch": 0.898989898989899, + "grad_norm": 10.75, + "learning_rate": 1.2471667314896674e-05, + "loss": 0.37683971, + "memory(GiB)": 15.04, + "step": 6675, + "train_speed(iter/s)": 0.334417 + }, + { + "acc": 0.91925354, + "epoch": 0.8996632996632996, + "grad_norm": 11.0, + "learning_rate": 1.2460876395392126e-05, + "loss": 0.25543945, + "memory(GiB)": 15.04, + "step": 6680, + "train_speed(iter/s)": 0.33446 + }, + { + "acc": 0.82515974, + "epoch": 0.9003367003367003, + "grad_norm": 6.1875, + "learning_rate": 1.2450082424761336e-05, + "loss": 0.51135798, + "memory(GiB)": 15.04, + "step": 6685, + "train_speed(iter/s)": 0.3345 + }, + { + "acc": 0.94111023, + "epoch": 0.901010101010101, + "grad_norm": 8.3125, + "learning_rate": 1.2439285416387248e-05, + "loss": 0.23880255, + "memory(GiB)": 15.04, + "step": 6690, + "train_speed(iter/s)": 0.334507 + }, + { + "acc": 0.89929008, + "epoch": 0.9016835016835016, + "grad_norm": 6.09375, + "learning_rate": 1.2428485383656565e-05, + "loss": 0.3595645, + "memory(GiB)": 15.04, + "step": 6695, + "train_speed(iter/s)": 0.334512 + }, + { + "acc": 0.92091246, + "epoch": 0.9023569023569024, + "grad_norm": 13.6875, + "learning_rate": 1.2417682339959755e-05, + "loss": 0.28632832, + "memory(GiB)": 15.04, + "step": 6700, + "train_speed(iter/s)": 0.334487 + }, + { + "acc": 0.90725622, + "epoch": 0.9030303030303031, + "grad_norm": 6.15625, + "learning_rate": 1.2406876298691006e-05, + "loss": 0.32967763, + "memory(GiB)": 15.04, + "step": 6705, + "train_speed(iter/s)": 0.334526 + }, + { + "acc": 0.83416014, + "epoch": 0.9037037037037037, + "grad_norm": 8.25, + "learning_rate": 1.2396067273248224e-05, + "loss": 0.64012122, + "memory(GiB)": 15.04, + "step": 6710, + "train_speed(iter/s)": 0.334581 + }, + { + "acc": 0.9107769, + "epoch": 0.9043771043771044, + "grad_norm": 9.75, + "learning_rate": 1.2385255277033022e-05, + "loss": 0.39353929, + "memory(GiB)": 15.04, + "step": 6715, + "train_speed(iter/s)": 0.334603 + }, + { + "acc": 0.88990326, + "epoch": 0.9050505050505051, + "grad_norm": 11.9375, + "learning_rate": 1.2374440323450685e-05, + "loss": 0.35531456, + "memory(GiB)": 15.04, + "step": 6720, + "train_speed(iter/s)": 0.334648 + }, + { + "acc": 0.88896084, + "epoch": 0.9057239057239057, + "grad_norm": 8.375, + "learning_rate": 1.2363622425910173e-05, + "loss": 0.2434422, + "memory(GiB)": 15.04, + "step": 6725, + "train_speed(iter/s)": 0.334704 + }, + { + "acc": 0.94749165, + "epoch": 0.9063973063973064, + "grad_norm": 6.3125, + "learning_rate": 1.2352801597824098e-05, + "loss": 0.20872343, + "memory(GiB)": 15.04, + "step": 6730, + "train_speed(iter/s)": 0.334752 + }, + { + "acc": 0.84323606, + "epoch": 0.907070707070707, + "grad_norm": 10.9375, + "learning_rate": 1.2341977852608698e-05, + "loss": 0.67311592, + "memory(GiB)": 15.04, + "step": 6735, + "train_speed(iter/s)": 0.334801 + }, + { + "acc": 0.9264308, + "epoch": 0.9077441077441077, + "grad_norm": 8.75, + "learning_rate": 1.2331151203683832e-05, + "loss": 0.28203523, + "memory(GiB)": 15.04, + "step": 6740, + "train_speed(iter/s)": 0.334853 + }, + { + "acc": 0.87206812, + "epoch": 0.9084175084175085, + "grad_norm": 12.0625, + "learning_rate": 1.2320321664472958e-05, + "loss": 0.39646292, + "memory(GiB)": 15.04, + "step": 6745, + "train_speed(iter/s)": 0.334901 + }, + { + "acc": 0.92845144, + "epoch": 0.9090909090909091, + "grad_norm": 4.34375, + "learning_rate": 1.2309489248403121e-05, + "loss": 0.31394038, + "memory(GiB)": 15.04, + "step": 6750, + "train_speed(iter/s)": 0.334909 + }, + { + "acc": 0.85453014, + "epoch": 0.9097643097643098, + "grad_norm": 14.6875, + "learning_rate": 1.229865396890493e-05, + "loss": 0.53902259, + "memory(GiB)": 15.04, + "step": 6755, + "train_speed(iter/s)": 0.334952 + }, + { + "acc": 0.92648659, + "epoch": 0.9104377104377105, + "grad_norm": 9.0625, + "learning_rate": 1.2287815839412543e-05, + "loss": 0.2476675, + "memory(GiB)": 15.04, + "step": 6760, + "train_speed(iter/s)": 0.334979 + }, + { + "acc": 0.94983616, + "epoch": 0.9111111111111111, + "grad_norm": 7.1875, + "learning_rate": 1.227697487336365e-05, + "loss": 0.21728773, + "memory(GiB)": 15.04, + "step": 6765, + "train_speed(iter/s)": 0.335031 + }, + { + "acc": 0.8695796, + "epoch": 0.9117845117845118, + "grad_norm": 13.4375, + "learning_rate": 1.2266131084199467e-05, + "loss": 0.48047376, + "memory(GiB)": 15.04, + "step": 6770, + "train_speed(iter/s)": 0.33503 + }, + { + "acc": 0.90174141, + "epoch": 0.9124579124579124, + "grad_norm": 7.09375, + "learning_rate": 1.22552844853647e-05, + "loss": 0.35227265, + "memory(GiB)": 15.04, + "step": 6775, + "train_speed(iter/s)": 0.335068 + }, + { + "acc": 0.92144661, + "epoch": 0.9131313131313131, + "grad_norm": 11.3125, + "learning_rate": 1.2244435090307542e-05, + "loss": 0.35098794, + "memory(GiB)": 15.04, + "step": 6780, + "train_speed(iter/s)": 0.335109 + }, + { + "acc": 0.91706858, + "epoch": 0.9138047138047138, + "grad_norm": 6.03125, + "learning_rate": 1.2233582912479658e-05, + "loss": 0.28804533, + "memory(GiB)": 15.04, + "step": 6785, + "train_speed(iter/s)": 0.335141 + }, + { + "acc": 0.89750051, + "epoch": 0.9144781144781144, + "grad_norm": 9.25, + "learning_rate": 1.2222727965336151e-05, + "loss": 0.38108807, + "memory(GiB)": 15.04, + "step": 6790, + "train_speed(iter/s)": 0.33518 + }, + { + "acc": 0.91727934, + "epoch": 0.9151515151515152, + "grad_norm": 7.84375, + "learning_rate": 1.2211870262335574e-05, + "loss": 0.40908642, + "memory(GiB)": 15.04, + "step": 6795, + "train_speed(iter/s)": 0.335223 + }, + { + "acc": 0.92927933, + "epoch": 0.9158249158249159, + "grad_norm": 6.5, + "learning_rate": 1.2201009816939886e-05, + "loss": 0.23740695, + "memory(GiB)": 15.04, + "step": 6800, + "train_speed(iter/s)": 0.335266 + }, + { + "acc": 0.88172426, + "epoch": 0.9164983164983165, + "grad_norm": 5.71875, + "learning_rate": 1.2190146642614444e-05, + "loss": 0.4228384, + "memory(GiB)": 15.04, + "step": 6805, + "train_speed(iter/s)": 0.33528 + }, + { + "acc": 0.94038963, + "epoch": 0.9171717171717172, + "grad_norm": 7.0625, + "learning_rate": 1.2179280752828e-05, + "loss": 0.22256649, + "memory(GiB)": 15.04, + "step": 6810, + "train_speed(iter/s)": 0.335311 + }, + { + "acc": 0.87440615, + "epoch": 0.9178451178451178, + "grad_norm": 4.875, + "learning_rate": 1.2168412161052654e-05, + "loss": 0.6039371, + "memory(GiB)": 15.04, + "step": 6815, + "train_speed(iter/s)": 0.335342 + }, + { + "acc": 0.93131466, + "epoch": 0.9185185185185185, + "grad_norm": 14.125, + "learning_rate": 1.215754088076388e-05, + "loss": 0.24507437, + "memory(GiB)": 15.04, + "step": 6820, + "train_speed(iter/s)": 0.335399 + }, + { + "acc": 0.8701457, + "epoch": 0.9191919191919192, + "grad_norm": 8.1875, + "learning_rate": 1.2146666925440467e-05, + "loss": 0.42550588, + "memory(GiB)": 15.04, + "step": 6825, + "train_speed(iter/s)": 0.335434 + }, + { + "acc": 0.91583204, + "epoch": 0.9198653198653198, + "grad_norm": 7.4375, + "learning_rate": 1.2135790308564527e-05, + "loss": 0.41826954, + "memory(GiB)": 15.04, + "step": 6830, + "train_speed(iter/s)": 0.335431 + }, + { + "acc": 0.92180929, + "epoch": 0.9205387205387205, + "grad_norm": 6.90625, + "learning_rate": 1.2124911043621472e-05, + "loss": 0.20690181, + "memory(GiB)": 15.04, + "step": 6835, + "train_speed(iter/s)": 0.335485 + }, + { + "acc": 0.88640709, + "epoch": 0.9212121212121213, + "grad_norm": 6.84375, + "learning_rate": 1.2114029144099997e-05, + "loss": 0.39851635, + "memory(GiB)": 15.04, + "step": 6840, + "train_speed(iter/s)": 0.335497 + }, + { + "acc": 0.91542492, + "epoch": 0.9218855218855219, + "grad_norm": 17.5, + "learning_rate": 1.2103144623492065e-05, + "loss": 0.25600076, + "memory(GiB)": 15.04, + "step": 6845, + "train_speed(iter/s)": 0.335553 + }, + { + "acc": 0.90082397, + "epoch": 0.9225589225589226, + "grad_norm": 5.4375, + "learning_rate": 1.2092257495292884e-05, + "loss": 0.38325059, + "memory(GiB)": 15.04, + "step": 6850, + "train_speed(iter/s)": 0.335574 + }, + { + "acc": 0.85676031, + "epoch": 0.9232323232323232, + "grad_norm": 13.5, + "learning_rate": 1.2081367773000901e-05, + "loss": 0.4454113, + "memory(GiB)": 15.04, + "step": 6855, + "train_speed(iter/s)": 0.33563 + }, + { + "acc": 0.88738985, + "epoch": 0.9239057239057239, + "grad_norm": 7.0, + "learning_rate": 1.2070475470117772e-05, + "loss": 0.57540016, + "memory(GiB)": 15.04, + "step": 6860, + "train_speed(iter/s)": 0.33567 + }, + { + "acc": 0.94030704, + "epoch": 0.9245791245791246, + "grad_norm": 6.125, + "learning_rate": 1.2059580600148362e-05, + "loss": 0.21582599, + "memory(GiB)": 15.04, + "step": 6865, + "train_speed(iter/s)": 0.335706 + }, + { + "acc": 0.84658775, + "epoch": 0.9252525252525252, + "grad_norm": 13.625, + "learning_rate": 1.2048683176600714e-05, + "loss": 0.41957035, + "memory(GiB)": 15.04, + "step": 6870, + "train_speed(iter/s)": 0.335708 + }, + { + "acc": 0.85046186, + "epoch": 0.9259259259259259, + "grad_norm": 13.9375, + "learning_rate": 1.2037783212986032e-05, + "loss": 0.53905044, + "memory(GiB)": 15.04, + "step": 6875, + "train_speed(iter/s)": 0.335689 + }, + { + "acc": 0.88738012, + "epoch": 0.9265993265993266, + "grad_norm": 17.125, + "learning_rate": 1.202688072281868e-05, + "loss": 0.4662982, + "memory(GiB)": 15.04, + "step": 6880, + "train_speed(iter/s)": 0.335725 + }, + { + "acc": 0.88524971, + "epoch": 0.9272727272727272, + "grad_norm": 7.875, + "learning_rate": 1.2015975719616142e-05, + "loss": 0.34116085, + "memory(GiB)": 15.04, + "step": 6885, + "train_speed(iter/s)": 0.335773 + }, + { + "acc": 0.92624969, + "epoch": 0.927946127946128, + "grad_norm": 8.25, + "learning_rate": 1.200506821689903e-05, + "loss": 0.23012111, + "memory(GiB)": 15.04, + "step": 6890, + "train_speed(iter/s)": 0.335804 + }, + { + "acc": 0.90716, + "epoch": 0.9286195286195286, + "grad_norm": 8.9375, + "learning_rate": 1.1994158228191048e-05, + "loss": 0.32048681, + "memory(GiB)": 15.04, + "step": 6895, + "train_speed(iter/s)": 0.335827 + }, + { + "acc": 0.91858282, + "epoch": 0.9292929292929293, + "grad_norm": 8.25, + "learning_rate": 1.1983245767018983e-05, + "loss": 0.30142379, + "memory(GiB)": 15.04, + "step": 6900, + "train_speed(iter/s)": 0.335826 + }, + { + "epoch": 0.9292929292929293, + "eval_acc": 0.8917195774165503, + "eval_loss": 0.42348816990852356, + "eval_runtime": 109.8053, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 6900 + }, + { + "acc": 0.90472231, + "epoch": 0.92996632996633, + "grad_norm": 5.59375, + "learning_rate": 1.197233084691269e-05, + "loss": 0.41530919, + "memory(GiB)": 15.04, + "step": 6905, + "train_speed(iter/s)": 0.334045 + }, + { + "acc": 0.92123251, + "epoch": 0.9306397306397306, + "grad_norm": 14.625, + "learning_rate": 1.196141348140507e-05, + "loss": 0.34660089, + "memory(GiB)": 15.04, + "step": 6910, + "train_speed(iter/s)": 0.334068 + }, + { + "acc": 0.84240036, + "epoch": 0.9313131313131313, + "grad_norm": 33.0, + "learning_rate": 1.1950493684032052e-05, + "loss": 0.61305223, + "memory(GiB)": 15.04, + "step": 6915, + "train_speed(iter/s)": 0.334123 + }, + { + "acc": 0.85562725, + "epoch": 0.931986531986532, + "grad_norm": 7.4375, + "learning_rate": 1.1939571468332593e-05, + "loss": 0.56112657, + "memory(GiB)": 15.04, + "step": 6920, + "train_speed(iter/s)": 0.334131 + }, + { + "acc": 0.89052143, + "epoch": 0.9326599326599326, + "grad_norm": 10.0, + "learning_rate": 1.1928646847848639e-05, + "loss": 0.38444917, + "memory(GiB)": 15.04, + "step": 6925, + "train_speed(iter/s)": 0.334166 + }, + { + "acc": 0.88111267, + "epoch": 0.9333333333333333, + "grad_norm": 9.5625, + "learning_rate": 1.1917719836125118e-05, + "loss": 0.52355328, + "memory(GiB)": 15.04, + "step": 6930, + "train_speed(iter/s)": 0.334224 + }, + { + "acc": 0.93286581, + "epoch": 0.934006734006734, + "grad_norm": 8.6875, + "learning_rate": 1.1906790446709922e-05, + "loss": 0.26666851, + "memory(GiB)": 15.04, + "step": 6935, + "train_speed(iter/s)": 0.334265 + }, + { + "acc": 0.84633722, + "epoch": 0.9346801346801347, + "grad_norm": 21.75, + "learning_rate": 1.1895858693153892e-05, + "loss": 0.66107178, + "memory(GiB)": 15.04, + "step": 6940, + "train_speed(iter/s)": 0.33427 + }, + { + "acc": 0.86352634, + "epoch": 0.9353535353535354, + "grad_norm": 10.25, + "learning_rate": 1.1884924589010805e-05, + "loss": 0.36745579, + "memory(GiB)": 15.04, + "step": 6945, + "train_speed(iter/s)": 0.334324 + }, + { + "acc": 0.84268894, + "epoch": 0.936026936026936, + "grad_norm": 8.875, + "learning_rate": 1.1873988147837347e-05, + "loss": 0.5519784, + "memory(GiB)": 15.04, + "step": 6950, + "train_speed(iter/s)": 0.334349 + }, + { + "acc": 0.91525087, + "epoch": 0.9367003367003367, + "grad_norm": 6.90625, + "learning_rate": 1.1863049383193103e-05, + "loss": 0.28298798, + "memory(GiB)": 15.04, + "step": 6955, + "train_speed(iter/s)": 0.33438 + }, + { + "acc": 0.89190407, + "epoch": 0.9373737373737374, + "grad_norm": 8.75, + "learning_rate": 1.1852108308640535e-05, + "loss": 0.22739718, + "memory(GiB)": 15.04, + "step": 6960, + "train_speed(iter/s)": 0.334425 + }, + { + "acc": 0.9698102, + "epoch": 0.938047138047138, + "grad_norm": 11.875, + "learning_rate": 1.184116493774498e-05, + "loss": 0.12442456, + "memory(GiB)": 15.04, + "step": 6965, + "train_speed(iter/s)": 0.334486 + }, + { + "acc": 0.84846802, + "epoch": 0.9387205387205387, + "grad_norm": 7.71875, + "learning_rate": 1.183021928407461e-05, + "loss": 0.65881367, + "memory(GiB)": 15.04, + "step": 6970, + "train_speed(iter/s)": 0.334462 + }, + { + "acc": 0.91213017, + "epoch": 0.9393939393939394, + "grad_norm": 7.03125, + "learning_rate": 1.1819271361200435e-05, + "loss": 0.32764463, + "memory(GiB)": 15.04, + "step": 6975, + "train_speed(iter/s)": 0.334474 + }, + { + "acc": 0.9128233, + "epoch": 0.94006734006734, + "grad_norm": 8.5625, + "learning_rate": 1.1808321182696271e-05, + "loss": 0.32920997, + "memory(GiB)": 15.04, + "step": 6980, + "train_speed(iter/s)": 0.334502 + }, + { + "acc": 0.9396574, + "epoch": 0.9407407407407408, + "grad_norm": 9.25, + "learning_rate": 1.179736876213874e-05, + "loss": 0.20859499, + "memory(GiB)": 15.04, + "step": 6985, + "train_speed(iter/s)": 0.334565 + }, + { + "acc": 0.86029367, + "epoch": 0.9414141414141414, + "grad_norm": 13.5, + "learning_rate": 1.1786414113107236e-05, + "loss": 0.5239604, + "memory(GiB)": 15.04, + "step": 6990, + "train_speed(iter/s)": 0.334586 + }, + { + "acc": 0.90799704, + "epoch": 0.9420875420875421, + "grad_norm": 7.625, + "learning_rate": 1.1775457249183922e-05, + "loss": 0.44904799, + "memory(GiB)": 15.04, + "step": 6995, + "train_speed(iter/s)": 0.334574 + }, + { + "acc": 0.91598625, + "epoch": 0.9427609427609428, + "grad_norm": 8.75, + "learning_rate": 1.1764498183953701e-05, + "loss": 0.31580322, + "memory(GiB)": 15.04, + "step": 7000, + "train_speed(iter/s)": 0.334615 + }, + { + "acc": 0.85193005, + "epoch": 0.9434343434343434, + "grad_norm": 14.75, + "learning_rate": 1.1753536931004211e-05, + "loss": 0.65886221, + "memory(GiB)": 15.04, + "step": 7005, + "train_speed(iter/s)": 0.334488 + }, + { + "acc": 0.89839611, + "epoch": 0.9441077441077441, + "grad_norm": 3.765625, + "learning_rate": 1.1742573503925794e-05, + "loss": 0.46067529, + "memory(GiB)": 15.04, + "step": 7010, + "train_speed(iter/s)": 0.334519 + }, + { + "acc": 0.90813351, + "epoch": 0.9447811447811448, + "grad_norm": 7.125, + "learning_rate": 1.1731607916311503e-05, + "loss": 0.3260469, + "memory(GiB)": 15.04, + "step": 7015, + "train_speed(iter/s)": 0.334521 + }, + { + "acc": 0.91946726, + "epoch": 0.9454545454545454, + "grad_norm": 6.375, + "learning_rate": 1.1720640181757055e-05, + "loss": 0.26359484, + "memory(GiB)": 15.04, + "step": 7020, + "train_speed(iter/s)": 0.334562 + }, + { + "acc": 0.8410737, + "epoch": 0.9461279461279462, + "grad_norm": 6.96875, + "learning_rate": 1.1709670313860835e-05, + "loss": 0.40910587, + "memory(GiB)": 15.04, + "step": 7025, + "train_speed(iter/s)": 0.334598 + }, + { + "acc": 0.94576159, + "epoch": 0.9468013468013468, + "grad_norm": 10.25, + "learning_rate": 1.1698698326223872e-05, + "loss": 0.23115597, + "memory(GiB)": 15.04, + "step": 7030, + "train_speed(iter/s)": 0.33464 + }, + { + "acc": 0.91468258, + "epoch": 0.9474747474747475, + "grad_norm": 4.5, + "learning_rate": 1.1687724232449823e-05, + "loss": 0.30396674, + "memory(GiB)": 15.04, + "step": 7035, + "train_speed(iter/s)": 0.334677 + }, + { + "acc": 0.91524286, + "epoch": 0.9481481481481482, + "grad_norm": 7.75, + "learning_rate": 1.1676748046144957e-05, + "loss": 0.35731499, + "memory(GiB)": 15.04, + "step": 7040, + "train_speed(iter/s)": 0.3347 + }, + { + "acc": 0.82774048, + "epoch": 0.9488215488215488, + "grad_norm": 9.0, + "learning_rate": 1.1665769780918139e-05, + "loss": 0.62082992, + "memory(GiB)": 15.04, + "step": 7045, + "train_speed(iter/s)": 0.334749 + }, + { + "acc": 0.90356588, + "epoch": 0.9494949494949495, + "grad_norm": 7.25, + "learning_rate": 1.1654789450380805e-05, + "loss": 0.42908516, + "memory(GiB)": 15.04, + "step": 7050, + "train_speed(iter/s)": 0.334731 + }, + { + "acc": 0.92638025, + "epoch": 0.9501683501683502, + "grad_norm": 7.0625, + "learning_rate": 1.1643807068146964e-05, + "loss": 0.25998495, + "memory(GiB)": 15.04, + "step": 7055, + "train_speed(iter/s)": 0.334743 + }, + { + "acc": 0.94073849, + "epoch": 0.9508417508417508, + "grad_norm": 7.71875, + "learning_rate": 1.1632822647833155e-05, + "loss": 0.25604682, + "memory(GiB)": 15.04, + "step": 7060, + "train_speed(iter/s)": 0.334807 + }, + { + "acc": 0.95495081, + "epoch": 0.9515151515151515, + "grad_norm": 5.21875, + "learning_rate": 1.1621836203058452e-05, + "loss": 0.25055363, + "memory(GiB)": 15.04, + "step": 7065, + "train_speed(iter/s)": 0.334855 + }, + { + "acc": 0.89045639, + "epoch": 0.9521885521885521, + "grad_norm": 7.34375, + "learning_rate": 1.1610847747444435e-05, + "loss": 0.44178624, + "memory(GiB)": 15.04, + "step": 7070, + "train_speed(iter/s)": 0.334892 + }, + { + "acc": 0.89794683, + "epoch": 0.9528619528619529, + "grad_norm": 24.75, + "learning_rate": 1.1599857294615184e-05, + "loss": 0.53782544, + "memory(GiB)": 15.04, + "step": 7075, + "train_speed(iter/s)": 0.334906 + }, + { + "acc": 0.88415861, + "epoch": 0.9535353535353536, + "grad_norm": 11.3125, + "learning_rate": 1.1588864858197246e-05, + "loss": 0.42191024, + "memory(GiB)": 15.04, + "step": 7080, + "train_speed(iter/s)": 0.334954 + }, + { + "acc": 0.87026701, + "epoch": 0.9542087542087542, + "grad_norm": 7.65625, + "learning_rate": 1.1577870451819633e-05, + "loss": 0.62059903, + "memory(GiB)": 15.04, + "step": 7085, + "train_speed(iter/s)": 0.334844 + }, + { + "acc": 0.87935076, + "epoch": 0.9548821548821549, + "grad_norm": 26.75, + "learning_rate": 1.15668740891138e-05, + "loss": 0.42825451, + "memory(GiB)": 15.04, + "step": 7090, + "train_speed(iter/s)": 0.334906 + }, + { + "acc": 0.88299179, + "epoch": 0.9555555555555556, + "grad_norm": 5.21875, + "learning_rate": 1.1555875783713627e-05, + "loss": 0.41017833, + "memory(GiB)": 15.04, + "step": 7095, + "train_speed(iter/s)": 0.334944 + }, + { + "acc": 0.88696423, + "epoch": 0.9562289562289562, + "grad_norm": 13.8125, + "learning_rate": 1.1544875549255396e-05, + "loss": 0.3364373, + "memory(GiB)": 15.04, + "step": 7100, + "train_speed(iter/s)": 0.334975 + }, + { + "acc": 0.93549919, + "epoch": 0.9569023569023569, + "grad_norm": 6.34375, + "learning_rate": 1.1533873399377792e-05, + "loss": 0.25334518, + "memory(GiB)": 15.04, + "step": 7105, + "train_speed(iter/s)": 0.335026 + }, + { + "acc": 0.92688274, + "epoch": 0.9575757575757575, + "grad_norm": 8.3125, + "learning_rate": 1.1522869347721863e-05, + "loss": 0.30110722, + "memory(GiB)": 15.04, + "step": 7110, + "train_speed(iter/s)": 0.335076 + }, + { + "acc": 0.91134062, + "epoch": 0.9582491582491582, + "grad_norm": 5.875, + "learning_rate": 1.151186340793103e-05, + "loss": 0.29459972, + "memory(GiB)": 15.04, + "step": 7115, + "train_speed(iter/s)": 0.335103 + }, + { + "acc": 0.89624596, + "epoch": 0.958922558922559, + "grad_norm": 10.3125, + "learning_rate": 1.150085559365104e-05, + "loss": 0.35935357, + "memory(GiB)": 15.04, + "step": 7120, + "train_speed(iter/s)": 0.335115 + }, + { + "acc": 0.88149929, + "epoch": 0.9595959595959596, + "grad_norm": 8.6875, + "learning_rate": 1.1489845918529971e-05, + "loss": 0.32850969, + "memory(GiB)": 15.04, + "step": 7125, + "train_speed(iter/s)": 0.335121 + }, + { + "acc": 0.88787622, + "epoch": 0.9602693602693603, + "grad_norm": 7.90625, + "learning_rate": 1.1478834396218208e-05, + "loss": 0.3603116, + "memory(GiB)": 15.04, + "step": 7130, + "train_speed(iter/s)": 0.335139 + }, + { + "acc": 0.90232334, + "epoch": 0.960942760942761, + "grad_norm": 14.375, + "learning_rate": 1.1467821040368423e-05, + "loss": 0.46831064, + "memory(GiB)": 15.04, + "step": 7135, + "train_speed(iter/s)": 0.335192 + }, + { + "acc": 0.90447445, + "epoch": 0.9616161616161616, + "grad_norm": 4.90625, + "learning_rate": 1.145680586463557e-05, + "loss": 0.2544513, + "memory(GiB)": 15.04, + "step": 7140, + "train_speed(iter/s)": 0.335236 + }, + { + "acc": 0.90101051, + "epoch": 0.9622895622895623, + "grad_norm": 9.875, + "learning_rate": 1.1445788882676848e-05, + "loss": 0.33424101, + "memory(GiB)": 15.04, + "step": 7145, + "train_speed(iter/s)": 0.335282 + }, + { + "acc": 0.93389111, + "epoch": 0.9629629629629629, + "grad_norm": 7.84375, + "learning_rate": 1.14347701081517e-05, + "loss": 0.19712846, + "memory(GiB)": 15.04, + "step": 7150, + "train_speed(iter/s)": 0.335336 + }, + { + "acc": 0.89447041, + "epoch": 0.9636363636363636, + "grad_norm": 10.0625, + "learning_rate": 1.1423749554721799e-05, + "loss": 0.25301185, + "memory(GiB)": 15.04, + "step": 7155, + "train_speed(iter/s)": 0.335379 + }, + { + "acc": 0.84386959, + "epoch": 0.9643097643097643, + "grad_norm": 6.125, + "learning_rate": 1.1412727236051012e-05, + "loss": 0.4289156, + "memory(GiB)": 15.04, + "step": 7160, + "train_speed(iter/s)": 0.335409 + }, + { + "acc": 0.83983965, + "epoch": 0.9649831649831649, + "grad_norm": 9.375, + "learning_rate": 1.1401703165805398e-05, + "loss": 0.37435858, + "memory(GiB)": 15.04, + "step": 7165, + "train_speed(iter/s)": 0.335473 + }, + { + "acc": 0.9178792, + "epoch": 0.9656565656565657, + "grad_norm": 16.0, + "learning_rate": 1.139067735765319e-05, + "loss": 0.32167628, + "memory(GiB)": 15.04, + "step": 7170, + "train_speed(iter/s)": 0.335511 + }, + { + "acc": 0.89984436, + "epoch": 0.9663299663299664, + "grad_norm": 8.625, + "learning_rate": 1.1379649825264781e-05, + "loss": 0.22635565, + "memory(GiB)": 15.04, + "step": 7175, + "train_speed(iter/s)": 0.33555 + }, + { + "acc": 0.8739892, + "epoch": 0.967003367003367, + "grad_norm": 8.1875, + "learning_rate": 1.1368620582312684e-05, + "loss": 0.39380751, + "memory(GiB)": 15.04, + "step": 7180, + "train_speed(iter/s)": 0.335599 + }, + { + "acc": 0.92747126, + "epoch": 0.9676767676767677, + "grad_norm": 9.875, + "learning_rate": 1.1357589642471556e-05, + "loss": 0.19782375, + "memory(GiB)": 15.04, + "step": 7185, + "train_speed(iter/s)": 0.33564 + }, + { + "acc": 0.91667948, + "epoch": 0.9683501683501684, + "grad_norm": 7.53125, + "learning_rate": 1.1346557019418144e-05, + "loss": 0.38190329, + "memory(GiB)": 15.04, + "step": 7190, + "train_speed(iter/s)": 0.33565 + }, + { + "acc": 0.93237314, + "epoch": 0.969023569023569, + "grad_norm": 7.625, + "learning_rate": 1.1335522726831278e-05, + "loss": 0.27520278, + "memory(GiB)": 15.04, + "step": 7195, + "train_speed(iter/s)": 0.335657 + }, + { + "acc": 0.83783598, + "epoch": 0.9696969696969697, + "grad_norm": 13.5625, + "learning_rate": 1.1324486778391872e-05, + "loss": 0.40921426, + "memory(GiB)": 15.04, + "step": 7200, + "train_speed(iter/s)": 0.335689 + }, + { + "epoch": 0.9696969696969697, + "eval_acc": 0.8939006043970545, + "eval_loss": 0.4129711389541626, + "eval_runtime": 109.8029, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 7200 + }, + { + "acc": 0.9090373, + "epoch": 0.9703703703703703, + "grad_norm": 4.5, + "learning_rate": 1.131344918778288e-05, + "loss": 0.45668912, + "memory(GiB)": 15.04, + "step": 7205, + "train_speed(iter/s)": 0.334004 + }, + { + "acc": 0.84333715, + "epoch": 0.971043771043771, + "grad_norm": 17.875, + "learning_rate": 1.1302409968689301e-05, + "loss": 0.59896164, + "memory(GiB)": 15.04, + "step": 7210, + "train_speed(iter/s)": 0.334016 + }, + { + "acc": 0.918678, + "epoch": 0.9717171717171718, + "grad_norm": 13.9375, + "learning_rate": 1.129136913479815e-05, + "loss": 0.33298759, + "memory(GiB)": 15.04, + "step": 7215, + "train_speed(iter/s)": 0.334048 + }, + { + "acc": 0.8880929, + "epoch": 0.9723905723905724, + "grad_norm": 12.8125, + "learning_rate": 1.128032669979844e-05, + "loss": 0.38684759, + "memory(GiB)": 15.04, + "step": 7220, + "train_speed(iter/s)": 0.334108 + }, + { + "acc": 0.90387564, + "epoch": 0.9730639730639731, + "grad_norm": 5.65625, + "learning_rate": 1.1269282677381177e-05, + "loss": 0.30169425, + "memory(GiB)": 15.04, + "step": 7225, + "train_speed(iter/s)": 0.334122 + }, + { + "acc": 0.90293312, + "epoch": 0.9737373737373738, + "grad_norm": 9.9375, + "learning_rate": 1.1258237081239324e-05, + "loss": 0.31314328, + "memory(GiB)": 15.04, + "step": 7230, + "train_speed(iter/s)": 0.334171 + }, + { + "acc": 0.85950842, + "epoch": 0.9744107744107744, + "grad_norm": 21.5, + "learning_rate": 1.1247189925067812e-05, + "loss": 0.51640587, + "memory(GiB)": 15.04, + "step": 7235, + "train_speed(iter/s)": 0.334217 + }, + { + "acc": 0.90892792, + "epoch": 0.9750841750841751, + "grad_norm": 4.8125, + "learning_rate": 1.123614122256349e-05, + "loss": 0.31192939, + "memory(GiB)": 15.04, + "step": 7240, + "train_speed(iter/s)": 0.334196 + }, + { + "acc": 0.94763403, + "epoch": 0.9757575757575757, + "grad_norm": 5.0, + "learning_rate": 1.1225090987425134e-05, + "loss": 0.25039723, + "memory(GiB)": 15.04, + "step": 7245, + "train_speed(iter/s)": 0.334213 + }, + { + "acc": 0.87983856, + "epoch": 0.9764309764309764, + "grad_norm": 11.8125, + "learning_rate": 1.1214039233353413e-05, + "loss": 0.40090127, + "memory(GiB)": 15.04, + "step": 7250, + "train_speed(iter/s)": 0.334251 + }, + { + "acc": 0.90854416, + "epoch": 0.9771043771043771, + "grad_norm": 4.375, + "learning_rate": 1.1202985974050884e-05, + "loss": 0.39664159, + "memory(GiB)": 15.04, + "step": 7255, + "train_speed(iter/s)": 0.33428 + }, + { + "acc": 0.88465996, + "epoch": 0.9777777777777777, + "grad_norm": 7.75, + "learning_rate": 1.119193122322197e-05, + "loss": 0.43538961, + "memory(GiB)": 15.04, + "step": 7260, + "train_speed(iter/s)": 0.334268 + }, + { + "acc": 0.89416714, + "epoch": 0.9784511784511785, + "grad_norm": 7.9375, + "learning_rate": 1.1180874994572946e-05, + "loss": 0.30163293, + "memory(GiB)": 15.04, + "step": 7265, + "train_speed(iter/s)": 0.334288 + }, + { + "acc": 0.89555483, + "epoch": 0.9791245791245792, + "grad_norm": 5.25, + "learning_rate": 1.1169817301811911e-05, + "loss": 0.30586894, + "memory(GiB)": 15.04, + "step": 7270, + "train_speed(iter/s)": 0.334282 + }, + { + "acc": 0.89075212, + "epoch": 0.9797979797979798, + "grad_norm": 7.46875, + "learning_rate": 1.1158758158648786e-05, + "loss": 0.4049346, + "memory(GiB)": 15.04, + "step": 7275, + "train_speed(iter/s)": 0.334323 + }, + { + "acc": 0.88342791, + "epoch": 0.9804713804713805, + "grad_norm": 6.90625, + "learning_rate": 1.1147697578795287e-05, + "loss": 0.37540126, + "memory(GiB)": 15.04, + "step": 7280, + "train_speed(iter/s)": 0.334364 + }, + { + "acc": 0.8891819, + "epoch": 0.9811447811447811, + "grad_norm": 7.71875, + "learning_rate": 1.1136635575964916e-05, + "loss": 0.33928139, + "memory(GiB)": 15.04, + "step": 7285, + "train_speed(iter/s)": 0.334394 + }, + { + "acc": 0.80094166, + "epoch": 0.9818181818181818, + "grad_norm": 16.625, + "learning_rate": 1.1125572163872936e-05, + "loss": 0.76204133, + "memory(GiB)": 15.04, + "step": 7290, + "train_speed(iter/s)": 0.334436 + }, + { + "acc": 0.92983532, + "epoch": 0.9824915824915825, + "grad_norm": 12.0625, + "learning_rate": 1.1114507356236354e-05, + "loss": 0.25833547, + "memory(GiB)": 15.04, + "step": 7295, + "train_speed(iter/s)": 0.334443 + }, + { + "acc": 0.87583199, + "epoch": 0.9831649831649831, + "grad_norm": 21.375, + "learning_rate": 1.1103441166773911e-05, + "loss": 0.66637115, + "memory(GiB)": 15.04, + "step": 7300, + "train_speed(iter/s)": 0.334493 + }, + { + "acc": 0.94558496, + "epoch": 0.9838383838383838, + "grad_norm": 8.8125, + "learning_rate": 1.1092373609206064e-05, + "loss": 0.20286543, + "memory(GiB)": 15.04, + "step": 7305, + "train_speed(iter/s)": 0.334534 + }, + { + "acc": 0.90613937, + "epoch": 0.9845117845117846, + "grad_norm": 9.625, + "learning_rate": 1.108130469725496e-05, + "loss": 0.38738461, + "memory(GiB)": 15.04, + "step": 7310, + "train_speed(iter/s)": 0.33459 + }, + { + "acc": 0.90719738, + "epoch": 0.9851851851851852, + "grad_norm": 6.96875, + "learning_rate": 1.1070234444644432e-05, + "loss": 0.40077448, + "memory(GiB)": 15.04, + "step": 7315, + "train_speed(iter/s)": 0.334619 + }, + { + "acc": 0.91612635, + "epoch": 0.9858585858585859, + "grad_norm": 8.4375, + "learning_rate": 1.1059162865099969e-05, + "loss": 0.33301401, + "memory(GiB)": 15.04, + "step": 7320, + "train_speed(iter/s)": 0.334645 + }, + { + "acc": 0.94373732, + "epoch": 0.9865319865319865, + "grad_norm": 5.375, + "learning_rate": 1.1048089972348705e-05, + "loss": 0.20315571, + "memory(GiB)": 15.04, + "step": 7325, + "train_speed(iter/s)": 0.334692 + }, + { + "acc": 0.93327198, + "epoch": 0.9872053872053872, + "grad_norm": 6.125, + "learning_rate": 1.1037015780119412e-05, + "loss": 0.23666122, + "memory(GiB)": 15.04, + "step": 7330, + "train_speed(iter/s)": 0.334721 + }, + { + "acc": 0.83606768, + "epoch": 0.9878787878787879, + "grad_norm": 11.1875, + "learning_rate": 1.1025940302142461e-05, + "loss": 0.47933102, + "memory(GiB)": 15.04, + "step": 7335, + "train_speed(iter/s)": 0.33475 + }, + { + "acc": 0.89996281, + "epoch": 0.9885521885521885, + "grad_norm": 9.3125, + "learning_rate": 1.1014863552149823e-05, + "loss": 0.38059247, + "memory(GiB)": 15.04, + "step": 7340, + "train_speed(iter/s)": 0.334786 + }, + { + "acc": 0.86802759, + "epoch": 0.9892255892255892, + "grad_norm": 11.9375, + "learning_rate": 1.1003785543875045e-05, + "loss": 0.47677841, + "memory(GiB)": 15.04, + "step": 7345, + "train_speed(iter/s)": 0.334832 + }, + { + "acc": 0.88216543, + "epoch": 0.98989898989899, + "grad_norm": 10.75, + "learning_rate": 1.0992706291053237e-05, + "loss": 0.26247544, + "memory(GiB)": 15.04, + "step": 7350, + "train_speed(iter/s)": 0.33482 + }, + { + "acc": 0.94089928, + "epoch": 0.9905723905723905, + "grad_norm": 6.46875, + "learning_rate": 1.0981625807421043e-05, + "loss": 0.20868149, + "memory(GiB)": 15.04, + "step": 7355, + "train_speed(iter/s)": 0.334842 + }, + { + "acc": 0.92048578, + "epoch": 0.9912457912457913, + "grad_norm": 10.5, + "learning_rate": 1.0970544106716649e-05, + "loss": 0.29919848, + "memory(GiB)": 15.04, + "step": 7360, + "train_speed(iter/s)": 0.334893 + }, + { + "acc": 0.89477882, + "epoch": 0.9919191919191919, + "grad_norm": 6.28125, + "learning_rate": 1.0959461202679735e-05, + "loss": 0.45753975, + "memory(GiB)": 15.04, + "step": 7365, + "train_speed(iter/s)": 0.334937 + }, + { + "acc": 0.90612965, + "epoch": 0.9925925925925926, + "grad_norm": 30.25, + "learning_rate": 1.0948377109051481e-05, + "loss": 0.39858489, + "memory(GiB)": 15.04, + "step": 7370, + "train_speed(iter/s)": 0.334978 + }, + { + "acc": 0.90982237, + "epoch": 0.9932659932659933, + "grad_norm": 5.09375, + "learning_rate": 1.0937291839574532e-05, + "loss": 0.31556346, + "memory(GiB)": 15.04, + "step": 7375, + "train_speed(iter/s)": 0.334975 + }, + { + "acc": 0.94280615, + "epoch": 0.9939393939393939, + "grad_norm": 8.4375, + "learning_rate": 1.0926205407993007e-05, + "loss": 0.25707188, + "memory(GiB)": 15.04, + "step": 7380, + "train_speed(iter/s)": 0.334987 + }, + { + "acc": 0.90444498, + "epoch": 0.9946127946127946, + "grad_norm": 9.4375, + "learning_rate": 1.0915117828052457e-05, + "loss": 0.35674734, + "memory(GiB)": 15.04, + "step": 7385, + "train_speed(iter/s)": 0.335024 + }, + { + "acc": 0.90108099, + "epoch": 0.9952861952861953, + "grad_norm": 9.875, + "learning_rate": 1.0904029113499852e-05, + "loss": 0.32676821, + "memory(GiB)": 15.04, + "step": 7390, + "train_speed(iter/s)": 0.335062 + }, + { + "acc": 0.93637123, + "epoch": 0.9959595959595959, + "grad_norm": 7.59375, + "learning_rate": 1.0892939278083577e-05, + "loss": 0.24299397, + "memory(GiB)": 15.04, + "step": 7395, + "train_speed(iter/s)": 0.335104 + }, + { + "acc": 0.91700153, + "epoch": 0.9966329966329966, + "grad_norm": 5.84375, + "learning_rate": 1.08818483355534e-05, + "loss": 0.29521022, + "memory(GiB)": 15.04, + "step": 7400, + "train_speed(iter/s)": 0.335135 + }, + { + "acc": 0.85686979, + "epoch": 0.9973063973063973, + "grad_norm": 13.8125, + "learning_rate": 1.0870756299660466e-05, + "loss": 0.4336318, + "memory(GiB)": 15.04, + "step": 7405, + "train_speed(iter/s)": 0.335186 + }, + { + "acc": 0.87050056, + "epoch": 0.997979797979798, + "grad_norm": 11.8125, + "learning_rate": 1.085966318415728e-05, + "loss": 0.57098708, + "memory(GiB)": 15.04, + "step": 7410, + "train_speed(iter/s)": 0.335217 + }, + { + "acc": 0.91617241, + "epoch": 0.9986531986531987, + "grad_norm": 4.6875, + "learning_rate": 1.0848569002797674e-05, + "loss": 0.33973608, + "memory(GiB)": 15.04, + "step": 7415, + "train_speed(iter/s)": 0.335261 + }, + { + "acc": 0.84956837, + "epoch": 0.9993265993265993, + "grad_norm": 8.3125, + "learning_rate": 1.083747376933681e-05, + "loss": 0.4439743, + "memory(GiB)": 15.04, + "step": 7420, + "train_speed(iter/s)": 0.335282 + }, + { + "acc": 0.91084309, + "epoch": 1.0, + "grad_norm": 3.515625, + "learning_rate": 1.082637749753115e-05, + "loss": 0.33124869, + "memory(GiB)": 15.04, + "step": 7425, + "train_speed(iter/s)": 0.335267 + }, + { + "acc": 0.91813211, + "epoch": 1.0006734006734006, + "grad_norm": 6.03125, + "learning_rate": 1.0815280201138451e-05, + "loss": 0.3201786, + "memory(GiB)": 15.04, + "step": 7430, + "train_speed(iter/s)": 0.335285 + }, + { + "acc": 0.87773285, + "epoch": 1.0013468013468014, + "grad_norm": 7.5625, + "learning_rate": 1.080418189391773e-05, + "loss": 0.43178754, + "memory(GiB)": 15.04, + "step": 7435, + "train_speed(iter/s)": 0.335303 + }, + { + "acc": 0.90512619, + "epoch": 1.002020202020202, + "grad_norm": 5.75, + "learning_rate": 1.0793082589629264e-05, + "loss": 0.46502805, + "memory(GiB)": 15.04, + "step": 7440, + "train_speed(iter/s)": 0.335306 + }, + { + "acc": 0.83395872, + "epoch": 1.0026936026936026, + "grad_norm": 9.75, + "learning_rate": 1.0781982302034563e-05, + "loss": 0.53723869, + "memory(GiB)": 15.04, + "step": 7445, + "train_speed(iter/s)": 0.335343 + }, + { + "acc": 0.81441259, + "epoch": 1.0033670033670035, + "grad_norm": 12.0625, + "learning_rate": 1.077088104489636e-05, + "loss": 0.49833088, + "memory(GiB)": 15.04, + "step": 7450, + "train_speed(iter/s)": 0.335361 + }, + { + "acc": 0.90716105, + "epoch": 1.004040404040404, + "grad_norm": 16.125, + "learning_rate": 1.0759778831978585e-05, + "loss": 0.46950393, + "memory(GiB)": 15.04, + "step": 7455, + "train_speed(iter/s)": 0.335399 + }, + { + "acc": 0.92520227, + "epoch": 1.0047138047138047, + "grad_norm": 5.71875, + "learning_rate": 1.0748675677046356e-05, + "loss": 0.27084908, + "memory(GiB)": 15.04, + "step": 7460, + "train_speed(iter/s)": 0.335432 + }, + { + "acc": 0.8178484, + "epoch": 1.0053872053872055, + "grad_norm": 12.0, + "learning_rate": 1.0737571593865963e-05, + "loss": 0.31178315, + "memory(GiB)": 15.04, + "step": 7465, + "train_speed(iter/s)": 0.335465 + }, + { + "acc": 0.91998367, + "epoch": 1.006060606060606, + "grad_norm": 5.375, + "learning_rate": 1.0726466596204836e-05, + "loss": 0.28148334, + "memory(GiB)": 15.04, + "step": 7470, + "train_speed(iter/s)": 0.335501 + }, + { + "acc": 0.85789928, + "epoch": 1.0067340067340067, + "grad_norm": 4.21875, + "learning_rate": 1.0715360697831547e-05, + "loss": 0.26114526, + "memory(GiB)": 15.04, + "step": 7475, + "train_speed(iter/s)": 0.335504 + }, + { + "acc": 0.930474, + "epoch": 1.0074074074074073, + "grad_norm": 5.625, + "learning_rate": 1.0704253912515787e-05, + "loss": 0.28305035, + "memory(GiB)": 15.04, + "step": 7480, + "train_speed(iter/s)": 0.335516 + }, + { + "acc": 0.88864784, + "epoch": 1.0080808080808081, + "grad_norm": 7.09375, + "learning_rate": 1.0693146254028342e-05, + "loss": 0.39999893, + "memory(GiB)": 15.04, + "step": 7485, + "train_speed(iter/s)": 0.335522 + }, + { + "acc": 0.92239819, + "epoch": 1.0087542087542087, + "grad_norm": 17.125, + "learning_rate": 1.0682037736141078e-05, + "loss": 0.24749136, + "memory(GiB)": 15.04, + "step": 7490, + "train_speed(iter/s)": 0.335561 + }, + { + "acc": 0.9252677, + "epoch": 1.0094276094276093, + "grad_norm": 10.1875, + "learning_rate": 1.0670928372626932e-05, + "loss": 0.26599128, + "memory(GiB)": 15.04, + "step": 7495, + "train_speed(iter/s)": 0.335574 + }, + { + "acc": 0.90563536, + "epoch": 1.0101010101010102, + "grad_norm": 14.9375, + "learning_rate": 1.0659818177259886e-05, + "loss": 0.46962347, + "memory(GiB)": 15.04, + "step": 7500, + "train_speed(iter/s)": 0.335579 + }, + { + "epoch": 1.0101010101010102, + "eval_acc": 0.8932916324312863, + "eval_loss": 0.41181936860084534, + "eval_runtime": 109.715, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 1.367, + "step": 7500 + }, + { + "acc": 0.92174358, + "epoch": 1.0107744107744108, + "grad_norm": 6.875, + "learning_rate": 1.0648707163814957e-05, + "loss": 0.29238164, + "memory(GiB)": 15.04, + "step": 7505, + "train_speed(iter/s)": 0.333896 + }, + { + "acc": 0.84976702, + "epoch": 1.0114478114478114, + "grad_norm": 10.5, + "learning_rate": 1.0637595346068173e-05, + "loss": 0.43344164, + "memory(GiB)": 15.04, + "step": 7510, + "train_speed(iter/s)": 0.333939 + }, + { + "acc": 0.89667292, + "epoch": 1.0121212121212122, + "grad_norm": 22.125, + "learning_rate": 1.062648273779656e-05, + "loss": 0.46906109, + "memory(GiB)": 15.04, + "step": 7515, + "train_speed(iter/s)": 0.333972 + }, + { + "acc": 0.87726889, + "epoch": 1.0127946127946128, + "grad_norm": 15.5, + "learning_rate": 1.0615369352778122e-05, + "loss": 0.32460937, + "memory(GiB)": 15.04, + "step": 7520, + "train_speed(iter/s)": 0.333997 + }, + { + "acc": 0.89613562, + "epoch": 1.0134680134680134, + "grad_norm": 12.9375, + "learning_rate": 1.0604255204791831e-05, + "loss": 0.48519855, + "memory(GiB)": 15.04, + "step": 7525, + "train_speed(iter/s)": 0.334044 + }, + { + "acc": 0.89590931, + "epoch": 1.0141414141414142, + "grad_norm": 7.34375, + "learning_rate": 1.0593140307617604e-05, + "loss": 0.43767009, + "memory(GiB)": 15.04, + "step": 7530, + "train_speed(iter/s)": 0.333996 + }, + { + "acc": 0.78652587, + "epoch": 1.0148148148148148, + "grad_norm": 14.5, + "learning_rate": 1.0582024675036282e-05, + "loss": 1.00399866, + "memory(GiB)": 15.04, + "step": 7535, + "train_speed(iter/s)": 0.333975 + }, + { + "acc": 0.90799513, + "epoch": 1.0154882154882154, + "grad_norm": 5.9375, + "learning_rate": 1.0570908320829625e-05, + "loss": 0.31158729, + "memory(GiB)": 15.04, + "step": 7540, + "train_speed(iter/s)": 0.333995 + }, + { + "acc": 0.90999622, + "epoch": 1.0161616161616163, + "grad_norm": 6.0625, + "learning_rate": 1.055979125878028e-05, + "loss": 0.31318541, + "memory(GiB)": 15.04, + "step": 7545, + "train_speed(iter/s)": 0.334028 + }, + { + "acc": 0.92610779, + "epoch": 1.0168350168350169, + "grad_norm": 7.8125, + "learning_rate": 1.0548673502671776e-05, + "loss": 0.26086628, + "memory(GiB)": 15.04, + "step": 7550, + "train_speed(iter/s)": 0.334072 + }, + { + "acc": 0.82934303, + "epoch": 1.0175084175084175, + "grad_norm": 8.0, + "learning_rate": 1.0537555066288503e-05, + "loss": 0.51228423, + "memory(GiB)": 15.04, + "step": 7555, + "train_speed(iter/s)": 0.334116 + }, + { + "acc": 0.87914476, + "epoch": 1.018181818181818, + "grad_norm": 6.5, + "learning_rate": 1.0526435963415695e-05, + "loss": 0.61368141, + "memory(GiB)": 15.04, + "step": 7560, + "train_speed(iter/s)": 0.33416 + }, + { + "acc": 0.92587614, + "epoch": 1.018855218855219, + "grad_norm": 11.0625, + "learning_rate": 1.051531620783941e-05, + "loss": 0.2348141, + "memory(GiB)": 15.04, + "step": 7565, + "train_speed(iter/s)": 0.334197 + }, + { + "acc": 0.90355387, + "epoch": 1.0195286195286195, + "grad_norm": 7.46875, + "learning_rate": 1.0504195813346511e-05, + "loss": 0.37921202, + "memory(GiB)": 15.04, + "step": 7570, + "train_speed(iter/s)": 0.334233 + }, + { + "acc": 0.93372288, + "epoch": 1.02020202020202, + "grad_norm": 8.125, + "learning_rate": 1.0493074793724665e-05, + "loss": 0.21990647, + "memory(GiB)": 15.04, + "step": 7575, + "train_speed(iter/s)": 0.33427 + }, + { + "acc": 0.91960363, + "epoch": 1.020875420875421, + "grad_norm": 13.9375, + "learning_rate": 1.0481953162762302e-05, + "loss": 0.28417826, + "memory(GiB)": 15.04, + "step": 7580, + "train_speed(iter/s)": 0.33429 + }, + { + "acc": 0.91729288, + "epoch": 1.0215488215488215, + "grad_norm": 5.65625, + "learning_rate": 1.047083093424862e-05, + "loss": 0.24105039, + "memory(GiB)": 15.04, + "step": 7585, + "train_speed(iter/s)": 0.334287 + }, + { + "acc": 0.85426855, + "epoch": 1.0222222222222221, + "grad_norm": 15.8125, + "learning_rate": 1.045970812197355e-05, + "loss": 0.43264589, + "memory(GiB)": 15.04, + "step": 7590, + "train_speed(iter/s)": 0.334299 + }, + { + "acc": 0.90263243, + "epoch": 1.022895622895623, + "grad_norm": 12.4375, + "learning_rate": 1.0448584739727752e-05, + "loss": 0.39218068, + "memory(GiB)": 15.04, + "step": 7595, + "train_speed(iter/s)": 0.334333 + }, + { + "acc": 0.88058176, + "epoch": 1.0235690235690236, + "grad_norm": 6.78125, + "learning_rate": 1.0437460801302586e-05, + "loss": 0.6238133, + "memory(GiB)": 15.04, + "step": 7600, + "train_speed(iter/s)": 0.334378 + }, + { + "acc": 0.89416704, + "epoch": 1.0242424242424242, + "grad_norm": 6.53125, + "learning_rate": 1.0426336320490112e-05, + "loss": 0.40273023, + "memory(GiB)": 15.04, + "step": 7605, + "train_speed(iter/s)": 0.334388 + }, + { + "acc": 0.86264629, + "epoch": 1.024915824915825, + "grad_norm": 4.9375, + "learning_rate": 1.0415211311083053e-05, + "loss": 0.52320361, + "memory(GiB)": 15.04, + "step": 7610, + "train_speed(iter/s)": 0.334381 + }, + { + "acc": 0.92643089, + "epoch": 1.0255892255892256, + "grad_norm": 6.1875, + "learning_rate": 1.0404085786874792e-05, + "loss": 0.20462449, + "memory(GiB)": 15.04, + "step": 7615, + "train_speed(iter/s)": 0.334432 + }, + { + "acc": 0.93033438, + "epoch": 1.0262626262626262, + "grad_norm": 6.1875, + "learning_rate": 1.0392959761659348e-05, + "loss": 0.24749184, + "memory(GiB)": 15.04, + "step": 7620, + "train_speed(iter/s)": 0.334479 + }, + { + "acc": 0.92216396, + "epoch": 1.026936026936027, + "grad_norm": 9.8125, + "learning_rate": 1.038183324923136e-05, + "loss": 0.39832199, + "memory(GiB)": 15.04, + "step": 7625, + "train_speed(iter/s)": 0.334519 + }, + { + "acc": 0.9161252, + "epoch": 1.0276094276094276, + "grad_norm": 9.5625, + "learning_rate": 1.0370706263386083e-05, + "loss": 0.25948415, + "memory(GiB)": 15.04, + "step": 7630, + "train_speed(iter/s)": 0.334551 + }, + { + "acc": 0.89475193, + "epoch": 1.0282828282828282, + "grad_norm": 12.1875, + "learning_rate": 1.035957881791934e-05, + "loss": 0.48776073, + "memory(GiB)": 15.04, + "step": 7635, + "train_speed(iter/s)": 0.334579 + }, + { + "acc": 0.87355528, + "epoch": 1.028956228956229, + "grad_norm": 4.46875, + "learning_rate": 1.034845092662754e-05, + "loss": 0.31079061, + "memory(GiB)": 15.04, + "step": 7640, + "train_speed(iter/s)": 0.334613 + }, + { + "acc": 0.91904821, + "epoch": 1.0296296296296297, + "grad_norm": 5.59375, + "learning_rate": 1.0337322603307631e-05, + "loss": 0.26461391, + "memory(GiB)": 15.04, + "step": 7645, + "train_speed(iter/s)": 0.334624 + }, + { + "acc": 0.89145298, + "epoch": 1.0303030303030303, + "grad_norm": 8.625, + "learning_rate": 1.032619386175711e-05, + "loss": 0.49507813, + "memory(GiB)": 15.04, + "step": 7650, + "train_speed(iter/s)": 0.334664 + }, + { + "acc": 0.89694061, + "epoch": 1.0309764309764309, + "grad_norm": 6.09375, + "learning_rate": 1.0315064715773983e-05, + "loss": 0.38549335, + "memory(GiB)": 15.04, + "step": 7655, + "train_speed(iter/s)": 0.33469 + }, + { + "acc": 0.93146105, + "epoch": 1.0316498316498317, + "grad_norm": 5.15625, + "learning_rate": 1.0303935179156762e-05, + "loss": 0.22203453, + "memory(GiB)": 15.04, + "step": 7660, + "train_speed(iter/s)": 0.334726 + }, + { + "acc": 0.9121562, + "epoch": 1.0323232323232323, + "grad_norm": 6.375, + "learning_rate": 1.0292805265704442e-05, + "loss": 0.3183358, + "memory(GiB)": 15.04, + "step": 7665, + "train_speed(iter/s)": 0.33477 + }, + { + "acc": 0.89644375, + "epoch": 1.032996632996633, + "grad_norm": 11.25, + "learning_rate": 1.0281674989216483e-05, + "loss": 0.39723806, + "memory(GiB)": 15.04, + "step": 7670, + "train_speed(iter/s)": 0.334797 + }, + { + "acc": 0.90520725, + "epoch": 1.0336700336700337, + "grad_norm": 6.09375, + "learning_rate": 1.0270544363492803e-05, + "loss": 0.41350541, + "memory(GiB)": 15.04, + "step": 7675, + "train_speed(iter/s)": 0.334764 + }, + { + "acc": 0.91033335, + "epoch": 1.0343434343434343, + "grad_norm": 7.84375, + "learning_rate": 1.0259413402333743e-05, + "loss": 0.38219395, + "memory(GiB)": 15.04, + "step": 7680, + "train_speed(iter/s)": 0.334789 + }, + { + "acc": 0.89836197, + "epoch": 1.035016835016835, + "grad_norm": 11.5, + "learning_rate": 1.0248282119540065e-05, + "loss": 0.29855702, + "memory(GiB)": 15.04, + "step": 7685, + "train_speed(iter/s)": 0.334833 + }, + { + "acc": 0.88722687, + "epoch": 1.0356902356902358, + "grad_norm": 7.4375, + "learning_rate": 1.023715052891293e-05, + "loss": 0.58175788, + "memory(GiB)": 15.04, + "step": 7690, + "train_speed(iter/s)": 0.334852 + }, + { + "acc": 0.89256382, + "epoch": 1.0363636363636364, + "grad_norm": 6.875, + "learning_rate": 1.0226018644253874e-05, + "loss": 0.3838747, + "memory(GiB)": 15.04, + "step": 7695, + "train_speed(iter/s)": 0.334865 + }, + { + "acc": 0.87644978, + "epoch": 1.037037037037037, + "grad_norm": 5.15625, + "learning_rate": 1.0214886479364811e-05, + "loss": 0.54048905, + "memory(GiB)": 15.04, + "step": 7700, + "train_speed(iter/s)": 0.334906 + }, + { + "acc": 0.91026125, + "epoch": 1.0377104377104378, + "grad_norm": 11.375, + "learning_rate": 1.0203754048047994e-05, + "loss": 0.38877017, + "memory(GiB)": 15.04, + "step": 7705, + "train_speed(iter/s)": 0.334939 + }, + { + "acc": 0.9281456, + "epoch": 1.0383838383838384, + "grad_norm": 5.5625, + "learning_rate": 1.0192621364106003e-05, + "loss": 0.30660315, + "memory(GiB)": 15.04, + "step": 7710, + "train_speed(iter/s)": 0.334969 + }, + { + "acc": 0.90934849, + "epoch": 1.039057239057239, + "grad_norm": 10.5, + "learning_rate": 1.0181488441341738e-05, + "loss": 0.34135413, + "memory(GiB)": 15.04, + "step": 7715, + "train_speed(iter/s)": 0.335007 + }, + { + "acc": 0.89176168, + "epoch": 1.0397306397306396, + "grad_norm": 13.375, + "learning_rate": 1.0170355293558389e-05, + "loss": 0.37481618, + "memory(GiB)": 15.04, + "step": 7720, + "train_speed(iter/s)": 0.335022 + }, + { + "acc": 0.90818529, + "epoch": 1.0404040404040404, + "grad_norm": 6.96875, + "learning_rate": 1.0159221934559435e-05, + "loss": 0.28701985, + "memory(GiB)": 15.04, + "step": 7725, + "train_speed(iter/s)": 0.335053 + }, + { + "acc": 0.92640896, + "epoch": 1.041077441077441, + "grad_norm": 6.96875, + "learning_rate": 1.0148088378148604e-05, + "loss": 0.3465523, + "memory(GiB)": 15.04, + "step": 7730, + "train_speed(iter/s)": 0.33508 + }, + { + "acc": 0.92203531, + "epoch": 1.0417508417508416, + "grad_norm": 8.375, + "learning_rate": 1.013695463812988e-05, + "loss": 0.24410901, + "memory(GiB)": 15.04, + "step": 7735, + "train_speed(iter/s)": 0.335127 + }, + { + "acc": 0.89423656, + "epoch": 1.0424242424242425, + "grad_norm": 8.3125, + "learning_rate": 1.0125820728307463e-05, + "loss": 0.56631393, + "memory(GiB)": 15.04, + "step": 7740, + "train_speed(iter/s)": 0.335166 + }, + { + "acc": 0.93116884, + "epoch": 1.043097643097643, + "grad_norm": 9.625, + "learning_rate": 1.0114686662485776e-05, + "loss": 0.26160462, + "memory(GiB)": 15.04, + "step": 7745, + "train_speed(iter/s)": 0.335201 + }, + { + "acc": 0.84082174, + "epoch": 1.0437710437710437, + "grad_norm": 7.5, + "learning_rate": 1.0103552454469427e-05, + "loss": 0.66812701, + "memory(GiB)": 15.04, + "step": 7750, + "train_speed(iter/s)": 0.335207 + }, + { + "acc": 0.83968897, + "epoch": 1.0444444444444445, + "grad_norm": 7.46875, + "learning_rate": 1.0092418118063202e-05, + "loss": 0.53751125, + "memory(GiB)": 15.04, + "step": 7755, + "train_speed(iter/s)": 0.335239 + }, + { + "acc": 0.88964825, + "epoch": 1.0451178451178451, + "grad_norm": 15.125, + "learning_rate": 1.0081283667072053e-05, + "loss": 0.48736072, + "memory(GiB)": 15.04, + "step": 7760, + "train_speed(iter/s)": 0.335221 + }, + { + "acc": 0.88361464, + "epoch": 1.0457912457912457, + "grad_norm": 7.71875, + "learning_rate": 1.0070149115301062e-05, + "loss": 0.33016255, + "memory(GiB)": 15.04, + "step": 7765, + "train_speed(iter/s)": 0.335268 + }, + { + "acc": 0.90930042, + "epoch": 1.0464646464646465, + "grad_norm": 6.96875, + "learning_rate": 1.0059014476555444e-05, + "loss": 0.29139402, + "memory(GiB)": 15.04, + "step": 7770, + "train_speed(iter/s)": 0.335293 + }, + { + "acc": 0.86369629, + "epoch": 1.0471380471380471, + "grad_norm": 11.75, + "learning_rate": 1.004787976464052e-05, + "loss": 0.42862406, + "memory(GiB)": 15.04, + "step": 7775, + "train_speed(iter/s)": 0.335312 + }, + { + "acc": 0.8694375, + "epoch": 1.0478114478114477, + "grad_norm": 8.25, + "learning_rate": 1.0036744993361703e-05, + "loss": 0.47458825, + "memory(GiB)": 15.04, + "step": 7780, + "train_speed(iter/s)": 0.335347 + }, + { + "acc": 0.9060977, + "epoch": 1.0484848484848486, + "grad_norm": 9.3125, + "learning_rate": 1.0025610176524477e-05, + "loss": 0.31830359, + "memory(GiB)": 15.04, + "step": 7785, + "train_speed(iter/s)": 0.335326 + }, + { + "acc": 0.92732201, + "epoch": 1.0491582491582492, + "grad_norm": 12.875, + "learning_rate": 1.0014475327934381e-05, + "loss": 0.25894213, + "memory(GiB)": 15.04, + "step": 7790, + "train_speed(iter/s)": 0.335372 + }, + { + "acc": 0.92505512, + "epoch": 1.0498316498316498, + "grad_norm": 9.3125, + "learning_rate": 1.0003340461396999e-05, + "loss": 0.28632023, + "memory(GiB)": 15.04, + "step": 7795, + "train_speed(iter/s)": 0.335398 + }, + { + "acc": 0.93215771, + "epoch": 1.0505050505050506, + "grad_norm": 8.375, + "learning_rate": 9.992205590717936e-06, + "loss": 0.23813188, + "memory(GiB)": 15.04, + "step": 7800, + "train_speed(iter/s)": 0.335438 + }, + { + "epoch": 1.0505050505050506, + "eval_acc": 0.8949522150060721, + "eval_loss": 0.4078538119792938, + "eval_runtime": 109.7415, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 1.367, + "step": 7800 + }, + { + "acc": 0.92810698, + "epoch": 1.0511784511784512, + "grad_norm": 9.875, + "learning_rate": 9.981070729702795e-06, + "loss": 0.2444669, + "memory(GiB)": 15.04, + "step": 7805, + "train_speed(iter/s)": 0.333911 + }, + { + "acc": 0.85111637, + "epoch": 1.0518518518518518, + "grad_norm": 6.3125, + "learning_rate": 9.969935892157182e-06, + "loss": 0.68557639, + "memory(GiB)": 15.04, + "step": 7810, + "train_speed(iter/s)": 0.333935 + }, + { + "acc": 0.85432844, + "epoch": 1.0525252525252524, + "grad_norm": 7.34375, + "learning_rate": 9.958801091886654e-06, + "loss": 0.37782602, + "memory(GiB)": 15.04, + "step": 7815, + "train_speed(iter/s)": 0.333973 + }, + { + "acc": 0.93435078, + "epoch": 1.0531986531986532, + "grad_norm": 8.1875, + "learning_rate": 9.947666342696742e-06, + "loss": 0.21125178, + "memory(GiB)": 15.04, + "step": 7820, + "train_speed(iter/s)": 0.333977 + }, + { + "acc": 0.92402878, + "epoch": 1.0538720538720538, + "grad_norm": 10.875, + "learning_rate": 9.936531658392894e-06, + "loss": 0.26423035, + "memory(GiB)": 15.04, + "step": 7825, + "train_speed(iter/s)": 0.334024 + }, + { + "acc": 0.92936287, + "epoch": 1.0545454545454545, + "grad_norm": 8.1875, + "learning_rate": 9.925397052780491e-06, + "loss": 0.23406742, + "memory(GiB)": 15.04, + "step": 7830, + "train_speed(iter/s)": 0.334044 + }, + { + "acc": 0.8435791, + "epoch": 1.0552188552188553, + "grad_norm": 8.1875, + "learning_rate": 9.91426253966482e-06, + "loss": 0.48717136, + "memory(GiB)": 15.04, + "step": 7835, + "train_speed(iter/s)": 0.334048 + }, + { + "acc": 0.85716352, + "epoch": 1.0558922558922559, + "grad_norm": 6.03125, + "learning_rate": 9.903128132851036e-06, + "loss": 0.71113405, + "memory(GiB)": 15.04, + "step": 7840, + "train_speed(iter/s)": 0.334045 + }, + { + "acc": 0.89160318, + "epoch": 1.0565656565656565, + "grad_norm": 12.125, + "learning_rate": 9.89199384614418e-06, + "loss": 0.48323383, + "memory(GiB)": 15.04, + "step": 7845, + "train_speed(iter/s)": 0.3341 + }, + { + "acc": 0.93457861, + "epoch": 1.0572390572390573, + "grad_norm": 7.0625, + "learning_rate": 9.880859693349129e-06, + "loss": 0.23514717, + "memory(GiB)": 15.04, + "step": 7850, + "train_speed(iter/s)": 0.334146 + }, + { + "acc": 0.90697279, + "epoch": 1.057912457912458, + "grad_norm": 6.53125, + "learning_rate": 9.869725688270609e-06, + "loss": 0.29730036, + "memory(GiB)": 15.04, + "step": 7855, + "train_speed(iter/s)": 0.33419 + }, + { + "acc": 0.92781496, + "epoch": 1.0585858585858585, + "grad_norm": 8.1875, + "learning_rate": 9.85859184471315e-06, + "loss": 0.28114614, + "memory(GiB)": 15.04, + "step": 7860, + "train_speed(iter/s)": 0.334225 + }, + { + "acc": 0.8981905, + "epoch": 1.0592592592592593, + "grad_norm": 4.46875, + "learning_rate": 9.84745817648109e-06, + "loss": 0.26651657, + "memory(GiB)": 15.04, + "step": 7865, + "train_speed(iter/s)": 0.334207 + }, + { + "acc": 0.92359161, + "epoch": 1.05993265993266, + "grad_norm": 6.0, + "learning_rate": 9.836324697378546e-06, + "loss": 0.21482296, + "memory(GiB)": 15.04, + "step": 7870, + "train_speed(iter/s)": 0.334248 + }, + { + "acc": 0.86257191, + "epoch": 1.0606060606060606, + "grad_norm": 10.125, + "learning_rate": 9.8251914212094e-06, + "loss": 0.37756774, + "memory(GiB)": 15.04, + "step": 7875, + "train_speed(iter/s)": 0.334297 + }, + { + "acc": 0.86224775, + "epoch": 1.0612794612794614, + "grad_norm": 12.0, + "learning_rate": 9.814058361777282e-06, + "loss": 0.56825967, + "memory(GiB)": 15.04, + "step": 7880, + "train_speed(iter/s)": 0.334338 + }, + { + "acc": 0.76086092, + "epoch": 1.061952861952862, + "grad_norm": 4.9375, + "learning_rate": 9.802925532885562e-06, + "loss": 0.5710423, + "memory(GiB)": 15.04, + "step": 7885, + "train_speed(iter/s)": 0.334364 + }, + { + "acc": 0.8451333, + "epoch": 1.0626262626262626, + "grad_norm": 7.75, + "learning_rate": 9.791792948337308e-06, + "loss": 0.49727669, + "memory(GiB)": 15.04, + "step": 7890, + "train_speed(iter/s)": 0.334383 + }, + { + "acc": 0.92011862, + "epoch": 1.0632996632996634, + "grad_norm": 7.625, + "learning_rate": 9.780660621935304e-06, + "loss": 0.33122869, + "memory(GiB)": 15.04, + "step": 7895, + "train_speed(iter/s)": 0.334383 + }, + { + "acc": 0.84289303, + "epoch": 1.063973063973064, + "grad_norm": 11.5, + "learning_rate": 9.76952856748199e-06, + "loss": 0.53285923, + "memory(GiB)": 15.04, + "step": 7900, + "train_speed(iter/s)": 0.334374 + }, + { + "acc": 0.89361753, + "epoch": 1.0646464646464646, + "grad_norm": 10.5, + "learning_rate": 9.758396798779493e-06, + "loss": 0.47205482, + "memory(GiB)": 15.04, + "step": 7905, + "train_speed(iter/s)": 0.334424 + }, + { + "acc": 0.87449894, + "epoch": 1.0653198653198652, + "grad_norm": 9.25, + "learning_rate": 9.747265329629578e-06, + "loss": 0.35123575, + "memory(GiB)": 15.04, + "step": 7910, + "train_speed(iter/s)": 0.334435 + }, + { + "acc": 0.92202873, + "epoch": 1.065993265993266, + "grad_norm": 7.28125, + "learning_rate": 9.736134173833629e-06, + "loss": 0.21412907, + "memory(GiB)": 15.04, + "step": 7915, + "train_speed(iter/s)": 0.334464 + }, + { + "acc": 0.89996395, + "epoch": 1.0666666666666667, + "grad_norm": 10.6875, + "learning_rate": 9.725003345192652e-06, + "loss": 0.59659462, + "memory(GiB)": 15.04, + "step": 7920, + "train_speed(iter/s)": 0.334492 + }, + { + "acc": 0.89880562, + "epoch": 1.0673400673400673, + "grad_norm": 6.71875, + "learning_rate": 9.713872857507242e-06, + "loss": 0.49608169, + "memory(GiB)": 15.04, + "step": 7925, + "train_speed(iter/s)": 0.334479 + }, + { + "acc": 0.91478395, + "epoch": 1.068013468013468, + "grad_norm": 6.46875, + "learning_rate": 9.702742724577573e-06, + "loss": 0.31762419, + "memory(GiB)": 15.04, + "step": 7930, + "train_speed(iter/s)": 0.334505 + }, + { + "acc": 0.92808399, + "epoch": 1.0686868686868687, + "grad_norm": 6.53125, + "learning_rate": 9.691612960203385e-06, + "loss": 0.27756267, + "memory(GiB)": 15.04, + "step": 7935, + "train_speed(iter/s)": 0.334537 + }, + { + "acc": 0.86438789, + "epoch": 1.0693602693602693, + "grad_norm": 5.28125, + "learning_rate": 9.68048357818395e-06, + "loss": 0.49684315, + "memory(GiB)": 15.04, + "step": 7940, + "train_speed(iter/s)": 0.334567 + }, + { + "acc": 0.94519253, + "epoch": 1.0700336700336701, + "grad_norm": 5.0, + "learning_rate": 9.669354592318072e-06, + "loss": 0.26527648, + "memory(GiB)": 15.04, + "step": 7945, + "train_speed(iter/s)": 0.334609 + }, + { + "acc": 0.90258961, + "epoch": 1.0707070707070707, + "grad_norm": 8.125, + "learning_rate": 9.658226016404065e-06, + "loss": 0.4713563, + "memory(GiB)": 15.04, + "step": 7950, + "train_speed(iter/s)": 0.334619 + }, + { + "acc": 0.9072814, + "epoch": 1.0713804713804713, + "grad_norm": 8.0625, + "learning_rate": 9.647097864239728e-06, + "loss": 0.30933635, + "memory(GiB)": 15.04, + "step": 7955, + "train_speed(iter/s)": 0.334651 + }, + { + "acc": 0.92713327, + "epoch": 1.0720538720538721, + "grad_norm": 2.390625, + "learning_rate": 9.63597014962235e-06, + "loss": 0.2353159, + "memory(GiB)": 15.04, + "step": 7960, + "train_speed(iter/s)": 0.334681 + }, + { + "acc": 0.83974371, + "epoch": 1.0727272727272728, + "grad_norm": 6.34375, + "learning_rate": 9.624842886348654e-06, + "loss": 0.42585888, + "memory(GiB)": 15.04, + "step": 7965, + "train_speed(iter/s)": 0.334715 + }, + { + "acc": 0.87823381, + "epoch": 1.0734006734006734, + "grad_norm": 6.9375, + "learning_rate": 9.613716088214827e-06, + "loss": 0.31924102, + "memory(GiB)": 15.04, + "step": 7970, + "train_speed(iter/s)": 0.334722 + }, + { + "acc": 0.90528374, + "epoch": 1.074074074074074, + "grad_norm": 7.0625, + "learning_rate": 9.602589769016461e-06, + "loss": 0.25827124, + "memory(GiB)": 15.04, + "step": 7975, + "train_speed(iter/s)": 0.334775 + }, + { + "acc": 0.93018322, + "epoch": 1.0747474747474748, + "grad_norm": 7.78125, + "learning_rate": 9.591463942548565e-06, + "loss": 0.2314424, + "memory(GiB)": 15.04, + "step": 7980, + "train_speed(iter/s)": 0.334794 + }, + { + "acc": 0.8915287, + "epoch": 1.0754208754208754, + "grad_norm": 15.375, + "learning_rate": 9.580338622605541e-06, + "loss": 0.24742427, + "memory(GiB)": 15.04, + "step": 7985, + "train_speed(iter/s)": 0.334846 + }, + { + "acc": 0.94772692, + "epoch": 1.076094276094276, + "grad_norm": 7.78125, + "learning_rate": 9.569213822981142e-06, + "loss": 0.24680769, + "memory(GiB)": 15.04, + "step": 7990, + "train_speed(iter/s)": 0.334871 + }, + { + "acc": 0.89446325, + "epoch": 1.0767676767676768, + "grad_norm": 4.9375, + "learning_rate": 9.558089557468506e-06, + "loss": 0.47502527, + "memory(GiB)": 15.04, + "step": 7995, + "train_speed(iter/s)": 0.334899 + }, + { + "acc": 0.84791069, + "epoch": 1.0774410774410774, + "grad_norm": 6.96875, + "learning_rate": 9.546965839860077e-06, + "loss": 0.33400559, + "memory(GiB)": 15.04, + "step": 8000, + "train_speed(iter/s)": 0.334924 + }, + { + "acc": 0.93035774, + "epoch": 1.078114478114478, + "grad_norm": 5.0625, + "learning_rate": 9.535842683947642e-06, + "loss": 0.30570438, + "memory(GiB)": 15.04, + "step": 8005, + "train_speed(iter/s)": 0.334873 + }, + { + "acc": 0.84862366, + "epoch": 1.0787878787878789, + "grad_norm": 11.6875, + "learning_rate": 9.52472010352229e-06, + "loss": 0.37180922, + "memory(GiB)": 15.04, + "step": 8010, + "train_speed(iter/s)": 0.334905 + }, + { + "acc": 0.8975028, + "epoch": 1.0794612794612795, + "grad_norm": 6.03125, + "learning_rate": 9.513598112374383e-06, + "loss": 0.34270852, + "memory(GiB)": 15.04, + "step": 8015, + "train_speed(iter/s)": 0.334896 + }, + { + "acc": 0.87457514, + "epoch": 1.08013468013468, + "grad_norm": 9.75, + "learning_rate": 9.502476724293569e-06, + "loss": 0.44135003, + "memory(GiB)": 15.04, + "step": 8020, + "train_speed(iter/s)": 0.334948 + }, + { + "acc": 0.92260237, + "epoch": 1.0808080808080809, + "grad_norm": 6.6875, + "learning_rate": 9.49135595306873e-06, + "loss": 0.26781821, + "memory(GiB)": 15.04, + "step": 8025, + "train_speed(iter/s)": 0.334976 + }, + { + "acc": 0.91439962, + "epoch": 1.0814814814814815, + "grad_norm": 7.0, + "learning_rate": 9.480235812488003e-06, + "loss": 0.30432069, + "memory(GiB)": 15.04, + "step": 8030, + "train_speed(iter/s)": 0.335021 + }, + { + "acc": 0.92259712, + "epoch": 1.082154882154882, + "grad_norm": 4.875, + "learning_rate": 9.46911631633873e-06, + "loss": 0.29038439, + "memory(GiB)": 15.04, + "step": 8035, + "train_speed(iter/s)": 0.335033 + }, + { + "acc": 0.92675104, + "epoch": 1.082828282828283, + "grad_norm": 12.8125, + "learning_rate": 9.457997478407453e-06, + "loss": 0.23149812, + "memory(GiB)": 15.04, + "step": 8040, + "train_speed(iter/s)": 0.335066 + }, + { + "acc": 0.89729252, + "epoch": 1.0835016835016835, + "grad_norm": 8.5, + "learning_rate": 9.446879312479909e-06, + "loss": 0.41467619, + "memory(GiB)": 15.04, + "step": 8045, + "train_speed(iter/s)": 0.335056 + }, + { + "acc": 0.83994951, + "epoch": 1.0841750841750841, + "grad_norm": 9.4375, + "learning_rate": 9.43576183234099e-06, + "loss": 0.4559495, + "memory(GiB)": 15.04, + "step": 8050, + "train_speed(iter/s)": 0.335098 + }, + { + "acc": 0.92020178, + "epoch": 1.084848484848485, + "grad_norm": 5.5625, + "learning_rate": 9.424645051774744e-06, + "loss": 0.31291816, + "memory(GiB)": 15.04, + "step": 8055, + "train_speed(iter/s)": 0.335117 + }, + { + "acc": 0.89911575, + "epoch": 1.0855218855218856, + "grad_norm": 9.9375, + "learning_rate": 9.413528984564354e-06, + "loss": 0.25901747, + "memory(GiB)": 15.04, + "step": 8060, + "train_speed(iter/s)": 0.335135 + }, + { + "acc": 0.91648579, + "epoch": 1.0861952861952862, + "grad_norm": 6.34375, + "learning_rate": 9.402413644492108e-06, + "loss": 0.31045237, + "memory(GiB)": 15.04, + "step": 8065, + "train_speed(iter/s)": 0.335162 + }, + { + "acc": 0.88398638, + "epoch": 1.0868686868686868, + "grad_norm": 11.3125, + "learning_rate": 9.391299045339409e-06, + "loss": 0.35305567, + "memory(GiB)": 15.04, + "step": 8070, + "train_speed(iter/s)": 0.335161 + }, + { + "acc": 0.85972319, + "epoch": 1.0875420875420876, + "grad_norm": 12.0, + "learning_rate": 9.380185200886722e-06, + "loss": 0.51338773, + "memory(GiB)": 15.04, + "step": 8075, + "train_speed(iter/s)": 0.335131 + }, + { + "acc": 0.91868744, + "epoch": 1.0882154882154882, + "grad_norm": 6.125, + "learning_rate": 9.36907212491359e-06, + "loss": 0.28593349, + "memory(GiB)": 15.04, + "step": 8080, + "train_speed(iter/s)": 0.335153 + }, + { + "acc": 0.86711378, + "epoch": 1.0888888888888888, + "grad_norm": 4.53125, + "learning_rate": 9.357959831198603e-06, + "loss": 0.5976912, + "memory(GiB)": 15.04, + "step": 8085, + "train_speed(iter/s)": 0.335185 + }, + { + "acc": 0.92944174, + "epoch": 1.0895622895622896, + "grad_norm": 12.125, + "learning_rate": 9.34684833351937e-06, + "loss": 0.32996786, + "memory(GiB)": 15.04, + "step": 8090, + "train_speed(iter/s)": 0.335182 + }, + { + "acc": 0.91718426, + "epoch": 1.0902356902356902, + "grad_norm": 7.0625, + "learning_rate": 9.33573764565253e-06, + "loss": 0.28149164, + "memory(GiB)": 15.04, + "step": 8095, + "train_speed(iter/s)": 0.33522 + }, + { + "acc": 0.91738691, + "epoch": 1.0909090909090908, + "grad_norm": 8.375, + "learning_rate": 9.324627781373699e-06, + "loss": 0.35977407, + "memory(GiB)": 15.04, + "step": 8100, + "train_speed(iter/s)": 0.335243 + }, + { + "epoch": 1.0909090909090908, + "eval_acc": 0.8937275796581557, + "eval_loss": 0.41258442401885986, + "eval_runtime": 109.7511, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 1.367, + "step": 8100 + }, + { + "acc": 0.89032269, + "epoch": 1.0915824915824917, + "grad_norm": 8.1875, + "learning_rate": 9.313518754457482e-06, + "loss": 0.30160944, + "memory(GiB)": 15.04, + "step": 8105, + "train_speed(iter/s)": 0.333743 + }, + { + "acc": 0.87330008, + "epoch": 1.0922558922558923, + "grad_norm": 6.5625, + "learning_rate": 9.302410578677456e-06, + "loss": 0.35457988, + "memory(GiB)": 15.04, + "step": 8110, + "train_speed(iter/s)": 0.333777 + }, + { + "acc": 0.90822659, + "epoch": 1.0929292929292929, + "grad_norm": 8.0625, + "learning_rate": 9.291303267806117e-06, + "loss": 0.26888912, + "memory(GiB)": 15.04, + "step": 8115, + "train_speed(iter/s)": 0.333815 + }, + { + "acc": 0.91463413, + "epoch": 1.0936026936026937, + "grad_norm": 5.96875, + "learning_rate": 9.280196835614916e-06, + "loss": 0.30835445, + "memory(GiB)": 15.04, + "step": 8120, + "train_speed(iter/s)": 0.333812 + }, + { + "acc": 0.87086973, + "epoch": 1.0942760942760943, + "grad_norm": 14.0, + "learning_rate": 9.269091295874193e-06, + "loss": 0.4832408, + "memory(GiB)": 15.04, + "step": 8125, + "train_speed(iter/s)": 0.33385 + }, + { + "acc": 0.94408932, + "epoch": 1.094949494949495, + "grad_norm": 7.1875, + "learning_rate": 9.257986662353192e-06, + "loss": 0.26666822, + "memory(GiB)": 15.04, + "step": 8130, + "train_speed(iter/s)": 0.333875 + }, + { + "acc": 0.91334887, + "epoch": 1.0956228956228957, + "grad_norm": 5.21875, + "learning_rate": 9.246882948820038e-06, + "loss": 0.26005599, + "memory(GiB)": 15.04, + "step": 8135, + "train_speed(iter/s)": 0.333909 + }, + { + "acc": 0.91448956, + "epoch": 1.0962962962962963, + "grad_norm": 10.625, + "learning_rate": 9.235780169041702e-06, + "loss": 0.28071885, + "memory(GiB)": 15.04, + "step": 8140, + "train_speed(iter/s)": 0.333926 + }, + { + "acc": 0.92785759, + "epoch": 1.096969696969697, + "grad_norm": 15.4375, + "learning_rate": 9.22467833678401e-06, + "loss": 0.24446368, + "memory(GiB)": 15.04, + "step": 8145, + "train_speed(iter/s)": 0.333966 + }, + { + "acc": 0.93406315, + "epoch": 1.0976430976430978, + "grad_norm": 6.40625, + "learning_rate": 9.2135774658116e-06, + "loss": 0.23972368, + "memory(GiB)": 15.04, + "step": 8150, + "train_speed(iter/s)": 0.333988 + }, + { + "acc": 0.8919508, + "epoch": 1.0983164983164984, + "grad_norm": 7.34375, + "learning_rate": 9.202477569887932e-06, + "loss": 0.41952968, + "memory(GiB)": 15.04, + "step": 8155, + "train_speed(iter/s)": 0.33394 + }, + { + "acc": 0.90486822, + "epoch": 1.098989898989899, + "grad_norm": 7.65625, + "learning_rate": 9.191378662775253e-06, + "loss": 0.27907114, + "memory(GiB)": 15.04, + "step": 8160, + "train_speed(iter/s)": 0.333987 + }, + { + "acc": 0.94088945, + "epoch": 1.0996632996632996, + "grad_norm": 5.71875, + "learning_rate": 9.180280758234575e-06, + "loss": 0.19171976, + "memory(GiB)": 15.04, + "step": 8165, + "train_speed(iter/s)": 0.333981 + }, + { + "acc": 0.91611347, + "epoch": 1.1003367003367004, + "grad_norm": 6.03125, + "learning_rate": 9.169183870025682e-06, + "loss": 0.2421226, + "memory(GiB)": 15.04, + "step": 8170, + "train_speed(iter/s)": 0.334011 + }, + { + "acc": 0.91907644, + "epoch": 1.101010101010101, + "grad_norm": 8.1875, + "learning_rate": 9.158088011907081e-06, + "loss": 0.33632712, + "memory(GiB)": 15.04, + "step": 8175, + "train_speed(iter/s)": 0.333984 + }, + { + "acc": 0.8622076, + "epoch": 1.1016835016835016, + "grad_norm": 15.875, + "learning_rate": 9.146993197636015e-06, + "loss": 0.46791787, + "memory(GiB)": 15.04, + "step": 8180, + "train_speed(iter/s)": 0.333996 + }, + { + "acc": 0.88847284, + "epoch": 1.1023569023569024, + "grad_norm": 11.625, + "learning_rate": 9.135899440968435e-06, + "loss": 0.40275388, + "memory(GiB)": 15.04, + "step": 8185, + "train_speed(iter/s)": 0.334046 + }, + { + "acc": 0.92154865, + "epoch": 1.103030303030303, + "grad_norm": 6.65625, + "learning_rate": 9.12480675565896e-06, + "loss": 0.30134928, + "memory(GiB)": 15.04, + "step": 8190, + "train_speed(iter/s)": 0.334075 + }, + { + "acc": 0.91268005, + "epoch": 1.1037037037037036, + "grad_norm": 7.8125, + "learning_rate": 9.11371515546091e-06, + "loss": 0.21697178, + "memory(GiB)": 15.04, + "step": 8195, + "train_speed(iter/s)": 0.334113 + }, + { + "acc": 0.91839352, + "epoch": 1.1043771043771045, + "grad_norm": 7.875, + "learning_rate": 9.10262465412623e-06, + "loss": 0.23478787, + "memory(GiB)": 15.04, + "step": 8200, + "train_speed(iter/s)": 0.334159 + }, + { + "acc": 0.94088354, + "epoch": 1.105050505050505, + "grad_norm": 8.1875, + "learning_rate": 9.091535265405528e-06, + "loss": 0.22988636, + "memory(GiB)": 15.04, + "step": 8205, + "train_speed(iter/s)": 0.334196 + }, + { + "acc": 0.83029928, + "epoch": 1.1057239057239057, + "grad_norm": 7.125, + "learning_rate": 9.080447003048016e-06, + "loss": 0.7598547, + "memory(GiB)": 15.04, + "step": 8210, + "train_speed(iter/s)": 0.334215 + }, + { + "acc": 0.90943527, + "epoch": 1.1063973063973065, + "grad_norm": 9.0, + "learning_rate": 9.069359880801518e-06, + "loss": 0.28078151, + "memory(GiB)": 15.04, + "step": 8215, + "train_speed(iter/s)": 0.334258 + }, + { + "acc": 0.91230097, + "epoch": 1.107070707070707, + "grad_norm": 5.625, + "learning_rate": 9.05827391241244e-06, + "loss": 0.28510058, + "memory(GiB)": 15.04, + "step": 8220, + "train_speed(iter/s)": 0.33424 + }, + { + "acc": 0.92466831, + "epoch": 1.1077441077441077, + "grad_norm": 6.34375, + "learning_rate": 9.04718911162576e-06, + "loss": 0.24214656, + "memory(GiB)": 15.04, + "step": 8225, + "train_speed(iter/s)": 0.334284 + }, + { + "acc": 0.91876364, + "epoch": 1.1084175084175083, + "grad_norm": 4.4375, + "learning_rate": 9.036105492185003e-06, + "loss": 0.22657027, + "memory(GiB)": 15.04, + "step": 8230, + "train_speed(iter/s)": 0.334324 + }, + { + "acc": 0.86891813, + "epoch": 1.1090909090909091, + "grad_norm": 14.0625, + "learning_rate": 9.025023067832239e-06, + "loss": 0.31826494, + "memory(GiB)": 15.04, + "step": 8235, + "train_speed(iter/s)": 0.334355 + }, + { + "acc": 0.93562517, + "epoch": 1.1097643097643097, + "grad_norm": 5.3125, + "learning_rate": 9.013941852308046e-06, + "loss": 0.17317027, + "memory(GiB)": 15.04, + "step": 8240, + "train_speed(iter/s)": 0.334385 + }, + { + "acc": 0.88516407, + "epoch": 1.1104377104377103, + "grad_norm": 12.75, + "learning_rate": 9.00286185935151e-06, + "loss": 0.35928683, + "memory(GiB)": 15.04, + "step": 8245, + "train_speed(iter/s)": 0.334401 + }, + { + "acc": 0.88753223, + "epoch": 1.1111111111111112, + "grad_norm": 10.0, + "learning_rate": 8.991783102700203e-06, + "loss": 0.60402765, + "memory(GiB)": 15.04, + "step": 8250, + "train_speed(iter/s)": 0.334426 + }, + { + "acc": 0.94273224, + "epoch": 1.1117845117845118, + "grad_norm": 7.5, + "learning_rate": 8.980705596090154e-06, + "loss": 0.24888756, + "memory(GiB)": 15.04, + "step": 8255, + "train_speed(iter/s)": 0.334436 + }, + { + "acc": 0.93126106, + "epoch": 1.1124579124579124, + "grad_norm": 6.59375, + "learning_rate": 8.969629353255855e-06, + "loss": 0.45041056, + "memory(GiB)": 15.04, + "step": 8260, + "train_speed(iter/s)": 0.334467 + }, + { + "acc": 0.92297277, + "epoch": 1.1131313131313132, + "grad_norm": 14.4375, + "learning_rate": 8.958554387930216e-06, + "loss": 0.27016611, + "memory(GiB)": 15.04, + "step": 8265, + "train_speed(iter/s)": 0.3345 + }, + { + "acc": 0.90376577, + "epoch": 1.1138047138047138, + "grad_norm": 8.0625, + "learning_rate": 8.947480713844578e-06, + "loss": 0.31735523, + "memory(GiB)": 15.04, + "step": 8270, + "train_speed(iter/s)": 0.334519 + }, + { + "acc": 0.91011925, + "epoch": 1.1144781144781144, + "grad_norm": 7.15625, + "learning_rate": 8.936408344728676e-06, + "loss": 0.33862257, + "memory(GiB)": 15.04, + "step": 8275, + "train_speed(iter/s)": 0.334533 + }, + { + "acc": 0.87243767, + "epoch": 1.1151515151515152, + "grad_norm": 11.875, + "learning_rate": 8.92533729431062e-06, + "loss": 0.65410371, + "memory(GiB)": 15.04, + "step": 8280, + "train_speed(iter/s)": 0.334538 + }, + { + "acc": 0.91496019, + "epoch": 1.1158249158249158, + "grad_norm": 12.25, + "learning_rate": 8.914267576316898e-06, + "loss": 0.35794642, + "memory(GiB)": 15.04, + "step": 8285, + "train_speed(iter/s)": 0.334567 + }, + { + "acc": 0.82232246, + "epoch": 1.1164983164983164, + "grad_norm": 9.375, + "learning_rate": 8.903199204472329e-06, + "loss": 0.51466088, + "memory(GiB)": 15.04, + "step": 8290, + "train_speed(iter/s)": 0.334615 + }, + { + "acc": 0.9278451, + "epoch": 1.1171717171717173, + "grad_norm": 9.125, + "learning_rate": 8.892132192500082e-06, + "loss": 0.28508511, + "memory(GiB)": 15.04, + "step": 8295, + "train_speed(iter/s)": 0.334637 + }, + { + "acc": 0.84856863, + "epoch": 1.1178451178451179, + "grad_norm": 5.3125, + "learning_rate": 8.881066554121625e-06, + "loss": 0.67713985, + "memory(GiB)": 15.04, + "step": 8300, + "train_speed(iter/s)": 0.334639 + }, + { + "acc": 0.92465773, + "epoch": 1.1185185185185185, + "grad_norm": 12.4375, + "learning_rate": 8.87000230305673e-06, + "loss": 0.34076834, + "memory(GiB)": 15.04, + "step": 8305, + "train_speed(iter/s)": 0.334669 + }, + { + "acc": 0.8347146, + "epoch": 1.1191919191919193, + "grad_norm": 26.375, + "learning_rate": 8.85893945302345e-06, + "loss": 0.42291145, + "memory(GiB)": 15.04, + "step": 8310, + "train_speed(iter/s)": 0.334698 + }, + { + "acc": 0.90365276, + "epoch": 1.11986531986532, + "grad_norm": 6.90625, + "learning_rate": 8.847878017738097e-06, + "loss": 0.34632828, + "memory(GiB)": 15.04, + "step": 8315, + "train_speed(iter/s)": 0.334721 + }, + { + "acc": 0.91383142, + "epoch": 1.1205387205387205, + "grad_norm": 20.625, + "learning_rate": 8.836818010915226e-06, + "loss": 0.36084635, + "memory(GiB)": 15.04, + "step": 8320, + "train_speed(iter/s)": 0.334764 + }, + { + "acc": 0.92835274, + "epoch": 1.121212121212121, + "grad_norm": 9.4375, + "learning_rate": 8.825759446267634e-06, + "loss": 0.25349114, + "memory(GiB)": 15.04, + "step": 8325, + "train_speed(iter/s)": 0.334799 + }, + { + "acc": 0.89334583, + "epoch": 1.121885521885522, + "grad_norm": 5.5625, + "learning_rate": 8.814702337506311e-06, + "loss": 0.5174253, + "memory(GiB)": 15.04, + "step": 8330, + "train_speed(iter/s)": 0.33481 + }, + { + "acc": 0.88716631, + "epoch": 1.1225589225589225, + "grad_norm": 16.25, + "learning_rate": 8.803646698340463e-06, + "loss": 0.27215698, + "memory(GiB)": 15.04, + "step": 8335, + "train_speed(iter/s)": 0.334855 + }, + { + "acc": 0.8864501, + "epoch": 1.1232323232323231, + "grad_norm": 6.84375, + "learning_rate": 8.792592542477451e-06, + "loss": 0.34078074, + "memory(GiB)": 15.04, + "step": 8340, + "train_speed(iter/s)": 0.334895 + }, + { + "acc": 0.89195995, + "epoch": 1.123905723905724, + "grad_norm": 11.25, + "learning_rate": 8.781539883622818e-06, + "loss": 0.46099796, + "memory(GiB)": 15.04, + "step": 8345, + "train_speed(iter/s)": 0.334932 + }, + { + "acc": 0.84340038, + "epoch": 1.1245791245791246, + "grad_norm": 13.1875, + "learning_rate": 8.770488735480244e-06, + "loss": 0.60841222, + "memory(GiB)": 15.04, + "step": 8350, + "train_speed(iter/s)": 0.334954 + }, + { + "acc": 0.82589006, + "epoch": 1.1252525252525252, + "grad_norm": 9.6875, + "learning_rate": 8.759439111751523e-06, + "loss": 0.52863579, + "memory(GiB)": 15.04, + "step": 8355, + "train_speed(iter/s)": 0.334984 + }, + { + "acc": 0.89610262, + "epoch": 1.125925925925926, + "grad_norm": 10.0625, + "learning_rate": 8.748391026136582e-06, + "loss": 0.36142957, + "memory(GiB)": 15.04, + "step": 8360, + "train_speed(iter/s)": 0.335005 + }, + { + "acc": 0.90552168, + "epoch": 1.1265993265993266, + "grad_norm": 5.40625, + "learning_rate": 8.737344492333417e-06, + "loss": 0.34427059, + "memory(GiB)": 15.04, + "step": 8365, + "train_speed(iter/s)": 0.334991 + }, + { + "acc": 0.91735144, + "epoch": 1.1272727272727272, + "grad_norm": 8.0625, + "learning_rate": 8.72629952403812e-06, + "loss": 0.26472263, + "memory(GiB)": 15.04, + "step": 8370, + "train_speed(iter/s)": 0.335022 + }, + { + "acc": 0.84729338, + "epoch": 1.127946127946128, + "grad_norm": 20.125, + "learning_rate": 8.715256134944831e-06, + "loss": 0.56816874, + "memory(GiB)": 15.04, + "step": 8375, + "train_speed(iter/s)": 0.335051 + }, + { + "acc": 0.91038513, + "epoch": 1.1286195286195286, + "grad_norm": 14.0625, + "learning_rate": 8.704214338745735e-06, + "loss": 0.29438369, + "memory(GiB)": 15.04, + "step": 8380, + "train_speed(iter/s)": 0.335078 + }, + { + "acc": 0.82746258, + "epoch": 1.1292929292929292, + "grad_norm": 7.59375, + "learning_rate": 8.693174149131042e-06, + "loss": 0.83370857, + "memory(GiB)": 15.04, + "step": 8385, + "train_speed(iter/s)": 0.335125 + }, + { + "acc": 0.88657513, + "epoch": 1.12996632996633, + "grad_norm": 12.75, + "learning_rate": 8.68213557978897e-06, + "loss": 0.42116256, + "memory(GiB)": 15.04, + "step": 8390, + "train_speed(iter/s)": 0.335136 + }, + { + "acc": 0.91205168, + "epoch": 1.1306397306397307, + "grad_norm": 11.3125, + "learning_rate": 8.671098644405726e-06, + "loss": 0.43715248, + "memory(GiB)": 15.04, + "step": 8395, + "train_speed(iter/s)": 0.335124 + }, + { + "acc": 0.85746775, + "epoch": 1.1313131313131313, + "grad_norm": 8.625, + "learning_rate": 8.660063356665498e-06, + "loss": 0.60507174, + "memory(GiB)": 15.04, + "step": 8400, + "train_speed(iter/s)": 0.335164 + }, + { + "epoch": 1.1313131313131313, + "eval_acc": 0.8942114714270625, + "eval_loss": 0.4126920700073242, + "eval_runtime": 109.7133, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 1.367, + "step": 8400 + }, + { + "acc": 0.89846535, + "epoch": 1.131986531986532, + "grad_norm": 10.4375, + "learning_rate": 8.649029730250418e-06, + "loss": 0.4828084, + "memory(GiB)": 15.04, + "step": 8405, + "train_speed(iter/s)": 0.333698 + }, + { + "acc": 0.84215689, + "epoch": 1.1326599326599327, + "grad_norm": 9.3125, + "learning_rate": 8.637997778840577e-06, + "loss": 0.51783433, + "memory(GiB)": 15.04, + "step": 8410, + "train_speed(iter/s)": 0.333741 + }, + { + "acc": 0.91730366, + "epoch": 1.1333333333333333, + "grad_norm": 14.0625, + "learning_rate": 8.626967516113968e-06, + "loss": 0.29312508, + "memory(GiB)": 15.04, + "step": 8415, + "train_speed(iter/s)": 0.333762 + }, + { + "acc": 0.92496166, + "epoch": 1.134006734006734, + "grad_norm": 5.5, + "learning_rate": 8.615938955746508e-06, + "loss": 0.37733843, + "memory(GiB)": 15.04, + "step": 8420, + "train_speed(iter/s)": 0.333765 + }, + { + "acc": 0.8840188, + "epoch": 1.1346801346801347, + "grad_norm": 7.53125, + "learning_rate": 8.604912111411998e-06, + "loss": 0.33286364, + "memory(GiB)": 15.04, + "step": 8425, + "train_speed(iter/s)": 0.333805 + }, + { + "acc": 0.92879047, + "epoch": 1.1353535353535353, + "grad_norm": 8.8125, + "learning_rate": 8.5938869967821e-06, + "loss": 0.26940417, + "memory(GiB)": 15.04, + "step": 8430, + "train_speed(iter/s)": 0.333816 + }, + { + "acc": 0.84250507, + "epoch": 1.136026936026936, + "grad_norm": 23.625, + "learning_rate": 8.582863625526351e-06, + "loss": 0.83595047, + "memory(GiB)": 15.04, + "step": 8435, + "train_speed(iter/s)": 0.333845 + }, + { + "acc": 0.90578241, + "epoch": 1.1367003367003368, + "grad_norm": 6.75, + "learning_rate": 8.571842011312111e-06, + "loss": 0.42703586, + "memory(GiB)": 15.04, + "step": 8440, + "train_speed(iter/s)": 0.333846 + }, + { + "acc": 0.92058296, + "epoch": 1.1373737373737374, + "grad_norm": 5.21875, + "learning_rate": 8.560822167804567e-06, + "loss": 0.27597072, + "memory(GiB)": 15.04, + "step": 8445, + "train_speed(iter/s)": 0.333846 + }, + { + "acc": 0.91123238, + "epoch": 1.138047138047138, + "grad_norm": 18.875, + "learning_rate": 8.549804108666717e-06, + "loss": 0.40116172, + "memory(GiB)": 15.04, + "step": 8450, + "train_speed(iter/s)": 0.333879 + }, + { + "acc": 0.92618265, + "epoch": 1.1387205387205388, + "grad_norm": 9.6875, + "learning_rate": 8.538787847559332e-06, + "loss": 0.25792506, + "memory(GiB)": 15.04, + "step": 8455, + "train_speed(iter/s)": 0.333916 + }, + { + "acc": 0.92828245, + "epoch": 1.1393939393939394, + "grad_norm": 6.59375, + "learning_rate": 8.52777339814097e-06, + "loss": 0.19393549, + "memory(GiB)": 15.04, + "step": 8460, + "train_speed(iter/s)": 0.333945 + }, + { + "acc": 0.92836123, + "epoch": 1.14006734006734, + "grad_norm": 6.4375, + "learning_rate": 8.516760774067927e-06, + "loss": 0.31603966, + "memory(GiB)": 15.04, + "step": 8465, + "train_speed(iter/s)": 0.333983 + }, + { + "acc": 0.94478445, + "epoch": 1.1407407407407408, + "grad_norm": 6.5, + "learning_rate": 8.505749988994247e-06, + "loss": 0.24762452, + "memory(GiB)": 15.04, + "step": 8470, + "train_speed(iter/s)": 0.333978 + }, + { + "acc": 0.83944941, + "epoch": 1.1414141414141414, + "grad_norm": 11.375, + "learning_rate": 8.494741056571693e-06, + "loss": 0.82006397, + "memory(GiB)": 15.04, + "step": 8475, + "train_speed(iter/s)": 0.333993 + }, + { + "acc": 0.89516182, + "epoch": 1.142087542087542, + "grad_norm": 6.75, + "learning_rate": 8.483733990449725e-06, + "loss": 0.38534896, + "memory(GiB)": 15.04, + "step": 8480, + "train_speed(iter/s)": 0.334006 + }, + { + "acc": 0.8848381, + "epoch": 1.1427609427609426, + "grad_norm": 9.0625, + "learning_rate": 8.472728804275496e-06, + "loss": 0.41047716, + "memory(GiB)": 15.04, + "step": 8485, + "train_speed(iter/s)": 0.334045 + }, + { + "acc": 0.7664793, + "epoch": 1.1434343434343435, + "grad_norm": 20.625, + "learning_rate": 8.46172551169382e-06, + "loss": 0.73766646, + "memory(GiB)": 15.04, + "step": 8490, + "train_speed(iter/s)": 0.334091 + }, + { + "acc": 0.82757874, + "epoch": 1.144107744107744, + "grad_norm": 11.0, + "learning_rate": 8.450724126347169e-06, + "loss": 0.59155359, + "memory(GiB)": 15.04, + "step": 8495, + "train_speed(iter/s)": 0.334104 + }, + { + "acc": 0.91042652, + "epoch": 1.144781144781145, + "grad_norm": 3.6875, + "learning_rate": 8.439724661875657e-06, + "loss": 0.31208146, + "memory(GiB)": 15.04, + "step": 8500, + "train_speed(iter/s)": 0.334089 + }, + { + "acc": 0.95034885, + "epoch": 1.1454545454545455, + "grad_norm": 5.09375, + "learning_rate": 8.428727131916996e-06, + "loss": 0.19542702, + "memory(GiB)": 15.04, + "step": 8505, + "train_speed(iter/s)": 0.33408 + }, + { + "acc": 0.90736942, + "epoch": 1.146127946127946, + "grad_norm": 8.8125, + "learning_rate": 8.417731550106526e-06, + "loss": 0.40063429, + "memory(GiB)": 15.04, + "step": 8510, + "train_speed(iter/s)": 0.33411 + }, + { + "acc": 0.89917545, + "epoch": 1.1468013468013467, + "grad_norm": 7.46875, + "learning_rate": 8.406737930077143e-06, + "loss": 0.36739912, + "memory(GiB)": 15.04, + "step": 8515, + "train_speed(iter/s)": 0.334111 + }, + { + "acc": 0.90421133, + "epoch": 1.1474747474747475, + "grad_norm": 11.6875, + "learning_rate": 8.395746285459333e-06, + "loss": 0.31956413, + "memory(GiB)": 15.04, + "step": 8520, + "train_speed(iter/s)": 0.334126 + }, + { + "acc": 0.88792953, + "epoch": 1.1481481481481481, + "grad_norm": 18.0, + "learning_rate": 8.38475662988113e-06, + "loss": 0.60335183, + "memory(GiB)": 15.04, + "step": 8525, + "train_speed(iter/s)": 0.334166 + }, + { + "acc": 0.87934303, + "epoch": 1.1488215488215487, + "grad_norm": 7.1875, + "learning_rate": 8.373768976968088e-06, + "loss": 0.53611398, + "memory(GiB)": 15.04, + "step": 8530, + "train_speed(iter/s)": 0.334193 + }, + { + "acc": 0.94821157, + "epoch": 1.1494949494949496, + "grad_norm": 6.40625, + "learning_rate": 8.362783340343294e-06, + "loss": 0.19109724, + "memory(GiB)": 15.04, + "step": 8535, + "train_speed(iter/s)": 0.334229 + }, + { + "acc": 0.85498476, + "epoch": 1.1501683501683502, + "grad_norm": 4.78125, + "learning_rate": 8.351799733627322e-06, + "loss": 0.26411538, + "memory(GiB)": 15.04, + "step": 8540, + "train_speed(iter/s)": 0.33427 + }, + { + "acc": 0.94778214, + "epoch": 1.1508417508417508, + "grad_norm": 7.34375, + "learning_rate": 8.340818170438239e-06, + "loss": 0.19438212, + "memory(GiB)": 15.04, + "step": 8545, + "train_speed(iter/s)": 0.334308 + }, + { + "acc": 0.93892117, + "epoch": 1.1515151515151516, + "grad_norm": 5.34375, + "learning_rate": 8.329838664391578e-06, + "loss": 0.26018372, + "memory(GiB)": 15.04, + "step": 8550, + "train_speed(iter/s)": 0.33432 + }, + { + "acc": 0.872229, + "epoch": 1.1521885521885522, + "grad_norm": 11.4375, + "learning_rate": 8.318861229100309e-06, + "loss": 0.30497761, + "memory(GiB)": 15.04, + "step": 8555, + "train_speed(iter/s)": 0.334359 + }, + { + "acc": 0.87170715, + "epoch": 1.1528619528619528, + "grad_norm": 7.875, + "learning_rate": 8.307885878174853e-06, + "loss": 0.37016969, + "memory(GiB)": 15.04, + "step": 8560, + "train_speed(iter/s)": 0.334365 + }, + { + "acc": 0.86693535, + "epoch": 1.1535353535353536, + "grad_norm": 7.21875, + "learning_rate": 8.296912625223034e-06, + "loss": 0.58177495, + "memory(GiB)": 15.04, + "step": 8565, + "train_speed(iter/s)": 0.334398 + }, + { + "acc": 0.85612116, + "epoch": 1.1542087542087542, + "grad_norm": 6.90625, + "learning_rate": 8.285941483850073e-06, + "loss": 0.5755075, + "memory(GiB)": 15.04, + "step": 8570, + "train_speed(iter/s)": 0.334441 + }, + { + "acc": 0.87171307, + "epoch": 1.1548821548821548, + "grad_norm": 6.15625, + "learning_rate": 8.274972467658589e-06, + "loss": 0.52622843, + "memory(GiB)": 15.04, + "step": 8575, + "train_speed(iter/s)": 0.334449 + }, + { + "acc": 0.93473549, + "epoch": 1.1555555555555554, + "grad_norm": 6.21875, + "learning_rate": 8.264005590248544e-06, + "loss": 0.20657101, + "memory(GiB)": 15.04, + "step": 8580, + "train_speed(iter/s)": 0.334462 + }, + { + "acc": 0.91747732, + "epoch": 1.1562289562289563, + "grad_norm": 6.90625, + "learning_rate": 8.253040865217269e-06, + "loss": 0.31283977, + "memory(GiB)": 15.04, + "step": 8585, + "train_speed(iter/s)": 0.334494 + }, + { + "acc": 0.90545425, + "epoch": 1.1569023569023569, + "grad_norm": 5.625, + "learning_rate": 8.242078306159408e-06, + "loss": 0.34917042, + "memory(GiB)": 15.04, + "step": 8590, + "train_speed(iter/s)": 0.334515 + }, + { + "acc": 0.85144119, + "epoch": 1.1575757575757575, + "grad_norm": 14.625, + "learning_rate": 8.231117926666932e-06, + "loss": 0.37468622, + "memory(GiB)": 15.04, + "step": 8595, + "train_speed(iter/s)": 0.334544 + }, + { + "acc": 0.83755226, + "epoch": 1.1582491582491583, + "grad_norm": 17.25, + "learning_rate": 8.220159740329113e-06, + "loss": 0.25008359, + "memory(GiB)": 15.04, + "step": 8600, + "train_speed(iter/s)": 0.334571 + }, + { + "acc": 0.84674406, + "epoch": 1.158922558922559, + "grad_norm": 8.375, + "learning_rate": 8.209203760732483e-06, + "loss": 0.34826465, + "memory(GiB)": 15.04, + "step": 8605, + "train_speed(iter/s)": 0.334597 + }, + { + "acc": 0.9512723, + "epoch": 1.1595959595959595, + "grad_norm": 11.9375, + "learning_rate": 8.198250001460867e-06, + "loss": 0.18209766, + "memory(GiB)": 15.04, + "step": 8610, + "train_speed(iter/s)": 0.334635 + }, + { + "acc": 0.91602087, + "epoch": 1.1602693602693603, + "grad_norm": 21.0, + "learning_rate": 8.187298476095308e-06, + "loss": 0.38796551, + "memory(GiB)": 15.04, + "step": 8615, + "train_speed(iter/s)": 0.334674 + }, + { + "acc": 0.92153492, + "epoch": 1.160942760942761, + "grad_norm": 5.25, + "learning_rate": 8.1763491982141e-06, + "loss": 0.25587325, + "memory(GiB)": 15.04, + "step": 8620, + "train_speed(iter/s)": 0.334709 + }, + { + "acc": 0.89310303, + "epoch": 1.1616161616161615, + "grad_norm": 6.34375, + "learning_rate": 8.165402181392748e-06, + "loss": 0.3570641, + "memory(GiB)": 15.04, + "step": 8625, + "train_speed(iter/s)": 0.334702 + }, + { + "acc": 0.91990938, + "epoch": 1.1622895622895624, + "grad_norm": 4.65625, + "learning_rate": 8.154457439203937e-06, + "loss": 0.33330026, + "memory(GiB)": 15.04, + "step": 8630, + "train_speed(iter/s)": 0.334722 + }, + { + "acc": 0.88193007, + "epoch": 1.162962962962963, + "grad_norm": 11.75, + "learning_rate": 8.14351498521756e-06, + "loss": 0.21499176, + "memory(GiB)": 15.04, + "step": 8635, + "train_speed(iter/s)": 0.334763 + }, + { + "acc": 0.94585333, + "epoch": 1.1636363636363636, + "grad_norm": 4.71875, + "learning_rate": 8.132574833000642e-06, + "loss": 0.16870201, + "memory(GiB)": 15.04, + "step": 8640, + "train_speed(iter/s)": 0.334807 + }, + { + "acc": 0.89892464, + "epoch": 1.1643097643097644, + "grad_norm": 8.9375, + "learning_rate": 8.121636996117377e-06, + "loss": 0.3604188, + "memory(GiB)": 15.04, + "step": 8645, + "train_speed(iter/s)": 0.334838 + }, + { + "acc": 0.85911007, + "epoch": 1.164983164983165, + "grad_norm": 6.25, + "learning_rate": 8.11070148812908e-06, + "loss": 0.51609712, + "memory(GiB)": 15.04, + "step": 8650, + "train_speed(iter/s)": 0.334873 + }, + { + "acc": 0.91215897, + "epoch": 1.1656565656565656, + "grad_norm": 9.5625, + "learning_rate": 8.099768322594178e-06, + "loss": 0.24298859, + "memory(GiB)": 15.04, + "step": 8655, + "train_speed(iter/s)": 0.334911 + }, + { + "acc": 0.86823225, + "epoch": 1.1663299663299664, + "grad_norm": 4.1875, + "learning_rate": 8.088837513068192e-06, + "loss": 0.30021667, + "memory(GiB)": 15.04, + "step": 8660, + "train_speed(iter/s)": 0.334917 + }, + { + "acc": 0.93826561, + "epoch": 1.167003367003367, + "grad_norm": 15.75, + "learning_rate": 8.07790907310373e-06, + "loss": 0.22655346, + "memory(GiB)": 15.04, + "step": 8665, + "train_speed(iter/s)": 0.334947 + }, + { + "acc": 0.9213007, + "epoch": 1.1676767676767676, + "grad_norm": 12.625, + "learning_rate": 8.06698301625045e-06, + "loss": 0.25660739, + "memory(GiB)": 15.04, + "step": 8670, + "train_speed(iter/s)": 0.334975 + }, + { + "acc": 0.90065212, + "epoch": 1.1683501683501682, + "grad_norm": 8.3125, + "learning_rate": 8.056059356055072e-06, + "loss": 0.36666379, + "memory(GiB)": 15.04, + "step": 8675, + "train_speed(iter/s)": 0.335 + }, + { + "acc": 0.90083942, + "epoch": 1.169023569023569, + "grad_norm": 9.6875, + "learning_rate": 8.045138106061323e-06, + "loss": 0.32792065, + "memory(GiB)": 15.04, + "step": 8680, + "train_speed(iter/s)": 0.335016 + }, + { + "acc": 0.89946604, + "epoch": 1.1696969696969697, + "grad_norm": 13.8125, + "learning_rate": 8.034219279809959e-06, + "loss": 0.51767335, + "memory(GiB)": 15.04, + "step": 8685, + "train_speed(iter/s)": 0.334977 + }, + { + "acc": 0.88581705, + "epoch": 1.1703703703703703, + "grad_norm": 13.4375, + "learning_rate": 8.023302890838729e-06, + "loss": 0.39377327, + "memory(GiB)": 15.04, + "step": 8690, + "train_speed(iter/s)": 0.335001 + }, + { + "acc": 0.89919348, + "epoch": 1.171043771043771, + "grad_norm": 10.25, + "learning_rate": 8.012388952682345e-06, + "loss": 0.40560446, + "memory(GiB)": 15.04, + "step": 8695, + "train_speed(iter/s)": 0.335028 + }, + { + "acc": 0.94321012, + "epoch": 1.1717171717171717, + "grad_norm": 5.84375, + "learning_rate": 8.001477478872504e-06, + "loss": 0.20610557, + "memory(GiB)": 15.04, + "step": 8700, + "train_speed(iter/s)": 0.335056 + }, + { + "epoch": 1.1717171717171717, + "eval_acc": 0.8943338839508115, + "eval_loss": 0.41293954849243164, + "eval_runtime": 109.9265, + "eval_samples_per_second": 1.365, + "eval_steps_per_second": 1.365, + "step": 8700 + }, + { + "acc": 0.91216431, + "epoch": 1.1723905723905723, + "grad_norm": 7.59375, + "learning_rate": 7.990568482937826e-06, + "loss": 0.28307076, + "memory(GiB)": 15.04, + "step": 8705, + "train_speed(iter/s)": 0.333645 + }, + { + "acc": 0.84530878, + "epoch": 1.1730639730639731, + "grad_norm": 12.8125, + "learning_rate": 7.97966197840387e-06, + "loss": 0.58879704, + "memory(GiB)": 15.04, + "step": 8710, + "train_speed(iter/s)": 0.333696 + }, + { + "acc": 0.87714481, + "epoch": 1.1737373737373737, + "grad_norm": 13.5, + "learning_rate": 7.968757978793111e-06, + "loss": 0.53552775, + "memory(GiB)": 15.04, + "step": 8715, + "train_speed(iter/s)": 0.333733 + }, + { + "acc": 0.92908802, + "epoch": 1.1744107744107743, + "grad_norm": 8.9375, + "learning_rate": 7.9578564976249e-06, + "loss": 0.23767047, + "memory(GiB)": 15.04, + "step": 8720, + "train_speed(iter/s)": 0.33376 + }, + { + "acc": 0.88402081, + "epoch": 1.1750841750841752, + "grad_norm": 20.375, + "learning_rate": 7.946957548415488e-06, + "loss": 0.44677601, + "memory(GiB)": 15.04, + "step": 8725, + "train_speed(iter/s)": 0.33379 + }, + { + "acc": 0.91932278, + "epoch": 1.1757575757575758, + "grad_norm": 12.375, + "learning_rate": 7.936061144677964e-06, + "loss": 0.32225776, + "memory(GiB)": 15.04, + "step": 8730, + "train_speed(iter/s)": 0.333818 + }, + { + "acc": 0.91353588, + "epoch": 1.1764309764309764, + "grad_norm": 9.1875, + "learning_rate": 7.92516729992228e-06, + "loss": 0.29596422, + "memory(GiB)": 15.04, + "step": 8735, + "train_speed(iter/s)": 0.33384 + }, + { + "acc": 0.92825842, + "epoch": 1.177104377104377, + "grad_norm": 8.9375, + "learning_rate": 7.914276027655208e-06, + "loss": 0.27144384, + "memory(GiB)": 15.04, + "step": 8740, + "train_speed(iter/s)": 0.333883 + }, + { + "acc": 0.94485922, + "epoch": 1.1777777777777778, + "grad_norm": 6.03125, + "learning_rate": 7.903387341380325e-06, + "loss": 0.25880239, + "memory(GiB)": 15.04, + "step": 8745, + "train_speed(iter/s)": 0.333909 + }, + { + "acc": 0.86168137, + "epoch": 1.1784511784511784, + "grad_norm": 8.75, + "learning_rate": 7.892501254598011e-06, + "loss": 0.31010644, + "memory(GiB)": 15.04, + "step": 8750, + "train_speed(iter/s)": 0.33393 + }, + { + "acc": 0.85700951, + "epoch": 1.1791245791245792, + "grad_norm": 9.5625, + "learning_rate": 7.881617780805419e-06, + "loss": 0.78340473, + "memory(GiB)": 15.04, + "step": 8755, + "train_speed(iter/s)": 0.333984 + }, + { + "acc": 0.9334198, + "epoch": 1.1797979797979798, + "grad_norm": 11.875, + "learning_rate": 7.870736933496457e-06, + "loss": 0.29740202, + "memory(GiB)": 15.04, + "step": 8760, + "train_speed(iter/s)": 0.33403 + }, + { + "acc": 0.89856739, + "epoch": 1.1804713804713804, + "grad_norm": 14.625, + "learning_rate": 7.85985872616179e-06, + "loss": 0.28176606, + "memory(GiB)": 15.04, + "step": 8765, + "train_speed(iter/s)": 0.334055 + }, + { + "acc": 0.89273281, + "epoch": 1.181144781144781, + "grad_norm": 6.4375, + "learning_rate": 7.848983172288796e-06, + "loss": 0.36913018, + "memory(GiB)": 15.04, + "step": 8770, + "train_speed(iter/s)": 0.334086 + }, + { + "acc": 0.9282836, + "epoch": 1.1818181818181819, + "grad_norm": 8.0, + "learning_rate": 7.83811028536157e-06, + "loss": 0.26287441, + "memory(GiB)": 15.04, + "step": 8775, + "train_speed(iter/s)": 0.334108 + }, + { + "acc": 0.8915103, + "epoch": 1.1824915824915825, + "grad_norm": 4.6875, + "learning_rate": 7.827240078860898e-06, + "loss": 0.23198943, + "memory(GiB)": 15.04, + "step": 8780, + "train_speed(iter/s)": 0.334145 + }, + { + "acc": 0.90011988, + "epoch": 1.183164983164983, + "grad_norm": 12.125, + "learning_rate": 7.816372566264243e-06, + "loss": 0.33637316, + "memory(GiB)": 15.04, + "step": 8785, + "train_speed(iter/s)": 0.334143 + }, + { + "acc": 0.91603937, + "epoch": 1.183838383838384, + "grad_norm": 5.96875, + "learning_rate": 7.805507761045734e-06, + "loss": 0.26426532, + "memory(GiB)": 15.04, + "step": 8790, + "train_speed(iter/s)": 0.334172 + }, + { + "acc": 0.89845304, + "epoch": 1.1845117845117845, + "grad_norm": 13.25, + "learning_rate": 7.794645676676132e-06, + "loss": 0.5090816, + "memory(GiB)": 15.04, + "step": 8795, + "train_speed(iter/s)": 0.334204 + }, + { + "acc": 0.86508207, + "epoch": 1.1851851851851851, + "grad_norm": 6.625, + "learning_rate": 7.783786326622837e-06, + "loss": 0.34729381, + "memory(GiB)": 15.04, + "step": 8800, + "train_speed(iter/s)": 0.334233 + }, + { + "acc": 0.87080355, + "epoch": 1.185858585858586, + "grad_norm": 12.8125, + "learning_rate": 7.772929724349843e-06, + "loss": 0.55419102, + "memory(GiB)": 15.04, + "step": 8805, + "train_speed(iter/s)": 0.334252 + }, + { + "acc": 0.80040865, + "epoch": 1.1865319865319865, + "grad_norm": 12.4375, + "learning_rate": 7.762075883317753e-06, + "loss": 1.01047974, + "memory(GiB)": 15.04, + "step": 8810, + "train_speed(iter/s)": 0.334259 + }, + { + "acc": 0.92868814, + "epoch": 1.1872053872053872, + "grad_norm": 8.25, + "learning_rate": 7.751224816983737e-06, + "loss": 0.24025538, + "memory(GiB)": 15.04, + "step": 8815, + "train_speed(iter/s)": 0.334292 + }, + { + "acc": 0.92831297, + "epoch": 1.187878787878788, + "grad_norm": 7.40625, + "learning_rate": 7.740376538801533e-06, + "loss": 0.2216753, + "memory(GiB)": 15.04, + "step": 8820, + "train_speed(iter/s)": 0.334326 + }, + { + "acc": 0.93432398, + "epoch": 1.1885521885521886, + "grad_norm": 4.34375, + "learning_rate": 7.72953106222141e-06, + "loss": 0.25570393, + "memory(GiB)": 15.04, + "step": 8825, + "train_speed(iter/s)": 0.334316 + }, + { + "acc": 0.87599707, + "epoch": 1.1892255892255892, + "grad_norm": 8.875, + "learning_rate": 7.718688400690174e-06, + "loss": 0.35427902, + "memory(GiB)": 15.04, + "step": 8830, + "train_speed(iter/s)": 0.334355 + }, + { + "acc": 0.91540041, + "epoch": 1.1898989898989898, + "grad_norm": 5.40625, + "learning_rate": 7.707848567651134e-06, + "loss": 0.35271568, + "memory(GiB)": 15.04, + "step": 8835, + "train_speed(iter/s)": 0.334376 + }, + { + "acc": 0.86296587, + "epoch": 1.1905723905723906, + "grad_norm": 6.21875, + "learning_rate": 7.697011576544102e-06, + "loss": 0.34237313, + "memory(GiB)": 15.04, + "step": 8840, + "train_speed(iter/s)": 0.334363 + }, + { + "acc": 0.87622728, + "epoch": 1.1912457912457912, + "grad_norm": 6.09375, + "learning_rate": 7.68617744080535e-06, + "loss": 0.37369969, + "memory(GiB)": 15.04, + "step": 8845, + "train_speed(iter/s)": 0.334401 + }, + { + "acc": 0.92973967, + "epoch": 1.1919191919191918, + "grad_norm": 8.4375, + "learning_rate": 7.675346173867627e-06, + "loss": 0.31743107, + "memory(GiB)": 15.04, + "step": 8850, + "train_speed(iter/s)": 0.334417 + }, + { + "acc": 0.91796322, + "epoch": 1.1925925925925926, + "grad_norm": 4.25, + "learning_rate": 7.664517789160111e-06, + "loss": 0.30589209, + "memory(GiB)": 15.04, + "step": 8855, + "train_speed(iter/s)": 0.334446 + }, + { + "acc": 0.87161016, + "epoch": 1.1932659932659933, + "grad_norm": 5.03125, + "learning_rate": 7.653692300108416e-06, + "loss": 0.46378088, + "memory(GiB)": 15.04, + "step": 8860, + "train_speed(iter/s)": 0.334489 + }, + { + "acc": 0.93547935, + "epoch": 1.1939393939393939, + "grad_norm": 5.96875, + "learning_rate": 7.642869720134567e-06, + "loss": 0.26447537, + "memory(GiB)": 15.04, + "step": 8865, + "train_speed(iter/s)": 0.33453 + }, + { + "acc": 0.88174496, + "epoch": 1.1946127946127947, + "grad_norm": 8.75, + "learning_rate": 7.63205006265697e-06, + "loss": 0.54033194, + "memory(GiB)": 15.04, + "step": 8870, + "train_speed(iter/s)": 0.334539 + }, + { + "acc": 0.8227273, + "epoch": 1.1952861952861953, + "grad_norm": 8.8125, + "learning_rate": 7.621233341090421e-06, + "loss": 0.38368597, + "memory(GiB)": 15.04, + "step": 8875, + "train_speed(iter/s)": 0.334566 + }, + { + "acc": 0.91086254, + "epoch": 1.195959595959596, + "grad_norm": 7.1875, + "learning_rate": 7.6104195688460655e-06, + "loss": 0.27185309, + "memory(GiB)": 15.04, + "step": 8880, + "train_speed(iter/s)": 0.334595 + }, + { + "acc": 0.83424644, + "epoch": 1.1966329966329967, + "grad_norm": 9.75, + "learning_rate": 7.599608759331398e-06, + "loss": 0.70110936, + "memory(GiB)": 15.04, + "step": 8885, + "train_speed(iter/s)": 0.334626 + }, + { + "acc": 0.87613516, + "epoch": 1.1973063973063973, + "grad_norm": 10.75, + "learning_rate": 7.588800925950246e-06, + "loss": 0.35930502, + "memory(GiB)": 15.04, + "step": 8890, + "train_speed(iter/s)": 0.334654 + }, + { + "acc": 0.92512188, + "epoch": 1.197979797979798, + "grad_norm": 10.125, + "learning_rate": 7.577996082102729e-06, + "loss": 0.25872157, + "memory(GiB)": 15.04, + "step": 8895, + "train_speed(iter/s)": 0.334691 + }, + { + "acc": 0.92029266, + "epoch": 1.1986531986531987, + "grad_norm": 5.375, + "learning_rate": 7.567194241185279e-06, + "loss": 0.26382964, + "memory(GiB)": 15.04, + "step": 8900, + "train_speed(iter/s)": 0.334705 + }, + { + "acc": 0.88651953, + "epoch": 1.1993265993265994, + "grad_norm": 15.8125, + "learning_rate": 7.556395416590589e-06, + "loss": 0.59845929, + "memory(GiB)": 15.04, + "step": 8905, + "train_speed(iter/s)": 0.334739 + }, + { + "acc": 0.89042358, + "epoch": 1.2, + "grad_norm": 12.5625, + "learning_rate": 7.545599621707625e-06, + "loss": 0.36238532, + "memory(GiB)": 15.04, + "step": 8910, + "train_speed(iter/s)": 0.334779 + }, + { + "acc": 0.9247798, + "epoch": 1.2006734006734008, + "grad_norm": 11.1875, + "learning_rate": 7.534806869921592e-06, + "loss": 0.25445011, + "memory(GiB)": 15.04, + "step": 8915, + "train_speed(iter/s)": 0.334808 + }, + { + "acc": 0.9430151, + "epoch": 1.2013468013468014, + "grad_norm": 10.5625, + "learning_rate": 7.524017174613916e-06, + "loss": 0.32521975, + "memory(GiB)": 15.04, + "step": 8920, + "train_speed(iter/s)": 0.334847 + }, + { + "acc": 0.91420298, + "epoch": 1.202020202020202, + "grad_norm": 9.3125, + "learning_rate": 7.5132305491622425e-06, + "loss": 0.3544997, + "memory(GiB)": 15.04, + "step": 8925, + "train_speed(iter/s)": 0.334837 + }, + { + "acc": 0.78509068, + "epoch": 1.2026936026936026, + "grad_norm": 6.3125, + "learning_rate": 7.502447006940406e-06, + "loss": 1.27279234, + "memory(GiB)": 15.04, + "step": 8930, + "train_speed(iter/s)": 0.334862 + }, + { + "acc": 0.94311543, + "epoch": 1.2033670033670034, + "grad_norm": 9.125, + "learning_rate": 7.491666561318416e-06, + "loss": 0.19692328, + "memory(GiB)": 15.04, + "step": 8935, + "train_speed(iter/s)": 0.334891 + }, + { + "acc": 0.90329046, + "epoch": 1.204040404040404, + "grad_norm": 6.0625, + "learning_rate": 7.480889225662454e-06, + "loss": 0.26504488, + "memory(GiB)": 15.04, + "step": 8940, + "train_speed(iter/s)": 0.334916 + }, + { + "acc": 0.82259665, + "epoch": 1.2047138047138046, + "grad_norm": 9.4375, + "learning_rate": 7.470115013334829e-06, + "loss": 0.56307988, + "memory(GiB)": 15.04, + "step": 8945, + "train_speed(iter/s)": 0.334962 + }, + { + "acc": 0.90707951, + "epoch": 1.2053872053872055, + "grad_norm": 18.75, + "learning_rate": 7.459343937693992e-06, + "loss": 0.33230534, + "memory(GiB)": 15.04, + "step": 8950, + "train_speed(iter/s)": 0.334993 + }, + { + "acc": 0.8344964, + "epoch": 1.206060606060606, + "grad_norm": 9.0, + "learning_rate": 7.448576012094492e-06, + "loss": 0.43879514, + "memory(GiB)": 15.04, + "step": 8955, + "train_speed(iter/s)": 0.335031 + }, + { + "acc": 0.90590734, + "epoch": 1.2067340067340067, + "grad_norm": 5.875, + "learning_rate": 7.437811249886985e-06, + "loss": 0.35281451, + "memory(GiB)": 15.04, + "step": 8960, + "train_speed(iter/s)": 0.335057 + }, + { + "acc": 0.82382822, + "epoch": 1.2074074074074075, + "grad_norm": 12.4375, + "learning_rate": 7.427049664418202e-06, + "loss": 0.63607116, + "memory(GiB)": 15.04, + "step": 8965, + "train_speed(iter/s)": 0.33509 + }, + { + "acc": 0.87669516, + "epoch": 1.208080808080808, + "grad_norm": 8.5, + "learning_rate": 7.416291269030923e-06, + "loss": 0.59069467, + "memory(GiB)": 15.04, + "step": 8970, + "train_speed(iter/s)": 0.335114 + }, + { + "acc": 0.8820632, + "epoch": 1.2087542087542087, + "grad_norm": 7.84375, + "learning_rate": 7.4055360770639925e-06, + "loss": 0.71371379, + "memory(GiB)": 15.04, + "step": 8975, + "train_speed(iter/s)": 0.335138 + }, + { + "acc": 0.9173686, + "epoch": 1.2094276094276095, + "grad_norm": 4.1875, + "learning_rate": 7.394784101852265e-06, + "loss": 0.26022398, + "memory(GiB)": 15.04, + "step": 8980, + "train_speed(iter/s)": 0.335154 + }, + { + "acc": 0.8300209, + "epoch": 1.2101010101010101, + "grad_norm": 15.4375, + "learning_rate": 7.384035356726618e-06, + "loss": 0.51802835, + "memory(GiB)": 15.04, + "step": 8985, + "train_speed(iter/s)": 0.335202 + }, + { + "acc": 0.89961033, + "epoch": 1.2107744107744107, + "grad_norm": 7.15625, + "learning_rate": 7.373289855013924e-06, + "loss": 0.29598281, + "memory(GiB)": 15.04, + "step": 8990, + "train_speed(iter/s)": 0.335226 + }, + { + "acc": 0.84554043, + "epoch": 1.2114478114478113, + "grad_norm": 7.84375, + "learning_rate": 7.3625476100370254e-06, + "loss": 0.50715199, + "memory(GiB)": 15.04, + "step": 8995, + "train_speed(iter/s)": 0.335254 + }, + { + "acc": 0.87353363, + "epoch": 1.2121212121212122, + "grad_norm": 8.625, + "learning_rate": 7.351808635114736e-06, + "loss": 0.51893387, + "memory(GiB)": 15.04, + "step": 9000, + "train_speed(iter/s)": 0.335262 + }, + { + "epoch": 1.2121212121212122, + "eval_acc": 0.894475415466723, + "eval_loss": 0.41569873690605164, + "eval_runtime": 109.7912, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 9000 + }, + { + "acc": 0.8929966, + "epoch": 1.2127946127946128, + "grad_norm": 14.375, + "learning_rate": 7.341072943561811e-06, + "loss": 0.47138219, + "memory(GiB)": 15.04, + "step": 9005, + "train_speed(iter/s)": 0.333891 + }, + { + "acc": 0.87038364, + "epoch": 1.2134680134680136, + "grad_norm": 43.0, + "learning_rate": 7.330340548688933e-06, + "loss": 0.37270429, + "memory(GiB)": 15.04, + "step": 9010, + "train_speed(iter/s)": 0.333881 + }, + { + "acc": 0.8217186, + "epoch": 1.2141414141414142, + "grad_norm": 12.4375, + "learning_rate": 7.319611463802705e-06, + "loss": 0.64520493, + "memory(GiB)": 15.04, + "step": 9015, + "train_speed(iter/s)": 0.333923 + }, + { + "acc": 0.93706741, + "epoch": 1.2148148148148148, + "grad_norm": 4.53125, + "learning_rate": 7.308885702205612e-06, + "loss": 0.16838611, + "memory(GiB)": 15.04, + "step": 9020, + "train_speed(iter/s)": 0.333962 + }, + { + "acc": 0.92356606, + "epoch": 1.2154882154882154, + "grad_norm": 5.53125, + "learning_rate": 7.298163277196035e-06, + "loss": 0.29079893, + "memory(GiB)": 15.04, + "step": 9025, + "train_speed(iter/s)": 0.333967 + }, + { + "acc": 0.89644585, + "epoch": 1.2161616161616162, + "grad_norm": 8.375, + "learning_rate": 7.2874442020682056e-06, + "loss": 0.38219862, + "memory(GiB)": 15.04, + "step": 9030, + "train_speed(iter/s)": 0.333971 + }, + { + "acc": 0.93895531, + "epoch": 1.2168350168350168, + "grad_norm": 5.59375, + "learning_rate": 7.276728490112208e-06, + "loss": 0.22326462, + "memory(GiB)": 15.04, + "step": 9035, + "train_speed(iter/s)": 0.33401 + }, + { + "acc": 0.9340889, + "epoch": 1.2175084175084174, + "grad_norm": 12.0625, + "learning_rate": 7.266016154613959e-06, + "loss": 0.24871545, + "memory(GiB)": 15.04, + "step": 9040, + "train_speed(iter/s)": 0.33405 + }, + { + "acc": 0.90456057, + "epoch": 1.2181818181818183, + "grad_norm": 25.875, + "learning_rate": 7.255307208855178e-06, + "loss": 0.44414439, + "memory(GiB)": 15.04, + "step": 9045, + "train_speed(iter/s)": 0.334094 + }, + { + "acc": 0.90100832, + "epoch": 1.2188552188552189, + "grad_norm": 8.375, + "learning_rate": 7.244601666113397e-06, + "loss": 0.33543525, + "memory(GiB)": 15.04, + "step": 9050, + "train_speed(iter/s)": 0.334091 + }, + { + "acc": 0.86368971, + "epoch": 1.2195286195286195, + "grad_norm": 14.75, + "learning_rate": 7.2338995396619135e-06, + "loss": 0.34672532, + "memory(GiB)": 15.04, + "step": 9055, + "train_speed(iter/s)": 0.33408 + }, + { + "acc": 0.82249479, + "epoch": 1.2202020202020203, + "grad_norm": 5.53125, + "learning_rate": 7.2232008427698e-06, + "loss": 0.53450623, + "memory(GiB)": 15.04, + "step": 9060, + "train_speed(iter/s)": 0.334108 + }, + { + "acc": 0.91870232, + "epoch": 1.220875420875421, + "grad_norm": 11.4375, + "learning_rate": 7.212505588701877e-06, + "loss": 0.3474076, + "memory(GiB)": 15.04, + "step": 9065, + "train_speed(iter/s)": 0.334139 + }, + { + "acc": 0.90108252, + "epoch": 1.2215488215488215, + "grad_norm": 6.40625, + "learning_rate": 7.201813790718686e-06, + "loss": 0.35729029, + "memory(GiB)": 15.04, + "step": 9070, + "train_speed(iter/s)": 0.334176 + }, + { + "acc": 0.94556217, + "epoch": 1.2222222222222223, + "grad_norm": 4.875, + "learning_rate": 7.191125462076497e-06, + "loss": 0.21146717, + "memory(GiB)": 15.04, + "step": 9075, + "train_speed(iter/s)": 0.334183 + }, + { + "acc": 0.90873528, + "epoch": 1.222895622895623, + "grad_norm": 5.09375, + "learning_rate": 7.180440616027264e-06, + "loss": 0.34912663, + "memory(GiB)": 15.04, + "step": 9080, + "train_speed(iter/s)": 0.334191 + }, + { + "acc": 0.90023823, + "epoch": 1.2235690235690235, + "grad_norm": 9.375, + "learning_rate": 7.169759265818637e-06, + "loss": 0.34831755, + "memory(GiB)": 15.04, + "step": 9085, + "train_speed(iter/s)": 0.334208 + }, + { + "acc": 0.89683199, + "epoch": 1.2242424242424241, + "grad_norm": 15.75, + "learning_rate": 7.159081424693925e-06, + "loss": 0.35634911, + "memory(GiB)": 15.04, + "step": 9090, + "train_speed(iter/s)": 0.334224 + }, + { + "acc": 0.93440924, + "epoch": 1.224915824915825, + "grad_norm": 11.3125, + "learning_rate": 7.148407105892085e-06, + "loss": 0.20357089, + "memory(GiB)": 15.04, + "step": 9095, + "train_speed(iter/s)": 0.334247 + }, + { + "acc": 0.93140469, + "epoch": 1.2255892255892256, + "grad_norm": 9.5, + "learning_rate": 7.137736322647708e-06, + "loss": 0.2412384, + "memory(GiB)": 15.04, + "step": 9100, + "train_speed(iter/s)": 0.334278 + }, + { + "acc": 0.88848457, + "epoch": 1.2262626262626264, + "grad_norm": 6.09375, + "learning_rate": 7.1270690881910055e-06, + "loss": 0.47499456, + "memory(GiB)": 15.04, + "step": 9105, + "train_speed(iter/s)": 0.334313 + }, + { + "acc": 0.93150959, + "epoch": 1.226936026936027, + "grad_norm": 12.125, + "learning_rate": 7.116405415747779e-06, + "loss": 0.24199398, + "memory(GiB)": 15.04, + "step": 9110, + "train_speed(iter/s)": 0.334351 + }, + { + "acc": 0.92064953, + "epoch": 1.2276094276094276, + "grad_norm": 9.3125, + "learning_rate": 7.10574531853943e-06, + "loss": 0.39521904, + "memory(GiB)": 15.04, + "step": 9115, + "train_speed(iter/s)": 0.334367 + }, + { + "acc": 0.8341259, + "epoch": 1.2282828282828282, + "grad_norm": 22.5, + "learning_rate": 7.095088809782909e-06, + "loss": 0.61440983, + "memory(GiB)": 15.04, + "step": 9120, + "train_speed(iter/s)": 0.334394 + }, + { + "acc": 0.88455887, + "epoch": 1.228956228956229, + "grad_norm": 8.0, + "learning_rate": 7.084435902690727e-06, + "loss": 0.35118048, + "memory(GiB)": 15.04, + "step": 9125, + "train_speed(iter/s)": 0.334398 + }, + { + "acc": 0.91548948, + "epoch": 1.2296296296296296, + "grad_norm": 9.1875, + "learning_rate": 7.073786610470935e-06, + "loss": 0.32967341, + "memory(GiB)": 15.04, + "step": 9130, + "train_speed(iter/s)": 0.334435 + }, + { + "acc": 0.88200417, + "epoch": 1.2303030303030302, + "grad_norm": 5.96875, + "learning_rate": 7.063140946327086e-06, + "loss": 0.36016295, + "memory(GiB)": 15.04, + "step": 9135, + "train_speed(iter/s)": 0.334469 + }, + { + "acc": 0.90623474, + "epoch": 1.230976430976431, + "grad_norm": 11.25, + "learning_rate": 7.052498923458253e-06, + "loss": 0.31132498, + "memory(GiB)": 15.04, + "step": 9140, + "train_speed(iter/s)": 0.334475 + }, + { + "acc": 0.85107536, + "epoch": 1.2316498316498317, + "grad_norm": 10.6875, + "learning_rate": 7.041860555058977e-06, + "loss": 0.74530754, + "memory(GiB)": 15.04, + "step": 9145, + "train_speed(iter/s)": 0.334513 + }, + { + "acc": 0.90378151, + "epoch": 1.2323232323232323, + "grad_norm": 5.65625, + "learning_rate": 7.031225854319281e-06, + "loss": 0.35486798, + "memory(GiB)": 15.04, + "step": 9150, + "train_speed(iter/s)": 0.334511 + }, + { + "acc": 0.94854698, + "epoch": 1.232996632996633, + "grad_norm": 5.125, + "learning_rate": 7.020594834424639e-06, + "loss": 0.23054857, + "memory(GiB)": 15.04, + "step": 9155, + "train_speed(iter/s)": 0.334522 + }, + { + "acc": 0.88515387, + "epoch": 1.2336700336700337, + "grad_norm": 5.96875, + "learning_rate": 7.009967508555952e-06, + "loss": 0.41824684, + "memory(GiB)": 15.04, + "step": 9160, + "train_speed(iter/s)": 0.334524 + }, + { + "acc": 0.91940241, + "epoch": 1.2343434343434343, + "grad_norm": 8.3125, + "learning_rate": 6.999343889889553e-06, + "loss": 0.2349196, + "memory(GiB)": 15.04, + "step": 9165, + "train_speed(iter/s)": 0.334557 + }, + { + "acc": 0.85173903, + "epoch": 1.2350168350168351, + "grad_norm": 6.8125, + "learning_rate": 6.988723991597166e-06, + "loss": 0.71610355, + "memory(GiB)": 15.04, + "step": 9170, + "train_speed(iter/s)": 0.334571 + }, + { + "acc": 0.85665131, + "epoch": 1.2356902356902357, + "grad_norm": 10.8125, + "learning_rate": 6.978107826845914e-06, + "loss": 0.59850006, + "memory(GiB)": 15.04, + "step": 9175, + "train_speed(iter/s)": 0.334547 + }, + { + "acc": 0.91978998, + "epoch": 1.2363636363636363, + "grad_norm": 6.5, + "learning_rate": 6.967495408798288e-06, + "loss": 0.28697846, + "memory(GiB)": 15.04, + "step": 9180, + "train_speed(iter/s)": 0.334573 + }, + { + "acc": 0.88025265, + "epoch": 1.237037037037037, + "grad_norm": 7.65625, + "learning_rate": 6.9568867506121285e-06, + "loss": 0.73406439, + "memory(GiB)": 15.04, + "step": 9185, + "train_speed(iter/s)": 0.334608 + }, + { + "acc": 0.91176662, + "epoch": 1.2377104377104378, + "grad_norm": 6.65625, + "learning_rate": 6.94628186544062e-06, + "loss": 0.37211435, + "memory(GiB)": 15.04, + "step": 9190, + "train_speed(iter/s)": 0.334617 + }, + { + "acc": 0.90800314, + "epoch": 1.2383838383838384, + "grad_norm": 9.0625, + "learning_rate": 6.93568076643226e-06, + "loss": 0.28418851, + "memory(GiB)": 15.04, + "step": 9195, + "train_speed(iter/s)": 0.334651 + }, + { + "acc": 0.8702795, + "epoch": 1.239057239057239, + "grad_norm": 16.25, + "learning_rate": 6.925083466730864e-06, + "loss": 0.3472441, + "memory(GiB)": 15.04, + "step": 9200, + "train_speed(iter/s)": 0.334689 + }, + { + "acc": 0.90860109, + "epoch": 1.2397306397306398, + "grad_norm": 12.9375, + "learning_rate": 6.914489979475536e-06, + "loss": 0.32622931, + "memory(GiB)": 15.04, + "step": 9205, + "train_speed(iter/s)": 0.334713 + }, + { + "acc": 0.92050362, + "epoch": 1.2404040404040404, + "grad_norm": 12.5, + "learning_rate": 6.903900317800637e-06, + "loss": 0.30244985, + "memory(GiB)": 15.04, + "step": 9210, + "train_speed(iter/s)": 0.334741 + }, + { + "acc": 0.89314461, + "epoch": 1.241077441077441, + "grad_norm": 6.6875, + "learning_rate": 6.893314494835806e-06, + "loss": 0.4402411, + "memory(GiB)": 15.04, + "step": 9215, + "train_speed(iter/s)": 0.334738 + }, + { + "acc": 0.92904959, + "epoch": 1.2417508417508418, + "grad_norm": 12.1875, + "learning_rate": 6.882732523705906e-06, + "loss": 0.31815796, + "memory(GiB)": 15.04, + "step": 9220, + "train_speed(iter/s)": 0.334752 + }, + { + "acc": 0.91005831, + "epoch": 1.2424242424242424, + "grad_norm": 11.625, + "learning_rate": 6.872154417531034e-06, + "loss": 0.41316104, + "memory(GiB)": 15.04, + "step": 9225, + "train_speed(iter/s)": 0.334771 + }, + { + "acc": 0.92818918, + "epoch": 1.243097643097643, + "grad_norm": 10.25, + "learning_rate": 6.861580189426495e-06, + "loss": 0.35050559, + "memory(GiB)": 15.04, + "step": 9230, + "train_speed(iter/s)": 0.334801 + }, + { + "acc": 0.86370831, + "epoch": 1.2437710437710439, + "grad_norm": 15.8125, + "learning_rate": 6.851009852502777e-06, + "loss": 0.67675142, + "memory(GiB)": 15.04, + "step": 9235, + "train_speed(iter/s)": 0.334819 + }, + { + "acc": 0.85959864, + "epoch": 1.2444444444444445, + "grad_norm": 20.875, + "learning_rate": 6.840443419865556e-06, + "loss": 0.58502169, + "memory(GiB)": 15.04, + "step": 9240, + "train_speed(iter/s)": 0.334826 + }, + { + "acc": 0.8739397, + "epoch": 1.245117845117845, + "grad_norm": 8.0625, + "learning_rate": 6.829880904615652e-06, + "loss": 0.45994511, + "memory(GiB)": 15.04, + "step": 9245, + "train_speed(iter/s)": 0.33481 + }, + { + "acc": 0.93040657, + "epoch": 1.2457912457912457, + "grad_norm": 6.34375, + "learning_rate": 6.819322319849044e-06, + "loss": 0.24262767, + "memory(GiB)": 15.04, + "step": 9250, + "train_speed(iter/s)": 0.334831 + }, + { + "acc": 0.91367722, + "epoch": 1.2464646464646465, + "grad_norm": 10.3125, + "learning_rate": 6.808767678656829e-06, + "loss": 0.32125883, + "memory(GiB)": 15.04, + "step": 9255, + "train_speed(iter/s)": 0.33482 + }, + { + "acc": 0.92794933, + "epoch": 1.247138047138047, + "grad_norm": 16.5, + "learning_rate": 6.798216994125213e-06, + "loss": 0.23239794, + "memory(GiB)": 15.04, + "step": 9260, + "train_speed(iter/s)": 0.33486 + }, + { + "acc": 0.89250135, + "epoch": 1.247811447811448, + "grad_norm": 14.8125, + "learning_rate": 6.7876702793355035e-06, + "loss": 0.43735785, + "memory(GiB)": 15.04, + "step": 9265, + "train_speed(iter/s)": 0.334857 + }, + { + "acc": 0.86579876, + "epoch": 1.2484848484848485, + "grad_norm": 18.625, + "learning_rate": 6.777127547364078e-06, + "loss": 0.36714532, + "memory(GiB)": 15.04, + "step": 9270, + "train_speed(iter/s)": 0.334887 + }, + { + "acc": 0.91549988, + "epoch": 1.2491582491582491, + "grad_norm": 8.1875, + "learning_rate": 6.766588811282379e-06, + "loss": 0.27293491, + "memory(GiB)": 15.04, + "step": 9275, + "train_speed(iter/s)": 0.334891 + }, + { + "acc": 0.90694761, + "epoch": 1.2498316498316497, + "grad_norm": 7.375, + "learning_rate": 6.756054084156902e-06, + "loss": 0.31676784, + "memory(GiB)": 15.04, + "step": 9280, + "train_speed(iter/s)": 0.33492 + }, + { + "acc": 0.89656353, + "epoch": 1.2505050505050506, + "grad_norm": 8.875, + "learning_rate": 6.745523379049157e-06, + "loss": 0.33908391, + "memory(GiB)": 15.04, + "step": 9285, + "train_speed(iter/s)": 0.334946 + }, + { + "acc": 0.91946945, + "epoch": 1.2511784511784512, + "grad_norm": 6.4375, + "learning_rate": 6.734996709015684e-06, + "loss": 0.32658114, + "memory(GiB)": 15.04, + "step": 9290, + "train_speed(iter/s)": 0.334981 + }, + { + "acc": 0.81229038, + "epoch": 1.2518518518518518, + "grad_norm": 8.3125, + "learning_rate": 6.724474087108004e-06, + "loss": 0.68991122, + "memory(GiB)": 15.04, + "step": 9295, + "train_speed(iter/s)": 0.334984 + }, + { + "acc": 0.92849064, + "epoch": 1.2525252525252526, + "grad_norm": 8.5, + "learning_rate": 6.713955526372629e-06, + "loss": 0.24314618, + "memory(GiB)": 15.04, + "step": 9300, + "train_speed(iter/s)": 0.335028 + }, + { + "epoch": 1.2525252525252526, + "eval_acc": 0.8943658280922432, + "eval_loss": 0.4082954525947571, + "eval_runtime": 109.9044, + "eval_samples_per_second": 1.365, + "eval_steps_per_second": 1.365, + "step": 9300 + }, + { + "acc": 0.8597661, + "epoch": 1.2531986531986532, + "grad_norm": 13.375, + "learning_rate": 6.70344103985104e-06, + "loss": 0.40775919, + "memory(GiB)": 15.04, + "step": 9305, + "train_speed(iter/s)": 0.333702 + }, + { + "acc": 0.86505661, + "epoch": 1.2538720538720538, + "grad_norm": 11.0, + "learning_rate": 6.692930640579651e-06, + "loss": 0.22979717, + "memory(GiB)": 15.04, + "step": 9310, + "train_speed(iter/s)": 0.333749 + }, + { + "acc": 0.92401609, + "epoch": 1.2545454545454544, + "grad_norm": 5.15625, + "learning_rate": 6.682424341589824e-06, + "loss": 0.32938907, + "memory(GiB)": 15.04, + "step": 9315, + "train_speed(iter/s)": 0.33378 + }, + { + "acc": 0.92805328, + "epoch": 1.2552188552188552, + "grad_norm": 8.8125, + "learning_rate": 6.671922155907826e-06, + "loss": 0.26388872, + "memory(GiB)": 15.04, + "step": 9320, + "train_speed(iter/s)": 0.333811 + }, + { + "acc": 0.91945896, + "epoch": 1.2558922558922558, + "grad_norm": 10.0625, + "learning_rate": 6.661424096554829e-06, + "loss": 0.20681672, + "memory(GiB)": 15.04, + "step": 9325, + "train_speed(iter/s)": 0.33384 + }, + { + "acc": 0.83211021, + "epoch": 1.2565656565656567, + "grad_norm": 8.9375, + "learning_rate": 6.650930176546896e-06, + "loss": 0.64398093, + "memory(GiB)": 15.04, + "step": 9330, + "train_speed(iter/s)": 0.333839 + }, + { + "acc": 0.88890476, + "epoch": 1.2572390572390573, + "grad_norm": 11.6875, + "learning_rate": 6.64044040889494e-06, + "loss": 0.34757118, + "memory(GiB)": 15.04, + "step": 9335, + "train_speed(iter/s)": 0.333826 + }, + { + "acc": 0.89659986, + "epoch": 1.2579124579124579, + "grad_norm": 31.25, + "learning_rate": 6.629954806604746e-06, + "loss": 0.6473115, + "memory(GiB)": 15.04, + "step": 9340, + "train_speed(iter/s)": 0.333846 + }, + { + "acc": 0.91689882, + "epoch": 1.2585858585858585, + "grad_norm": 25.625, + "learning_rate": 6.619473382676917e-06, + "loss": 0.26747901, + "memory(GiB)": 15.04, + "step": 9345, + "train_speed(iter/s)": 0.333884 + }, + { + "acc": 0.90223913, + "epoch": 1.2592592592592593, + "grad_norm": 6.5, + "learning_rate": 6.6089961501068875e-06, + "loss": 0.36214557, + "memory(GiB)": 15.04, + "step": 9350, + "train_speed(iter/s)": 0.333903 + }, + { + "acc": 0.81011677, + "epoch": 1.25993265993266, + "grad_norm": 18.5, + "learning_rate": 6.59852312188489e-06, + "loss": 0.58135796, + "memory(GiB)": 15.04, + "step": 9355, + "train_speed(iter/s)": 0.333939 + }, + { + "acc": 0.91339312, + "epoch": 1.2606060606060607, + "grad_norm": 6.21875, + "learning_rate": 6.588054310995946e-06, + "loss": 0.28903656, + "memory(GiB)": 15.04, + "step": 9360, + "train_speed(iter/s)": 0.33397 + }, + { + "acc": 0.93569059, + "epoch": 1.2612794612794613, + "grad_norm": 11.125, + "learning_rate": 6.5775897304198464e-06, + "loss": 0.24312882, + "memory(GiB)": 15.04, + "step": 9365, + "train_speed(iter/s)": 0.333995 + }, + { + "acc": 0.89483786, + "epoch": 1.261952861952862, + "grad_norm": 28.875, + "learning_rate": 6.567129393131139e-06, + "loss": 0.37268546, + "memory(GiB)": 15.04, + "step": 9370, + "train_speed(iter/s)": 0.334042 + }, + { + "acc": 0.90341072, + "epoch": 1.2626262626262625, + "grad_norm": 8.5625, + "learning_rate": 6.556673312099106e-06, + "loss": 0.32093859, + "memory(GiB)": 15.04, + "step": 9375, + "train_speed(iter/s)": 0.334058 + }, + { + "acc": 0.90460091, + "epoch": 1.2632996632996634, + "grad_norm": 8.0625, + "learning_rate": 6.546221500287766e-06, + "loss": 0.36141028, + "memory(GiB)": 15.04, + "step": 9380, + "train_speed(iter/s)": 0.33409 + }, + { + "acc": 0.89143772, + "epoch": 1.263973063973064, + "grad_norm": 5.28125, + "learning_rate": 6.535773970655823e-06, + "loss": 0.3237045, + "memory(GiB)": 15.04, + "step": 9385, + "train_speed(iter/s)": 0.334088 + }, + { + "acc": 0.91589622, + "epoch": 1.2646464646464646, + "grad_norm": 5.28125, + "learning_rate": 6.525330736156692e-06, + "loss": 0.2698863, + "memory(GiB)": 15.04, + "step": 9390, + "train_speed(iter/s)": 0.334122 + }, + { + "acc": 0.91980467, + "epoch": 1.2653198653198654, + "grad_norm": 5.53125, + "learning_rate": 6.514891809738446e-06, + "loss": 0.27117088, + "memory(GiB)": 15.04, + "step": 9395, + "train_speed(iter/s)": 0.334162 + }, + { + "acc": 0.9274991, + "epoch": 1.265993265993266, + "grad_norm": 19.375, + "learning_rate": 6.5044572043438305e-06, + "loss": 0.2792767, + "memory(GiB)": 15.04, + "step": 9400, + "train_speed(iter/s)": 0.334172 + }, + { + "acc": 0.89860916, + "epoch": 1.2666666666666666, + "grad_norm": 6.5, + "learning_rate": 6.494026932910229e-06, + "loss": 0.23947976, + "memory(GiB)": 15.04, + "step": 9405, + "train_speed(iter/s)": 0.334204 + }, + { + "acc": 0.89169331, + "epoch": 1.2673400673400672, + "grad_norm": 12.3125, + "learning_rate": 6.483601008369645e-06, + "loss": 0.32476013, + "memory(GiB)": 15.04, + "step": 9410, + "train_speed(iter/s)": 0.334253 + }, + { + "acc": 0.91733112, + "epoch": 1.268013468013468, + "grad_norm": 5.59375, + "learning_rate": 6.473179443648703e-06, + "loss": 0.29361489, + "memory(GiB)": 15.04, + "step": 9415, + "train_speed(iter/s)": 0.334258 + }, + { + "acc": 0.8154501, + "epoch": 1.2686868686868686, + "grad_norm": 7.0, + "learning_rate": 6.462762251668609e-06, + "loss": 0.85370541, + "memory(GiB)": 15.04, + "step": 9420, + "train_speed(iter/s)": 0.334288 + }, + { + "acc": 0.89090977, + "epoch": 1.2693602693602695, + "grad_norm": 12.5625, + "learning_rate": 6.452349445345159e-06, + "loss": 0.42459707, + "memory(GiB)": 15.04, + "step": 9425, + "train_speed(iter/s)": 0.334292 + }, + { + "acc": 0.87531862, + "epoch": 1.27003367003367, + "grad_norm": 4.25, + "learning_rate": 6.441941037588712e-06, + "loss": 0.36733196, + "memory(GiB)": 15.04, + "step": 9430, + "train_speed(iter/s)": 0.334323 + }, + { + "acc": 0.89692774, + "epoch": 1.2707070707070707, + "grad_norm": 8.4375, + "learning_rate": 6.4315370413041655e-06, + "loss": 0.43209648, + "memory(GiB)": 15.04, + "step": 9435, + "train_speed(iter/s)": 0.334348 + }, + { + "acc": 0.82759171, + "epoch": 1.2713804713804713, + "grad_norm": 17.75, + "learning_rate": 6.421137469390949e-06, + "loss": 0.4820406, + "memory(GiB)": 15.04, + "step": 9440, + "train_speed(iter/s)": 0.334392 + }, + { + "acc": 0.90950403, + "epoch": 1.272053872053872, + "grad_norm": 3.78125, + "learning_rate": 6.41074233474301e-06, + "loss": 0.2940321, + "memory(GiB)": 15.04, + "step": 9445, + "train_speed(iter/s)": 0.334429 + }, + { + "acc": 0.91077566, + "epoch": 1.2727272727272727, + "grad_norm": 6.90625, + "learning_rate": 6.400351650248788e-06, + "loss": 0.40922956, + "memory(GiB)": 15.04, + "step": 9450, + "train_speed(iter/s)": 0.334387 + }, + { + "acc": 0.89681053, + "epoch": 1.2734006734006735, + "grad_norm": 6.90625, + "learning_rate": 6.3899654287912204e-06, + "loss": 0.40920839, + "memory(GiB)": 15.04, + "step": 9455, + "train_speed(iter/s)": 0.334419 + }, + { + "acc": 0.91836929, + "epoch": 1.2740740740740741, + "grad_norm": 15.0, + "learning_rate": 6.3795836832476895e-06, + "loss": 0.33368034, + "memory(GiB)": 15.04, + "step": 9460, + "train_speed(iter/s)": 0.334458 + }, + { + "acc": 0.86629801, + "epoch": 1.2747474747474747, + "grad_norm": 11.8125, + "learning_rate": 6.369206426490048e-06, + "loss": 0.32990625, + "memory(GiB)": 15.04, + "step": 9465, + "train_speed(iter/s)": 0.334479 + }, + { + "acc": 0.89950666, + "epoch": 1.2754208754208753, + "grad_norm": 8.0625, + "learning_rate": 6.358833671384565e-06, + "loss": 0.39896271, + "memory(GiB)": 15.04, + "step": 9470, + "train_speed(iter/s)": 0.334459 + }, + { + "acc": 0.93626051, + "epoch": 1.2760942760942762, + "grad_norm": 5.28125, + "learning_rate": 6.3484654307919415e-06, + "loss": 0.246946, + "memory(GiB)": 15.04, + "step": 9475, + "train_speed(iter/s)": 0.334492 + }, + { + "acc": 0.91166868, + "epoch": 1.2767676767676768, + "grad_norm": 8.75, + "learning_rate": 6.338101717567282e-06, + "loss": 0.31343174, + "memory(GiB)": 15.04, + "step": 9480, + "train_speed(iter/s)": 0.3345 + }, + { + "acc": 0.88177662, + "epoch": 1.2774410774410774, + "grad_norm": 16.875, + "learning_rate": 6.327742544560063e-06, + "loss": 0.36417689, + "memory(GiB)": 15.04, + "step": 9485, + "train_speed(iter/s)": 0.334495 + }, + { + "acc": 0.93200769, + "epoch": 1.2781144781144782, + "grad_norm": 5.53125, + "learning_rate": 6.317387924614151e-06, + "loss": 0.20921798, + "memory(GiB)": 15.04, + "step": 9490, + "train_speed(iter/s)": 0.334517 + }, + { + "acc": 0.93198013, + "epoch": 1.2787878787878788, + "grad_norm": 7.34375, + "learning_rate": 6.307037870567751e-06, + "loss": 0.23144667, + "memory(GiB)": 15.04, + "step": 9495, + "train_speed(iter/s)": 0.334545 + }, + { + "acc": 0.92326565, + "epoch": 1.2794612794612794, + "grad_norm": 8.4375, + "learning_rate": 6.296692395253415e-06, + "loss": 0.29435198, + "memory(GiB)": 15.04, + "step": 9500, + "train_speed(iter/s)": 0.334583 + }, + { + "acc": 0.92107267, + "epoch": 1.28013468013468, + "grad_norm": 3.265625, + "learning_rate": 6.286351511498024e-06, + "loss": 0.26065884, + "memory(GiB)": 15.04, + "step": 9505, + "train_speed(iter/s)": 0.334575 + }, + { + "acc": 0.90917654, + "epoch": 1.2808080808080808, + "grad_norm": 7.5625, + "learning_rate": 6.276015232122748e-06, + "loss": 0.29355597, + "memory(GiB)": 15.04, + "step": 9510, + "train_speed(iter/s)": 0.334582 + }, + { + "acc": 0.89617958, + "epoch": 1.2814814814814814, + "grad_norm": 8.625, + "learning_rate": 6.265683569943069e-06, + "loss": 0.26834872, + "memory(GiB)": 15.04, + "step": 9515, + "train_speed(iter/s)": 0.334588 + }, + { + "acc": 0.8536458, + "epoch": 1.2821548821548823, + "grad_norm": 6.03125, + "learning_rate": 6.255356537768725e-06, + "loss": 0.42065563, + "memory(GiB)": 15.04, + "step": 9520, + "train_speed(iter/s)": 0.334618 + }, + { + "acc": 0.93801365, + "epoch": 1.2828282828282829, + "grad_norm": 6.1875, + "learning_rate": 6.2450341484037325e-06, + "loss": 0.23328383, + "memory(GiB)": 15.04, + "step": 9525, + "train_speed(iter/s)": 0.334643 + }, + { + "acc": 0.87468853, + "epoch": 1.2835016835016835, + "grad_norm": 8.4375, + "learning_rate": 6.2347164146463355e-06, + "loss": 0.35560603, + "memory(GiB)": 15.04, + "step": 9530, + "train_speed(iter/s)": 0.334664 + }, + { + "acc": 0.91180153, + "epoch": 1.284175084175084, + "grad_norm": 7.28125, + "learning_rate": 6.224403349289018e-06, + "loss": 0.27251148, + "memory(GiB)": 15.04, + "step": 9535, + "train_speed(iter/s)": 0.334702 + }, + { + "acc": 0.8807745, + "epoch": 1.284848484848485, + "grad_norm": 7.46875, + "learning_rate": 6.214094965118466e-06, + "loss": 0.33747103, + "memory(GiB)": 15.04, + "step": 9540, + "train_speed(iter/s)": 0.334738 + }, + { + "acc": 0.88320541, + "epoch": 1.2855218855218855, + "grad_norm": 19.75, + "learning_rate": 6.203791274915567e-06, + "loss": 0.54292784, + "memory(GiB)": 15.04, + "step": 9545, + "train_speed(iter/s)": 0.334771 + }, + { + "acc": 0.89294491, + "epoch": 1.2861952861952861, + "grad_norm": 7.8125, + "learning_rate": 6.193492291455385e-06, + "loss": 0.48767056, + "memory(GiB)": 15.04, + "step": 9550, + "train_speed(iter/s)": 0.334779 + }, + { + "acc": 0.92093458, + "epoch": 1.286868686868687, + "grad_norm": 5.90625, + "learning_rate": 6.183198027507158e-06, + "loss": 0.26070592, + "memory(GiB)": 15.04, + "step": 9555, + "train_speed(iter/s)": 0.334805 + }, + { + "acc": 0.88449621, + "epoch": 1.2875420875420875, + "grad_norm": 7.34375, + "learning_rate": 6.172908495834258e-06, + "loss": 0.49047241, + "memory(GiB)": 15.04, + "step": 9560, + "train_speed(iter/s)": 0.334811 + }, + { + "acc": 0.89118357, + "epoch": 1.2882154882154881, + "grad_norm": 6.71875, + "learning_rate": 6.162623709194202e-06, + "loss": 0.42326612, + "memory(GiB)": 15.04, + "step": 9565, + "train_speed(iter/s)": 0.334809 + }, + { + "acc": 0.92075138, + "epoch": 1.2888888888888888, + "grad_norm": 4.71875, + "learning_rate": 6.152343680338614e-06, + "loss": 0.29220011, + "memory(GiB)": 15.04, + "step": 9570, + "train_speed(iter/s)": 0.334812 + }, + { + "acc": 0.88515062, + "epoch": 1.2895622895622896, + "grad_norm": 22.875, + "learning_rate": 6.142068422013226e-06, + "loss": 0.42076855, + "memory(GiB)": 15.04, + "step": 9575, + "train_speed(iter/s)": 0.334828 + }, + { + "acc": 0.8957715, + "epoch": 1.2902356902356902, + "grad_norm": 8.375, + "learning_rate": 6.131797946957857e-06, + "loss": 0.28677437, + "memory(GiB)": 15.04, + "step": 9580, + "train_speed(iter/s)": 0.334851 + }, + { + "acc": 0.86107922, + "epoch": 1.290909090909091, + "grad_norm": 8.375, + "learning_rate": 6.1215322679063846e-06, + "loss": 0.2829767, + "memory(GiB)": 15.04, + "step": 9585, + "train_speed(iter/s)": 0.334872 + }, + { + "acc": 0.92166348, + "epoch": 1.2915824915824916, + "grad_norm": 6.40625, + "learning_rate": 6.111271397586751e-06, + "loss": 0.27973387, + "memory(GiB)": 15.04, + "step": 9590, + "train_speed(iter/s)": 0.334893 + }, + { + "acc": 0.85127792, + "epoch": 1.2922558922558922, + "grad_norm": 26.125, + "learning_rate": 6.101015348720934e-06, + "loss": 0.44502549, + "memory(GiB)": 15.04, + "step": 9595, + "train_speed(iter/s)": 0.334932 + }, + { + "acc": 0.90770893, + "epoch": 1.2929292929292928, + "grad_norm": 4.96875, + "learning_rate": 6.090764134024927e-06, + "loss": 0.42872896, + "memory(GiB)": 15.04, + "step": 9600, + "train_speed(iter/s)": 0.334918 + }, + { + "epoch": 1.2929292929292928, + "eval_acc": 0.8960018608016954, + "eval_loss": 0.4059368968009949, + "eval_runtime": 110.1772, + "eval_samples_per_second": 1.361, + "eval_steps_per_second": 1.361, + "step": 9600 + }, + { + "acc": 0.8971241, + "epoch": 1.2936026936026936, + "grad_norm": 8.5, + "learning_rate": 6.080517766208742e-06, + "loss": 0.31131194, + "memory(GiB)": 15.04, + "step": 9605, + "train_speed(iter/s)": 0.333671 + }, + { + "acc": 0.86708736, + "epoch": 1.2942760942760942, + "grad_norm": 12.25, + "learning_rate": 6.070276257976364e-06, + "loss": 0.60133257, + "memory(GiB)": 15.04, + "step": 9610, + "train_speed(iter/s)": 0.333707 + }, + { + "acc": 0.89065332, + "epoch": 1.294949494949495, + "grad_norm": 9.125, + "learning_rate": 6.0600396220257705e-06, + "loss": 0.33532963, + "memory(GiB)": 15.04, + "step": 9615, + "train_speed(iter/s)": 0.333742 + }, + { + "acc": 0.90499363, + "epoch": 1.2956228956228957, + "grad_norm": 9.0, + "learning_rate": 6.049807871048889e-06, + "loss": 0.27627466, + "memory(GiB)": 15.04, + "step": 9620, + "train_speed(iter/s)": 0.333763 + }, + { + "acc": 0.88430939, + "epoch": 1.2962962962962963, + "grad_norm": 12.4375, + "learning_rate": 6.039581017731591e-06, + "loss": 0.63434253, + "memory(GiB)": 15.04, + "step": 9625, + "train_speed(iter/s)": 0.333803 + }, + { + "acc": 0.93028517, + "epoch": 1.2969696969696969, + "grad_norm": 4.875, + "learning_rate": 6.029359074753679e-06, + "loss": 0.24705272, + "memory(GiB)": 15.04, + "step": 9630, + "train_speed(iter/s)": 0.333814 + }, + { + "acc": 0.93611622, + "epoch": 1.2976430976430977, + "grad_norm": 5.40625, + "learning_rate": 6.019142054788858e-06, + "loss": 0.24927502, + "memory(GiB)": 15.04, + "step": 9635, + "train_speed(iter/s)": 0.333856 + }, + { + "acc": 0.92476377, + "epoch": 1.2983164983164983, + "grad_norm": 7.625, + "learning_rate": 6.00892997050474e-06, + "loss": 0.38260887, + "memory(GiB)": 15.04, + "step": 9640, + "train_speed(iter/s)": 0.333882 + }, + { + "acc": 0.93132019, + "epoch": 1.298989898989899, + "grad_norm": 20.25, + "learning_rate": 5.99872283456282e-06, + "loss": 0.26264632, + "memory(GiB)": 15.04, + "step": 9645, + "train_speed(iter/s)": 0.333908 + }, + { + "acc": 0.912959, + "epoch": 1.2996632996632997, + "grad_norm": 6.96875, + "learning_rate": 5.98852065961844e-06, + "loss": 0.28866189, + "memory(GiB)": 15.04, + "step": 9650, + "train_speed(iter/s)": 0.333922 + }, + { + "acc": 0.9070096, + "epoch": 1.3003367003367003, + "grad_norm": 8.75, + "learning_rate": 5.978323458320814e-06, + "loss": 0.46759634, + "memory(GiB)": 15.04, + "step": 9655, + "train_speed(iter/s)": 0.333965 + }, + { + "acc": 0.8527648, + "epoch": 1.301010101010101, + "grad_norm": 10.0625, + "learning_rate": 5.9681312433129656e-06, + "loss": 0.49304399, + "memory(GiB)": 15.04, + "step": 9660, + "train_speed(iter/s)": 0.33399 + }, + { + "acc": 0.94254284, + "epoch": 1.3016835016835016, + "grad_norm": 7.34375, + "learning_rate": 5.957944027231756e-06, + "loss": 0.26958907, + "memory(GiB)": 15.04, + "step": 9665, + "train_speed(iter/s)": 0.333995 + }, + { + "acc": 0.91601257, + "epoch": 1.3023569023569024, + "grad_norm": 9.1875, + "learning_rate": 5.947761822707842e-06, + "loss": 0.28514488, + "memory(GiB)": 15.04, + "step": 9670, + "train_speed(iter/s)": 0.334035 + }, + { + "acc": 0.83065281, + "epoch": 1.303030303030303, + "grad_norm": 5.53125, + "learning_rate": 5.937584642365661e-06, + "loss": 0.4698401, + "memory(GiB)": 15.04, + "step": 9675, + "train_speed(iter/s)": 0.334057 + }, + { + "acc": 0.88610077, + "epoch": 1.3037037037037038, + "grad_norm": 7.34375, + "learning_rate": 5.927412498823431e-06, + "loss": 0.39201999, + "memory(GiB)": 15.04, + "step": 9680, + "train_speed(iter/s)": 0.334072 + }, + { + "acc": 0.89949703, + "epoch": 1.3043771043771044, + "grad_norm": 14.4375, + "learning_rate": 5.9172454046931125e-06, + "loss": 0.34470611, + "memory(GiB)": 15.04, + "step": 9685, + "train_speed(iter/s)": 0.334092 + }, + { + "acc": 0.91302719, + "epoch": 1.305050505050505, + "grad_norm": 5.125, + "learning_rate": 5.90708337258042e-06, + "loss": 0.26164844, + "memory(GiB)": 15.04, + "step": 9690, + "train_speed(iter/s)": 0.334101 + }, + { + "acc": 0.93276243, + "epoch": 1.3057239057239056, + "grad_norm": 7.25, + "learning_rate": 5.89692641508478e-06, + "loss": 0.23191741, + "memory(GiB)": 15.04, + "step": 9695, + "train_speed(iter/s)": 0.334123 + }, + { + "acc": 0.92170534, + "epoch": 1.3063973063973064, + "grad_norm": 4.1875, + "learning_rate": 5.886774544799337e-06, + "loss": 0.20918903, + "memory(GiB)": 15.04, + "step": 9700, + "train_speed(iter/s)": 0.334138 + }, + { + "acc": 0.9261858, + "epoch": 1.307070707070707, + "grad_norm": 11.5, + "learning_rate": 5.876627774310917e-06, + "loss": 0.26228995, + "memory(GiB)": 15.04, + "step": 9705, + "train_speed(iter/s)": 0.334169 + }, + { + "acc": 0.92261524, + "epoch": 1.3077441077441079, + "grad_norm": 5.8125, + "learning_rate": 5.866486116200033e-06, + "loss": 0.22712448, + "memory(GiB)": 15.04, + "step": 9710, + "train_speed(iter/s)": 0.334203 + }, + { + "acc": 0.8621439, + "epoch": 1.3084175084175085, + "grad_norm": 8.8125, + "learning_rate": 5.8563495830408525e-06, + "loss": 0.42797208, + "memory(GiB)": 15.04, + "step": 9715, + "train_speed(iter/s)": 0.334241 + }, + { + "acc": 0.87616835, + "epoch": 1.309090909090909, + "grad_norm": 5.84375, + "learning_rate": 5.8462181874011955e-06, + "loss": 0.46978016, + "memory(GiB)": 15.04, + "step": 9720, + "train_speed(iter/s)": 0.334275 + }, + { + "acc": 0.87344656, + "epoch": 1.3097643097643097, + "grad_norm": 10.3125, + "learning_rate": 5.836091941842506e-06, + "loss": 0.44679465, + "memory(GiB)": 15.04, + "step": 9725, + "train_speed(iter/s)": 0.334299 + }, + { + "acc": 0.91853428, + "epoch": 1.3104377104377105, + "grad_norm": 6.34375, + "learning_rate": 5.825970858919847e-06, + "loss": 0.45350027, + "memory(GiB)": 15.04, + "step": 9730, + "train_speed(iter/s)": 0.334331 + }, + { + "acc": 0.82459059, + "epoch": 1.3111111111111111, + "grad_norm": 8.875, + "learning_rate": 5.815854951181874e-06, + "loss": 0.64473267, + "memory(GiB)": 15.04, + "step": 9735, + "train_speed(iter/s)": 0.334364 + }, + { + "acc": 0.85284615, + "epoch": 1.3117845117845117, + "grad_norm": 5.59375, + "learning_rate": 5.805744231170833e-06, + "loss": 0.53016238, + "memory(GiB)": 15.04, + "step": 9740, + "train_speed(iter/s)": 0.334396 + }, + { + "acc": 0.90661678, + "epoch": 1.3124579124579125, + "grad_norm": 12.0, + "learning_rate": 5.795638711422542e-06, + "loss": 0.32342672, + "memory(GiB)": 15.04, + "step": 9745, + "train_speed(iter/s)": 0.334419 + }, + { + "acc": 0.89141331, + "epoch": 1.3131313131313131, + "grad_norm": 6.1875, + "learning_rate": 5.785538404466355e-06, + "loss": 0.31523666, + "memory(GiB)": 15.04, + "step": 9750, + "train_speed(iter/s)": 0.334442 + }, + { + "acc": 0.88405666, + "epoch": 1.3138047138047138, + "grad_norm": 8.0, + "learning_rate": 5.775443322825183e-06, + "loss": 0.36176951, + "memory(GiB)": 15.04, + "step": 9755, + "train_speed(iter/s)": 0.334482 + }, + { + "acc": 0.91502209, + "epoch": 1.3144781144781144, + "grad_norm": 8.6875, + "learning_rate": 5.765353479015438e-06, + "loss": 0.29535065, + "memory(GiB)": 15.04, + "step": 9760, + "train_speed(iter/s)": 0.334499 + }, + { + "acc": 0.90932894, + "epoch": 1.3151515151515152, + "grad_norm": 4.15625, + "learning_rate": 5.755268885547054e-06, + "loss": 0.45657849, + "memory(GiB)": 15.04, + "step": 9765, + "train_speed(iter/s)": 0.334504 + }, + { + "acc": 0.92367249, + "epoch": 1.3158249158249158, + "grad_norm": 4.5, + "learning_rate": 5.745189554923454e-06, + "loss": 0.2430377, + "memory(GiB)": 15.04, + "step": 9770, + "train_speed(iter/s)": 0.334532 + }, + { + "acc": 0.90884066, + "epoch": 1.3164983164983166, + "grad_norm": 7.59375, + "learning_rate": 5.7351154996415215e-06, + "loss": 0.34123187, + "memory(GiB)": 15.04, + "step": 9775, + "train_speed(iter/s)": 0.334537 + }, + { + "acc": 0.80925026, + "epoch": 1.3171717171717172, + "grad_norm": 13.0625, + "learning_rate": 5.725046732191619e-06, + "loss": 0.4762979, + "memory(GiB)": 15.04, + "step": 9780, + "train_speed(iter/s)": 0.334572 + }, + { + "acc": 0.88695049, + "epoch": 1.3178451178451178, + "grad_norm": 7.78125, + "learning_rate": 5.7149832650575365e-06, + "loss": 0.51415014, + "memory(GiB)": 15.04, + "step": 9785, + "train_speed(iter/s)": 0.334594 + }, + { + "acc": 0.9239048, + "epoch": 1.3185185185185184, + "grad_norm": 8.5625, + "learning_rate": 5.704925110716499e-06, + "loss": 0.31108632, + "memory(GiB)": 15.04, + "step": 9790, + "train_speed(iter/s)": 0.33462 + }, + { + "acc": 0.90207672, + "epoch": 1.3191919191919192, + "grad_norm": 6.4375, + "learning_rate": 5.6948722816391525e-06, + "loss": 0.27707739, + "memory(GiB)": 15.04, + "step": 9795, + "train_speed(iter/s)": 0.334658 + }, + { + "acc": 0.903545, + "epoch": 1.3198653198653199, + "grad_norm": 11.625, + "learning_rate": 5.6848247902895215e-06, + "loss": 0.34115701, + "memory(GiB)": 15.04, + "step": 9800, + "train_speed(iter/s)": 0.334671 + }, + { + "acc": 0.86289349, + "epoch": 1.3205387205387205, + "grad_norm": 12.0, + "learning_rate": 5.6747826491250326e-06, + "loss": 0.51363149, + "memory(GiB)": 15.04, + "step": 9805, + "train_speed(iter/s)": 0.334693 + }, + { + "acc": 0.88835621, + "epoch": 1.3212121212121213, + "grad_norm": 6.4375, + "learning_rate": 5.664745870596462e-06, + "loss": 0.48059182, + "memory(GiB)": 15.04, + "step": 9810, + "train_speed(iter/s)": 0.334691 + }, + { + "acc": 0.82850885, + "epoch": 1.3218855218855219, + "grad_norm": 9.75, + "learning_rate": 5.654714467147951e-06, + "loss": 0.50740252, + "memory(GiB)": 15.04, + "step": 9815, + "train_speed(iter/s)": 0.334718 + }, + { + "acc": 0.84253283, + "epoch": 1.3225589225589225, + "grad_norm": 14.0, + "learning_rate": 5.644688451216968e-06, + "loss": 0.34741228, + "memory(GiB)": 15.04, + "step": 9820, + "train_speed(iter/s)": 0.33476 + }, + { + "acc": 0.88747406, + "epoch": 1.3232323232323233, + "grad_norm": 10.875, + "learning_rate": 5.634667835234302e-06, + "loss": 0.36861217, + "memory(GiB)": 15.04, + "step": 9825, + "train_speed(iter/s)": 0.3348 + }, + { + "acc": 0.947155, + "epoch": 1.323905723905724, + "grad_norm": 5.59375, + "learning_rate": 5.624652631624056e-06, + "loss": 0.20252576, + "memory(GiB)": 15.04, + "step": 9830, + "train_speed(iter/s)": 0.334773 + }, + { + "acc": 0.89678516, + "epoch": 1.3245791245791245, + "grad_norm": 10.375, + "learning_rate": 5.614642852803604e-06, + "loss": 0.38581629, + "memory(GiB)": 15.04, + "step": 9835, + "train_speed(iter/s)": 0.334802 + }, + { + "acc": 0.89338531, + "epoch": 1.3252525252525253, + "grad_norm": 8.25, + "learning_rate": 5.604638511183619e-06, + "loss": 0.32135746, + "memory(GiB)": 15.04, + "step": 9840, + "train_speed(iter/s)": 0.334833 + }, + { + "acc": 0.92641726, + "epoch": 1.325925925925926, + "grad_norm": 7.34375, + "learning_rate": 5.594639619168005e-06, + "loss": 0.25842469, + "memory(GiB)": 15.04, + "step": 9845, + "train_speed(iter/s)": 0.334855 + }, + { + "acc": 0.92931995, + "epoch": 1.3265993265993266, + "grad_norm": 5.0, + "learning_rate": 5.584646189153937e-06, + "loss": 0.21179109, + "memory(GiB)": 15.04, + "step": 9850, + "train_speed(iter/s)": 0.334867 + }, + { + "acc": 0.8109643, + "epoch": 1.3272727272727272, + "grad_norm": 6.5625, + "learning_rate": 5.574658233531796e-06, + "loss": 0.35859571, + "memory(GiB)": 15.04, + "step": 9855, + "train_speed(iter/s)": 0.334897 + }, + { + "acc": 0.85045424, + "epoch": 1.327946127946128, + "grad_norm": 8.0625, + "learning_rate": 5.564675764685181e-06, + "loss": 0.75530548, + "memory(GiB)": 15.04, + "step": 9860, + "train_speed(iter/s)": 0.334895 + }, + { + "acc": 0.93578234, + "epoch": 1.3286195286195286, + "grad_norm": 6.5625, + "learning_rate": 5.554698794990896e-06, + "loss": 0.21489677, + "memory(GiB)": 15.04, + "step": 9865, + "train_speed(iter/s)": 0.334915 + }, + { + "acc": 0.89079962, + "epoch": 1.3292929292929294, + "grad_norm": 5.53125, + "learning_rate": 5.5447273368189255e-06, + "loss": 0.39067354, + "memory(GiB)": 15.04, + "step": 9870, + "train_speed(iter/s)": 0.334916 + }, + { + "acc": 0.91762409, + "epoch": 1.32996632996633, + "grad_norm": 8.125, + "learning_rate": 5.53476140253241e-06, + "loss": 0.26952195, + "memory(GiB)": 15.04, + "step": 9875, + "train_speed(iter/s)": 0.334939 + }, + { + "acc": 0.90775633, + "epoch": 1.3306397306397306, + "grad_norm": 6.65625, + "learning_rate": 5.524801004487652e-06, + "loss": 0.34524853, + "memory(GiB)": 15.04, + "step": 9880, + "train_speed(iter/s)": 0.334968 + }, + { + "acc": 0.87291451, + "epoch": 1.3313131313131312, + "grad_norm": 11.125, + "learning_rate": 5.514846155034084e-06, + "loss": 0.40953698, + "memory(GiB)": 15.04, + "step": 9885, + "train_speed(iter/s)": 0.334983 + }, + { + "acc": 0.92717562, + "epoch": 1.331986531986532, + "grad_norm": 7.1875, + "learning_rate": 5.50489686651426e-06, + "loss": 0.31727536, + "memory(GiB)": 15.04, + "step": 9890, + "train_speed(iter/s)": 0.334996 + }, + { + "acc": 0.93518858, + "epoch": 1.3326599326599327, + "grad_norm": 7.03125, + "learning_rate": 5.494953151263847e-06, + "loss": 0.28765833, + "memory(GiB)": 15.04, + "step": 9895, + "train_speed(iter/s)": 0.335006 + }, + { + "acc": 0.87127934, + "epoch": 1.3333333333333333, + "grad_norm": 20.25, + "learning_rate": 5.485015021611587e-06, + "loss": 0.47570858, + "memory(GiB)": 15.04, + "step": 9900, + "train_speed(iter/s)": 0.335046 + }, + { + "epoch": 1.3333333333333333, + "eval_acc": 0.8956226336808257, + "eval_loss": 0.4088384509086609, + "eval_runtime": 110.0565, + "eval_samples_per_second": 1.363, + "eval_steps_per_second": 1.363, + "step": 9900 + }, + { + "acc": 0.93745594, + "epoch": 1.334006734006734, + "grad_norm": 4.65625, + "learning_rate": 5.475082489879313e-06, + "loss": 0.38016248, + "memory(GiB)": 15.04, + "step": 9905, + "train_speed(iter/s)": 0.333844 + }, + { + "acc": 0.94206533, + "epoch": 1.3346801346801347, + "grad_norm": 8.6875, + "learning_rate": 5.465155568381899e-06, + "loss": 0.20986974, + "memory(GiB)": 15.04, + "step": 9910, + "train_speed(iter/s)": 0.33388 + }, + { + "acc": 0.90396404, + "epoch": 1.3353535353535353, + "grad_norm": 8.5625, + "learning_rate": 5.455234269427281e-06, + "loss": 0.36379542, + "memory(GiB)": 15.04, + "step": 9915, + "train_speed(iter/s)": 0.333903 + }, + { + "acc": 0.8936451, + "epoch": 1.336026936026936, + "grad_norm": 5.5625, + "learning_rate": 5.445318605316418e-06, + "loss": 0.28865125, + "memory(GiB)": 15.04, + "step": 9920, + "train_speed(iter/s)": 0.33391 + }, + { + "acc": 0.93893976, + "epoch": 1.3367003367003367, + "grad_norm": 3.90625, + "learning_rate": 5.4354085883432736e-06, + "loss": 0.24402006, + "memory(GiB)": 15.04, + "step": 9925, + "train_speed(iter/s)": 0.333893 + }, + { + "acc": 0.88604975, + "epoch": 1.3373737373737373, + "grad_norm": 9.8125, + "learning_rate": 5.425504230794827e-06, + "loss": 0.35277283, + "memory(GiB)": 15.04, + "step": 9930, + "train_speed(iter/s)": 0.333924 + }, + { + "acc": 0.85546255, + "epoch": 1.3380471380471382, + "grad_norm": 16.625, + "learning_rate": 5.415605544951019e-06, + "loss": 0.55175953, + "memory(GiB)": 15.04, + "step": 9935, + "train_speed(iter/s)": 0.333952 + }, + { + "acc": 0.91053429, + "epoch": 1.3387205387205388, + "grad_norm": 8.9375, + "learning_rate": 5.405712543084777e-06, + "loss": 0.35091743, + "memory(GiB)": 15.04, + "step": 9940, + "train_speed(iter/s)": 0.333969 + }, + { + "acc": 0.86309891, + "epoch": 1.3393939393939394, + "grad_norm": 5.90625, + "learning_rate": 5.395825237461976e-06, + "loss": 0.644346, + "memory(GiB)": 15.04, + "step": 9945, + "train_speed(iter/s)": 0.333983 + }, + { + "acc": 0.88862133, + "epoch": 1.34006734006734, + "grad_norm": 8.5625, + "learning_rate": 5.38594364034142e-06, + "loss": 0.30490239, + "memory(GiB)": 15.04, + "step": 9950, + "train_speed(iter/s)": 0.334023 + }, + { + "acc": 0.93491707, + "epoch": 1.3407407407407408, + "grad_norm": 6.59375, + "learning_rate": 5.37606776397485e-06, + "loss": 0.25334756, + "memory(GiB)": 15.04, + "step": 9955, + "train_speed(iter/s)": 0.334038 + }, + { + "acc": 0.91741714, + "epoch": 1.3414141414141414, + "grad_norm": 8.0, + "learning_rate": 5.366197620606899e-06, + "loss": 0.24480448, + "memory(GiB)": 15.04, + "step": 9960, + "train_speed(iter/s)": 0.334041 + }, + { + "acc": 0.81197901, + "epoch": 1.3420875420875422, + "grad_norm": 17.125, + "learning_rate": 5.3563332224750985e-06, + "loss": 0.49897423, + "memory(GiB)": 15.04, + "step": 9965, + "train_speed(iter/s)": 0.33405 + }, + { + "acc": 0.87177372, + "epoch": 1.3427609427609428, + "grad_norm": 9.3125, + "learning_rate": 5.346474581809866e-06, + "loss": 0.380778, + "memory(GiB)": 15.04, + "step": 9970, + "train_speed(iter/s)": 0.334081 + }, + { + "acc": 0.93990059, + "epoch": 1.3434343434343434, + "grad_norm": 6.78125, + "learning_rate": 5.336621710834462e-06, + "loss": 0.22352836, + "memory(GiB)": 15.04, + "step": 9975, + "train_speed(iter/s)": 0.334107 + }, + { + "acc": 0.84761782, + "epoch": 1.344107744107744, + "grad_norm": 14.6875, + "learning_rate": 5.326774621765009e-06, + "loss": 0.48472433, + "memory(GiB)": 15.04, + "step": 9980, + "train_speed(iter/s)": 0.334112 + }, + { + "acc": 0.92594032, + "epoch": 1.3447811447811449, + "grad_norm": 6.125, + "learning_rate": 5.316933326810452e-06, + "loss": 0.30096884, + "memory(GiB)": 15.04, + "step": 9985, + "train_speed(iter/s)": 0.334142 + }, + { + "acc": 0.897118, + "epoch": 1.3454545454545455, + "grad_norm": 5.8125, + "learning_rate": 5.3070978381725546e-06, + "loss": 0.64908366, + "memory(GiB)": 15.04, + "step": 9990, + "train_speed(iter/s)": 0.334175 + }, + { + "acc": 0.90277023, + "epoch": 1.346127946127946, + "grad_norm": 5.96875, + "learning_rate": 5.297268168045887e-06, + "loss": 0.35519619, + "memory(GiB)": 15.04, + "step": 9995, + "train_speed(iter/s)": 0.334188 + }, + { + "acc": 0.87940149, + "epoch": 1.3468013468013469, + "grad_norm": 10.625, + "learning_rate": 5.287444328617793e-06, + "loss": 0.47951393, + "memory(GiB)": 15.04, + "step": 10000, + "train_speed(iter/s)": 0.334201 + }, + { + "acc": 0.82602615, + "epoch": 1.3474747474747475, + "grad_norm": 17.875, + "learning_rate": 5.277626332068402e-06, + "loss": 0.52059345, + "memory(GiB)": 15.04, + "step": 10005, + "train_speed(iter/s)": 0.334192 + }, + { + "acc": 0.91368418, + "epoch": 1.348148148148148, + "grad_norm": 6.59375, + "learning_rate": 5.267814190570584e-06, + "loss": 0.21461604, + "memory(GiB)": 15.04, + "step": 10010, + "train_speed(iter/s)": 0.334219 + }, + { + "acc": 0.91373196, + "epoch": 1.3488215488215487, + "grad_norm": 7.5625, + "learning_rate": 5.258007916289965e-06, + "loss": 0.26179779, + "memory(GiB)": 15.04, + "step": 10015, + "train_speed(iter/s)": 0.334237 + }, + { + "acc": 0.91855373, + "epoch": 1.3494949494949495, + "grad_norm": 9.125, + "learning_rate": 5.24820752138488e-06, + "loss": 0.31510684, + "memory(GiB)": 15.04, + "step": 10020, + "train_speed(iter/s)": 0.334273 + }, + { + "acc": 0.91080656, + "epoch": 1.3501683501683501, + "grad_norm": 9.9375, + "learning_rate": 5.238413018006392e-06, + "loss": 0.2690109, + "memory(GiB)": 15.04, + "step": 10025, + "train_speed(iter/s)": 0.334306 + }, + { + "acc": 0.93179283, + "epoch": 1.350841750841751, + "grad_norm": 5.34375, + "learning_rate": 5.228624418298241e-06, + "loss": 0.2307549, + "memory(GiB)": 15.04, + "step": 10030, + "train_speed(iter/s)": 0.334341 + }, + { + "acc": 0.86719303, + "epoch": 1.3515151515151516, + "grad_norm": 9.375, + "learning_rate": 5.2188417343968645e-06, + "loss": 0.48214526, + "memory(GiB)": 15.04, + "step": 10035, + "train_speed(iter/s)": 0.334341 + }, + { + "acc": 0.8814992, + "epoch": 1.3521885521885522, + "grad_norm": 12.9375, + "learning_rate": 5.209064978431353e-06, + "loss": 0.29228759, + "memory(GiB)": 15.04, + "step": 10040, + "train_speed(iter/s)": 0.334386 + }, + { + "acc": 0.8772913, + "epoch": 1.3528619528619528, + "grad_norm": 8.6875, + "learning_rate": 5.199294162523455e-06, + "loss": 0.52333031, + "memory(GiB)": 15.04, + "step": 10045, + "train_speed(iter/s)": 0.334412 + }, + { + "acc": 0.83181, + "epoch": 1.3535353535353536, + "grad_norm": 11.0625, + "learning_rate": 5.189529298787546e-06, + "loss": 0.71318178, + "memory(GiB)": 15.04, + "step": 10050, + "train_speed(iter/s)": 0.334427 + }, + { + "acc": 0.92688646, + "epoch": 1.3542087542087542, + "grad_norm": 7.3125, + "learning_rate": 5.179770399330629e-06, + "loss": 0.30033717, + "memory(GiB)": 15.04, + "step": 10055, + "train_speed(iter/s)": 0.334453 + }, + { + "acc": 0.88847084, + "epoch": 1.354882154882155, + "grad_norm": 5.71875, + "learning_rate": 5.170017476252316e-06, + "loss": 0.57049932, + "memory(GiB)": 15.04, + "step": 10060, + "train_speed(iter/s)": 0.334476 + }, + { + "acc": 0.83316708, + "epoch": 1.3555555555555556, + "grad_norm": 12.0, + "learning_rate": 5.160270541644792e-06, + "loss": 0.25934565, + "memory(GiB)": 15.04, + "step": 10065, + "train_speed(iter/s)": 0.334505 + }, + { + "acc": 0.94990816, + "epoch": 1.3562289562289562, + "grad_norm": 10.3125, + "learning_rate": 5.150529607592838e-06, + "loss": 0.20809155, + "memory(GiB)": 15.04, + "step": 10070, + "train_speed(iter/s)": 0.334522 + }, + { + "acc": 0.89981232, + "epoch": 1.3569023569023568, + "grad_norm": 5.4375, + "learning_rate": 5.140794686173777e-06, + "loss": 0.40964913, + "memory(GiB)": 15.04, + "step": 10075, + "train_speed(iter/s)": 0.334545 + }, + { + "acc": 0.93371658, + "epoch": 1.3575757575757577, + "grad_norm": 11.0, + "learning_rate": 5.131065789457489e-06, + "loss": 0.24967754, + "memory(GiB)": 15.04, + "step": 10080, + "train_speed(iter/s)": 0.334573 + }, + { + "acc": 0.8034173, + "epoch": 1.3582491582491583, + "grad_norm": 17.875, + "learning_rate": 5.121342929506386e-06, + "loss": 0.67588377, + "memory(GiB)": 15.04, + "step": 10085, + "train_speed(iter/s)": 0.334605 + }, + { + "acc": 0.84909592, + "epoch": 1.3589225589225589, + "grad_norm": 11.375, + "learning_rate": 5.111626118375379e-06, + "loss": 0.5578403, + "memory(GiB)": 15.04, + "step": 10090, + "train_speed(iter/s)": 0.334616 + }, + { + "acc": 0.88609409, + "epoch": 1.3595959595959597, + "grad_norm": 7.84375, + "learning_rate": 5.1019153681119024e-06, + "loss": 0.41217747, + "memory(GiB)": 15.04, + "step": 10095, + "train_speed(iter/s)": 0.334635 + }, + { + "acc": 0.90607328, + "epoch": 1.3602693602693603, + "grad_norm": 7.90625, + "learning_rate": 5.092210690755853e-06, + "loss": 0.50391126, + "memory(GiB)": 15.04, + "step": 10100, + "train_speed(iter/s)": 0.334646 + }, + { + "acc": 0.89073992, + "epoch": 1.360942760942761, + "grad_norm": 8.3125, + "learning_rate": 5.082512098339616e-06, + "loss": 0.36505196, + "memory(GiB)": 15.04, + "step": 10105, + "train_speed(iter/s)": 0.33467 + }, + { + "acc": 0.89976683, + "epoch": 1.3616161616161615, + "grad_norm": 5.78125, + "learning_rate": 5.0728196028880265e-06, + "loss": 0.70888257, + "memory(GiB)": 15.04, + "step": 10110, + "train_speed(iter/s)": 0.334702 + }, + { + "acc": 0.92454653, + "epoch": 1.3622895622895623, + "grad_norm": 5.0, + "learning_rate": 5.063133216418351e-06, + "loss": 0.2980185, + "memory(GiB)": 15.04, + "step": 10115, + "train_speed(iter/s)": 0.334702 + }, + { + "acc": 0.89257593, + "epoch": 1.362962962962963, + "grad_norm": 6.53125, + "learning_rate": 5.0534529509402995e-06, + "loss": 0.31065969, + "memory(GiB)": 15.04, + "step": 10120, + "train_speed(iter/s)": 0.334706 + }, + { + "acc": 0.91431408, + "epoch": 1.3636363636363638, + "grad_norm": 6.53125, + "learning_rate": 5.0437788184559755e-06, + "loss": 0.39485724, + "memory(GiB)": 15.04, + "step": 10125, + "train_speed(iter/s)": 0.334735 + }, + { + "acc": 0.88906364, + "epoch": 1.3643097643097644, + "grad_norm": 7.71875, + "learning_rate": 5.0341108309598886e-06, + "loss": 0.26602957, + "memory(GiB)": 15.04, + "step": 10130, + "train_speed(iter/s)": 0.334744 + }, + { + "acc": 0.90695925, + "epoch": 1.364983164983165, + "grad_norm": 6.4375, + "learning_rate": 5.024449000438931e-06, + "loss": 0.36310508, + "memory(GiB)": 15.04, + "step": 10135, + "train_speed(iter/s)": 0.334748 + }, + { + "acc": 0.91908731, + "epoch": 1.3656565656565656, + "grad_norm": 8.9375, + "learning_rate": 5.01479333887235e-06, + "loss": 0.37564602, + "memory(GiB)": 15.04, + "step": 10140, + "train_speed(iter/s)": 0.334768 + }, + { + "acc": 0.9059762, + "epoch": 1.3663299663299664, + "grad_norm": 5.96875, + "learning_rate": 5.0051438582317594e-06, + "loss": 0.18879628, + "memory(GiB)": 15.04, + "step": 10145, + "train_speed(iter/s)": 0.334798 + }, + { + "acc": 0.92819109, + "epoch": 1.367003367003367, + "grad_norm": 5.96875, + "learning_rate": 4.9955005704810936e-06, + "loss": 0.30936062, + "memory(GiB)": 15.04, + "step": 10150, + "train_speed(iter/s)": 0.334822 + }, + { + "acc": 0.93115044, + "epoch": 1.3676767676767676, + "grad_norm": 6.71875, + "learning_rate": 4.9858634875766196e-06, + "loss": 0.32543521, + "memory(GiB)": 15.04, + "step": 10155, + "train_speed(iter/s)": 0.334836 + }, + { + "acc": 0.8822731, + "epoch": 1.3683501683501684, + "grad_norm": 5.0, + "learning_rate": 4.9762326214669154e-06, + "loss": 0.6356657, + "memory(GiB)": 15.04, + "step": 10160, + "train_speed(iter/s)": 0.334851 + }, + { + "acc": 0.86856585, + "epoch": 1.369023569023569, + "grad_norm": 11.375, + "learning_rate": 4.966607984092834e-06, + "loss": 0.33516977, + "memory(GiB)": 15.04, + "step": 10165, + "train_speed(iter/s)": 0.334886 + }, + { + "acc": 0.89640121, + "epoch": 1.3696969696969696, + "grad_norm": 8.9375, + "learning_rate": 4.956989587387523e-06, + "loss": 0.41605682, + "memory(GiB)": 15.04, + "step": 10170, + "train_speed(iter/s)": 0.33489 + }, + { + "acc": 0.93657713, + "epoch": 1.3703703703703702, + "grad_norm": 4.875, + "learning_rate": 4.94737744327638e-06, + "loss": 0.24267459, + "memory(GiB)": 15.04, + "step": 10175, + "train_speed(iter/s)": 0.334899 + }, + { + "acc": 0.88825378, + "epoch": 1.371043771043771, + "grad_norm": 6.5625, + "learning_rate": 4.93777156367706e-06, + "loss": 0.40265498, + "memory(GiB)": 15.04, + "step": 10180, + "train_speed(iter/s)": 0.334921 + }, + { + "acc": 0.91289072, + "epoch": 1.3717171717171717, + "grad_norm": 10.5, + "learning_rate": 4.928171960499442e-06, + "loss": 0.30160131, + "memory(GiB)": 15.04, + "step": 10185, + "train_speed(iter/s)": 0.334931 + }, + { + "acc": 0.92027521, + "epoch": 1.3723905723905725, + "grad_norm": 9.0625, + "learning_rate": 4.918578645645635e-06, + "loss": 0.40555534, + "memory(GiB)": 15.04, + "step": 10190, + "train_speed(iter/s)": 0.334964 + }, + { + "acc": 0.85583553, + "epoch": 1.373063973063973, + "grad_norm": 15.0625, + "learning_rate": 4.908991631009936e-06, + "loss": 0.65150752, + "memory(GiB)": 15.04, + "step": 10195, + "train_speed(iter/s)": 0.334969 + }, + { + "acc": 0.94926624, + "epoch": 1.3737373737373737, + "grad_norm": 5.0, + "learning_rate": 4.8994109284788445e-06, + "loss": 0.22038443, + "memory(GiB)": 15.04, + "step": 10200, + "train_speed(iter/s)": 0.334989 + }, + { + "epoch": 1.3737373737373737, + "eval_acc": 0.8955845525240543, + "eval_loss": 0.40956243872642517, + "eval_runtime": 109.7705, + "eval_samples_per_second": 1.366, + "eval_steps_per_second": 1.366, + "step": 10200 + }, + { + "acc": 0.77951794, + "epoch": 1.3744107744107743, + "grad_norm": 15.0625, + "learning_rate": 4.889836549931024e-06, + "loss": 0.83544111, + "memory(GiB)": 15.04, + "step": 10205, + "train_speed(iter/s)": 0.33383 + }, + { + "acc": 0.88952808, + "epoch": 1.3750841750841751, + "grad_norm": 6.125, + "learning_rate": 4.880268507237307e-06, + "loss": 0.37691703, + "memory(GiB)": 15.04, + "step": 10210, + "train_speed(iter/s)": 0.333843 + }, + { + "acc": 0.89511099, + "epoch": 1.3757575757575757, + "grad_norm": 9.625, + "learning_rate": 4.870706812260656e-06, + "loss": 0.32926075, + "memory(GiB)": 15.04, + "step": 10215, + "train_speed(iter/s)": 0.333872 + }, + { + "acc": 0.90368662, + "epoch": 1.3764309764309766, + "grad_norm": 14.75, + "learning_rate": 4.861151476856182e-06, + "loss": 0.44513268, + "memory(GiB)": 15.04, + "step": 10220, + "train_speed(iter/s)": 0.333897 + }, + { + "acc": 0.87467747, + "epoch": 1.3771043771043772, + "grad_norm": 13.9375, + "learning_rate": 4.851602512871092e-06, + "loss": 0.49827414, + "memory(GiB)": 15.04, + "step": 10225, + "train_speed(iter/s)": 0.333921 + }, + { + "acc": 0.93380136, + "epoch": 1.3777777777777778, + "grad_norm": 6.1875, + "learning_rate": 4.8420599321447085e-06, + "loss": 0.2167762, + "memory(GiB)": 15.04, + "step": 10230, + "train_speed(iter/s)": 0.333941 + }, + { + "acc": 0.90687685, + "epoch": 1.3784511784511784, + "grad_norm": 6.34375, + "learning_rate": 4.832523746508434e-06, + "loss": 0.38546033, + "memory(GiB)": 15.04, + "step": 10235, + "train_speed(iter/s)": 0.333938 + }, + { + "acc": 0.89047594, + "epoch": 1.3791245791245792, + "grad_norm": 9.8125, + "learning_rate": 4.8229939677857375e-06, + "loss": 0.41071882, + "memory(GiB)": 15.04, + "step": 10240, + "train_speed(iter/s)": 0.333965 + }, + { + "acc": 0.90846529, + "epoch": 1.3797979797979798, + "grad_norm": 9.9375, + "learning_rate": 4.813470607792154e-06, + "loss": 0.27075646, + "memory(GiB)": 15.04, + "step": 10245, + "train_speed(iter/s)": 0.333999 + }, + { + "acc": 0.92954416, + "epoch": 1.3804713804713804, + "grad_norm": 6.4375, + "learning_rate": 4.803953678335249e-06, + "loss": 0.2295686, + "memory(GiB)": 15.04, + "step": 10250, + "train_speed(iter/s)": 0.334027 + }, + { + "acc": 0.89961386, + "epoch": 1.3811447811447812, + "grad_norm": 9.75, + "learning_rate": 4.794443191214624e-06, + "loss": 0.46945553, + "memory(GiB)": 15.04, + "step": 10255, + "train_speed(iter/s)": 0.334043 + }, + { + "acc": 0.89444189, + "epoch": 1.3818181818181818, + "grad_norm": 11.25, + "learning_rate": 4.784939158221893e-06, + "loss": 0.41053882, + "memory(GiB)": 15.04, + "step": 10260, + "train_speed(iter/s)": 0.334076 + }, + { + "acc": 0.91152878, + "epoch": 1.3824915824915824, + "grad_norm": 6.0625, + "learning_rate": 4.775441591140657e-06, + "loss": 0.3230912, + "memory(GiB)": 15.04, + "step": 10265, + "train_speed(iter/s)": 0.334084 + }, + { + "acc": 0.9085844, + "epoch": 1.383164983164983, + "grad_norm": 5.0625, + "learning_rate": 4.765950501746517e-06, + "loss": 0.41453176, + "memory(GiB)": 15.04, + "step": 10270, + "train_speed(iter/s)": 0.334075 + }, + { + "acc": 0.84861412, + "epoch": 1.3838383838383839, + "grad_norm": 5.3125, + "learning_rate": 4.756465901807025e-06, + "loss": 0.51905255, + "memory(GiB)": 15.04, + "step": 10275, + "train_speed(iter/s)": 0.334048 + }, + { + "acc": 0.89370041, + "epoch": 1.3845117845117845, + "grad_norm": 7.875, + "learning_rate": 4.746987803081698e-06, + "loss": 0.34876211, + "memory(GiB)": 15.04, + "step": 10280, + "train_speed(iter/s)": 0.33408 + }, + { + "acc": 0.88962431, + "epoch": 1.3851851851851853, + "grad_norm": 4.21875, + "learning_rate": 4.737516217321996e-06, + "loss": 0.44635606, + "memory(GiB)": 15.04, + "step": 10285, + "train_speed(iter/s)": 0.334096 + }, + { + "acc": 0.89850054, + "epoch": 1.385858585858586, + "grad_norm": 6.09375, + "learning_rate": 4.728051156271289e-06, + "loss": 0.4509078, + "memory(GiB)": 15.04, + "step": 10290, + "train_speed(iter/s)": 0.334114 + }, + { + "acc": 0.89144163, + "epoch": 1.3865319865319865, + "grad_norm": 4.375, + "learning_rate": 4.718592631664875e-06, + "loss": 0.38697112, + "memory(GiB)": 15.04, + "step": 10295, + "train_speed(iter/s)": 0.334129 + }, + { + "acc": 0.9227663, + "epoch": 1.387205387205387, + "grad_norm": 5.9375, + "learning_rate": 4.70914065522993e-06, + "loss": 0.42397604, + "memory(GiB)": 15.04, + "step": 10300, + "train_speed(iter/s)": 0.334125 + }, + { + "acc": 0.93395185, + "epoch": 1.387878787878788, + "grad_norm": 23.25, + "learning_rate": 4.699695238685526e-06, + "loss": 0.24768918, + "memory(GiB)": 15.04, + "step": 10305, + "train_speed(iter/s)": 0.334162 + }, + { + "acc": 0.89229746, + "epoch": 1.3885521885521885, + "grad_norm": 15.25, + "learning_rate": 4.690256393742596e-06, + "loss": 0.32988002, + "memory(GiB)": 15.04, + "step": 10310, + "train_speed(iter/s)": 0.334197 + }, + { + "acc": 0.834126, + "epoch": 1.3892255892255894, + "grad_norm": 42.25, + "learning_rate": 4.680824132103921e-06, + "loss": 0.7447185, + "memory(GiB)": 15.04, + "step": 10315, + "train_speed(iter/s)": 0.334218 + }, + { + "acc": 0.86799088, + "epoch": 1.38989898989899, + "grad_norm": 13.3125, + "learning_rate": 4.671398465464129e-06, + "loss": 0.33674641, + "memory(GiB)": 15.04, + "step": 10320, + "train_speed(iter/s)": 0.334257 + }, + { + "acc": 0.94394722, + "epoch": 1.3905723905723906, + "grad_norm": 5.03125, + "learning_rate": 4.661979405509659e-06, + "loss": 0.22380381, + "memory(GiB)": 15.04, + "step": 10325, + "train_speed(iter/s)": 0.334271 + }, + { + "acc": 0.88122826, + "epoch": 1.3912457912457912, + "grad_norm": 19.625, + "learning_rate": 4.6525669639187705e-06, + "loss": 0.29948568, + "memory(GiB)": 15.04, + "step": 10330, + "train_speed(iter/s)": 0.334284 + }, + { + "acc": 0.93242474, + "epoch": 1.391919191919192, + "grad_norm": 12.375, + "learning_rate": 4.643161152361515e-06, + "loss": 0.23160937, + "memory(GiB)": 15.04, + "step": 10335, + "train_speed(iter/s)": 0.334312 + }, + { + "acc": 0.84878254, + "epoch": 1.3925925925925926, + "grad_norm": 28.5, + "learning_rate": 4.633761982499713e-06, + "loss": 0.44875698, + "memory(GiB)": 15.04, + "step": 10340, + "train_speed(iter/s)": 0.334354 + }, + { + "acc": 0.89076071, + "epoch": 1.3932659932659932, + "grad_norm": 5.28125, + "learning_rate": 4.624369465986967e-06, + "loss": 0.34800448, + "memory(GiB)": 15.04, + "step": 10345, + "train_speed(iter/s)": 0.334366 + }, + { + "acc": 0.90187988, + "epoch": 1.393939393939394, + "grad_norm": 5.0625, + "learning_rate": 4.614983614468613e-06, + "loss": 0.25995982, + "memory(GiB)": 15.04, + "step": 10350, + "train_speed(iter/s)": 0.334383 + }, + { + "acc": 0.86971121, + "epoch": 1.3946127946127946, + "grad_norm": 11.0625, + "learning_rate": 4.60560443958174e-06, + "loss": 0.32057152, + "memory(GiB)": 15.04, + "step": 10355, + "train_speed(iter/s)": 0.334386 + }, + { + "acc": 0.88873186, + "epoch": 1.3952861952861952, + "grad_norm": 7.5, + "learning_rate": 4.596231952955143e-06, + "loss": 0.4387907, + "memory(GiB)": 15.04, + "step": 10360, + "train_speed(iter/s)": 0.334393 + }, + { + "acc": 0.89455509, + "epoch": 1.3959595959595958, + "grad_norm": 6.0, + "learning_rate": 4.586866166209342e-06, + "loss": 0.25350516, + "memory(GiB)": 15.04, + "step": 10365, + "train_speed(iter/s)": 0.334425 + }, + { + "acc": 0.90793009, + "epoch": 1.3966329966329967, + "grad_norm": 5.4375, + "learning_rate": 4.577507090956529e-06, + "loss": 0.38416338, + "memory(GiB)": 15.04, + "step": 10370, + "train_speed(iter/s)": 0.33443 + }, + { + "acc": 0.91300716, + "epoch": 1.3973063973063973, + "grad_norm": 7.96875, + "learning_rate": 4.568154738800597e-06, + "loss": 0.30698023, + "memory(GiB)": 15.04, + "step": 10375, + "train_speed(iter/s)": 0.334457 + }, + { + "acc": 0.94259071, + "epoch": 1.397979797979798, + "grad_norm": 9.125, + "learning_rate": 4.558809121337086e-06, + "loss": 0.24739785, + "memory(GiB)": 15.04, + "step": 10380, + "train_speed(iter/s)": 0.334486 + }, + { + "acc": 0.9485342, + "epoch": 1.3986531986531987, + "grad_norm": 7.09375, + "learning_rate": 4.549470250153197e-06, + "loss": 0.18721933, + "memory(GiB)": 15.04, + "step": 10385, + "train_speed(iter/s)": 0.334502 + }, + { + "acc": 0.92590742, + "epoch": 1.3993265993265993, + "grad_norm": 3.765625, + "learning_rate": 4.5401381368277555e-06, + "loss": 0.24307759, + "memory(GiB)": 15.04, + "step": 10390, + "train_speed(iter/s)": 0.334516 + }, + { + "acc": 0.92164106, + "epoch": 1.4, + "grad_norm": 6.8125, + "learning_rate": 4.530812792931224e-06, + "loss": 0.25890124, + "memory(GiB)": 15.04, + "step": 10395, + "train_speed(iter/s)": 0.334527 + }, + { + "acc": 0.92346478, + "epoch": 1.4006734006734007, + "grad_norm": 5.78125, + "learning_rate": 4.521494230025655e-06, + "loss": 0.33697472, + "memory(GiB)": 15.04, + "step": 10400, + "train_speed(iter/s)": 0.334535 + }, + { + "acc": 0.8685358, + "epoch": 1.4013468013468013, + "grad_norm": 7.28125, + "learning_rate": 4.512182459664705e-06, + "loss": 0.38889794, + "memory(GiB)": 15.04, + "step": 10405, + "train_speed(iter/s)": 0.334567 + }, + { + "acc": 0.88733273, + "epoch": 1.402020202020202, + "grad_norm": 8.5, + "learning_rate": 4.502877493393607e-06, + "loss": 0.56911922, + "memory(GiB)": 15.04, + "step": 10410, + "train_speed(iter/s)": 0.334578 + }, + { + "acc": 0.93346405, + "epoch": 1.4026936026936028, + "grad_norm": 9.3125, + "learning_rate": 4.493579342749152e-06, + "loss": 0.3124732, + "memory(GiB)": 15.04, + "step": 10415, + "train_speed(iter/s)": 0.334597 + }, + { + "acc": 0.89845037, + "epoch": 1.4033670033670034, + "grad_norm": 7.71875, + "learning_rate": 4.4842880192596896e-06, + "loss": 0.439781, + "memory(GiB)": 15.04, + "step": 10420, + "train_speed(iter/s)": 0.334633 + }, + { + "acc": 0.89592113, + "epoch": 1.404040404040404, + "grad_norm": 11.75, + "learning_rate": 4.475003534445094e-06, + "loss": 0.48291736, + "memory(GiB)": 15.04, + "step": 10425, + "train_speed(iter/s)": 0.334639 + }, + { + "acc": 0.8618639, + "epoch": 1.4047138047138046, + "grad_norm": 7.1875, + "learning_rate": 4.46572589981677e-06, + "loss": 0.4195827, + "memory(GiB)": 15.04, + "step": 10430, + "train_speed(iter/s)": 0.334653 + }, + { + "acc": 0.84246922, + "epoch": 1.4053872053872054, + "grad_norm": 6.5625, + "learning_rate": 4.456455126877627e-06, + "loss": 0.3907258, + "memory(GiB)": 15.04, + "step": 10435, + "train_speed(iter/s)": 0.334658 + }, + { + "acc": 0.88151245, + "epoch": 1.406060606060606, + "grad_norm": 26.0, + "learning_rate": 4.44719122712206e-06, + "loss": 0.45118403, + "memory(GiB)": 15.04, + "step": 10440, + "train_speed(iter/s)": 0.334663 + }, + { + "acc": 0.8924264, + "epoch": 1.4067340067340068, + "grad_norm": 14.1875, + "learning_rate": 4.437934212035954e-06, + "loss": 0.33682299, + "memory(GiB)": 15.04, + "step": 10445, + "train_speed(iter/s)": 0.334696 + }, + { + "acc": 0.91919479, + "epoch": 1.4074074074074074, + "grad_norm": 6.625, + "learning_rate": 4.428684093096647e-06, + "loss": 0.24935861, + "memory(GiB)": 15.04, + "step": 10450, + "train_speed(iter/s)": 0.33472 + }, + { + "acc": 0.90029221, + "epoch": 1.408080808080808, + "grad_norm": 8.25, + "learning_rate": 4.41944088177293e-06, + "loss": 0.52882147, + "memory(GiB)": 15.04, + "step": 10455, + "train_speed(iter/s)": 0.334719 + }, + { + "acc": 0.88726606, + "epoch": 1.4087542087542086, + "grad_norm": 16.75, + "learning_rate": 4.41020458952504e-06, + "loss": 0.64942045, + "memory(GiB)": 15.04, + "step": 10460, + "train_speed(iter/s)": 0.334708 + }, + { + "acc": 0.93211842, + "epoch": 1.4094276094276095, + "grad_norm": 6.3125, + "learning_rate": 4.400975227804616e-06, + "loss": 0.21920092, + "memory(GiB)": 15.04, + "step": 10465, + "train_speed(iter/s)": 0.334722 + }, + { + "acc": 0.85095205, + "epoch": 1.41010101010101, + "grad_norm": 24.125, + "learning_rate": 4.3917528080547225e-06, + "loss": 0.56060896, + "memory(GiB)": 15.04, + "step": 10470, + "train_speed(iter/s)": 0.334743 + }, + { + "acc": 0.93012409, + "epoch": 1.410774410774411, + "grad_norm": 8.625, + "learning_rate": 4.3825373417098015e-06, + "loss": 0.25061631, + "memory(GiB)": 15.04, + "step": 10475, + "train_speed(iter/s)": 0.334761 + }, + { + "acc": 0.93014574, + "epoch": 1.4114478114478115, + "grad_norm": 12.5625, + "learning_rate": 4.373328840195686e-06, + "loss": 0.2959131, + "memory(GiB)": 15.04, + "step": 10480, + "train_speed(iter/s)": 0.334742 + }, + { + "acc": 0.91991816, + "epoch": 1.412121212121212, + "grad_norm": 4.65625, + "learning_rate": 4.364127314929571e-06, + "loss": 0.31254485, + "memory(GiB)": 15.04, + "step": 10485, + "train_speed(iter/s)": 0.334709 + }, + { + "acc": 0.88106947, + "epoch": 1.4127946127946127, + "grad_norm": 6.3125, + "learning_rate": 4.354932777319995e-06, + "loss": 0.40515866, + "memory(GiB)": 15.04, + "step": 10490, + "train_speed(iter/s)": 0.334747 + }, + { + "acc": 0.84238834, + "epoch": 1.4134680134680135, + "grad_norm": 8.5625, + "learning_rate": 4.345745238766842e-06, + "loss": 0.61503758, + "memory(GiB)": 15.04, + "step": 10495, + "train_speed(iter/s)": 0.334777 + }, + { + "acc": 0.90629997, + "epoch": 1.4141414141414141, + "grad_norm": 6.53125, + "learning_rate": 4.3365647106613085e-06, + "loss": 0.35899134, + "memory(GiB)": 15.04, + "step": 10500, + "train_speed(iter/s)": 0.334812 + }, + { + "epoch": 1.4141414141414141, + "eval_acc": 0.8955231477946604, + "eval_loss": 0.4071575999259949, + "eval_runtime": 110.4916, + "eval_samples_per_second": 1.358, + "eval_steps_per_second": 1.358, + "step": 10500 + } + ], + "logging_steps": 5, + "max_steps": 14850, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.608415718624e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}