{ "best_metric": 0.83392954, "best_model_checkpoint": "/yldm0226/llm_sft_output/qwen2-7b/v14-20240829-144156/checkpoint-64500", "epoch": 1.5048338412633604, "eval_steps": 500, "global_step": 64500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.61647654, "epoch": 2.333075722888931e-05, "grad_norm": 43.0, "learning_rate": 3.888024883359254e-09, "loss": 1.74678063, "memory(GiB)": 101.91, "step": 1, "train_speed(iter/s)": 0.078507 }, { "acc": 0.63459184, "epoch": 0.0002333075722888931, "grad_norm": 24.75, "learning_rate": 3.888024883359254e-08, "loss": 1.52511342, "memory(GiB)": 103.58, "step": 10, "train_speed(iter/s)": 0.301552 }, { "acc": 0.62491989, "epoch": 0.0004666151445777862, "grad_norm": 47.25, "learning_rate": 7.776049766718508e-08, "loss": 1.61745605, "memory(GiB)": 106.05, "step": 20, "train_speed(iter/s)": 0.341082 }, { "acc": 0.6284081, "epoch": 0.0006999227168666793, "grad_norm": 29.125, "learning_rate": 1.1664074650077761e-07, "loss": 1.53084373, "memory(GiB)": 106.05, "step": 30, "train_speed(iter/s)": 0.365297 }, { "acc": 0.6367589, "epoch": 0.0009332302891555724, "grad_norm": 41.5, "learning_rate": 1.5552099533437016e-07, "loss": 1.52307425, "memory(GiB)": 106.18, "step": 40, "train_speed(iter/s)": 0.372324 }, { "acc": 0.62523122, "epoch": 0.0011665378614444655, "grad_norm": 22.125, "learning_rate": 1.944012441679627e-07, "loss": 1.56338863, "memory(GiB)": 107.61, "step": 50, "train_speed(iter/s)": 0.373452 }, { "acc": 0.62716923, "epoch": 0.0013998454337333585, "grad_norm": 61.75, "learning_rate": 2.3328149300155523e-07, "loss": 1.61170464, "memory(GiB)": 107.61, "step": 60, "train_speed(iter/s)": 0.377821 }, { "acc": 0.64288507, "epoch": 0.0016331530060222517, "grad_norm": 45.5, "learning_rate": 2.721617418351478e-07, "loss": 1.50410194, "memory(GiB)": 107.61, "step": 70, "train_speed(iter/s)": 0.383983 }, { "acc": 0.62396164, "epoch": 0.0018664605783111448, "grad_norm": 36.75, "learning_rate": 3.110419906687403e-07, "loss": 1.58061781, "memory(GiB)": 107.61, "step": 80, "train_speed(iter/s)": 0.387383 }, { "acc": 0.64201689, "epoch": 0.0020997681506000378, "grad_norm": 27.625, "learning_rate": 3.4992223950233286e-07, "loss": 1.47628975, "memory(GiB)": 107.61, "step": 90, "train_speed(iter/s)": 0.392346 }, { "acc": 0.62364359, "epoch": 0.002333075722888931, "grad_norm": 96.0, "learning_rate": 3.888024883359254e-07, "loss": 1.60199127, "memory(GiB)": 107.73, "step": 100, "train_speed(iter/s)": 0.394553 }, { "acc": 0.63773804, "epoch": 0.0025663832951778242, "grad_norm": 43.25, "learning_rate": 4.2768273716951787e-07, "loss": 1.49887962, "memory(GiB)": 107.73, "step": 110, "train_speed(iter/s)": 0.398459 }, { "acc": 0.63337984, "epoch": 0.002799690867466717, "grad_norm": 21.75, "learning_rate": 4.6656298600311046e-07, "loss": 1.50928802, "memory(GiB)": 107.73, "step": 120, "train_speed(iter/s)": 0.398931 }, { "acc": 0.6219851, "epoch": 0.0030329984397556103, "grad_norm": 13.4375, "learning_rate": 5.054432348367029e-07, "loss": 1.57879667, "memory(GiB)": 107.73, "step": 130, "train_speed(iter/s)": 0.399653 }, { "acc": 0.62025309, "epoch": 0.0032663060120445035, "grad_norm": 23.625, "learning_rate": 5.443234836702956e-07, "loss": 1.56035786, "memory(GiB)": 107.73, "step": 140, "train_speed(iter/s)": 0.400702 }, { "acc": 0.6314353, "epoch": 0.0034996135843333967, "grad_norm": 33.5, "learning_rate": 5.832037325038881e-07, "loss": 1.48340359, "memory(GiB)": 107.73, "step": 150, "train_speed(iter/s)": 0.400995 }, { "acc": 0.63966684, "epoch": 0.0037329211566222895, "grad_norm": 30.125, "learning_rate": 6.220839813374806e-07, "loss": 1.46832266, "memory(GiB)": 109.27, "step": 160, "train_speed(iter/s)": 0.400399 }, { "acc": 0.62565508, "epoch": 0.003966228728911183, "grad_norm": 30.625, "learning_rate": 6.609642301710731e-07, "loss": 1.50919857, "memory(GiB)": 109.27, "step": 170, "train_speed(iter/s)": 0.400965 }, { "acc": 0.64747152, "epoch": 0.0041995363012000755, "grad_norm": 33.25, "learning_rate": 6.998444790046657e-07, "loss": 1.40472679, "memory(GiB)": 109.27, "step": 180, "train_speed(iter/s)": 0.401444 }, { "acc": 0.63570065, "epoch": 0.004432843873488969, "grad_norm": 107.0, "learning_rate": 7.387247278382582e-07, "loss": 1.47104721, "memory(GiB)": 109.27, "step": 190, "train_speed(iter/s)": 0.402174 }, { "acc": 0.64924078, "epoch": 0.004666151445777862, "grad_norm": 26.375, "learning_rate": 7.776049766718508e-07, "loss": 1.41057568, "memory(GiB)": 109.27, "step": 200, "train_speed(iter/s)": 0.400875 }, { "acc": 0.64792824, "epoch": 0.004899459018066755, "grad_norm": 16.625, "learning_rate": 8.164852255054432e-07, "loss": 1.41193724, "memory(GiB)": 109.27, "step": 210, "train_speed(iter/s)": 0.40145 }, { "acc": 0.65794325, "epoch": 0.0051327665903556485, "grad_norm": 27.625, "learning_rate": 8.553654743390357e-07, "loss": 1.32516994, "memory(GiB)": 109.27, "step": 220, "train_speed(iter/s)": 0.400563 }, { "acc": 0.64404802, "epoch": 0.005366074162644541, "grad_norm": 24.75, "learning_rate": 8.942457231726284e-07, "loss": 1.40193615, "memory(GiB)": 109.27, "step": 230, "train_speed(iter/s)": 0.40142 }, { "acc": 0.6396965, "epoch": 0.005599381734933434, "grad_norm": 33.25, "learning_rate": 9.331259720062209e-07, "loss": 1.45375385, "memory(GiB)": 109.27, "step": 240, "train_speed(iter/s)": 0.401781 }, { "acc": 0.67226753, "epoch": 0.005832689307222328, "grad_norm": 13.9375, "learning_rate": 9.720062208398133e-07, "loss": 1.27868023, "memory(GiB)": 109.65, "step": 250, "train_speed(iter/s)": 0.402589 }, { "acc": 0.65854316, "epoch": 0.0060659968795112205, "grad_norm": 13.1875, "learning_rate": 1.0108864696734059e-06, "loss": 1.35100641, "memory(GiB)": 109.65, "step": 260, "train_speed(iter/s)": 0.402595 }, { "acc": 0.65821409, "epoch": 0.006299304451800113, "grad_norm": 17.0, "learning_rate": 1.0497667185069986e-06, "loss": 1.34223928, "memory(GiB)": 109.77, "step": 270, "train_speed(iter/s)": 0.402759 }, { "acc": 0.69331517, "epoch": 0.006532612024089007, "grad_norm": 13.3125, "learning_rate": 1.0886469673405912e-06, "loss": 1.19433346, "memory(GiB)": 109.77, "step": 280, "train_speed(iter/s)": 0.404285 }, { "acc": 0.68652458, "epoch": 0.0067659195963779, "grad_norm": 19.375, "learning_rate": 1.1275272161741837e-06, "loss": 1.1824007, "memory(GiB)": 109.77, "step": 290, "train_speed(iter/s)": 0.404542 }, { "acc": 0.68696241, "epoch": 0.006999227168666793, "grad_norm": 9.5, "learning_rate": 1.1664074650077762e-06, "loss": 1.22615242, "memory(GiB)": 109.9, "step": 300, "train_speed(iter/s)": 0.405085 }, { "acc": 0.69048529, "epoch": 0.007232534740955686, "grad_norm": 8.0625, "learning_rate": 1.2052877138413686e-06, "loss": 1.19841995, "memory(GiB)": 109.9, "step": 310, "train_speed(iter/s)": 0.405033 }, { "acc": 0.69980855, "epoch": 0.007465842313244579, "grad_norm": 13.5, "learning_rate": 1.2441679626749613e-06, "loss": 1.16620598, "memory(GiB)": 109.9, "step": 320, "train_speed(iter/s)": 0.405535 }, { "acc": 0.69819374, "epoch": 0.007699149885533473, "grad_norm": 14.6875, "learning_rate": 1.2830482115085538e-06, "loss": 1.17274351, "memory(GiB)": 109.9, "step": 330, "train_speed(iter/s)": 0.406566 }, { "acc": 0.69822979, "epoch": 0.007932457457822365, "grad_norm": 11.75, "learning_rate": 1.3219284603421462e-06, "loss": 1.16593056, "memory(GiB)": 109.9, "step": 340, "train_speed(iter/s)": 0.407206 }, { "acc": 0.66752868, "epoch": 0.00816576503011126, "grad_norm": 9.6875, "learning_rate": 1.360808709175739e-06, "loss": 1.30824986, "memory(GiB)": 109.9, "step": 350, "train_speed(iter/s)": 0.40717 }, { "acc": 0.68702393, "epoch": 0.008399072602400151, "grad_norm": 16.25, "learning_rate": 1.3996889580093314e-06, "loss": 1.20300846, "memory(GiB)": 109.9, "step": 360, "train_speed(iter/s)": 0.407871 }, { "acc": 0.68695755, "epoch": 0.008632380174689045, "grad_norm": 11.25, "learning_rate": 1.4385692068429238e-06, "loss": 1.21336107, "memory(GiB)": 109.9, "step": 370, "train_speed(iter/s)": 0.408225 }, { "acc": 0.66215248, "epoch": 0.008865687746977938, "grad_norm": 13.875, "learning_rate": 1.4774494556765165e-06, "loss": 1.30150585, "memory(GiB)": 109.9, "step": 380, "train_speed(iter/s)": 0.40873 }, { "acc": 0.68752451, "epoch": 0.00909899531926683, "grad_norm": 9.125, "learning_rate": 1.5163297045101088e-06, "loss": 1.19580708, "memory(GiB)": 109.9, "step": 390, "train_speed(iter/s)": 0.409299 }, { "acc": 0.69389825, "epoch": 0.009332302891555724, "grad_norm": 18.375, "learning_rate": 1.5552099533437016e-06, "loss": 1.14363232, "memory(GiB)": 109.9, "step": 400, "train_speed(iter/s)": 0.41011 }, { "acc": 0.69924498, "epoch": 0.009565610463844618, "grad_norm": 10.5625, "learning_rate": 1.594090202177294e-06, "loss": 1.11300545, "memory(GiB)": 109.9, "step": 410, "train_speed(iter/s)": 0.410186 }, { "acc": 0.68810396, "epoch": 0.00979891803613351, "grad_norm": 40.75, "learning_rate": 1.6329704510108864e-06, "loss": 1.17614174, "memory(GiB)": 109.9, "step": 420, "train_speed(iter/s)": 0.410775 }, { "acc": 0.70244446, "epoch": 0.010032225608422403, "grad_norm": 19.75, "learning_rate": 1.6718506998444792e-06, "loss": 1.11227798, "memory(GiB)": 109.9, "step": 430, "train_speed(iter/s)": 0.410915 }, { "acc": 0.68015146, "epoch": 0.010265533180711297, "grad_norm": 15.9375, "learning_rate": 1.7107309486780715e-06, "loss": 1.23970881, "memory(GiB)": 109.9, "step": 440, "train_speed(iter/s)": 0.411274 }, { "acc": 0.69839172, "epoch": 0.010498840753000189, "grad_norm": 5.25, "learning_rate": 1.7496111975116642e-06, "loss": 1.17983141, "memory(GiB)": 109.9, "step": 450, "train_speed(iter/s)": 0.410944 }, { "acc": 0.71311102, "epoch": 0.010732148325289083, "grad_norm": 8.9375, "learning_rate": 1.7884914463452568e-06, "loss": 1.10826473, "memory(GiB)": 109.9, "step": 460, "train_speed(iter/s)": 0.41157 }, { "acc": 0.71311436, "epoch": 0.010965455897577976, "grad_norm": 13.9375, "learning_rate": 1.8273716951788493e-06, "loss": 1.1191287, "memory(GiB)": 109.9, "step": 470, "train_speed(iter/s)": 0.411782 }, { "acc": 0.69832649, "epoch": 0.011198763469866868, "grad_norm": 10.625, "learning_rate": 1.8662519440124418e-06, "loss": 1.15418854, "memory(GiB)": 109.9, "step": 480, "train_speed(iter/s)": 0.411901 }, { "acc": 0.70663209, "epoch": 0.011432071042155762, "grad_norm": 9.375, "learning_rate": 1.9051321928460342e-06, "loss": 1.10731459, "memory(GiB)": 109.9, "step": 490, "train_speed(iter/s)": 0.412735 }, { "acc": 0.69591608, "epoch": 0.011665378614444655, "grad_norm": 8.875, "learning_rate": 1.9440124416796267e-06, "loss": 1.16394606, "memory(GiB)": 109.9, "step": 500, "train_speed(iter/s)": 0.413165 }, { "epoch": 0.011665378614444655, "eval_acc": 0.6805360778834394, "eval_loss": 1.1259791851043701, "eval_runtime": 1262.6901, "eval_samples_per_second": 28.503, "eval_steps_per_second": 14.252, "step": 500 }, { "acc": 0.71509686, "epoch": 0.011898686186733547, "grad_norm": 13.4375, "learning_rate": 1.9828926905132194e-06, "loss": 1.10683022, "memory(GiB)": 112.35, "step": 510, "train_speed(iter/s)": 0.202384 }, { "acc": 0.710323, "epoch": 0.012131993759022441, "grad_norm": 13.625, "learning_rate": 2.0217729393468118e-06, "loss": 1.08523674, "memory(GiB)": 112.35, "step": 520, "train_speed(iter/s)": 0.204348 }, { "acc": 0.70022459, "epoch": 0.012365301331311335, "grad_norm": 8.0, "learning_rate": 2.0606531881804045e-06, "loss": 1.18831568, "memory(GiB)": 112.35, "step": 530, "train_speed(iter/s)": 0.206393 }, { "acc": 0.70497704, "epoch": 0.012598608903600227, "grad_norm": 7.15625, "learning_rate": 2.0995334370139973e-06, "loss": 1.12294245, "memory(GiB)": 112.35, "step": 540, "train_speed(iter/s)": 0.208305 }, { "acc": 0.71431627, "epoch": 0.01283191647588912, "grad_norm": 11.625, "learning_rate": 2.1384136858475896e-06, "loss": 1.09484558, "memory(GiB)": 112.35, "step": 550, "train_speed(iter/s)": 0.210236 }, { "acc": 0.69433327, "epoch": 0.013065224048178014, "grad_norm": 9.1875, "learning_rate": 2.1772939346811823e-06, "loss": 1.15555983, "memory(GiB)": 112.35, "step": 560, "train_speed(iter/s)": 0.212059 }, { "acc": 0.722299, "epoch": 0.013298531620466908, "grad_norm": 6.84375, "learning_rate": 2.2161741835147746e-06, "loss": 1.05842762, "memory(GiB)": 112.35, "step": 570, "train_speed(iter/s)": 0.214014 }, { "acc": 0.71955214, "epoch": 0.0135318391927558, "grad_norm": 8.1875, "learning_rate": 2.2550544323483674e-06, "loss": 1.1037324, "memory(GiB)": 112.35, "step": 580, "train_speed(iter/s)": 0.215787 }, { "acc": 0.70316348, "epoch": 0.013765146765044693, "grad_norm": 8.6875, "learning_rate": 2.2939346811819597e-06, "loss": 1.13511295, "memory(GiB)": 112.35, "step": 590, "train_speed(iter/s)": 0.217609 }, { "acc": 0.70167513, "epoch": 0.013998454337333587, "grad_norm": 5.96875, "learning_rate": 2.3328149300155525e-06, "loss": 1.15515079, "memory(GiB)": 112.35, "step": 600, "train_speed(iter/s)": 0.219285 }, { "acc": 0.6906002, "epoch": 0.014231761909622479, "grad_norm": 8.125, "learning_rate": 2.3716951788491448e-06, "loss": 1.20004253, "memory(GiB)": 112.35, "step": 610, "train_speed(iter/s)": 0.221138 }, { "acc": 0.69671326, "epoch": 0.014465069481911372, "grad_norm": 5.03125, "learning_rate": 2.410575427682737e-06, "loss": 1.15085373, "memory(GiB)": 112.35, "step": 620, "train_speed(iter/s)": 0.222865 }, { "acc": 0.71678777, "epoch": 0.014698377054200266, "grad_norm": 8.6875, "learning_rate": 2.44945567651633e-06, "loss": 1.06843872, "memory(GiB)": 112.35, "step": 630, "train_speed(iter/s)": 0.224597 }, { "acc": 0.7299325, "epoch": 0.014931684626489158, "grad_norm": 7.125, "learning_rate": 2.4883359253499226e-06, "loss": 1.03410025, "memory(GiB)": 112.35, "step": 640, "train_speed(iter/s)": 0.226264 }, { "acc": 0.72808428, "epoch": 0.015164992198778052, "grad_norm": 5.09375, "learning_rate": 2.527216174183515e-06, "loss": 1.01569843, "memory(GiB)": 112.35, "step": 650, "train_speed(iter/s)": 0.227757 }, { "acc": 0.71508141, "epoch": 0.015398299771066945, "grad_norm": 7.5, "learning_rate": 2.5660964230171077e-06, "loss": 1.05746775, "memory(GiB)": 112.35, "step": 660, "train_speed(iter/s)": 0.229405 }, { "acc": 0.69491739, "epoch": 0.01563160734335584, "grad_norm": 12.8125, "learning_rate": 2.6049766718507004e-06, "loss": 1.16022482, "memory(GiB)": 112.35, "step": 670, "train_speed(iter/s)": 0.230859 }, { "acc": 0.73366556, "epoch": 0.01586491491564473, "grad_norm": 9.4375, "learning_rate": 2.6438569206842923e-06, "loss": 1.01865559, "memory(GiB)": 112.35, "step": 680, "train_speed(iter/s)": 0.232322 }, { "acc": 0.69907765, "epoch": 0.016098222487933623, "grad_norm": 8.875, "learning_rate": 2.682737169517885e-06, "loss": 1.14045467, "memory(GiB)": 112.35, "step": 690, "train_speed(iter/s)": 0.23393 }, { "acc": 0.72175894, "epoch": 0.01633153006022252, "grad_norm": 9.0625, "learning_rate": 2.721617418351478e-06, "loss": 1.06233292, "memory(GiB)": 112.35, "step": 700, "train_speed(iter/s)": 0.235402 }, { "acc": 0.7145546, "epoch": 0.01656483763251141, "grad_norm": 7.625, "learning_rate": 2.76049766718507e-06, "loss": 1.11091633, "memory(GiB)": 112.35, "step": 710, "train_speed(iter/s)": 0.236915 }, { "acc": 0.70876598, "epoch": 0.016798145204800302, "grad_norm": 35.0, "learning_rate": 2.799377916018663e-06, "loss": 1.08977337, "memory(GiB)": 112.35, "step": 720, "train_speed(iter/s)": 0.238332 }, { "acc": 0.71281333, "epoch": 0.017031452777089198, "grad_norm": 11.25, "learning_rate": 2.838258164852255e-06, "loss": 1.08405428, "memory(GiB)": 112.35, "step": 730, "train_speed(iter/s)": 0.239763 }, { "acc": 0.74212275, "epoch": 0.01726476034937809, "grad_norm": 8.125, "learning_rate": 2.8771384136858475e-06, "loss": 0.97985992, "memory(GiB)": 112.35, "step": 740, "train_speed(iter/s)": 0.241198 }, { "acc": 0.73240829, "epoch": 0.01749806792166698, "grad_norm": 8.25, "learning_rate": 2.9160186625194403e-06, "loss": 1.01898518, "memory(GiB)": 112.35, "step": 750, "train_speed(iter/s)": 0.242605 }, { "acc": 0.7157486, "epoch": 0.017731375493955877, "grad_norm": 8.625, "learning_rate": 2.954898911353033e-06, "loss": 1.10528841, "memory(GiB)": 112.35, "step": 760, "train_speed(iter/s)": 0.243892 }, { "acc": 0.718012, "epoch": 0.01796468306624477, "grad_norm": 7.90625, "learning_rate": 2.9937791601866257e-06, "loss": 1.07659645, "memory(GiB)": 112.35, "step": 770, "train_speed(iter/s)": 0.245149 }, { "acc": 0.72557025, "epoch": 0.01819799063853366, "grad_norm": 14.9375, "learning_rate": 3.0326594090202176e-06, "loss": 1.03978872, "memory(GiB)": 112.35, "step": 780, "train_speed(iter/s)": 0.246572 }, { "acc": 0.73356361, "epoch": 0.018431298210822556, "grad_norm": 7.6875, "learning_rate": 3.0715396578538104e-06, "loss": 0.98564129, "memory(GiB)": 112.35, "step": 790, "train_speed(iter/s)": 0.247963 }, { "acc": 0.7228992, "epoch": 0.018664605783111448, "grad_norm": 6.96875, "learning_rate": 3.110419906687403e-06, "loss": 1.0402339, "memory(GiB)": 112.35, "step": 800, "train_speed(iter/s)": 0.249216 }, { "acc": 0.74984007, "epoch": 0.01889791335540034, "grad_norm": 11.0, "learning_rate": 3.1493001555209955e-06, "loss": 0.94254513, "memory(GiB)": 112.35, "step": 810, "train_speed(iter/s)": 0.250355 }, { "acc": 0.72919197, "epoch": 0.019131220927689235, "grad_norm": 11.875, "learning_rate": 3.188180404354588e-06, "loss": 1.03513203, "memory(GiB)": 112.35, "step": 820, "train_speed(iter/s)": 0.251463 }, { "acc": 0.72337241, "epoch": 0.019364528499978127, "grad_norm": 46.5, "learning_rate": 3.2270606531881805e-06, "loss": 1.0482893, "memory(GiB)": 112.35, "step": 830, "train_speed(iter/s)": 0.252709 }, { "acc": 0.72558317, "epoch": 0.01959783607226702, "grad_norm": 9.3125, "learning_rate": 3.265940902021773e-06, "loss": 1.03368216, "memory(GiB)": 112.35, "step": 840, "train_speed(iter/s)": 0.253857 }, { "acc": 0.71586609, "epoch": 0.019831143644555915, "grad_norm": 7.78125, "learning_rate": 3.3048211508553656e-06, "loss": 1.06373787, "memory(GiB)": 112.35, "step": 850, "train_speed(iter/s)": 0.255056 }, { "acc": 0.73888211, "epoch": 0.020064451216844807, "grad_norm": 6.34375, "learning_rate": 3.3437013996889583e-06, "loss": 0.98285561, "memory(GiB)": 112.35, "step": 860, "train_speed(iter/s)": 0.256242 }, { "acc": 0.71399093, "epoch": 0.0202977587891337, "grad_norm": 9.0625, "learning_rate": 3.382581648522551e-06, "loss": 1.06048183, "memory(GiB)": 112.35, "step": 870, "train_speed(iter/s)": 0.257465 }, { "acc": 0.72095742, "epoch": 0.020531066361422594, "grad_norm": 14.875, "learning_rate": 3.421461897356143e-06, "loss": 1.04118671, "memory(GiB)": 112.35, "step": 880, "train_speed(iter/s)": 0.258731 }, { "acc": 0.72350492, "epoch": 0.020764373933711486, "grad_norm": 6.96875, "learning_rate": 3.4603421461897357e-06, "loss": 1.03228941, "memory(GiB)": 112.35, "step": 890, "train_speed(iter/s)": 0.259889 }, { "acc": 0.72251768, "epoch": 0.020997681506000378, "grad_norm": 9.125, "learning_rate": 3.4992223950233285e-06, "loss": 1.04102535, "memory(GiB)": 112.35, "step": 900, "train_speed(iter/s)": 0.261014 }, { "acc": 0.71974378, "epoch": 0.021230989078289273, "grad_norm": 10.875, "learning_rate": 3.5381026438569212e-06, "loss": 1.05280695, "memory(GiB)": 112.35, "step": 910, "train_speed(iter/s)": 0.26205 }, { "acc": 0.72368603, "epoch": 0.021464296650578165, "grad_norm": 7.8125, "learning_rate": 3.5769828926905135e-06, "loss": 1.02749395, "memory(GiB)": 112.35, "step": 920, "train_speed(iter/s)": 0.263087 }, { "acc": 0.71367736, "epoch": 0.021697604222867057, "grad_norm": 8.375, "learning_rate": 3.615863141524106e-06, "loss": 1.08294706, "memory(GiB)": 112.35, "step": 930, "train_speed(iter/s)": 0.264077 }, { "acc": 0.73049974, "epoch": 0.021930911795155952, "grad_norm": 5.71875, "learning_rate": 3.6547433903576986e-06, "loss": 0.99465523, "memory(GiB)": 112.35, "step": 940, "train_speed(iter/s)": 0.265026 }, { "acc": 0.7236989, "epoch": 0.022164219367444844, "grad_norm": 8.4375, "learning_rate": 3.693623639191291e-06, "loss": 1.03436298, "memory(GiB)": 112.35, "step": 950, "train_speed(iter/s)": 0.266057 }, { "acc": 0.74879866, "epoch": 0.022397526939733736, "grad_norm": 13.9375, "learning_rate": 3.7325038880248837e-06, "loss": 0.94515381, "memory(GiB)": 112.35, "step": 960, "train_speed(iter/s)": 0.266945 }, { "acc": 0.70539465, "epoch": 0.02263083451202263, "grad_norm": 7.71875, "learning_rate": 3.7713841368584764e-06, "loss": 1.12703362, "memory(GiB)": 112.35, "step": 970, "train_speed(iter/s)": 0.26783 }, { "acc": 0.73541384, "epoch": 0.022864142084311524, "grad_norm": 6.53125, "learning_rate": 3.8102643856920683e-06, "loss": 0.98798637, "memory(GiB)": 112.35, "step": 980, "train_speed(iter/s)": 0.268822 }, { "acc": 0.72558088, "epoch": 0.023097449656600415, "grad_norm": 7.25, "learning_rate": 3.849144634525661e-06, "loss": 1.04246445, "memory(GiB)": 112.35, "step": 990, "train_speed(iter/s)": 0.269734 }, { "acc": 0.7282671, "epoch": 0.02333075722888931, "grad_norm": 12.0625, "learning_rate": 3.888024883359253e-06, "loss": 1.00360498, "memory(GiB)": 112.35, "step": 1000, "train_speed(iter/s)": 0.270707 }, { "epoch": 0.02333075722888931, "eval_acc": 0.6965390336851683, "eval_loss": 1.0107953548431396, "eval_runtime": 1263.7692, "eval_samples_per_second": 28.479, "eval_steps_per_second": 14.24, "step": 1000 }, { "acc": 0.73418517, "epoch": 0.023564064801178203, "grad_norm": 4.4375, "learning_rate": 3.9269051321928466e-06, "loss": 0.98490839, "memory(GiB)": 112.35, "step": 1010, "train_speed(iter/s)": 0.201745 }, { "acc": 0.74667645, "epoch": 0.023797372373467095, "grad_norm": 9.4375, "learning_rate": 3.965785381026439e-06, "loss": 0.93045502, "memory(GiB)": 112.35, "step": 1020, "train_speed(iter/s)": 0.202663 }, { "acc": 0.74708214, "epoch": 0.02403067994575599, "grad_norm": 6.8125, "learning_rate": 4.004665629860031e-06, "loss": 0.9263464, "memory(GiB)": 112.35, "step": 1030, "train_speed(iter/s)": 0.20368 }, { "acc": 0.72029591, "epoch": 0.024263987518044882, "grad_norm": 6.59375, "learning_rate": 4.0435458786936235e-06, "loss": 1.03119831, "memory(GiB)": 112.35, "step": 1040, "train_speed(iter/s)": 0.2047 }, { "acc": 0.74541645, "epoch": 0.024497295090333774, "grad_norm": 11.0625, "learning_rate": 4.082426127527217e-06, "loss": 0.9554368, "memory(GiB)": 112.35, "step": 1050, "train_speed(iter/s)": 0.205665 }, { "acc": 0.72473269, "epoch": 0.02473060266262267, "grad_norm": 6.34375, "learning_rate": 4.121306376360809e-06, "loss": 1.02179165, "memory(GiB)": 112.35, "step": 1060, "train_speed(iter/s)": 0.206708 }, { "acc": 0.73439541, "epoch": 0.02496391023491156, "grad_norm": 10.0, "learning_rate": 4.160186625194401e-06, "loss": 0.99017801, "memory(GiB)": 112.35, "step": 1070, "train_speed(iter/s)": 0.207724 }, { "acc": 0.73526278, "epoch": 0.025197217807200453, "grad_norm": 7.84375, "learning_rate": 4.1990668740279945e-06, "loss": 0.98316002, "memory(GiB)": 112.35, "step": 1080, "train_speed(iter/s)": 0.208733 }, { "acc": 0.72961845, "epoch": 0.02543052537948935, "grad_norm": 20.375, "learning_rate": 4.237947122861587e-06, "loss": 0.99067287, "memory(GiB)": 112.35, "step": 1090, "train_speed(iter/s)": 0.209706 }, { "acc": 0.72883358, "epoch": 0.02566383295177824, "grad_norm": 8.5, "learning_rate": 4.276827371695179e-06, "loss": 1.00727882, "memory(GiB)": 112.35, "step": 1100, "train_speed(iter/s)": 0.210633 }, { "acc": 0.74940205, "epoch": 0.025897140524067136, "grad_norm": 7.96875, "learning_rate": 4.3157076205287715e-06, "loss": 0.94974909, "memory(GiB)": 112.35, "step": 1110, "train_speed(iter/s)": 0.211574 }, { "acc": 0.72435718, "epoch": 0.026130448096356028, "grad_norm": 11.375, "learning_rate": 4.354587869362365e-06, "loss": 1.05743275, "memory(GiB)": 112.35, "step": 1120, "train_speed(iter/s)": 0.212541 }, { "acc": 0.74189386, "epoch": 0.02636375566864492, "grad_norm": 7.90625, "learning_rate": 4.393468118195957e-06, "loss": 0.95555878, "memory(GiB)": 112.35, "step": 1130, "train_speed(iter/s)": 0.213455 }, { "acc": 0.73600874, "epoch": 0.026597063240933815, "grad_norm": 8.5625, "learning_rate": 4.432348367029549e-06, "loss": 0.97529106, "memory(GiB)": 112.35, "step": 1140, "train_speed(iter/s)": 0.214441 }, { "acc": 0.71932983, "epoch": 0.026830370813222707, "grad_norm": 6.625, "learning_rate": 4.471228615863142e-06, "loss": 1.04903622, "memory(GiB)": 112.35, "step": 1150, "train_speed(iter/s)": 0.215407 }, { "acc": 0.73875904, "epoch": 0.0270636783855116, "grad_norm": 5.96875, "learning_rate": 4.510108864696735e-06, "loss": 0.94776335, "memory(GiB)": 112.35, "step": 1160, "train_speed(iter/s)": 0.21635 }, { "acc": 0.72604222, "epoch": 0.027296985957800494, "grad_norm": 6.03125, "learning_rate": 4.548989113530327e-06, "loss": 1.02964401, "memory(GiB)": 112.35, "step": 1170, "train_speed(iter/s)": 0.217273 }, { "acc": 0.73982382, "epoch": 0.027530293530089386, "grad_norm": 7.8125, "learning_rate": 4.587869362363919e-06, "loss": 0.95189095, "memory(GiB)": 112.35, "step": 1180, "train_speed(iter/s)": 0.218171 }, { "acc": 0.72973738, "epoch": 0.02776360110237828, "grad_norm": 8.3125, "learning_rate": 4.626749611197512e-06, "loss": 1.01156664, "memory(GiB)": 112.35, "step": 1190, "train_speed(iter/s)": 0.219124 }, { "acc": 0.73390875, "epoch": 0.027996908674667174, "grad_norm": 5.28125, "learning_rate": 4.665629860031105e-06, "loss": 1.00512886, "memory(GiB)": 112.35, "step": 1200, "train_speed(iter/s)": 0.220082 }, { "acc": 0.73757582, "epoch": 0.028230216246956066, "grad_norm": 8.5625, "learning_rate": 4.704510108864697e-06, "loss": 0.97182465, "memory(GiB)": 112.35, "step": 1210, "train_speed(iter/s)": 0.220961 }, { "acc": 0.72584505, "epoch": 0.028463523819244958, "grad_norm": 9.625, "learning_rate": 4.7433903576982896e-06, "loss": 0.9941761, "memory(GiB)": 112.35, "step": 1220, "train_speed(iter/s)": 0.221844 }, { "acc": 0.72198229, "epoch": 0.028696831391533853, "grad_norm": 10.3125, "learning_rate": 4.782270606531883e-06, "loss": 1.03835678, "memory(GiB)": 112.35, "step": 1230, "train_speed(iter/s)": 0.222731 }, { "acc": 0.7207737, "epoch": 0.028930138963822745, "grad_norm": 9.5625, "learning_rate": 4.821150855365474e-06, "loss": 1.0451992, "memory(GiB)": 112.35, "step": 1240, "train_speed(iter/s)": 0.223638 }, { "acc": 0.73655729, "epoch": 0.029163446536111637, "grad_norm": 20.5, "learning_rate": 4.860031104199067e-06, "loss": 0.99565964, "memory(GiB)": 112.35, "step": 1250, "train_speed(iter/s)": 0.22444 }, { "acc": 0.7507812, "epoch": 0.029396754108400532, "grad_norm": 9.1875, "learning_rate": 4.89891135303266e-06, "loss": 0.89867344, "memory(GiB)": 112.35, "step": 1260, "train_speed(iter/s)": 0.225176 }, { "acc": 0.75731397, "epoch": 0.029630061680689424, "grad_norm": 7.5, "learning_rate": 4.937791601866253e-06, "loss": 0.87951641, "memory(GiB)": 112.35, "step": 1270, "train_speed(iter/s)": 0.226051 }, { "acc": 0.73596754, "epoch": 0.029863369252978316, "grad_norm": 7.65625, "learning_rate": 4.976671850699845e-06, "loss": 0.99108791, "memory(GiB)": 112.35, "step": 1280, "train_speed(iter/s)": 0.226857 }, { "acc": 0.75433683, "epoch": 0.03009667682526721, "grad_norm": 7.65625, "learning_rate": 5.0155520995334375e-06, "loss": 0.94422626, "memory(GiB)": 112.35, "step": 1290, "train_speed(iter/s)": 0.227701 }, { "acc": 0.76343012, "epoch": 0.030329984397556103, "grad_norm": 10.375, "learning_rate": 5.05443234836703e-06, "loss": 0.88653755, "memory(GiB)": 112.35, "step": 1300, "train_speed(iter/s)": 0.228478 }, { "acc": 0.73513155, "epoch": 0.030563291969844995, "grad_norm": 11.75, "learning_rate": 5.093312597200622e-06, "loss": 0.984659, "memory(GiB)": 112.35, "step": 1310, "train_speed(iter/s)": 0.229316 }, { "acc": 0.73354354, "epoch": 0.03079659954213389, "grad_norm": 6.625, "learning_rate": 5.132192846034215e-06, "loss": 0.99107714, "memory(GiB)": 112.35, "step": 1320, "train_speed(iter/s)": 0.230137 }, { "acc": 0.73805466, "epoch": 0.031029907114422783, "grad_norm": 6.71875, "learning_rate": 5.171073094867808e-06, "loss": 0.99133682, "memory(GiB)": 112.35, "step": 1330, "train_speed(iter/s)": 0.230915 }, { "acc": 0.72652588, "epoch": 0.03126321468671168, "grad_norm": 7.40625, "learning_rate": 5.209953343701401e-06, "loss": 1.01565933, "memory(GiB)": 112.35, "step": 1340, "train_speed(iter/s)": 0.231717 }, { "acc": 0.71203609, "epoch": 0.03149652225900057, "grad_norm": 6.90625, "learning_rate": 5.248833592534993e-06, "loss": 1.08359432, "memory(GiB)": 112.35, "step": 1350, "train_speed(iter/s)": 0.232522 }, { "acc": 0.72031331, "epoch": 0.03172982983128946, "grad_norm": 9.0625, "learning_rate": 5.287713841368585e-06, "loss": 1.05539494, "memory(GiB)": 112.35, "step": 1360, "train_speed(iter/s)": 0.233324 }, { "acc": 0.73889894, "epoch": 0.031963137403578354, "grad_norm": 7.21875, "learning_rate": 5.326594090202177e-06, "loss": 0.98478098, "memory(GiB)": 112.35, "step": 1370, "train_speed(iter/s)": 0.234139 }, { "acc": 0.74943533, "epoch": 0.032196444975867246, "grad_norm": 5.4375, "learning_rate": 5.36547433903577e-06, "loss": 0.94105816, "memory(GiB)": 112.35, "step": 1380, "train_speed(iter/s)": 0.234897 }, { "acc": 0.74231901, "epoch": 0.03242975254815614, "grad_norm": 5.78125, "learning_rate": 5.404354587869362e-06, "loss": 0.97695713, "memory(GiB)": 112.35, "step": 1390, "train_speed(iter/s)": 0.235582 }, { "acc": 0.73235698, "epoch": 0.03266306012044504, "grad_norm": 10.8125, "learning_rate": 5.443234836702956e-06, "loss": 0.99308739, "memory(GiB)": 112.35, "step": 1400, "train_speed(iter/s)": 0.236373 }, { "acc": 0.72734451, "epoch": 0.03289636769273393, "grad_norm": 34.0, "learning_rate": 5.482115085536548e-06, "loss": 1.01753654, "memory(GiB)": 112.35, "step": 1410, "train_speed(iter/s)": 0.237142 }, { "acc": 0.73217077, "epoch": 0.03312967526502282, "grad_norm": 6.5625, "learning_rate": 5.52099533437014e-06, "loss": 1.0209549, "memory(GiB)": 112.35, "step": 1420, "train_speed(iter/s)": 0.237842 }, { "acc": 0.73642378, "epoch": 0.03336298283731171, "grad_norm": 8.0, "learning_rate": 5.559875583203733e-06, "loss": 0.9961915, "memory(GiB)": 112.35, "step": 1430, "train_speed(iter/s)": 0.238494 }, { "acc": 0.74590855, "epoch": 0.033596290409600604, "grad_norm": 6.25, "learning_rate": 5.598755832037326e-06, "loss": 0.9565074, "memory(GiB)": 112.35, "step": 1440, "train_speed(iter/s)": 0.239282 }, { "acc": 0.73592806, "epoch": 0.033829597981889496, "grad_norm": 8.5625, "learning_rate": 5.637636080870919e-06, "loss": 0.98991613, "memory(GiB)": 112.35, "step": 1450, "train_speed(iter/s)": 0.240048 }, { "acc": 0.72957783, "epoch": 0.034062905554178395, "grad_norm": 5.1875, "learning_rate": 5.67651632970451e-06, "loss": 1.02330503, "memory(GiB)": 112.35, "step": 1460, "train_speed(iter/s)": 0.240734 }, { "acc": 0.73291864, "epoch": 0.03429621312646729, "grad_norm": 7.59375, "learning_rate": 5.715396578538103e-06, "loss": 0.99332161, "memory(GiB)": 112.35, "step": 1470, "train_speed(iter/s)": 0.241464 }, { "acc": 0.75711956, "epoch": 0.03452952069875618, "grad_norm": 10.0, "learning_rate": 5.754276827371695e-06, "loss": 0.90812778, "memory(GiB)": 112.35, "step": 1480, "train_speed(iter/s)": 0.242183 }, { "acc": 0.73406906, "epoch": 0.03476282827104507, "grad_norm": 6.125, "learning_rate": 5.793157076205288e-06, "loss": 1.00051651, "memory(GiB)": 112.35, "step": 1490, "train_speed(iter/s)": 0.242854 }, { "acc": 0.74864845, "epoch": 0.03499613584333396, "grad_norm": 11.6875, "learning_rate": 5.8320373250388805e-06, "loss": 0.9269001, "memory(GiB)": 112.35, "step": 1500, "train_speed(iter/s)": 0.243552 }, { "epoch": 0.03499613584333396, "eval_acc": 0.7041837824581557, "eval_loss": 0.9649238586425781, "eval_runtime": 1263.7025, "eval_samples_per_second": 28.481, "eval_steps_per_second": 14.241, "step": 1500 }, { "acc": 0.73199005, "epoch": 0.03522944341562286, "grad_norm": 8.3125, "learning_rate": 5.870917573872474e-06, "loss": 0.98909521, "memory(GiB)": 112.35, "step": 1510, "train_speed(iter/s)": 0.202116 }, { "acc": 0.73852539, "epoch": 0.035462750987911754, "grad_norm": 8.4375, "learning_rate": 5.909797822706066e-06, "loss": 0.95922289, "memory(GiB)": 112.35, "step": 1520, "train_speed(iter/s)": 0.202822 }, { "acc": 0.75149479, "epoch": 0.035696058560200646, "grad_norm": 5.21875, "learning_rate": 5.948678071539658e-06, "loss": 0.93620281, "memory(GiB)": 112.35, "step": 1530, "train_speed(iter/s)": 0.203529 }, { "acc": 0.76064391, "epoch": 0.03592936613248954, "grad_norm": 6.4375, "learning_rate": 5.9875583203732515e-06, "loss": 0.87388725, "memory(GiB)": 112.35, "step": 1540, "train_speed(iter/s)": 0.204226 }, { "acc": 0.73580513, "epoch": 0.03616267370477843, "grad_norm": 6.75, "learning_rate": 6.026438569206844e-06, "loss": 0.98621922, "memory(GiB)": 113.93, "step": 1550, "train_speed(iter/s)": 0.204927 }, { "acc": 0.72287655, "epoch": 0.03639598127706732, "grad_norm": 8.375, "learning_rate": 6.065318818040435e-06, "loss": 1.03312302, "memory(GiB)": 113.93, "step": 1560, "train_speed(iter/s)": 0.205635 }, { "acc": 0.7447782, "epoch": 0.03662928884935622, "grad_norm": 8.3125, "learning_rate": 6.1041990668740285e-06, "loss": 0.9568697, "memory(GiB)": 113.93, "step": 1570, "train_speed(iter/s)": 0.206359 }, { "acc": 0.73666182, "epoch": 0.03686259642164511, "grad_norm": 15.0, "learning_rate": 6.143079315707621e-06, "loss": 0.98750963, "memory(GiB)": 113.93, "step": 1580, "train_speed(iter/s)": 0.207057 }, { "acc": 0.76351557, "epoch": 0.037095903993934004, "grad_norm": 9.1875, "learning_rate": 6.181959564541213e-06, "loss": 0.87661018, "memory(GiB)": 113.93, "step": 1590, "train_speed(iter/s)": 0.207743 }, { "acc": 0.74693308, "epoch": 0.037329211566222896, "grad_norm": 6.40625, "learning_rate": 6.220839813374806e-06, "loss": 0.91910772, "memory(GiB)": 113.93, "step": 1600, "train_speed(iter/s)": 0.20841 }, { "acc": 0.73038244, "epoch": 0.03756251913851179, "grad_norm": 6.3125, "learning_rate": 6.259720062208399e-06, "loss": 1.03058157, "memory(GiB)": 113.93, "step": 1610, "train_speed(iter/s)": 0.209093 }, { "acc": 0.74041862, "epoch": 0.03779582671080068, "grad_norm": 4.75, "learning_rate": 6.298600311041991e-06, "loss": 0.97728148, "memory(GiB)": 113.93, "step": 1620, "train_speed(iter/s)": 0.20979 }, { "acc": 0.73190141, "epoch": 0.03802913428308958, "grad_norm": 6.21875, "learning_rate": 6.337480559875584e-06, "loss": 1.00035477, "memory(GiB)": 113.93, "step": 1630, "train_speed(iter/s)": 0.210369 }, { "acc": 0.74100523, "epoch": 0.03826244185537847, "grad_norm": 8.3125, "learning_rate": 6.376360808709176e-06, "loss": 0.9790329, "memory(GiB)": 113.93, "step": 1640, "train_speed(iter/s)": 0.211052 }, { "acc": 0.74432406, "epoch": 0.03849574942766736, "grad_norm": 9.25, "learning_rate": 6.4152410575427696e-06, "loss": 0.9442791, "memory(GiB)": 113.93, "step": 1650, "train_speed(iter/s)": 0.21167 }, { "acc": 0.72465081, "epoch": 0.038729056999956255, "grad_norm": 5.625, "learning_rate": 6.454121306376361e-06, "loss": 1.01588669, "memory(GiB)": 113.93, "step": 1660, "train_speed(iter/s)": 0.212307 }, { "acc": 0.73395238, "epoch": 0.038962364572245146, "grad_norm": 7.34375, "learning_rate": 6.493001555209953e-06, "loss": 0.97928257, "memory(GiB)": 113.93, "step": 1670, "train_speed(iter/s)": 0.212992 }, { "acc": 0.74781828, "epoch": 0.03919567214453404, "grad_norm": 9.25, "learning_rate": 6.531881804043546e-06, "loss": 0.95643587, "memory(GiB)": 113.93, "step": 1680, "train_speed(iter/s)": 0.213633 }, { "acc": 0.74310112, "epoch": 0.03942897971682294, "grad_norm": 7.625, "learning_rate": 6.570762052877139e-06, "loss": 0.9715971, "memory(GiB)": 113.93, "step": 1690, "train_speed(iter/s)": 0.214277 }, { "acc": 0.73666706, "epoch": 0.03966228728911183, "grad_norm": 8.6875, "learning_rate": 6.609642301710731e-06, "loss": 0.96375885, "memory(GiB)": 113.93, "step": 1700, "train_speed(iter/s)": 0.214902 }, { "acc": 0.74063663, "epoch": 0.03989559486140072, "grad_norm": 8.5625, "learning_rate": 6.648522550544324e-06, "loss": 0.9540266, "memory(GiB)": 113.93, "step": 1710, "train_speed(iter/s)": 0.215542 }, { "acc": 0.73028035, "epoch": 0.04012890243368961, "grad_norm": 5.6875, "learning_rate": 6.687402799377917e-06, "loss": 1.00620832, "memory(GiB)": 113.93, "step": 1720, "train_speed(iter/s)": 0.216156 }, { "acc": 0.7486237, "epoch": 0.040362210005978505, "grad_norm": 7.9375, "learning_rate": 6.726283048211509e-06, "loss": 0.93779144, "memory(GiB)": 113.93, "step": 1730, "train_speed(iter/s)": 0.2168 }, { "acc": 0.73410063, "epoch": 0.0405955175782674, "grad_norm": 5.625, "learning_rate": 6.765163297045102e-06, "loss": 1.0017622, "memory(GiB)": 113.93, "step": 1740, "train_speed(iter/s)": 0.217393 }, { "acc": 0.74687967, "epoch": 0.040828825150556296, "grad_norm": 9.3125, "learning_rate": 6.8040435458786945e-06, "loss": 0.92012205, "memory(GiB)": 113.93, "step": 1750, "train_speed(iter/s)": 0.218017 }, { "acc": 0.74878888, "epoch": 0.04106213272284519, "grad_norm": 6.78125, "learning_rate": 6.842923794712286e-06, "loss": 0.93968182, "memory(GiB)": 113.93, "step": 1760, "train_speed(iter/s)": 0.218615 }, { "acc": 0.74908767, "epoch": 0.04129544029513408, "grad_norm": 6.96875, "learning_rate": 6.881804043545879e-06, "loss": 0.93707829, "memory(GiB)": 113.93, "step": 1770, "train_speed(iter/s)": 0.219234 }, { "acc": 0.74882145, "epoch": 0.04152874786742297, "grad_norm": 8.5, "learning_rate": 6.9206842923794715e-06, "loss": 0.939575, "memory(GiB)": 113.93, "step": 1780, "train_speed(iter/s)": 0.219815 }, { "acc": 0.74497771, "epoch": 0.04176205543971186, "grad_norm": 9.0, "learning_rate": 6.959564541213064e-06, "loss": 0.96148252, "memory(GiB)": 113.93, "step": 1790, "train_speed(iter/s)": 0.220422 }, { "acc": 0.72854424, "epoch": 0.041995363012000755, "grad_norm": 5.28125, "learning_rate": 6.998444790046657e-06, "loss": 0.99456854, "memory(GiB)": 113.93, "step": 1800, "train_speed(iter/s)": 0.221038 }, { "acc": 0.74988499, "epoch": 0.042228670584289654, "grad_norm": 6.0, "learning_rate": 7.037325038880249e-06, "loss": 0.92922001, "memory(GiB)": 113.93, "step": 1810, "train_speed(iter/s)": 0.221664 }, { "acc": 0.72039394, "epoch": 0.042461978156578546, "grad_norm": 9.6875, "learning_rate": 7.0762052877138424e-06, "loss": 1.03519535, "memory(GiB)": 113.93, "step": 1820, "train_speed(iter/s)": 0.222256 }, { "acc": 0.74757175, "epoch": 0.04269528572886744, "grad_norm": 6.78125, "learning_rate": 7.115085536547435e-06, "loss": 0.93627872, "memory(GiB)": 113.93, "step": 1830, "train_speed(iter/s)": 0.222868 }, { "acc": 0.72629972, "epoch": 0.04292859330115633, "grad_norm": 9.9375, "learning_rate": 7.153965785381027e-06, "loss": 0.99472885, "memory(GiB)": 113.93, "step": 1840, "train_speed(iter/s)": 0.223446 }, { "acc": 0.76153107, "epoch": 0.04316190087344522, "grad_norm": 6.59375, "learning_rate": 7.19284603421462e-06, "loss": 0.88526754, "memory(GiB)": 113.93, "step": 1850, "train_speed(iter/s)": 0.224061 }, { "acc": 0.74155784, "epoch": 0.043395208445734114, "grad_norm": 4.5, "learning_rate": 7.231726283048212e-06, "loss": 0.94947701, "memory(GiB)": 113.93, "step": 1860, "train_speed(iter/s)": 0.22459 }, { "acc": 0.74650035, "epoch": 0.04362851601802301, "grad_norm": 7.0625, "learning_rate": 7.270606531881804e-06, "loss": 0.94161625, "memory(GiB)": 113.93, "step": 1870, "train_speed(iter/s)": 0.225192 }, { "acc": 0.74047966, "epoch": 0.043861823590311905, "grad_norm": 6.8125, "learning_rate": 7.309486780715397e-06, "loss": 0.97489223, "memory(GiB)": 113.93, "step": 1880, "train_speed(iter/s)": 0.225787 }, { "acc": 0.72761512, "epoch": 0.0440951311626008, "grad_norm": 6.28125, "learning_rate": 7.3483670295489895e-06, "loss": 1.04280052, "memory(GiB)": 113.93, "step": 1890, "train_speed(iter/s)": 0.226365 }, { "acc": 0.74957743, "epoch": 0.04432843873488969, "grad_norm": 5.03125, "learning_rate": 7.387247278382582e-06, "loss": 0.91257458, "memory(GiB)": 113.93, "step": 1900, "train_speed(iter/s)": 0.226926 }, { "acc": 0.75803919, "epoch": 0.04456174630717858, "grad_norm": 5.0625, "learning_rate": 7.426127527216175e-06, "loss": 0.87559795, "memory(GiB)": 113.93, "step": 1910, "train_speed(iter/s)": 0.227478 }, { "acc": 0.73415823, "epoch": 0.04479505387946747, "grad_norm": 6.40625, "learning_rate": 7.465007776049767e-06, "loss": 0.96939735, "memory(GiB)": 113.93, "step": 1920, "train_speed(iter/s)": 0.228008 }, { "acc": 0.73540893, "epoch": 0.04502836145175637, "grad_norm": 6.6875, "learning_rate": 7.5038880248833605e-06, "loss": 0.9725069, "memory(GiB)": 113.93, "step": 1930, "train_speed(iter/s)": 0.2286 }, { "acc": 0.7391387, "epoch": 0.04526166902404526, "grad_norm": 6.71875, "learning_rate": 7.542768273716953e-06, "loss": 0.96738825, "memory(GiB)": 113.93, "step": 1940, "train_speed(iter/s)": 0.229147 }, { "acc": 0.73418598, "epoch": 0.045494976596334155, "grad_norm": 9.125, "learning_rate": 7.581648522550545e-06, "loss": 0.98213692, "memory(GiB)": 113.93, "step": 1950, "train_speed(iter/s)": 0.229654 }, { "acc": 0.74322481, "epoch": 0.04572828416862305, "grad_norm": 8.0625, "learning_rate": 7.620528771384137e-06, "loss": 0.96111469, "memory(GiB)": 113.93, "step": 1960, "train_speed(iter/s)": 0.230179 }, { "acc": 0.75385466, "epoch": 0.04596159174091194, "grad_norm": 8.6875, "learning_rate": 7.659409020217729e-06, "loss": 0.9105032, "memory(GiB)": 113.93, "step": 1970, "train_speed(iter/s)": 0.230678 }, { "acc": 0.76086102, "epoch": 0.04619489931320083, "grad_norm": 9.4375, "learning_rate": 7.698289269051322e-06, "loss": 0.87767982, "memory(GiB)": 113.93, "step": 1980, "train_speed(iter/s)": 0.231199 }, { "acc": 0.73802357, "epoch": 0.04642820688548973, "grad_norm": 6.375, "learning_rate": 7.737169517884915e-06, "loss": 0.97832022, "memory(GiB)": 113.93, "step": 1990, "train_speed(iter/s)": 0.231722 }, { "acc": 0.75145512, "epoch": 0.04666151445777862, "grad_norm": 9.9375, "learning_rate": 7.776049766718507e-06, "loss": 0.93729334, "memory(GiB)": 113.93, "step": 2000, "train_speed(iter/s)": 0.232232 }, { "epoch": 0.04666151445777862, "eval_acc": 0.7097407542457381, "eval_loss": 0.9388070106506348, "eval_runtime": 1262.8115, "eval_samples_per_second": 28.501, "eval_steps_per_second": 14.251, "step": 2000 }, { "acc": 0.73531675, "epoch": 0.046894822030067514, "grad_norm": 8.25, "learning_rate": 7.8149300155521e-06, "loss": 0.9853364, "memory(GiB)": 113.93, "step": 2010, "train_speed(iter/s)": 0.202531 }, { "acc": 0.74468656, "epoch": 0.047128129602356406, "grad_norm": 5.71875, "learning_rate": 7.853810264385693e-06, "loss": 0.95494814, "memory(GiB)": 113.93, "step": 2020, "train_speed(iter/s)": 0.203097 }, { "acc": 0.73700533, "epoch": 0.0473614371746453, "grad_norm": 6.90625, "learning_rate": 7.892690513219286e-06, "loss": 0.95661955, "memory(GiB)": 113.93, "step": 2030, "train_speed(iter/s)": 0.203619 }, { "acc": 0.73729725, "epoch": 0.04759474474693419, "grad_norm": 6.1875, "learning_rate": 7.931570762052878e-06, "loss": 0.96925774, "memory(GiB)": 113.93, "step": 2040, "train_speed(iter/s)": 0.204123 }, { "acc": 0.7569746, "epoch": 0.04782805231922309, "grad_norm": 7.75, "learning_rate": 7.970451010886471e-06, "loss": 0.89170647, "memory(GiB)": 113.93, "step": 2050, "train_speed(iter/s)": 0.204636 }, { "acc": 0.76195021, "epoch": 0.04806135989151198, "grad_norm": 5.0625, "learning_rate": 8.009331259720062e-06, "loss": 0.86682911, "memory(GiB)": 113.93, "step": 2060, "train_speed(iter/s)": 0.205136 }, { "acc": 0.74411983, "epoch": 0.04829466746380087, "grad_norm": 8.125, "learning_rate": 8.048211508553656e-06, "loss": 0.94727058, "memory(GiB)": 113.93, "step": 2070, "train_speed(iter/s)": 0.205673 }, { "acc": 0.746982, "epoch": 0.048527975036089764, "grad_norm": 5.84375, "learning_rate": 8.087091757387247e-06, "loss": 0.92584314, "memory(GiB)": 113.93, "step": 2080, "train_speed(iter/s)": 0.206113 }, { "acc": 0.75299349, "epoch": 0.048761282608378656, "grad_norm": 8.4375, "learning_rate": 8.12597200622084e-06, "loss": 0.90946913, "memory(GiB)": 113.93, "step": 2090, "train_speed(iter/s)": 0.206629 }, { "acc": 0.73941689, "epoch": 0.04899459018066755, "grad_norm": 12.1875, "learning_rate": 8.164852255054433e-06, "loss": 0.95414791, "memory(GiB)": 113.93, "step": 2100, "train_speed(iter/s)": 0.207094 }, { "acc": 0.74983721, "epoch": 0.04922789775295645, "grad_norm": 5.84375, "learning_rate": 8.203732503888025e-06, "loss": 0.92014599, "memory(GiB)": 113.93, "step": 2110, "train_speed(iter/s)": 0.207566 }, { "acc": 0.72483044, "epoch": 0.04946120532524534, "grad_norm": 9.6875, "learning_rate": 8.242612752721618e-06, "loss": 1.01256599, "memory(GiB)": 113.93, "step": 2120, "train_speed(iter/s)": 0.208056 }, { "acc": 0.73585343, "epoch": 0.04969451289753423, "grad_norm": 6.21875, "learning_rate": 8.281493001555211e-06, "loss": 0.96261358, "memory(GiB)": 113.93, "step": 2130, "train_speed(iter/s)": 0.208581 }, { "acc": 0.75417643, "epoch": 0.04992782046982312, "grad_norm": 8.0625, "learning_rate": 8.320373250388803e-06, "loss": 0.91494999, "memory(GiB)": 113.93, "step": 2140, "train_speed(iter/s)": 0.209074 }, { "acc": 0.76283751, "epoch": 0.050161128042112015, "grad_norm": 7.34375, "learning_rate": 8.359253499222396e-06, "loss": 0.86959171, "memory(GiB)": 113.93, "step": 2150, "train_speed(iter/s)": 0.209578 }, { "acc": 0.75727921, "epoch": 0.050394435614400906, "grad_norm": 9.125, "learning_rate": 8.398133748055989e-06, "loss": 0.9050848, "memory(GiB)": 113.93, "step": 2160, "train_speed(iter/s)": 0.210045 }, { "acc": 0.74644947, "epoch": 0.050627743186689805, "grad_norm": 10.1875, "learning_rate": 8.43701399688958e-06, "loss": 0.93587589, "memory(GiB)": 113.93, "step": 2170, "train_speed(iter/s)": 0.210574 }, { "acc": 0.73937345, "epoch": 0.0508610507589787, "grad_norm": 6.0625, "learning_rate": 8.475894245723174e-06, "loss": 0.97366419, "memory(GiB)": 113.93, "step": 2180, "train_speed(iter/s)": 0.211089 }, { "acc": 0.76942945, "epoch": 0.05109435833126759, "grad_norm": 8.4375, "learning_rate": 8.514774494556765e-06, "loss": 0.83894501, "memory(GiB)": 113.93, "step": 2190, "train_speed(iter/s)": 0.211551 }, { "acc": 0.735604, "epoch": 0.05132766590355648, "grad_norm": 11.4375, "learning_rate": 8.553654743390358e-06, "loss": 0.9935276, "memory(GiB)": 113.93, "step": 2200, "train_speed(iter/s)": 0.212051 }, { "acc": 0.7588975, "epoch": 0.05156097347584537, "grad_norm": 6.1875, "learning_rate": 8.592534992223951e-06, "loss": 0.88080206, "memory(GiB)": 113.93, "step": 2210, "train_speed(iter/s)": 0.212502 }, { "acc": 0.72717614, "epoch": 0.05179428104813427, "grad_norm": 8.875, "learning_rate": 8.631415241057543e-06, "loss": 1.00440168, "memory(GiB)": 113.93, "step": 2220, "train_speed(iter/s)": 0.212966 }, { "acc": 0.72449689, "epoch": 0.052027588620423164, "grad_norm": 7.65625, "learning_rate": 8.670295489891136e-06, "loss": 1.02544794, "memory(GiB)": 113.93, "step": 2230, "train_speed(iter/s)": 0.213449 }, { "acc": 0.75859423, "epoch": 0.052260896192712056, "grad_norm": 12.125, "learning_rate": 8.70917573872473e-06, "loss": 0.90177155, "memory(GiB)": 113.93, "step": 2240, "train_speed(iter/s)": 0.213906 }, { "acc": 0.73521795, "epoch": 0.05249420376500095, "grad_norm": 5.46875, "learning_rate": 8.74805598755832e-06, "loss": 0.99730349, "memory(GiB)": 113.93, "step": 2250, "train_speed(iter/s)": 0.214364 }, { "acc": 0.74286022, "epoch": 0.05272751133728984, "grad_norm": 5.1875, "learning_rate": 8.786936236391914e-06, "loss": 0.9500061, "memory(GiB)": 113.93, "step": 2260, "train_speed(iter/s)": 0.214831 }, { "acc": 0.72509484, "epoch": 0.05296081890957873, "grad_norm": 5.65625, "learning_rate": 8.825816485225505e-06, "loss": 1.00917435, "memory(GiB)": 113.93, "step": 2270, "train_speed(iter/s)": 0.215325 }, { "acc": 0.74981842, "epoch": 0.05319412648186763, "grad_norm": 8.5, "learning_rate": 8.864696734059099e-06, "loss": 0.90823412, "memory(GiB)": 113.93, "step": 2280, "train_speed(iter/s)": 0.215809 }, { "acc": 0.74728265, "epoch": 0.05342743405415652, "grad_norm": 6.46875, "learning_rate": 8.903576982892692e-06, "loss": 0.95744038, "memory(GiB)": 113.93, "step": 2290, "train_speed(iter/s)": 0.21626 }, { "acc": 0.74729271, "epoch": 0.053660741626445414, "grad_norm": 8.625, "learning_rate": 8.942457231726283e-06, "loss": 0.93564587, "memory(GiB)": 113.93, "step": 2300, "train_speed(iter/s)": 0.216736 }, { "acc": 0.74897308, "epoch": 0.053894049198734306, "grad_norm": 9.125, "learning_rate": 8.981337480559876e-06, "loss": 0.914293, "memory(GiB)": 113.93, "step": 2310, "train_speed(iter/s)": 0.217175 }, { "acc": 0.74979, "epoch": 0.0541273567710232, "grad_norm": 10.0625, "learning_rate": 9.02021772939347e-06, "loss": 0.91987686, "memory(GiB)": 113.93, "step": 2320, "train_speed(iter/s)": 0.217653 }, { "acc": 0.76758165, "epoch": 0.05436066434331209, "grad_norm": 7.6875, "learning_rate": 9.059097978227061e-06, "loss": 0.87134476, "memory(GiB)": 113.93, "step": 2330, "train_speed(iter/s)": 0.218071 }, { "acc": 0.75257339, "epoch": 0.05459397191560099, "grad_norm": 6.03125, "learning_rate": 9.097978227060654e-06, "loss": 0.91587467, "memory(GiB)": 113.93, "step": 2340, "train_speed(iter/s)": 0.218525 }, { "acc": 0.73905735, "epoch": 0.05482727948788988, "grad_norm": 9.0625, "learning_rate": 9.136858475894247e-06, "loss": 0.96962147, "memory(GiB)": 113.93, "step": 2350, "train_speed(iter/s)": 0.218952 }, { "acc": 0.75100336, "epoch": 0.05506058706017877, "grad_norm": 10.9375, "learning_rate": 9.175738724727839e-06, "loss": 0.92670422, "memory(GiB)": 113.93, "step": 2360, "train_speed(iter/s)": 0.219336 }, { "acc": 0.75260715, "epoch": 0.055293894632467665, "grad_norm": 5.65625, "learning_rate": 9.21461897356143e-06, "loss": 0.9232728, "memory(GiB)": 113.93, "step": 2370, "train_speed(iter/s)": 0.219758 }, { "acc": 0.74801059, "epoch": 0.05552720220475656, "grad_norm": 7.125, "learning_rate": 9.253499222395023e-06, "loss": 0.92012291, "memory(GiB)": 113.93, "step": 2380, "train_speed(iter/s)": 0.220201 }, { "acc": 0.7247982, "epoch": 0.05576050977704545, "grad_norm": 5.9375, "learning_rate": 9.292379471228617e-06, "loss": 1.03374653, "memory(GiB)": 113.93, "step": 2390, "train_speed(iter/s)": 0.220597 }, { "acc": 0.72924385, "epoch": 0.05599381734933435, "grad_norm": 7.28125, "learning_rate": 9.33125972006221e-06, "loss": 0.98637619, "memory(GiB)": 113.93, "step": 2400, "train_speed(iter/s)": 0.22105 }, { "acc": 0.73365808, "epoch": 0.05622712492162324, "grad_norm": 6.28125, "learning_rate": 9.370139968895801e-06, "loss": 0.97255344, "memory(GiB)": 113.93, "step": 2410, "train_speed(iter/s)": 0.221467 }, { "acc": 0.75538158, "epoch": 0.05646043249391213, "grad_norm": 7.21875, "learning_rate": 9.409020217729394e-06, "loss": 0.87520905, "memory(GiB)": 113.93, "step": 2420, "train_speed(iter/s)": 0.221929 }, { "acc": 0.73517923, "epoch": 0.05669374006620102, "grad_norm": 6.96875, "learning_rate": 9.447900466562988e-06, "loss": 0.97482023, "memory(GiB)": 113.93, "step": 2430, "train_speed(iter/s)": 0.222341 }, { "acc": 0.74999905, "epoch": 0.056927047638489915, "grad_norm": 10.625, "learning_rate": 9.486780715396579e-06, "loss": 0.93086433, "memory(GiB)": 113.93, "step": 2440, "train_speed(iter/s)": 0.222758 }, { "acc": 0.72462254, "epoch": 0.05716035521077881, "grad_norm": 6.15625, "learning_rate": 9.525660964230172e-06, "loss": 1.04345741, "memory(GiB)": 113.93, "step": 2450, "train_speed(iter/s)": 0.223185 }, { "acc": 0.75127659, "epoch": 0.057393662783067706, "grad_norm": 7.0, "learning_rate": 9.564541213063765e-06, "loss": 0.91554146, "memory(GiB)": 113.93, "step": 2460, "train_speed(iter/s)": 0.22358 }, { "acc": 0.75875664, "epoch": 0.0576269703553566, "grad_norm": 8.1875, "learning_rate": 9.603421461897357e-06, "loss": 0.87615499, "memory(GiB)": 113.93, "step": 2470, "train_speed(iter/s)": 0.223974 }, { "acc": 0.7406374, "epoch": 0.05786027792764549, "grad_norm": 5.125, "learning_rate": 9.642301710730948e-06, "loss": 0.97383041, "memory(GiB)": 113.93, "step": 2480, "train_speed(iter/s)": 0.224432 }, { "acc": 0.73948112, "epoch": 0.05809358549993438, "grad_norm": 7.34375, "learning_rate": 9.681181959564542e-06, "loss": 0.97291508, "memory(GiB)": 113.93, "step": 2490, "train_speed(iter/s)": 0.224875 }, { "acc": 0.74801497, "epoch": 0.058326893072223274, "grad_norm": 17.625, "learning_rate": 9.720062208398135e-06, "loss": 0.94796829, "memory(GiB)": 113.93, "step": 2500, "train_speed(iter/s)": 0.225296 }, { "epoch": 0.058326893072223274, "eval_acc": 0.7136444454483557, "eval_loss": 0.92160964012146, "eval_runtime": 1263.4972, "eval_samples_per_second": 28.485, "eval_steps_per_second": 14.243, "step": 2500 }, { "acc": 0.73342862, "epoch": 0.058560200644512166, "grad_norm": 4.8125, "learning_rate": 9.758942457231726e-06, "loss": 0.9874877, "memory(GiB)": 113.93, "step": 2510, "train_speed(iter/s)": 0.202285 }, { "acc": 0.75045805, "epoch": 0.058793508216801064, "grad_norm": 9.25, "learning_rate": 9.79782270606532e-06, "loss": 0.90973644, "memory(GiB)": 113.93, "step": 2520, "train_speed(iter/s)": 0.20266 }, { "acc": 0.74481392, "epoch": 0.059026815789089956, "grad_norm": 5.625, "learning_rate": 9.836702954898913e-06, "loss": 0.94075699, "memory(GiB)": 113.93, "step": 2530, "train_speed(iter/s)": 0.203052 }, { "acc": 0.75897999, "epoch": 0.05926012336137885, "grad_norm": 7.21875, "learning_rate": 9.875583203732506e-06, "loss": 0.88116951, "memory(GiB)": 113.93, "step": 2540, "train_speed(iter/s)": 0.203427 }, { "acc": 0.75533948, "epoch": 0.05949343093366774, "grad_norm": 7.34375, "learning_rate": 9.914463452566097e-06, "loss": 0.91413612, "memory(GiB)": 113.93, "step": 2550, "train_speed(iter/s)": 0.203845 }, { "acc": 0.75480471, "epoch": 0.05972673850595663, "grad_norm": 11.1875, "learning_rate": 9.95334370139969e-06, "loss": 0.88989658, "memory(GiB)": 113.93, "step": 2560, "train_speed(iter/s)": 0.204269 }, { "acc": 0.74719419, "epoch": 0.059960046078245524, "grad_norm": 5.0625, "learning_rate": 9.992223950233282e-06, "loss": 0.92936649, "memory(GiB)": 113.93, "step": 2570, "train_speed(iter/s)": 0.204664 }, { "acc": 0.73482113, "epoch": 0.06019335365053442, "grad_norm": 5.28125, "learning_rate": 9.999999771600465e-06, "loss": 0.96768579, "memory(GiB)": 113.93, "step": 2580, "train_speed(iter/s)": 0.205099 }, { "acc": 0.76646805, "epoch": 0.060426661222823315, "grad_norm": 7.375, "learning_rate": 9.999998843727385e-06, "loss": 0.83013287, "memory(GiB)": 113.93, "step": 2590, "train_speed(iter/s)": 0.205537 }, { "acc": 0.76077948, "epoch": 0.06065996879511221, "grad_norm": 5.65625, "learning_rate": 9.999997202105923e-06, "loss": 0.87780886, "memory(GiB)": 113.93, "step": 2600, "train_speed(iter/s)": 0.205939 }, { "acc": 0.76506357, "epoch": 0.0608932763674011, "grad_norm": 7.78125, "learning_rate": 9.999994846736312e-06, "loss": 0.8675107, "memory(GiB)": 113.93, "step": 2610, "train_speed(iter/s)": 0.206346 }, { "acc": 0.73746605, "epoch": 0.06112658393968999, "grad_norm": 6.1875, "learning_rate": 9.99999177761889e-06, "loss": 0.98223839, "memory(GiB)": 113.93, "step": 2620, "train_speed(iter/s)": 0.206705 }, { "acc": 0.75281706, "epoch": 0.06135989151197888, "grad_norm": 5.5, "learning_rate": 9.999987994754094e-06, "loss": 0.90281296, "memory(GiB)": 113.93, "step": 2630, "train_speed(iter/s)": 0.207092 }, { "acc": 0.75662251, "epoch": 0.06159319908426778, "grad_norm": 5.375, "learning_rate": 9.999983498142464e-06, "loss": 0.88488331, "memory(GiB)": 113.93, "step": 2640, "train_speed(iter/s)": 0.207466 }, { "acc": 0.74789619, "epoch": 0.06182650665655667, "grad_norm": 7.78125, "learning_rate": 9.999978287784642e-06, "loss": 0.92577553, "memory(GiB)": 113.93, "step": 2650, "train_speed(iter/s)": 0.207881 }, { "acc": 0.7367178, "epoch": 0.062059814228845565, "grad_norm": 6.28125, "learning_rate": 9.999972363681371e-06, "loss": 0.95870752, "memory(GiB)": 113.93, "step": 2660, "train_speed(iter/s)": 0.20827 }, { "acc": 0.754388, "epoch": 0.06229312180113446, "grad_norm": 8.375, "learning_rate": 9.9999657258335e-06, "loss": 0.91254597, "memory(GiB)": 113.93, "step": 2670, "train_speed(iter/s)": 0.208661 }, { "acc": 0.74269323, "epoch": 0.06252642937342336, "grad_norm": 6.4375, "learning_rate": 9.999958374241974e-06, "loss": 0.93991718, "memory(GiB)": 113.93, "step": 2680, "train_speed(iter/s)": 0.209065 }, { "acc": 0.75530109, "epoch": 0.06275973694571224, "grad_norm": 12.0, "learning_rate": 9.99995030890784e-06, "loss": 0.89492931, "memory(GiB)": 113.93, "step": 2690, "train_speed(iter/s)": 0.209474 }, { "acc": 0.7680037, "epoch": 0.06299304451800114, "grad_norm": 9.5625, "learning_rate": 9.999941529832254e-06, "loss": 0.85669937, "memory(GiB)": 113.93, "step": 2700, "train_speed(iter/s)": 0.209873 }, { "acc": 0.75060863, "epoch": 0.06322635209029003, "grad_norm": 6.09375, "learning_rate": 9.999932037016466e-06, "loss": 0.91923189, "memory(GiB)": 113.93, "step": 2710, "train_speed(iter/s)": 0.210281 }, { "acc": 0.76650629, "epoch": 0.06345965966257892, "grad_norm": 7.125, "learning_rate": 9.999921830461833e-06, "loss": 0.85965843, "memory(GiB)": 113.93, "step": 2720, "train_speed(iter/s)": 0.21063 }, { "acc": 0.76216908, "epoch": 0.06369296723486782, "grad_norm": 4.96875, "learning_rate": 9.99991091016981e-06, "loss": 0.86604691, "memory(GiB)": 113.93, "step": 2730, "train_speed(iter/s)": 0.211024 }, { "acc": 0.74518843, "epoch": 0.06392627480715671, "grad_norm": 5.8125, "learning_rate": 9.99989927614196e-06, "loss": 0.93415737, "memory(GiB)": 113.93, "step": 2740, "train_speed(iter/s)": 0.211406 }, { "acc": 0.75024357, "epoch": 0.0641595823794456, "grad_norm": 8.4375, "learning_rate": 9.999886928379939e-06, "loss": 0.95769053, "memory(GiB)": 113.93, "step": 2750, "train_speed(iter/s)": 0.211779 }, { "acc": 0.72975616, "epoch": 0.06439288995173449, "grad_norm": 7.6875, "learning_rate": 9.99987386688551e-06, "loss": 1.0179884, "memory(GiB)": 113.93, "step": 2760, "train_speed(iter/s)": 0.212192 }, { "acc": 0.76447496, "epoch": 0.06462619752402339, "grad_norm": 5.96875, "learning_rate": 9.99986009166054e-06, "loss": 0.84103346, "memory(GiB)": 113.93, "step": 2770, "train_speed(iter/s)": 0.212559 }, { "acc": 0.76405754, "epoch": 0.06485950509631228, "grad_norm": 4.84375, "learning_rate": 9.999845602706995e-06, "loss": 0.84612503, "memory(GiB)": 113.93, "step": 2780, "train_speed(iter/s)": 0.212948 }, { "acc": 0.75079432, "epoch": 0.06509281266860117, "grad_norm": 4.65625, "learning_rate": 9.999830400026941e-06, "loss": 0.91033974, "memory(GiB)": 113.93, "step": 2790, "train_speed(iter/s)": 0.213325 }, { "acc": 0.74466982, "epoch": 0.06532612024089007, "grad_norm": 4.4375, "learning_rate": 9.999814483622552e-06, "loss": 0.9435277, "memory(GiB)": 113.93, "step": 2800, "train_speed(iter/s)": 0.213688 }, { "acc": 0.73456364, "epoch": 0.06555942781317896, "grad_norm": 7.59375, "learning_rate": 9.999797853496097e-06, "loss": 0.96198997, "memory(GiB)": 113.93, "step": 2810, "train_speed(iter/s)": 0.214054 }, { "acc": 0.74370036, "epoch": 0.06579273538546786, "grad_norm": 6.75, "learning_rate": 9.999780509649952e-06, "loss": 0.96928806, "memory(GiB)": 113.93, "step": 2820, "train_speed(iter/s)": 0.21443 }, { "acc": 0.74688067, "epoch": 0.06602604295775674, "grad_norm": 5.25, "learning_rate": 9.99976245208659e-06, "loss": 0.9240942, "memory(GiB)": 113.93, "step": 2830, "train_speed(iter/s)": 0.214811 }, { "acc": 0.76717963, "epoch": 0.06625935053004564, "grad_norm": 6.875, "learning_rate": 9.99974368080859e-06, "loss": 0.84208164, "memory(GiB)": 113.93, "step": 2840, "train_speed(iter/s)": 0.215185 }, { "acc": 0.75557346, "epoch": 0.06649265810233454, "grad_norm": 7.21875, "learning_rate": 9.999724195818634e-06, "loss": 0.90467901, "memory(GiB)": 113.93, "step": 2850, "train_speed(iter/s)": 0.215566 }, { "acc": 0.74978905, "epoch": 0.06672596567462342, "grad_norm": 8.4375, "learning_rate": 9.999703997119501e-06, "loss": 0.91198101, "memory(GiB)": 113.93, "step": 2860, "train_speed(iter/s)": 0.215912 }, { "acc": 0.73471365, "epoch": 0.06695927324691232, "grad_norm": 7.46875, "learning_rate": 9.999683084714074e-06, "loss": 0.96304169, "memory(GiB)": 113.94, "step": 2870, "train_speed(iter/s)": 0.216295 }, { "acc": 0.76548772, "epoch": 0.06719258081920121, "grad_norm": 6.9375, "learning_rate": 9.999661458605339e-06, "loss": 0.87100945, "memory(GiB)": 113.94, "step": 2880, "train_speed(iter/s)": 0.216664 }, { "acc": 0.7508503, "epoch": 0.06742588839149011, "grad_norm": 5.46875, "learning_rate": 9.999639118796384e-06, "loss": 0.91987743, "memory(GiB)": 113.94, "step": 2890, "train_speed(iter/s)": 0.21703 }, { "acc": 0.76183619, "epoch": 0.06765919596377899, "grad_norm": 11.6875, "learning_rate": 9.999616065290396e-06, "loss": 0.87902603, "memory(GiB)": 113.94, "step": 2900, "train_speed(iter/s)": 0.217389 }, { "acc": 0.75862818, "epoch": 0.06789250353606789, "grad_norm": 6.46875, "learning_rate": 9.999592298090669e-06, "loss": 0.85631313, "memory(GiB)": 113.94, "step": 2910, "train_speed(iter/s)": 0.217726 }, { "acc": 0.7673759, "epoch": 0.06812581110835679, "grad_norm": 5.59375, "learning_rate": 9.999567817200592e-06, "loss": 0.8424716, "memory(GiB)": 113.94, "step": 2920, "train_speed(iter/s)": 0.218102 }, { "acc": 0.7607398, "epoch": 0.06835911868064568, "grad_norm": 5.15625, "learning_rate": 9.999542622623661e-06, "loss": 0.8682909, "memory(GiB)": 113.94, "step": 2930, "train_speed(iter/s)": 0.218439 }, { "acc": 0.76228185, "epoch": 0.06859242625293457, "grad_norm": 8.625, "learning_rate": 9.999516714363475e-06, "loss": 0.87924309, "memory(GiB)": 113.94, "step": 2940, "train_speed(iter/s)": 0.21878 }, { "acc": 0.74948287, "epoch": 0.06882573382522346, "grad_norm": 6.40625, "learning_rate": 9.99949009242373e-06, "loss": 0.92974014, "memory(GiB)": 113.94, "step": 2950, "train_speed(iter/s)": 0.219126 }, { "acc": 0.76307225, "epoch": 0.06905904139751236, "grad_norm": 8.4375, "learning_rate": 9.999462756808227e-06, "loss": 0.86307697, "memory(GiB)": 113.94, "step": 2960, "train_speed(iter/s)": 0.219454 }, { "acc": 0.76721716, "epoch": 0.06929234896980126, "grad_norm": 10.5625, "learning_rate": 9.999434707520867e-06, "loss": 0.85319052, "memory(GiB)": 113.94, "step": 2970, "train_speed(iter/s)": 0.219801 }, { "acc": 0.75422001, "epoch": 0.06952565654209014, "grad_norm": 5.25, "learning_rate": 9.999405944565654e-06, "loss": 0.91344976, "memory(GiB)": 113.94, "step": 2980, "train_speed(iter/s)": 0.220147 }, { "acc": 0.74894171, "epoch": 0.06975896411437904, "grad_norm": 7.46875, "learning_rate": 9.999376467946695e-06, "loss": 0.92173023, "memory(GiB)": 113.94, "step": 2990, "train_speed(iter/s)": 0.220504 }, { "acc": 0.74609923, "epoch": 0.06999227168666793, "grad_norm": 5.28125, "learning_rate": 9.999346277668198e-06, "loss": 0.908741, "memory(GiB)": 113.94, "step": 3000, "train_speed(iter/s)": 0.220849 }, { "epoch": 0.06999227168666793, "eval_acc": 0.7174498788314903, "eval_loss": 0.9058911800384521, "eval_runtime": 1263.0362, "eval_samples_per_second": 28.496, "eval_steps_per_second": 14.248, "step": 3000 }, { "acc": 0.76210346, "epoch": 0.07022557925895682, "grad_norm": 4.8125, "learning_rate": 9.999315373734472e-06, "loss": 0.8618515, "memory(GiB)": 117.13, "step": 3010, "train_speed(iter/s)": 0.202022 }, { "acc": 0.7602829, "epoch": 0.07045888683124572, "grad_norm": 8.375, "learning_rate": 9.999283756149932e-06, "loss": 0.86991014, "memory(GiB)": 117.13, "step": 3020, "train_speed(iter/s)": 0.202355 }, { "acc": 0.76280308, "epoch": 0.07069219440353461, "grad_norm": 5.0, "learning_rate": 9.999251424919083e-06, "loss": 0.89072361, "memory(GiB)": 117.13, "step": 3030, "train_speed(iter/s)": 0.202715 }, { "acc": 0.76693077, "epoch": 0.07092550197582351, "grad_norm": 5.625, "learning_rate": 9.999218380046548e-06, "loss": 0.85743275, "memory(GiB)": 117.13, "step": 3040, "train_speed(iter/s)": 0.203062 }, { "acc": 0.7543458, "epoch": 0.07115880954811239, "grad_norm": 5.6875, "learning_rate": 9.99918462153704e-06, "loss": 0.88218651, "memory(GiB)": 117.13, "step": 3050, "train_speed(iter/s)": 0.203403 }, { "acc": 0.74904919, "epoch": 0.07139211712040129, "grad_norm": 5.96875, "learning_rate": 9.999150149395383e-06, "loss": 0.94129143, "memory(GiB)": 117.13, "step": 3060, "train_speed(iter/s)": 0.203761 }, { "acc": 0.73940239, "epoch": 0.07162542469269018, "grad_norm": 6.21875, "learning_rate": 9.99911496362649e-06, "loss": 0.9520813, "memory(GiB)": 117.13, "step": 3070, "train_speed(iter/s)": 0.204102 }, { "acc": 0.74238367, "epoch": 0.07185873226497907, "grad_norm": 6.40625, "learning_rate": 9.99907906423539e-06, "loss": 0.97201996, "memory(GiB)": 117.13, "step": 3080, "train_speed(iter/s)": 0.204451 }, { "acc": 0.74634147, "epoch": 0.07209203983726797, "grad_norm": 6.53125, "learning_rate": 9.999042451227208e-06, "loss": 0.92671185, "memory(GiB)": 117.13, "step": 3090, "train_speed(iter/s)": 0.204801 }, { "acc": 0.76068754, "epoch": 0.07232534740955686, "grad_norm": 8.125, "learning_rate": 9.999005124607167e-06, "loss": 0.88492603, "memory(GiB)": 117.13, "step": 3100, "train_speed(iter/s)": 0.205142 }, { "acc": 0.77721863, "epoch": 0.07255865498184576, "grad_norm": 18.25, "learning_rate": 9.998967084380596e-06, "loss": 0.83444014, "memory(GiB)": 117.13, "step": 3110, "train_speed(iter/s)": 0.205515 }, { "acc": 0.75386047, "epoch": 0.07279196255413464, "grad_norm": 8.25, "learning_rate": 9.998928330552925e-06, "loss": 0.91596088, "memory(GiB)": 117.13, "step": 3120, "train_speed(iter/s)": 0.205857 }, { "acc": 0.74797945, "epoch": 0.07302527012642354, "grad_norm": 10.5, "learning_rate": 9.998888863129688e-06, "loss": 0.9316391, "memory(GiB)": 117.13, "step": 3130, "train_speed(iter/s)": 0.206207 }, { "acc": 0.74575772, "epoch": 0.07325857769871244, "grad_norm": 5.34375, "learning_rate": 9.998848682116518e-06, "loss": 0.95001354, "memory(GiB)": 117.13, "step": 3140, "train_speed(iter/s)": 0.206558 }, { "acc": 0.76075306, "epoch": 0.07349188527100133, "grad_norm": 6.96875, "learning_rate": 9.998807787519151e-06, "loss": 0.85484276, "memory(GiB)": 117.13, "step": 3150, "train_speed(iter/s)": 0.206886 }, { "acc": 0.77597332, "epoch": 0.07372519284329022, "grad_norm": 6.65625, "learning_rate": 9.998766179343425e-06, "loss": 0.81057844, "memory(GiB)": 117.13, "step": 3160, "train_speed(iter/s)": 0.207212 }, { "acc": 0.75207553, "epoch": 0.07395850041557911, "grad_norm": 5.96875, "learning_rate": 9.998723857595278e-06, "loss": 0.90646782, "memory(GiB)": 117.13, "step": 3170, "train_speed(iter/s)": 0.207553 }, { "acc": 0.75076399, "epoch": 0.07419180798786801, "grad_norm": 4.84375, "learning_rate": 9.998680822280752e-06, "loss": 0.91785688, "memory(GiB)": 117.13, "step": 3180, "train_speed(iter/s)": 0.207877 }, { "acc": 0.7491312, "epoch": 0.0744251155601569, "grad_norm": 6.875, "learning_rate": 9.998637073405992e-06, "loss": 0.9490078, "memory(GiB)": 117.13, "step": 3190, "train_speed(iter/s)": 0.208216 }, { "acc": 0.76558161, "epoch": 0.07465842313244579, "grad_norm": 5.96875, "learning_rate": 9.998592610977241e-06, "loss": 0.85403881, "memory(GiB)": 117.13, "step": 3200, "train_speed(iter/s)": 0.208556 }, { "acc": 0.7579298, "epoch": 0.07489173070473469, "grad_norm": 4.90625, "learning_rate": 9.998547435000847e-06, "loss": 0.88162756, "memory(GiB)": 117.13, "step": 3210, "train_speed(iter/s)": 0.208859 }, { "acc": 0.75007544, "epoch": 0.07512503827702358, "grad_norm": 5.53125, "learning_rate": 9.998501545483259e-06, "loss": 0.91202097, "memory(GiB)": 117.13, "step": 3220, "train_speed(iter/s)": 0.209158 }, { "acc": 0.74673915, "epoch": 0.07535834584931247, "grad_norm": 5.28125, "learning_rate": 9.998454942431029e-06, "loss": 0.91447458, "memory(GiB)": 117.13, "step": 3230, "train_speed(iter/s)": 0.209453 }, { "acc": 0.75813031, "epoch": 0.07559165342160136, "grad_norm": 7.21875, "learning_rate": 9.998407625850806e-06, "loss": 0.88690529, "memory(GiB)": 117.13, "step": 3240, "train_speed(iter/s)": 0.209766 }, { "acc": 0.75772085, "epoch": 0.07582496099389026, "grad_norm": 6.34375, "learning_rate": 9.998359595749346e-06, "loss": 0.8959444, "memory(GiB)": 117.13, "step": 3250, "train_speed(iter/s)": 0.210102 }, { "acc": 0.74702339, "epoch": 0.07605826856617916, "grad_norm": 6.53125, "learning_rate": 9.998310852133506e-06, "loss": 0.94382629, "memory(GiB)": 117.13, "step": 3260, "train_speed(iter/s)": 0.210403 }, { "acc": 0.74971991, "epoch": 0.07629157613846804, "grad_norm": 8.5625, "learning_rate": 9.998261395010246e-06, "loss": 0.90288601, "memory(GiB)": 117.13, "step": 3270, "train_speed(iter/s)": 0.210704 }, { "acc": 0.74937439, "epoch": 0.07652488371075694, "grad_norm": 5.84375, "learning_rate": 9.998211224386623e-06, "loss": 0.92901402, "memory(GiB)": 117.13, "step": 3280, "train_speed(iter/s)": 0.211027 }, { "acc": 0.74345322, "epoch": 0.07675819128304583, "grad_norm": 7.03125, "learning_rate": 9.998160340269799e-06, "loss": 0.95880556, "memory(GiB)": 117.13, "step": 3290, "train_speed(iter/s)": 0.211348 }, { "acc": 0.75362153, "epoch": 0.07699149885533473, "grad_norm": 23.625, "learning_rate": 9.998108742667038e-06, "loss": 0.89722214, "memory(GiB)": 117.13, "step": 3300, "train_speed(iter/s)": 0.211665 }, { "acc": 0.76416769, "epoch": 0.07722480642762361, "grad_norm": 5.90625, "learning_rate": 9.998056431585707e-06, "loss": 0.87462025, "memory(GiB)": 117.13, "step": 3310, "train_speed(iter/s)": 0.211995 }, { "acc": 0.7516809, "epoch": 0.07745811399991251, "grad_norm": 7.15625, "learning_rate": 9.998003407033271e-06, "loss": 0.91020813, "memory(GiB)": 117.13, "step": 3320, "train_speed(iter/s)": 0.212287 }, { "acc": 0.74609051, "epoch": 0.07769142157220141, "grad_norm": 6.5625, "learning_rate": 9.997949669017302e-06, "loss": 0.92258587, "memory(GiB)": 117.13, "step": 3330, "train_speed(iter/s)": 0.212604 }, { "acc": 0.77119193, "epoch": 0.07792472914449029, "grad_norm": 4.15625, "learning_rate": 9.997895217545468e-06, "loss": 0.82188301, "memory(GiB)": 117.13, "step": 3340, "train_speed(iter/s)": 0.212874 }, { "acc": 0.76285582, "epoch": 0.07815803671677919, "grad_norm": 6.28125, "learning_rate": 9.997840052625546e-06, "loss": 0.8615324, "memory(GiB)": 117.13, "step": 3350, "train_speed(iter/s)": 0.213166 }, { "acc": 0.74604807, "epoch": 0.07839134428906808, "grad_norm": 5.09375, "learning_rate": 9.997784174265407e-06, "loss": 0.9481144, "memory(GiB)": 117.13, "step": 3360, "train_speed(iter/s)": 0.213486 }, { "acc": 0.74088101, "epoch": 0.07862465186135698, "grad_norm": 5.65625, "learning_rate": 9.99772758247303e-06, "loss": 0.94895153, "memory(GiB)": 117.13, "step": 3370, "train_speed(iter/s)": 0.213796 }, { "acc": 0.74408126, "epoch": 0.07885795943364587, "grad_norm": 9.5, "learning_rate": 9.99767027725649e-06, "loss": 0.96670113, "memory(GiB)": 117.13, "step": 3380, "train_speed(iter/s)": 0.214116 }, { "acc": 0.76560807, "epoch": 0.07909126700593476, "grad_norm": 5.4375, "learning_rate": 9.997612258623972e-06, "loss": 0.85278721, "memory(GiB)": 117.13, "step": 3390, "train_speed(iter/s)": 0.214433 }, { "acc": 0.73402705, "epoch": 0.07932457457822366, "grad_norm": 5.875, "learning_rate": 9.997553526583755e-06, "loss": 0.9818717, "memory(GiB)": 117.13, "step": 3400, "train_speed(iter/s)": 0.214759 }, { "acc": 0.74461875, "epoch": 0.07955788215051254, "grad_norm": 8.5, "learning_rate": 9.997494081144224e-06, "loss": 0.94067936, "memory(GiB)": 117.13, "step": 3410, "train_speed(iter/s)": 0.215081 }, { "acc": 0.75828438, "epoch": 0.07979118972280144, "grad_norm": 9.4375, "learning_rate": 9.997433922313863e-06, "loss": 0.86931877, "memory(GiB)": 117.13, "step": 3420, "train_speed(iter/s)": 0.215374 }, { "acc": 0.75122313, "epoch": 0.08002449729509033, "grad_norm": 7.0, "learning_rate": 9.997373050101265e-06, "loss": 0.92490044, "memory(GiB)": 117.13, "step": 3430, "train_speed(iter/s)": 0.215678 }, { "acc": 0.74527292, "epoch": 0.08025780486737923, "grad_norm": 9.375, "learning_rate": 9.997311464515113e-06, "loss": 0.95839014, "memory(GiB)": 117.13, "step": 3440, "train_speed(iter/s)": 0.21597 }, { "acc": 0.76718817, "epoch": 0.08049111243966812, "grad_norm": 14.6875, "learning_rate": 9.997249165564203e-06, "loss": 0.83571854, "memory(GiB)": 117.13, "step": 3450, "train_speed(iter/s)": 0.216299 }, { "acc": 0.74820533, "epoch": 0.08072442001195701, "grad_norm": 5.59375, "learning_rate": 9.997186153257425e-06, "loss": 0.94048061, "memory(GiB)": 117.13, "step": 3460, "train_speed(iter/s)": 0.216595 }, { "acc": 0.75320921, "epoch": 0.08095772758424591, "grad_norm": 8.375, "learning_rate": 9.997122427603777e-06, "loss": 0.86001663, "memory(GiB)": 117.13, "step": 3470, "train_speed(iter/s)": 0.216885 }, { "acc": 0.75159769, "epoch": 0.0811910351565348, "grad_norm": 5.84375, "learning_rate": 9.997057988612351e-06, "loss": 0.88844814, "memory(GiB)": 117.13, "step": 3480, "train_speed(iter/s)": 0.217171 }, { "acc": 0.73871365, "epoch": 0.08142434272882369, "grad_norm": 5.5, "learning_rate": 9.996992836292352e-06, "loss": 0.97896032, "memory(GiB)": 117.13, "step": 3490, "train_speed(iter/s)": 0.217476 }, { "acc": 0.76496325, "epoch": 0.08165765030111259, "grad_norm": 6.84375, "learning_rate": 9.996926970653076e-06, "loss": 0.85766716, "memory(GiB)": 117.13, "step": 3500, "train_speed(iter/s)": 0.217788 }, { "epoch": 0.08165765030111259, "eval_acc": 0.7195258777859883, "eval_loss": 0.8972460627555847, "eval_runtime": 1264.1831, "eval_samples_per_second": 28.47, "eval_steps_per_second": 14.235, "step": 3500 }, { "acc": 0.76662865, "epoch": 0.08189095787340148, "grad_norm": 4.9375, "learning_rate": 9.996860391703925e-06, "loss": 0.83215084, "memory(GiB)": 117.13, "step": 3510, "train_speed(iter/s)": 0.201894 }, { "acc": 0.76249638, "epoch": 0.08212426544569038, "grad_norm": 7.28125, "learning_rate": 9.996793099454407e-06, "loss": 0.88059654, "memory(GiB)": 117.13, "step": 3520, "train_speed(iter/s)": 0.202185 }, { "acc": 0.76358109, "epoch": 0.08235757301797926, "grad_norm": 5.75, "learning_rate": 9.996725093914125e-06, "loss": 0.86032219, "memory(GiB)": 117.13, "step": 3530, "train_speed(iter/s)": 0.202486 }, { "acc": 0.75037994, "epoch": 0.08259088059026816, "grad_norm": 6.84375, "learning_rate": 9.996656375092786e-06, "loss": 0.91713352, "memory(GiB)": 117.13, "step": 3540, "train_speed(iter/s)": 0.202791 }, { "acc": 0.76051826, "epoch": 0.08282418816255704, "grad_norm": 8.0, "learning_rate": 9.996586943000203e-06, "loss": 0.88210812, "memory(GiB)": 117.13, "step": 3550, "train_speed(iter/s)": 0.203094 }, { "acc": 0.75420084, "epoch": 0.08305749573484594, "grad_norm": 8.375, "learning_rate": 9.996516797646285e-06, "loss": 0.8841979, "memory(GiB)": 117.13, "step": 3560, "train_speed(iter/s)": 0.20339 }, { "acc": 0.74496236, "epoch": 0.08329080330713484, "grad_norm": 7.71875, "learning_rate": 9.996445939041043e-06, "loss": 0.93399353, "memory(GiB)": 117.13, "step": 3570, "train_speed(iter/s)": 0.203695 }, { "acc": 0.75147638, "epoch": 0.08352411087942373, "grad_norm": 7.90625, "learning_rate": 9.996374367194599e-06, "loss": 0.9279705, "memory(GiB)": 117.13, "step": 3580, "train_speed(iter/s)": 0.203991 }, { "acc": 0.73116684, "epoch": 0.08375741845171263, "grad_norm": 6.40625, "learning_rate": 9.996302082117162e-06, "loss": 1.01277275, "memory(GiB)": 117.13, "step": 3590, "train_speed(iter/s)": 0.204285 }, { "acc": 0.75076885, "epoch": 0.08399072602400151, "grad_norm": 5.5, "learning_rate": 9.996229083819055e-06, "loss": 0.88694172, "memory(GiB)": 117.13, "step": 3600, "train_speed(iter/s)": 0.204601 }, { "acc": 0.75149937, "epoch": 0.08422403359629041, "grad_norm": 5.59375, "learning_rate": 9.996155372310699e-06, "loss": 0.90187569, "memory(GiB)": 117.13, "step": 3610, "train_speed(iter/s)": 0.204903 }, { "acc": 0.74182329, "epoch": 0.08445734116857931, "grad_norm": 7.75, "learning_rate": 9.996080947602615e-06, "loss": 0.94764538, "memory(GiB)": 117.13, "step": 3620, "train_speed(iter/s)": 0.205208 }, { "acc": 0.76639605, "epoch": 0.0846906487408682, "grad_norm": 9.25, "learning_rate": 9.996005809705428e-06, "loss": 0.84041519, "memory(GiB)": 117.13, "step": 3630, "train_speed(iter/s)": 0.205479 }, { "acc": 0.74680309, "epoch": 0.08492395631315709, "grad_norm": 7.78125, "learning_rate": 9.99592995862986e-06, "loss": 0.93639717, "memory(GiB)": 117.13, "step": 3640, "train_speed(iter/s)": 0.205784 }, { "acc": 0.77147326, "epoch": 0.08515726388544598, "grad_norm": 4.3125, "learning_rate": 9.995853394386743e-06, "loss": 0.82768078, "memory(GiB)": 117.13, "step": 3650, "train_speed(iter/s)": 0.206083 }, { "acc": 0.77721438, "epoch": 0.08539057145773488, "grad_norm": 7.34375, "learning_rate": 9.995776116987006e-06, "loss": 0.83602409, "memory(GiB)": 117.13, "step": 3660, "train_speed(iter/s)": 0.206357 }, { "acc": 0.75351801, "epoch": 0.08562387903002378, "grad_norm": 6.8125, "learning_rate": 9.995698126441678e-06, "loss": 0.90112209, "memory(GiB)": 117.13, "step": 3670, "train_speed(iter/s)": 0.206636 }, { "acc": 0.74808969, "epoch": 0.08585718660231266, "grad_norm": 5.3125, "learning_rate": 9.995619422761896e-06, "loss": 0.92840281, "memory(GiB)": 117.13, "step": 3680, "train_speed(iter/s)": 0.206918 }, { "acc": 0.7723721, "epoch": 0.08609049417460156, "grad_norm": 6.59375, "learning_rate": 9.995540005958891e-06, "loss": 0.81368618, "memory(GiB)": 117.13, "step": 3690, "train_speed(iter/s)": 0.207209 }, { "acc": 0.76420546, "epoch": 0.08632380174689044, "grad_norm": 7.46875, "learning_rate": 9.995459876044e-06, "loss": 0.85773888, "memory(GiB)": 117.13, "step": 3700, "train_speed(iter/s)": 0.207498 }, { "acc": 0.7516161, "epoch": 0.08655710931917934, "grad_norm": 4.875, "learning_rate": 9.995379033028666e-06, "loss": 0.90806656, "memory(GiB)": 117.13, "step": 3710, "train_speed(iter/s)": 0.207787 }, { "acc": 0.73606005, "epoch": 0.08679041689146823, "grad_norm": 6.78125, "learning_rate": 9.995297476924424e-06, "loss": 0.97946396, "memory(GiB)": 117.13, "step": 3720, "train_speed(iter/s)": 0.208071 }, { "acc": 0.7413105, "epoch": 0.08702372446375713, "grad_norm": 7.4375, "learning_rate": 9.99521520774292e-06, "loss": 0.95148058, "memory(GiB)": 117.13, "step": 3730, "train_speed(iter/s)": 0.208349 }, { "acc": 0.76187735, "epoch": 0.08725703203604603, "grad_norm": 6.0625, "learning_rate": 9.995132225495896e-06, "loss": 0.87295303, "memory(GiB)": 117.13, "step": 3740, "train_speed(iter/s)": 0.208636 }, { "acc": 0.75933933, "epoch": 0.08749033960833491, "grad_norm": 4.59375, "learning_rate": 9.995048530195198e-06, "loss": 0.86997223, "memory(GiB)": 117.13, "step": 3750, "train_speed(iter/s)": 0.208927 }, { "acc": 0.7408792, "epoch": 0.08772364718062381, "grad_norm": 10.8125, "learning_rate": 9.99496412185277e-06, "loss": 0.97568855, "memory(GiB)": 117.13, "step": 3760, "train_speed(iter/s)": 0.209201 }, { "acc": 0.75821385, "epoch": 0.0879569547529127, "grad_norm": 4.9375, "learning_rate": 9.994879000480668e-06, "loss": 0.89489002, "memory(GiB)": 117.13, "step": 3770, "train_speed(iter/s)": 0.20947 }, { "acc": 0.75803394, "epoch": 0.0881902623252016, "grad_norm": 8.1875, "learning_rate": 9.994793166091039e-06, "loss": 0.89759312, "memory(GiB)": 117.13, "step": 3780, "train_speed(iter/s)": 0.209749 }, { "acc": 0.75399165, "epoch": 0.08842356989749049, "grad_norm": 4.5, "learning_rate": 9.994706618696137e-06, "loss": 0.89545727, "memory(GiB)": 117.13, "step": 3790, "train_speed(iter/s)": 0.210021 }, { "acc": 0.7794095, "epoch": 0.08865687746977938, "grad_norm": 7.84375, "learning_rate": 9.994619358308316e-06, "loss": 0.80540943, "memory(GiB)": 117.13, "step": 3800, "train_speed(iter/s)": 0.210297 }, { "acc": 0.75469561, "epoch": 0.08889018504206828, "grad_norm": 9.3125, "learning_rate": 9.994531384940032e-06, "loss": 0.88198318, "memory(GiB)": 117.13, "step": 3810, "train_speed(iter/s)": 0.210545 }, { "acc": 0.76181564, "epoch": 0.08912349261435716, "grad_norm": 5.34375, "learning_rate": 9.994442698603844e-06, "loss": 0.87399759, "memory(GiB)": 117.13, "step": 3820, "train_speed(iter/s)": 0.210811 }, { "acc": 0.75165758, "epoch": 0.08935680018664606, "grad_norm": 6.09375, "learning_rate": 9.99435329931241e-06, "loss": 0.91721411, "memory(GiB)": 117.13, "step": 3830, "train_speed(iter/s)": 0.211087 }, { "acc": 0.78487425, "epoch": 0.08959010775893494, "grad_norm": 6.625, "learning_rate": 9.994263187078496e-06, "loss": 0.77421608, "memory(GiB)": 117.13, "step": 3840, "train_speed(iter/s)": 0.211363 }, { "acc": 0.77620182, "epoch": 0.08982341533122384, "grad_norm": 5.21875, "learning_rate": 9.994172361914962e-06, "loss": 0.80532532, "memory(GiB)": 117.13, "step": 3850, "train_speed(iter/s)": 0.211638 }, { "acc": 0.76540213, "epoch": 0.09005672290351274, "grad_norm": 5.03125, "learning_rate": 9.994080823834775e-06, "loss": 0.86148052, "memory(GiB)": 117.26, "step": 3860, "train_speed(iter/s)": 0.211918 }, { "acc": 0.76003137, "epoch": 0.09029003047580163, "grad_norm": 6.0, "learning_rate": 9.993988572851e-06, "loss": 0.88044262, "memory(GiB)": 117.26, "step": 3870, "train_speed(iter/s)": 0.2122 }, { "acc": 0.76154003, "epoch": 0.09052333804809053, "grad_norm": 5.75, "learning_rate": 9.993895608976806e-06, "loss": 0.88937092, "memory(GiB)": 117.26, "step": 3880, "train_speed(iter/s)": 0.212458 }, { "acc": 0.74300785, "epoch": 0.09075664562037941, "grad_norm": 8.4375, "learning_rate": 9.993801932225466e-06, "loss": 0.9540123, "memory(GiB)": 117.26, "step": 3890, "train_speed(iter/s)": 0.212741 }, { "acc": 0.76823215, "epoch": 0.09098995319266831, "grad_norm": 5.40625, "learning_rate": 9.993707542610351e-06, "loss": 0.8575367, "memory(GiB)": 117.26, "step": 3900, "train_speed(iter/s)": 0.213004 }, { "acc": 0.74470196, "epoch": 0.09122326076495721, "grad_norm": 7.625, "learning_rate": 9.993612440144935e-06, "loss": 0.9225111, "memory(GiB)": 117.26, "step": 3910, "train_speed(iter/s)": 0.213276 }, { "acc": 0.75410318, "epoch": 0.0914565683372461, "grad_norm": 6.03125, "learning_rate": 9.993516624842792e-06, "loss": 0.88778362, "memory(GiB)": 117.26, "step": 3920, "train_speed(iter/s)": 0.213539 }, { "acc": 0.74215426, "epoch": 0.09168987590953499, "grad_norm": 14.4375, "learning_rate": 9.993420096717603e-06, "loss": 0.93417568, "memory(GiB)": 117.26, "step": 3930, "train_speed(iter/s)": 0.213817 }, { "acc": 0.7749835, "epoch": 0.09192318348182388, "grad_norm": 6.71875, "learning_rate": 9.993322855783146e-06, "loss": 0.81192265, "memory(GiB)": 117.26, "step": 3940, "train_speed(iter/s)": 0.214069 }, { "acc": 0.74965153, "epoch": 0.09215649105411278, "grad_norm": 8.875, "learning_rate": 9.993224902053302e-06, "loss": 0.90446196, "memory(GiB)": 117.26, "step": 3950, "train_speed(iter/s)": 0.214348 }, { "acc": 0.76766262, "epoch": 0.09238979862640166, "grad_norm": 4.71875, "learning_rate": 9.993126235542053e-06, "loss": 0.85899258, "memory(GiB)": 117.26, "step": 3960, "train_speed(iter/s)": 0.214614 }, { "acc": 0.76388664, "epoch": 0.09262310619869056, "grad_norm": 13.25, "learning_rate": 9.993026856263486e-06, "loss": 0.86655865, "memory(GiB)": 117.26, "step": 3970, "train_speed(iter/s)": 0.214882 }, { "acc": 0.78647819, "epoch": 0.09285641377097946, "grad_norm": 5.71875, "learning_rate": 9.992926764231784e-06, "loss": 0.76948757, "memory(GiB)": 117.26, "step": 3980, "train_speed(iter/s)": 0.215155 }, { "acc": 0.76524334, "epoch": 0.09308972134326834, "grad_norm": 5.8125, "learning_rate": 9.992825959461237e-06, "loss": 0.86004267, "memory(GiB)": 117.26, "step": 3990, "train_speed(iter/s)": 0.21543 }, { "acc": 0.7678196, "epoch": 0.09332302891555724, "grad_norm": 6.40625, "learning_rate": 9.992724441966234e-06, "loss": 0.84719696, "memory(GiB)": 117.26, "step": 4000, "train_speed(iter/s)": 0.215685 }, { "epoch": 0.09332302891555724, "eval_acc": 0.7213637346361234, "eval_loss": 0.8891978859901428, "eval_runtime": 1264.113, "eval_samples_per_second": 28.471, "eval_steps_per_second": 14.236, "step": 4000 }, { "acc": 0.76412001, "epoch": 0.09355633648784613, "grad_norm": 5.25, "learning_rate": 9.99262221176127e-06, "loss": 0.84074697, "memory(GiB)": 117.26, "step": 4010, "train_speed(iter/s)": 0.201908 }, { "acc": 0.75619836, "epoch": 0.09378964406013503, "grad_norm": 7.8125, "learning_rate": 9.992519268860934e-06, "loss": 0.90627365, "memory(GiB)": 117.26, "step": 4020, "train_speed(iter/s)": 0.202174 }, { "acc": 0.76248636, "epoch": 0.09402295163242393, "grad_norm": 4.65625, "learning_rate": 9.992415613279922e-06, "loss": 0.87103119, "memory(GiB)": 117.26, "step": 4030, "train_speed(iter/s)": 0.202438 }, { "acc": 0.77371283, "epoch": 0.09425625920471281, "grad_norm": 6.53125, "learning_rate": 9.992311245033033e-06, "loss": 0.81645164, "memory(GiB)": 117.26, "step": 4040, "train_speed(iter/s)": 0.202678 }, { "acc": 0.75443277, "epoch": 0.09448956677700171, "grad_norm": 4.34375, "learning_rate": 9.992206164135163e-06, "loss": 0.88794823, "memory(GiB)": 117.26, "step": 4050, "train_speed(iter/s)": 0.202951 }, { "acc": 0.76103091, "epoch": 0.0947228743492906, "grad_norm": 7.46875, "learning_rate": 9.992100370601313e-06, "loss": 0.86247635, "memory(GiB)": 117.26, "step": 4060, "train_speed(iter/s)": 0.203216 }, { "acc": 0.76392231, "epoch": 0.0949561819215795, "grad_norm": 6.03125, "learning_rate": 9.991993864446585e-06, "loss": 0.84815683, "memory(GiB)": 117.26, "step": 4070, "train_speed(iter/s)": 0.203487 }, { "acc": 0.73951268, "epoch": 0.09518948949386838, "grad_norm": 5.875, "learning_rate": 9.991886645686184e-06, "loss": 0.95514603, "memory(GiB)": 117.26, "step": 4080, "train_speed(iter/s)": 0.203738 }, { "acc": 0.76140833, "epoch": 0.09542279706615728, "grad_norm": 6.5, "learning_rate": 9.991778714335415e-06, "loss": 0.842805, "memory(GiB)": 117.26, "step": 4090, "train_speed(iter/s)": 0.203994 }, { "acc": 0.75136814, "epoch": 0.09565610463844618, "grad_norm": 6.75, "learning_rate": 9.991670070409684e-06, "loss": 0.91178036, "memory(GiB)": 117.26, "step": 4100, "train_speed(iter/s)": 0.204262 }, { "acc": 0.76194067, "epoch": 0.09588941221073506, "grad_norm": 15.625, "learning_rate": 9.991560713924501e-06, "loss": 0.89563017, "memory(GiB)": 117.26, "step": 4110, "train_speed(iter/s)": 0.204514 }, { "acc": 0.77262659, "epoch": 0.09612271978302396, "grad_norm": 6.78125, "learning_rate": 9.991450644895476e-06, "loss": 0.83956661, "memory(GiB)": 117.26, "step": 4120, "train_speed(iter/s)": 0.204771 }, { "acc": 0.75700169, "epoch": 0.09635602735531285, "grad_norm": 6.78125, "learning_rate": 9.99133986333832e-06, "loss": 0.88781214, "memory(GiB)": 117.26, "step": 4130, "train_speed(iter/s)": 0.205032 }, { "acc": 0.75085874, "epoch": 0.09658933492760174, "grad_norm": 6.65625, "learning_rate": 9.99122836926885e-06, "loss": 0.92638836, "memory(GiB)": 117.26, "step": 4140, "train_speed(iter/s)": 0.205285 }, { "acc": 0.76435242, "epoch": 0.09682264249989064, "grad_norm": 6.75, "learning_rate": 9.991116162702981e-06, "loss": 0.84227076, "memory(GiB)": 117.26, "step": 4150, "train_speed(iter/s)": 0.205554 }, { "acc": 0.76795573, "epoch": 0.09705595007217953, "grad_norm": 5.625, "learning_rate": 9.991003243656728e-06, "loss": 0.82411375, "memory(GiB)": 117.26, "step": 4160, "train_speed(iter/s)": 0.205812 }, { "acc": 0.7445519, "epoch": 0.09728925764446843, "grad_norm": 5.96875, "learning_rate": 9.990889612146213e-06, "loss": 0.94919357, "memory(GiB)": 117.26, "step": 4170, "train_speed(iter/s)": 0.206059 }, { "acc": 0.76449804, "epoch": 0.09752256521675731, "grad_norm": 9.1875, "learning_rate": 9.990775268187654e-06, "loss": 0.85302496, "memory(GiB)": 117.26, "step": 4180, "train_speed(iter/s)": 0.206325 }, { "acc": 0.76151171, "epoch": 0.09775587278904621, "grad_norm": 12.75, "learning_rate": 9.990660211797378e-06, "loss": 0.85370407, "memory(GiB)": 117.26, "step": 4190, "train_speed(iter/s)": 0.206583 }, { "acc": 0.74476466, "epoch": 0.0979891803613351, "grad_norm": 9.875, "learning_rate": 9.990544442991805e-06, "loss": 0.94560623, "memory(GiB)": 117.26, "step": 4200, "train_speed(iter/s)": 0.206844 }, { "acc": 0.75480165, "epoch": 0.098222487933624, "grad_norm": 6.125, "learning_rate": 9.99042796178746e-06, "loss": 0.89572659, "memory(GiB)": 117.26, "step": 4210, "train_speed(iter/s)": 0.207103 }, { "acc": 0.7533886, "epoch": 0.0984557955059129, "grad_norm": 5.59375, "learning_rate": 9.990310768200977e-06, "loss": 0.89642124, "memory(GiB)": 117.26, "step": 4220, "train_speed(iter/s)": 0.20734 }, { "acc": 0.73566222, "epoch": 0.09868910307820178, "grad_norm": 5.3125, "learning_rate": 9.99019286224908e-06, "loss": 0.96868382, "memory(GiB)": 117.26, "step": 4230, "train_speed(iter/s)": 0.20755 }, { "acc": 0.76720333, "epoch": 0.09892241065049068, "grad_norm": 7.96875, "learning_rate": 9.990074243948602e-06, "loss": 0.85299377, "memory(GiB)": 117.26, "step": 4240, "train_speed(iter/s)": 0.207791 }, { "acc": 0.76671886, "epoch": 0.09915571822277956, "grad_norm": 6.6875, "learning_rate": 9.989954913316476e-06, "loss": 0.84508743, "memory(GiB)": 117.26, "step": 4250, "train_speed(iter/s)": 0.208048 }, { "acc": 0.77009649, "epoch": 0.09938902579506846, "grad_norm": 8.625, "learning_rate": 9.989834870369735e-06, "loss": 0.85040188, "memory(GiB)": 117.26, "step": 4260, "train_speed(iter/s)": 0.208293 }, { "acc": 0.7688715, "epoch": 0.09962233336735736, "grad_norm": 5.21875, "learning_rate": 9.989714115125515e-06, "loss": 0.83766098, "memory(GiB)": 117.26, "step": 4270, "train_speed(iter/s)": 0.208535 }, { "acc": 0.76275082, "epoch": 0.09985564093964625, "grad_norm": 9.125, "learning_rate": 9.989592647601056e-06, "loss": 0.86342697, "memory(GiB)": 117.26, "step": 4280, "train_speed(iter/s)": 0.208775 }, { "acc": 0.754038, "epoch": 0.10008894851193514, "grad_norm": 7.6875, "learning_rate": 9.989470467813696e-06, "loss": 0.90022526, "memory(GiB)": 117.26, "step": 4290, "train_speed(iter/s)": 0.208998 }, { "acc": 0.75553808, "epoch": 0.10032225608422403, "grad_norm": 7.75, "learning_rate": 9.989347575780874e-06, "loss": 0.866786, "memory(GiB)": 117.26, "step": 4300, "train_speed(iter/s)": 0.209248 }, { "acc": 0.74242492, "epoch": 0.10055556365651293, "grad_norm": 6.59375, "learning_rate": 9.989223971520136e-06, "loss": 0.93780308, "memory(GiB)": 117.26, "step": 4310, "train_speed(iter/s)": 0.209496 }, { "acc": 0.76752081, "epoch": 0.10078887122880181, "grad_norm": 5.6875, "learning_rate": 9.989099655049128e-06, "loss": 0.8470974, "memory(GiB)": 117.26, "step": 4320, "train_speed(iter/s)": 0.209754 }, { "acc": 0.74966879, "epoch": 0.10102217880109071, "grad_norm": 9.1875, "learning_rate": 9.98897462638559e-06, "loss": 0.92223148, "memory(GiB)": 117.26, "step": 4330, "train_speed(iter/s)": 0.209988 }, { "acc": 0.75483513, "epoch": 0.10125548637337961, "grad_norm": 5.0625, "learning_rate": 9.988848885547376e-06, "loss": 0.89337826, "memory(GiB)": 117.26, "step": 4340, "train_speed(iter/s)": 0.210229 }, { "acc": 0.77093763, "epoch": 0.1014887939456685, "grad_norm": 9.1875, "learning_rate": 9.988722432552431e-06, "loss": 0.81348486, "memory(GiB)": 117.26, "step": 4350, "train_speed(iter/s)": 0.210466 }, { "acc": 0.74523001, "epoch": 0.1017221015179574, "grad_norm": 11.125, "learning_rate": 9.988595267418809e-06, "loss": 0.90048809, "memory(GiB)": 117.26, "step": 4360, "train_speed(iter/s)": 0.210712 }, { "acc": 0.75970225, "epoch": 0.10195540909024628, "grad_norm": 5.625, "learning_rate": 9.988467390164662e-06, "loss": 0.85486603, "memory(GiB)": 117.26, "step": 4370, "train_speed(iter/s)": 0.210936 }, { "acc": 0.75850015, "epoch": 0.10218871666253518, "grad_norm": 4.53125, "learning_rate": 9.988338800808245e-06, "loss": 0.89363995, "memory(GiB)": 117.26, "step": 4380, "train_speed(iter/s)": 0.211182 }, { "acc": 0.75912037, "epoch": 0.10242202423482408, "grad_norm": 5.78125, "learning_rate": 9.988209499367911e-06, "loss": 0.87021446, "memory(GiB)": 117.26, "step": 4390, "train_speed(iter/s)": 0.211431 }, { "acc": 0.76318393, "epoch": 0.10265533180711296, "grad_norm": 5.0625, "learning_rate": 9.988079485862121e-06, "loss": 0.86028423, "memory(GiB)": 117.26, "step": 4400, "train_speed(iter/s)": 0.211655 }, { "acc": 0.76449718, "epoch": 0.10288863937940186, "grad_norm": 5.84375, "learning_rate": 9.987948760309434e-06, "loss": 0.84147358, "memory(GiB)": 117.26, "step": 4410, "train_speed(iter/s)": 0.211902 }, { "acc": 0.77095308, "epoch": 0.10312194695169075, "grad_norm": 3.609375, "learning_rate": 9.987817322728509e-06, "loss": 0.84317484, "memory(GiB)": 117.26, "step": 4420, "train_speed(iter/s)": 0.212143 }, { "acc": 0.76160107, "epoch": 0.10335525452397964, "grad_norm": 4.96875, "learning_rate": 9.98768517313811e-06, "loss": 0.87322044, "memory(GiB)": 117.26, "step": 4430, "train_speed(iter/s)": 0.212372 }, { "acc": 0.74773016, "epoch": 0.10358856209626854, "grad_norm": 7.59375, "learning_rate": 9.987552311557103e-06, "loss": 0.94118099, "memory(GiB)": 117.26, "step": 4440, "train_speed(iter/s)": 0.212607 }, { "acc": 0.77219372, "epoch": 0.10382186966855743, "grad_norm": 5.375, "learning_rate": 9.987418738004453e-06, "loss": 0.83494358, "memory(GiB)": 117.26, "step": 4450, "train_speed(iter/s)": 0.21283 }, { "acc": 0.72884626, "epoch": 0.10405517724084633, "grad_norm": 5.09375, "learning_rate": 9.987284452499227e-06, "loss": 1.01027079, "memory(GiB)": 117.26, "step": 4460, "train_speed(iter/s)": 0.213075 }, { "acc": 0.75734034, "epoch": 0.10428848481313521, "grad_norm": 12.6875, "learning_rate": 9.987149455060592e-06, "loss": 0.87786808, "memory(GiB)": 117.26, "step": 4470, "train_speed(iter/s)": 0.21332 }, { "acc": 0.75500989, "epoch": 0.10452179238542411, "grad_norm": 4.9375, "learning_rate": 9.987013745707824e-06, "loss": 0.88896713, "memory(GiB)": 117.26, "step": 4480, "train_speed(iter/s)": 0.213546 }, { "acc": 0.77651052, "epoch": 0.104755099957713, "grad_norm": 4.0625, "learning_rate": 9.986877324460288e-06, "loss": 0.81290522, "memory(GiB)": 117.26, "step": 4490, "train_speed(iter/s)": 0.213761 }, { "acc": 0.7629488, "epoch": 0.1049884075300019, "grad_norm": 5.03125, "learning_rate": 9.986740191337467e-06, "loss": 0.87226248, "memory(GiB)": 117.26, "step": 4500, "train_speed(iter/s)": 0.21399 }, { "epoch": 0.1049884075300019, "eval_acc": 0.7225868750786547, "eval_loss": 0.88419109582901, "eval_runtime": 1261.6084, "eval_samples_per_second": 28.528, "eval_steps_per_second": 14.264, "step": 4500 }, { "acc": 0.73781762, "epoch": 0.1052217151022908, "grad_norm": 5.59375, "learning_rate": 9.986602346358932e-06, "loss": 0.97211933, "memory(GiB)": 117.26, "step": 4510, "train_speed(iter/s)": 0.201887 }, { "acc": 0.74730396, "epoch": 0.10545502267457968, "grad_norm": 6.34375, "learning_rate": 9.986463789544359e-06, "loss": 0.92175388, "memory(GiB)": 117.26, "step": 4520, "train_speed(iter/s)": 0.202128 }, { "acc": 0.748983, "epoch": 0.10568833024686858, "grad_norm": 7.9375, "learning_rate": 9.986324520913528e-06, "loss": 0.92216883, "memory(GiB)": 117.26, "step": 4530, "train_speed(iter/s)": 0.202356 }, { "acc": 0.76742783, "epoch": 0.10592163781915746, "grad_norm": 5.59375, "learning_rate": 9.986184540486322e-06, "loss": 0.83385925, "memory(GiB)": 117.26, "step": 4540, "train_speed(iter/s)": 0.202583 }, { "acc": 0.76445155, "epoch": 0.10615494539144636, "grad_norm": 6.90625, "learning_rate": 9.98604384828272e-06, "loss": 0.81289186, "memory(GiB)": 117.26, "step": 4550, "train_speed(iter/s)": 0.202827 }, { "acc": 0.76047554, "epoch": 0.10638825296373526, "grad_norm": 14.5625, "learning_rate": 9.985902444322809e-06, "loss": 0.83031597, "memory(GiB)": 117.26, "step": 4560, "train_speed(iter/s)": 0.203059 }, { "acc": 0.77634478, "epoch": 0.10662156053602415, "grad_norm": 4.59375, "learning_rate": 9.98576032862677e-06, "loss": 0.8340477, "memory(GiB)": 117.26, "step": 4570, "train_speed(iter/s)": 0.203269 }, { "acc": 0.75437384, "epoch": 0.10685486810831304, "grad_norm": 6.59375, "learning_rate": 9.985617501214895e-06, "loss": 0.89912443, "memory(GiB)": 117.26, "step": 4580, "train_speed(iter/s)": 0.203512 }, { "acc": 0.76212082, "epoch": 0.10708817568060193, "grad_norm": 5.6875, "learning_rate": 9.985473962107568e-06, "loss": 0.86147919, "memory(GiB)": 117.26, "step": 4590, "train_speed(iter/s)": 0.203744 }, { "acc": 0.75399237, "epoch": 0.10732148325289083, "grad_norm": 6.40625, "learning_rate": 9.985329711325282e-06, "loss": 0.89479609, "memory(GiB)": 117.26, "step": 4600, "train_speed(iter/s)": 0.203975 }, { "acc": 0.7696825, "epoch": 0.10755479082517971, "grad_norm": 6.4375, "learning_rate": 9.985184748888627e-06, "loss": 0.85566216, "memory(GiB)": 117.26, "step": 4610, "train_speed(iter/s)": 0.204217 }, { "acc": 0.76564717, "epoch": 0.10778809839746861, "grad_norm": 10.8125, "learning_rate": 9.985039074818298e-06, "loss": 0.82426109, "memory(GiB)": 117.26, "step": 4620, "train_speed(iter/s)": 0.20444 }, { "acc": 0.76383858, "epoch": 0.10802140596975751, "grad_norm": 7.78125, "learning_rate": 9.98489268913509e-06, "loss": 0.87438736, "memory(GiB)": 117.26, "step": 4630, "train_speed(iter/s)": 0.204669 }, { "acc": 0.75862565, "epoch": 0.1082547135420464, "grad_norm": 5.3125, "learning_rate": 9.984745591859899e-06, "loss": 0.8994606, "memory(GiB)": 117.26, "step": 4640, "train_speed(iter/s)": 0.204886 }, { "acc": 0.76976395, "epoch": 0.1084880211143353, "grad_norm": 11.6875, "learning_rate": 9.98459778301372e-06, "loss": 0.84485207, "memory(GiB)": 117.26, "step": 4650, "train_speed(iter/s)": 0.205111 }, { "acc": 0.76308441, "epoch": 0.10872132868662418, "grad_norm": 8.1875, "learning_rate": 9.984449262617659e-06, "loss": 0.84300032, "memory(GiB)": 117.26, "step": 4660, "train_speed(iter/s)": 0.205352 }, { "acc": 0.74991522, "epoch": 0.10895463625891308, "grad_norm": 5.4375, "learning_rate": 9.984300030692913e-06, "loss": 0.91463776, "memory(GiB)": 117.26, "step": 4670, "train_speed(iter/s)": 0.205595 }, { "acc": 0.74887056, "epoch": 0.10918794383120198, "grad_norm": 7.4375, "learning_rate": 9.984150087260784e-06, "loss": 0.90414076, "memory(GiB)": 117.26, "step": 4680, "train_speed(iter/s)": 0.205824 }, { "acc": 0.75217257, "epoch": 0.10942125140349086, "grad_norm": 8.0625, "learning_rate": 9.983999432342679e-06, "loss": 0.90018339, "memory(GiB)": 117.26, "step": 4690, "train_speed(iter/s)": 0.206039 }, { "acc": 0.75878348, "epoch": 0.10965455897577976, "grad_norm": 6.4375, "learning_rate": 9.983848065960103e-06, "loss": 0.88386631, "memory(GiB)": 117.26, "step": 4700, "train_speed(iter/s)": 0.206269 }, { "acc": 0.78065205, "epoch": 0.10988786654806865, "grad_norm": 8.25, "learning_rate": 9.983695988134662e-06, "loss": 0.80149155, "memory(GiB)": 117.26, "step": 4710, "train_speed(iter/s)": 0.206478 }, { "acc": 0.75370417, "epoch": 0.11012117412035755, "grad_norm": 5.28125, "learning_rate": 9.983543198888069e-06, "loss": 0.9132328, "memory(GiB)": 117.26, "step": 4720, "train_speed(iter/s)": 0.206717 }, { "acc": 0.74119639, "epoch": 0.11035448169264643, "grad_norm": 5.8125, "learning_rate": 9.98338969824213e-06, "loss": 0.95304585, "memory(GiB)": 117.26, "step": 4730, "train_speed(iter/s)": 0.206938 }, { "acc": 0.73715696, "epoch": 0.11058778926493533, "grad_norm": 7.4375, "learning_rate": 9.98323548621876e-06, "loss": 0.97094479, "memory(GiB)": 117.26, "step": 4740, "train_speed(iter/s)": 0.207159 }, { "acc": 0.73141956, "epoch": 0.11082109683722423, "grad_norm": 8.6875, "learning_rate": 9.983080562839971e-06, "loss": 0.9959837, "memory(GiB)": 117.26, "step": 4750, "train_speed(iter/s)": 0.207376 }, { "acc": 0.75815339, "epoch": 0.11105440440951311, "grad_norm": 5.25, "learning_rate": 9.982924928127881e-06, "loss": 0.85841503, "memory(GiB)": 117.26, "step": 4760, "train_speed(iter/s)": 0.207601 }, { "acc": 0.786269, "epoch": 0.11128771198180201, "grad_norm": 5.75, "learning_rate": 9.982768582104705e-06, "loss": 0.78536749, "memory(GiB)": 117.26, "step": 4770, "train_speed(iter/s)": 0.207831 }, { "acc": 0.75228901, "epoch": 0.1115210195540909, "grad_norm": 4.75, "learning_rate": 9.98261152479276e-06, "loss": 0.9109827, "memory(GiB)": 117.26, "step": 4780, "train_speed(iter/s)": 0.208045 }, { "acc": 0.77886076, "epoch": 0.1117543271263798, "grad_norm": 7.34375, "learning_rate": 9.982453756214467e-06, "loss": 0.80876541, "memory(GiB)": 117.26, "step": 4790, "train_speed(iter/s)": 0.208256 }, { "acc": 0.76863036, "epoch": 0.1119876346986687, "grad_norm": 10.0625, "learning_rate": 9.982295276392349e-06, "loss": 0.8519475, "memory(GiB)": 117.26, "step": 4800, "train_speed(iter/s)": 0.208483 }, { "acc": 0.74021883, "epoch": 0.11222094227095758, "grad_norm": 5.875, "learning_rate": 9.982136085349028e-06, "loss": 0.97321672, "memory(GiB)": 117.26, "step": 4810, "train_speed(iter/s)": 0.208685 }, { "acc": 0.75231524, "epoch": 0.11245424984324648, "grad_norm": 6.5625, "learning_rate": 9.981976183107227e-06, "loss": 0.89958773, "memory(GiB)": 117.26, "step": 4820, "train_speed(iter/s)": 0.208911 }, { "acc": 0.75560327, "epoch": 0.11268755741553536, "grad_norm": 6.8125, "learning_rate": 9.981815569689774e-06, "loss": 0.8913147, "memory(GiB)": 117.26, "step": 4830, "train_speed(iter/s)": 0.209136 }, { "acc": 0.75714111, "epoch": 0.11292086498782426, "grad_norm": 6.53125, "learning_rate": 9.981654245119594e-06, "loss": 0.89320059, "memory(GiB)": 117.26, "step": 4840, "train_speed(iter/s)": 0.209362 }, { "acc": 0.73951216, "epoch": 0.11315417256011315, "grad_norm": 5.40625, "learning_rate": 9.98149220941972e-06, "loss": 0.94522648, "memory(GiB)": 117.26, "step": 4850, "train_speed(iter/s)": 0.209581 }, { "acc": 0.76282744, "epoch": 0.11338748013240205, "grad_norm": 5.90625, "learning_rate": 9.981329462613278e-06, "loss": 0.87296104, "memory(GiB)": 117.26, "step": 4860, "train_speed(iter/s)": 0.20981 }, { "acc": 0.74168768, "epoch": 0.11362078770469095, "grad_norm": 10.5625, "learning_rate": 9.981166004723504e-06, "loss": 0.9625988, "memory(GiB)": 117.26, "step": 4870, "train_speed(iter/s)": 0.210029 }, { "acc": 0.75198603, "epoch": 0.11385409527697983, "grad_norm": 4.96875, "learning_rate": 9.981001835773729e-06, "loss": 0.89658346, "memory(GiB)": 117.26, "step": 4880, "train_speed(iter/s)": 0.210259 }, { "acc": 0.76066141, "epoch": 0.11408740284926873, "grad_norm": 8.0, "learning_rate": 9.98083695578739e-06, "loss": 0.88691692, "memory(GiB)": 117.26, "step": 4890, "train_speed(iter/s)": 0.210479 }, { "acc": 0.76857014, "epoch": 0.11432071042155761, "grad_norm": 5.625, "learning_rate": 9.980671364788022e-06, "loss": 0.84836664, "memory(GiB)": 117.26, "step": 4900, "train_speed(iter/s)": 0.210698 }, { "acc": 0.72234068, "epoch": 0.11455401799384651, "grad_norm": 6.4375, "learning_rate": 9.980505062799262e-06, "loss": 0.99850521, "memory(GiB)": 117.26, "step": 4910, "train_speed(iter/s)": 0.210916 }, { "acc": 0.78880205, "epoch": 0.11478732556613541, "grad_norm": 12.9375, "learning_rate": 9.980338049844854e-06, "loss": 0.7666151, "memory(GiB)": 117.26, "step": 4920, "train_speed(iter/s)": 0.211139 }, { "acc": 0.74182463, "epoch": 0.1150206331384243, "grad_norm": 5.375, "learning_rate": 9.980170325948633e-06, "loss": 0.96035089, "memory(GiB)": 117.26, "step": 4930, "train_speed(iter/s)": 0.211346 }, { "acc": 0.78005672, "epoch": 0.1152539407107132, "grad_norm": 6.59375, "learning_rate": 9.980001891134548e-06, "loss": 0.81281548, "memory(GiB)": 117.26, "step": 4940, "train_speed(iter/s)": 0.211553 }, { "acc": 0.76335382, "epoch": 0.11548724828300208, "grad_norm": 9.0625, "learning_rate": 9.979832745426637e-06, "loss": 0.8605381, "memory(GiB)": 117.26, "step": 4950, "train_speed(iter/s)": 0.211755 }, { "acc": 0.76253567, "epoch": 0.11572055585529098, "grad_norm": 7.625, "learning_rate": 9.97966288884905e-06, "loss": 0.89806976, "memory(GiB)": 117.26, "step": 4960, "train_speed(iter/s)": 0.211967 }, { "acc": 0.75646868, "epoch": 0.11595386342757986, "grad_norm": 5.46875, "learning_rate": 9.979492321426032e-06, "loss": 0.88856773, "memory(GiB)": 117.26, "step": 4970, "train_speed(iter/s)": 0.212177 }, { "acc": 0.77580862, "epoch": 0.11618717099986876, "grad_norm": 5.625, "learning_rate": 9.97932104318193e-06, "loss": 0.78144164, "memory(GiB)": 117.26, "step": 4980, "train_speed(iter/s)": 0.212389 }, { "acc": 0.76104336, "epoch": 0.11642047857215766, "grad_norm": 5.75, "learning_rate": 9.979149054141197e-06, "loss": 0.87449379, "memory(GiB)": 117.26, "step": 4990, "train_speed(iter/s)": 0.212604 }, { "acc": 0.74931889, "epoch": 0.11665378614444655, "grad_norm": 6.125, "learning_rate": 9.978976354328383e-06, "loss": 0.96509285, "memory(GiB)": 117.26, "step": 5000, "train_speed(iter/s)": 0.212833 }, { "epoch": 0.11665378614444655, "eval_acc": 0.7237651621980065, "eval_loss": 0.8791925311088562, "eval_runtime": 1263.564, "eval_samples_per_second": 28.484, "eval_steps_per_second": 14.242, "step": 5000 }, { "acc": 0.76495252, "epoch": 0.11688709371673545, "grad_norm": 8.5, "learning_rate": 9.97880294376814e-06, "loss": 0.85412045, "memory(GiB)": 117.26, "step": 5010, "train_speed(iter/s)": 0.201961 }, { "acc": 0.75546417, "epoch": 0.11712040128902433, "grad_norm": 6.28125, "learning_rate": 9.978628822485224e-06, "loss": 0.90189953, "memory(GiB)": 117.26, "step": 5020, "train_speed(iter/s)": 0.202169 }, { "acc": 0.74391336, "epoch": 0.11735370886131323, "grad_norm": 5.75, "learning_rate": 9.978453990504488e-06, "loss": 0.92395954, "memory(GiB)": 117.26, "step": 5030, "train_speed(iter/s)": 0.202388 }, { "acc": 0.73180037, "epoch": 0.11758701643360213, "grad_norm": 7.15625, "learning_rate": 9.978278447850894e-06, "loss": 0.99405308, "memory(GiB)": 117.26, "step": 5040, "train_speed(iter/s)": 0.202601 }, { "acc": 0.74025183, "epoch": 0.11782032400589101, "grad_norm": 6.1875, "learning_rate": 9.978102194549498e-06, "loss": 0.95806866, "memory(GiB)": 117.26, "step": 5050, "train_speed(iter/s)": 0.202792 }, { "acc": 0.76137457, "epoch": 0.11805363157817991, "grad_norm": 5.9375, "learning_rate": 9.977925230625455e-06, "loss": 0.85161057, "memory(GiB)": 117.26, "step": 5060, "train_speed(iter/s)": 0.202992 }, { "acc": 0.77407813, "epoch": 0.1182869391504688, "grad_norm": 6.125, "learning_rate": 9.977747556104036e-06, "loss": 0.84397354, "memory(GiB)": 117.26, "step": 5070, "train_speed(iter/s)": 0.203203 }, { "acc": 0.75392308, "epoch": 0.1185202467227577, "grad_norm": 7.21875, "learning_rate": 9.9775691710106e-06, "loss": 0.8861392, "memory(GiB)": 117.26, "step": 5080, "train_speed(iter/s)": 0.203402 }, { "acc": 0.74401016, "epoch": 0.11875355429504658, "grad_norm": 5.625, "learning_rate": 9.977390075370607e-06, "loss": 0.91689663, "memory(GiB)": 117.26, "step": 5090, "train_speed(iter/s)": 0.203606 }, { "acc": 0.76963558, "epoch": 0.11898686186733548, "grad_norm": 6.3125, "learning_rate": 9.97721026920963e-06, "loss": 0.83954287, "memory(GiB)": 117.26, "step": 5100, "train_speed(iter/s)": 0.203815 }, { "acc": 0.73818998, "epoch": 0.11922016943962438, "grad_norm": 6.375, "learning_rate": 9.977029752553331e-06, "loss": 0.96415129, "memory(GiB)": 117.26, "step": 5110, "train_speed(iter/s)": 0.20403 }, { "acc": 0.74603262, "epoch": 0.11945347701191326, "grad_norm": 5.21875, "learning_rate": 9.97684852542748e-06, "loss": 0.92159462, "memory(GiB)": 117.26, "step": 5120, "train_speed(iter/s)": 0.204238 }, { "acc": 0.75029335, "epoch": 0.11968678458420216, "grad_norm": 6.21875, "learning_rate": 9.976666587857951e-06, "loss": 0.89370575, "memory(GiB)": 117.26, "step": 5130, "train_speed(iter/s)": 0.204452 }, { "acc": 0.75669193, "epoch": 0.11992009215649105, "grad_norm": 6.3125, "learning_rate": 9.97648393987071e-06, "loss": 0.87409286, "memory(GiB)": 117.26, "step": 5140, "train_speed(iter/s)": 0.204642 }, { "acc": 0.76010904, "epoch": 0.12015339972877995, "grad_norm": 8.4375, "learning_rate": 9.976300581491833e-06, "loss": 0.86449699, "memory(GiB)": 117.26, "step": 5150, "train_speed(iter/s)": 0.204839 }, { "acc": 0.74232459, "epoch": 0.12038670730106885, "grad_norm": 5.84375, "learning_rate": 9.976116512747493e-06, "loss": 0.95563736, "memory(GiB)": 117.26, "step": 5160, "train_speed(iter/s)": 0.20503 }, { "acc": 0.76094551, "epoch": 0.12062001487335773, "grad_norm": 6.5, "learning_rate": 9.975931733663966e-06, "loss": 0.85187588, "memory(GiB)": 117.26, "step": 5170, "train_speed(iter/s)": 0.20524 }, { "acc": 0.76606655, "epoch": 0.12085332244564663, "grad_norm": 4.25, "learning_rate": 9.97574624426763e-06, "loss": 0.82849426, "memory(GiB)": 117.26, "step": 5180, "train_speed(iter/s)": 0.205451 }, { "acc": 0.74569139, "epoch": 0.12108663001793551, "grad_norm": 5.71875, "learning_rate": 9.975560044584964e-06, "loss": 0.92549343, "memory(GiB)": 117.26, "step": 5190, "train_speed(iter/s)": 0.205643 }, { "acc": 0.75343733, "epoch": 0.12131993759022441, "grad_norm": 10.6875, "learning_rate": 9.975373134642545e-06, "loss": 0.90239754, "memory(GiB)": 117.26, "step": 5200, "train_speed(iter/s)": 0.205836 }, { "acc": 0.7536705, "epoch": 0.12155324516251331, "grad_norm": 5.6875, "learning_rate": 9.975185514467058e-06, "loss": 0.91035576, "memory(GiB)": 117.26, "step": 5210, "train_speed(iter/s)": 0.206045 }, { "acc": 0.73881683, "epoch": 0.1217865527348022, "grad_norm": 9.625, "learning_rate": 9.974997184085285e-06, "loss": 0.95173883, "memory(GiB)": 117.26, "step": 5220, "train_speed(iter/s)": 0.20626 }, { "acc": 0.77404766, "epoch": 0.1220198603070911, "grad_norm": 5.78125, "learning_rate": 9.974808143524107e-06, "loss": 0.81718616, "memory(GiB)": 117.26, "step": 5230, "train_speed(iter/s)": 0.206465 }, { "acc": 0.74286504, "epoch": 0.12225316787937998, "grad_norm": 5.71875, "learning_rate": 9.974618392810513e-06, "loss": 0.96385307, "memory(GiB)": 117.26, "step": 5240, "train_speed(iter/s)": 0.206663 }, { "acc": 0.77244081, "epoch": 0.12248647545166888, "grad_norm": 5.875, "learning_rate": 9.974427931971588e-06, "loss": 0.83910236, "memory(GiB)": 117.26, "step": 5250, "train_speed(iter/s)": 0.206871 }, { "acc": 0.76794581, "epoch": 0.12271978302395777, "grad_norm": 5.21875, "learning_rate": 9.97423676103452e-06, "loss": 0.8369154, "memory(GiB)": 117.26, "step": 5260, "train_speed(iter/s)": 0.207064 }, { "acc": 0.74856863, "epoch": 0.12295309059624666, "grad_norm": 6.0, "learning_rate": 9.974044880026602e-06, "loss": 0.92070465, "memory(GiB)": 117.26, "step": 5270, "train_speed(iter/s)": 0.207259 }, { "acc": 0.73694801, "epoch": 0.12318639816853556, "grad_norm": 5.6875, "learning_rate": 9.97385228897522e-06, "loss": 0.96520367, "memory(GiB)": 117.26, "step": 5280, "train_speed(iter/s)": 0.207469 }, { "acc": 0.78544483, "epoch": 0.12341970574082445, "grad_norm": 16.25, "learning_rate": 9.97365898790787e-06, "loss": 0.76588273, "memory(GiB)": 117.26, "step": 5290, "train_speed(iter/s)": 0.207666 }, { "acc": 0.76067019, "epoch": 0.12365301331311335, "grad_norm": 5.15625, "learning_rate": 9.973464976852144e-06, "loss": 0.85567551, "memory(GiB)": 117.26, "step": 5300, "train_speed(iter/s)": 0.207867 }, { "acc": 0.75521135, "epoch": 0.12388632088540223, "grad_norm": 7.15625, "learning_rate": 9.973270255835737e-06, "loss": 0.89592381, "memory(GiB)": 117.26, "step": 5310, "train_speed(iter/s)": 0.208066 }, { "acc": 0.7576539, "epoch": 0.12411962845769113, "grad_norm": 11.1875, "learning_rate": 9.973074824886446e-06, "loss": 0.90161066, "memory(GiB)": 117.26, "step": 5320, "train_speed(iter/s)": 0.208274 }, { "acc": 0.73488312, "epoch": 0.12435293602998003, "grad_norm": 5.96875, "learning_rate": 9.972878684032169e-06, "loss": 0.99012794, "memory(GiB)": 117.26, "step": 5330, "train_speed(iter/s)": 0.208478 }, { "acc": 0.75820341, "epoch": 0.12458624360226891, "grad_norm": 7.65625, "learning_rate": 9.972681833300903e-06, "loss": 0.97679787, "memory(GiB)": 117.26, "step": 5340, "train_speed(iter/s)": 0.208687 }, { "acc": 0.750632, "epoch": 0.12481955117455781, "grad_norm": 5.375, "learning_rate": 9.972484272720751e-06, "loss": 0.88920898, "memory(GiB)": 117.26, "step": 5350, "train_speed(iter/s)": 0.208886 }, { "acc": 0.75719719, "epoch": 0.1250528587468467, "grad_norm": 5.46875, "learning_rate": 9.972286002319913e-06, "loss": 0.90022354, "memory(GiB)": 117.26, "step": 5360, "train_speed(iter/s)": 0.209084 }, { "acc": 0.78128004, "epoch": 0.12528616631913558, "grad_norm": 5.0625, "learning_rate": 9.972087022126693e-06, "loss": 0.80217743, "memory(GiB)": 117.26, "step": 5370, "train_speed(iter/s)": 0.209272 }, { "acc": 0.77177401, "epoch": 0.12551947389142448, "grad_norm": 7.625, "learning_rate": 9.971887332169494e-06, "loss": 0.81486521, "memory(GiB)": 117.26, "step": 5380, "train_speed(iter/s)": 0.209467 }, { "acc": 0.75152845, "epoch": 0.12575278146371338, "grad_norm": 8.125, "learning_rate": 9.971686932476825e-06, "loss": 0.89488583, "memory(GiB)": 117.26, "step": 5390, "train_speed(iter/s)": 0.209668 }, { "acc": 0.75611267, "epoch": 0.12598608903600228, "grad_norm": 4.34375, "learning_rate": 9.971485823077288e-06, "loss": 0.87171736, "memory(GiB)": 117.26, "step": 5400, "train_speed(iter/s)": 0.209854 }, { "acc": 0.74898229, "epoch": 0.12621939660829118, "grad_norm": 7.3125, "learning_rate": 9.971284003999595e-06, "loss": 0.92085381, "memory(GiB)": 117.26, "step": 5410, "train_speed(iter/s)": 0.210048 }, { "acc": 0.76406021, "epoch": 0.12645270418058005, "grad_norm": 5.625, "learning_rate": 9.971081475272555e-06, "loss": 0.85575199, "memory(GiB)": 117.26, "step": 5420, "train_speed(iter/s)": 0.210246 }, { "acc": 0.77844558, "epoch": 0.12668601175286895, "grad_norm": 5.34375, "learning_rate": 9.97087823692508e-06, "loss": 0.79132781, "memory(GiB)": 117.26, "step": 5430, "train_speed(iter/s)": 0.210438 }, { "acc": 0.76380596, "epoch": 0.12691931932515785, "grad_norm": 6.28125, "learning_rate": 9.970674288986178e-06, "loss": 0.87061825, "memory(GiB)": 117.26, "step": 5440, "train_speed(iter/s)": 0.210631 }, { "acc": 0.77341371, "epoch": 0.12715262689744675, "grad_norm": 7.0625, "learning_rate": 9.970469631484967e-06, "loss": 0.79120307, "memory(GiB)": 117.26, "step": 5450, "train_speed(iter/s)": 0.210821 }, { "acc": 0.77013798, "epoch": 0.12738593446973565, "grad_norm": 7.96875, "learning_rate": 9.970264264450659e-06, "loss": 0.82661667, "memory(GiB)": 117.26, "step": 5460, "train_speed(iter/s)": 0.21101 }, { "acc": 0.77013121, "epoch": 0.12761924204202452, "grad_norm": 5.0, "learning_rate": 9.970058187912572e-06, "loss": 0.86741314, "memory(GiB)": 117.26, "step": 5470, "train_speed(iter/s)": 0.211202 }, { "acc": 0.74863768, "epoch": 0.12785254961431342, "grad_norm": 6.8125, "learning_rate": 9.969851401900122e-06, "loss": 0.91290703, "memory(GiB)": 117.26, "step": 5480, "train_speed(iter/s)": 0.211382 }, { "acc": 0.74249878, "epoch": 0.12808585718660231, "grad_norm": 5.34375, "learning_rate": 9.969643906442828e-06, "loss": 0.92488508, "memory(GiB)": 117.26, "step": 5490, "train_speed(iter/s)": 0.211577 }, { "acc": 0.76028767, "epoch": 0.1283191647588912, "grad_norm": 7.21875, "learning_rate": 9.96943570157031e-06, "loss": 0.87617188, "memory(GiB)": 117.26, "step": 5500, "train_speed(iter/s)": 0.211769 }, { "epoch": 0.1283191647588912, "eval_acc": 0.7247441908492767, "eval_loss": 0.8759089708328247, "eval_runtime": 1264.5459, "eval_samples_per_second": 28.462, "eval_steps_per_second": 14.231, "step": 5500 }, { "acc": 0.74221888, "epoch": 0.1285524723311801, "grad_norm": 4.71875, "learning_rate": 9.969226787312288e-06, "loss": 0.94780912, "memory(GiB)": 117.26, "step": 5510, "train_speed(iter/s)": 0.201933 }, { "acc": 0.74198723, "epoch": 0.12878577990346898, "grad_norm": 7.4375, "learning_rate": 9.969017163698587e-06, "loss": 0.95109205, "memory(GiB)": 117.26, "step": 5520, "train_speed(iter/s)": 0.202135 }, { "acc": 0.73248787, "epoch": 0.12901908747575788, "grad_norm": 6.6875, "learning_rate": 9.96880683075913e-06, "loss": 1.0025753, "memory(GiB)": 117.26, "step": 5530, "train_speed(iter/s)": 0.202324 }, { "acc": 0.76849861, "epoch": 0.12925239504804678, "grad_norm": 10.9375, "learning_rate": 9.96859578852394e-06, "loss": 0.833638, "memory(GiB)": 117.26, "step": 5540, "train_speed(iter/s)": 0.202506 }, { "acc": 0.75560169, "epoch": 0.12948570262033568, "grad_norm": 6.46875, "learning_rate": 9.968384037023147e-06, "loss": 0.88218699, "memory(GiB)": 117.26, "step": 5550, "train_speed(iter/s)": 0.202692 }, { "acc": 0.77715511, "epoch": 0.12971901019262455, "grad_norm": 4.15625, "learning_rate": 9.968171576286973e-06, "loss": 0.7928422, "memory(GiB)": 117.26, "step": 5560, "train_speed(iter/s)": 0.202874 }, { "acc": 0.780762, "epoch": 0.12995231776491345, "grad_norm": 6.78125, "learning_rate": 9.96795840634575e-06, "loss": 0.78349032, "memory(GiB)": 117.26, "step": 5570, "train_speed(iter/s)": 0.203067 }, { "acc": 0.73517962, "epoch": 0.13018562533720235, "grad_norm": 6.625, "learning_rate": 9.96774452722991e-06, "loss": 0.9921154, "memory(GiB)": 117.26, "step": 5580, "train_speed(iter/s)": 0.203266 }, { "acc": 0.765696, "epoch": 0.13041893290949125, "grad_norm": 9.625, "learning_rate": 9.967529938969981e-06, "loss": 0.87698326, "memory(GiB)": 117.26, "step": 5590, "train_speed(iter/s)": 0.203453 }, { "acc": 0.76675014, "epoch": 0.13065224048178015, "grad_norm": 7.0, "learning_rate": 9.967314641596595e-06, "loss": 0.83034935, "memory(GiB)": 117.26, "step": 5600, "train_speed(iter/s)": 0.203643 }, { "acc": 0.75825205, "epoch": 0.13088554805406902, "grad_norm": 6.34375, "learning_rate": 9.967098635140489e-06, "loss": 0.87165432, "memory(GiB)": 117.26, "step": 5610, "train_speed(iter/s)": 0.203839 }, { "acc": 0.7604888, "epoch": 0.13111885562635792, "grad_norm": 6.0, "learning_rate": 9.966881919632494e-06, "loss": 0.87833328, "memory(GiB)": 117.26, "step": 5620, "train_speed(iter/s)": 0.204033 }, { "acc": 0.748596, "epoch": 0.13135216319864682, "grad_norm": 5.1875, "learning_rate": 9.966664495103548e-06, "loss": 0.91375675, "memory(GiB)": 117.26, "step": 5630, "train_speed(iter/s)": 0.204219 }, { "acc": 0.76087523, "epoch": 0.13158547077093571, "grad_norm": 5.5625, "learning_rate": 9.96644636158469e-06, "loss": 0.87532892, "memory(GiB)": 117.26, "step": 5640, "train_speed(iter/s)": 0.204393 }, { "acc": 0.75693159, "epoch": 0.1318187783432246, "grad_norm": 15.875, "learning_rate": 9.966227519107054e-06, "loss": 0.89432802, "memory(GiB)": 117.26, "step": 5650, "train_speed(iter/s)": 0.204585 }, { "acc": 0.75121632, "epoch": 0.13205208591551348, "grad_norm": 5.375, "learning_rate": 9.966007967701884e-06, "loss": 0.92458391, "memory(GiB)": 117.26, "step": 5660, "train_speed(iter/s)": 0.204761 }, { "acc": 0.75425925, "epoch": 0.13228539348780238, "grad_norm": 8.125, "learning_rate": 9.965787707400521e-06, "loss": 0.89481945, "memory(GiB)": 117.26, "step": 5670, "train_speed(iter/s)": 0.204945 }, { "acc": 0.77305632, "epoch": 0.13251870106009128, "grad_norm": 5.03125, "learning_rate": 9.965566738234403e-06, "loss": 0.81543274, "memory(GiB)": 117.26, "step": 5680, "train_speed(iter/s)": 0.205131 }, { "acc": 0.76543159, "epoch": 0.13275200863238018, "grad_norm": 6.125, "learning_rate": 9.965345060235075e-06, "loss": 0.84962788, "memory(GiB)": 117.26, "step": 5690, "train_speed(iter/s)": 0.205309 }, { "acc": 0.74899764, "epoch": 0.13298531620466908, "grad_norm": 6.5, "learning_rate": 9.965122673434182e-06, "loss": 0.92564602, "memory(GiB)": 117.26, "step": 5700, "train_speed(iter/s)": 0.205496 }, { "acc": 0.77601094, "epoch": 0.13321862377695795, "grad_norm": 7.84375, "learning_rate": 9.964899577863472e-06, "loss": 0.80912209, "memory(GiB)": 117.26, "step": 5710, "train_speed(iter/s)": 0.205675 }, { "acc": 0.73249235, "epoch": 0.13345193134924685, "grad_norm": 5.90625, "learning_rate": 9.964675773554789e-06, "loss": 0.98017988, "memory(GiB)": 117.26, "step": 5720, "train_speed(iter/s)": 0.205853 }, { "acc": 0.74499979, "epoch": 0.13368523892153575, "grad_norm": 5.6875, "learning_rate": 9.96445126054008e-06, "loss": 0.93364029, "memory(GiB)": 117.26, "step": 5730, "train_speed(iter/s)": 0.206038 }, { "acc": 0.75207715, "epoch": 0.13391854649382465, "grad_norm": 5.96875, "learning_rate": 9.964226038851397e-06, "loss": 0.87914734, "memory(GiB)": 117.26, "step": 5740, "train_speed(iter/s)": 0.206223 }, { "acc": 0.75214834, "epoch": 0.13415185406611355, "grad_norm": 6.5625, "learning_rate": 9.964000108520889e-06, "loss": 0.89662056, "memory(GiB)": 117.26, "step": 5750, "train_speed(iter/s)": 0.206401 }, { "acc": 0.77131853, "epoch": 0.13438516163840242, "grad_norm": 5.34375, "learning_rate": 9.963773469580806e-06, "loss": 0.84501877, "memory(GiB)": 117.26, "step": 5760, "train_speed(iter/s)": 0.206583 }, { "acc": 0.75995574, "epoch": 0.13461846921069132, "grad_norm": 6.15625, "learning_rate": 9.963546122063504e-06, "loss": 0.87854509, "memory(GiB)": 117.26, "step": 5770, "train_speed(iter/s)": 0.206771 }, { "acc": 0.77891011, "epoch": 0.13485177678298021, "grad_norm": 8.8125, "learning_rate": 9.963318066001433e-06, "loss": 0.8121933, "memory(GiB)": 117.26, "step": 5780, "train_speed(iter/s)": 0.206944 }, { "acc": 0.74783754, "epoch": 0.1350850843552691, "grad_norm": 9.0625, "learning_rate": 9.963089301427152e-06, "loss": 0.91349182, "memory(GiB)": 117.26, "step": 5790, "train_speed(iter/s)": 0.207136 }, { "acc": 0.75257044, "epoch": 0.13531839192755798, "grad_norm": 5.03125, "learning_rate": 9.962859828373315e-06, "loss": 0.90391273, "memory(GiB)": 117.26, "step": 5800, "train_speed(iter/s)": 0.207317 }, { "acc": 0.77451558, "epoch": 0.13555169949984688, "grad_norm": 8.25, "learning_rate": 9.96262964687268e-06, "loss": 0.81084442, "memory(GiB)": 117.26, "step": 5810, "train_speed(iter/s)": 0.207503 }, { "acc": 0.76609716, "epoch": 0.13578500707213578, "grad_norm": 8.6875, "learning_rate": 9.9623987569581e-06, "loss": 0.86032953, "memory(GiB)": 117.26, "step": 5820, "train_speed(iter/s)": 0.207686 }, { "acc": 0.76087742, "epoch": 0.13601831464442468, "grad_norm": 4.53125, "learning_rate": 9.962167158662543e-06, "loss": 0.8788414, "memory(GiB)": 117.26, "step": 5830, "train_speed(iter/s)": 0.20787 }, { "acc": 0.76801157, "epoch": 0.13625162221671358, "grad_norm": 8.125, "learning_rate": 9.961934852019066e-06, "loss": 0.86093464, "memory(GiB)": 117.26, "step": 5840, "train_speed(iter/s)": 0.208049 }, { "acc": 0.73029947, "epoch": 0.13648492978900245, "grad_norm": 4.71875, "learning_rate": 9.96170183706083e-06, "loss": 0.99479952, "memory(GiB)": 117.26, "step": 5850, "train_speed(iter/s)": 0.208243 }, { "acc": 0.74154463, "epoch": 0.13671823736129135, "grad_norm": 8.0625, "learning_rate": 9.961468113821096e-06, "loss": 0.95223618, "memory(GiB)": 117.26, "step": 5860, "train_speed(iter/s)": 0.208421 }, { "acc": 0.78498073, "epoch": 0.13695154493358025, "grad_norm": 4.84375, "learning_rate": 9.96123368233323e-06, "loss": 0.77928801, "memory(GiB)": 117.26, "step": 5870, "train_speed(iter/s)": 0.20861 }, { "acc": 0.74769692, "epoch": 0.13718485250586915, "grad_norm": 6.03125, "learning_rate": 9.9609985426307e-06, "loss": 0.92958603, "memory(GiB)": 117.26, "step": 5880, "train_speed(iter/s)": 0.208785 }, { "acc": 0.78302388, "epoch": 0.13741816007815805, "grad_norm": 5.46875, "learning_rate": 9.960762694747068e-06, "loss": 0.7901701, "memory(GiB)": 117.26, "step": 5890, "train_speed(iter/s)": 0.208966 }, { "acc": 0.76675072, "epoch": 0.13765146765044692, "grad_norm": 6.5625, "learning_rate": 9.960526138716e-06, "loss": 0.84309778, "memory(GiB)": 117.26, "step": 5900, "train_speed(iter/s)": 0.209155 }, { "acc": 0.75567636, "epoch": 0.13788477522273582, "grad_norm": 14.1875, "learning_rate": 9.960288874571271e-06, "loss": 0.87544594, "memory(GiB)": 117.26, "step": 5910, "train_speed(iter/s)": 0.209339 }, { "acc": 0.7616755, "epoch": 0.13811808279502472, "grad_norm": 4.78125, "learning_rate": 9.960050902346743e-06, "loss": 0.84467897, "memory(GiB)": 117.26, "step": 5920, "train_speed(iter/s)": 0.20951 }, { "acc": 0.76681614, "epoch": 0.13835139036731361, "grad_norm": 5.5625, "learning_rate": 9.959812222076391e-06, "loss": 0.85082617, "memory(GiB)": 117.26, "step": 5930, "train_speed(iter/s)": 0.209694 }, { "acc": 0.74857068, "epoch": 0.1385846979396025, "grad_norm": 5.75, "learning_rate": 9.959572833794283e-06, "loss": 0.89518185, "memory(GiB)": 117.26, "step": 5940, "train_speed(iter/s)": 0.209874 }, { "acc": 0.76660008, "epoch": 0.13881800551189138, "grad_norm": 5.0625, "learning_rate": 9.959332737534597e-06, "loss": 0.85159597, "memory(GiB)": 117.26, "step": 5950, "train_speed(iter/s)": 0.210056 }, { "acc": 0.7717967, "epoch": 0.13905131308418028, "grad_norm": 5.75, "learning_rate": 9.959091933331601e-06, "loss": 0.82726517, "memory(GiB)": 117.26, "step": 5960, "train_speed(iter/s)": 0.210231 }, { "acc": 0.76094427, "epoch": 0.13928462065646918, "grad_norm": 5.25, "learning_rate": 9.958850421219675e-06, "loss": 0.8662014, "memory(GiB)": 117.26, "step": 5970, "train_speed(iter/s)": 0.210403 }, { "acc": 0.7369442, "epoch": 0.13951792822875808, "grad_norm": 7.65625, "learning_rate": 9.958608201233288e-06, "loss": 0.96486263, "memory(GiB)": 117.26, "step": 5980, "train_speed(iter/s)": 0.210578 }, { "acc": 0.75027895, "epoch": 0.13975123580104698, "grad_norm": 7.0, "learning_rate": 9.958365273407023e-06, "loss": 0.90262318, "memory(GiB)": 117.26, "step": 5990, "train_speed(iter/s)": 0.210732 }, { "acc": 0.74614449, "epoch": 0.13998454337333585, "grad_norm": 15.0625, "learning_rate": 9.958121637775554e-06, "loss": 0.93252382, "memory(GiB)": 117.26, "step": 6000, "train_speed(iter/s)": 0.210908 }, { "epoch": 0.13998454337333585, "eval_acc": 0.7254156999538559, "eval_loss": 0.8730215430259705, "eval_runtime": 1263.6519, "eval_samples_per_second": 28.482, "eval_steps_per_second": 14.241, "step": 6000 }, { "acc": 0.75072947, "epoch": 0.14021785094562475, "grad_norm": 6.875, "learning_rate": 9.957877294373665e-06, "loss": 0.92445507, "memory(GiB)": 117.26, "step": 6010, "train_speed(iter/s)": 0.201939 }, { "acc": 0.75903053, "epoch": 0.14045115851791365, "grad_norm": 7.8125, "learning_rate": 9.957632243236231e-06, "loss": 0.88661766, "memory(GiB)": 117.26, "step": 6020, "train_speed(iter/s)": 0.202109 }, { "acc": 0.7490994, "epoch": 0.14068446609020255, "grad_norm": 6.71875, "learning_rate": 9.957386484398233e-06, "loss": 0.90530672, "memory(GiB)": 117.26, "step": 6030, "train_speed(iter/s)": 0.202286 }, { "acc": 0.75356007, "epoch": 0.14091777366249145, "grad_norm": 4.9375, "learning_rate": 9.957140017894754e-06, "loss": 0.89122963, "memory(GiB)": 117.26, "step": 6040, "train_speed(iter/s)": 0.202461 }, { "acc": 0.77132211, "epoch": 0.14115108123478032, "grad_norm": 5.1875, "learning_rate": 9.956892843760979e-06, "loss": 0.81775093, "memory(GiB)": 117.26, "step": 6050, "train_speed(iter/s)": 0.20263 }, { "acc": 0.74122229, "epoch": 0.14138438880706922, "grad_norm": 8.4375, "learning_rate": 9.956644962032192e-06, "loss": 0.96717453, "memory(GiB)": 117.26, "step": 6060, "train_speed(iter/s)": 0.202811 }, { "acc": 0.74958901, "epoch": 0.14161769637935812, "grad_norm": 8.8125, "learning_rate": 9.956396372743775e-06, "loss": 0.88429737, "memory(GiB)": 117.26, "step": 6070, "train_speed(iter/s)": 0.20298 }, { "acc": 0.80719585, "epoch": 0.14185100395164701, "grad_norm": 6.9375, "learning_rate": 9.956147075931215e-06, "loss": 0.67036023, "memory(GiB)": 117.26, "step": 6080, "train_speed(iter/s)": 0.203147 }, { "acc": 0.76072383, "epoch": 0.14208431152393589, "grad_norm": 5.8125, "learning_rate": 9.955897071630101e-06, "loss": 0.85584431, "memory(GiB)": 117.26, "step": 6090, "train_speed(iter/s)": 0.203311 }, { "acc": 0.75450392, "epoch": 0.14231761909622478, "grad_norm": 5.3125, "learning_rate": 9.955646359876118e-06, "loss": 0.89288139, "memory(GiB)": 117.26, "step": 6100, "train_speed(iter/s)": 0.203482 }, { "acc": 0.75096741, "epoch": 0.14255092666851368, "grad_norm": 6.65625, "learning_rate": 9.955394940705057e-06, "loss": 0.90254669, "memory(GiB)": 117.26, "step": 6110, "train_speed(iter/s)": 0.203648 }, { "acc": 0.76784148, "epoch": 0.14278423424080258, "grad_norm": 12.9375, "learning_rate": 9.95514281415281e-06, "loss": 0.83598175, "memory(GiB)": 117.26, "step": 6120, "train_speed(iter/s)": 0.203816 }, { "acc": 0.7782548, "epoch": 0.14301754181309148, "grad_norm": 6.25, "learning_rate": 9.954889980255363e-06, "loss": 0.80113516, "memory(GiB)": 117.26, "step": 6130, "train_speed(iter/s)": 0.203995 }, { "acc": 0.75364347, "epoch": 0.14325084938538035, "grad_norm": 4.875, "learning_rate": 9.954636439048813e-06, "loss": 0.91740513, "memory(GiB)": 117.26, "step": 6140, "train_speed(iter/s)": 0.204154 }, { "acc": 0.76953945, "epoch": 0.14348415695766925, "grad_norm": 10.375, "learning_rate": 9.95438219056935e-06, "loss": 0.8324584, "memory(GiB)": 117.26, "step": 6150, "train_speed(iter/s)": 0.204316 }, { "acc": 0.75888433, "epoch": 0.14371746452995815, "grad_norm": 4.28125, "learning_rate": 9.954127234853267e-06, "loss": 0.8856554, "memory(GiB)": 117.26, "step": 6160, "train_speed(iter/s)": 0.204485 }, { "acc": 0.76844521, "epoch": 0.14395077210224705, "grad_norm": 4.875, "learning_rate": 9.953871571936962e-06, "loss": 0.84255247, "memory(GiB)": 117.26, "step": 6170, "train_speed(iter/s)": 0.204659 }, { "acc": 0.78293333, "epoch": 0.14418407967453595, "grad_norm": 5.9375, "learning_rate": 9.953615201856928e-06, "loss": 0.78041797, "memory(GiB)": 117.26, "step": 6180, "train_speed(iter/s)": 0.204832 }, { "acc": 0.7618185, "epoch": 0.14441738724682482, "grad_norm": 5.78125, "learning_rate": 9.953358124649764e-06, "loss": 0.84786835, "memory(GiB)": 117.26, "step": 6190, "train_speed(iter/s)": 0.205001 }, { "acc": 0.73889437, "epoch": 0.14465069481911372, "grad_norm": 5.4375, "learning_rate": 9.953100340352166e-06, "loss": 0.97422285, "memory(GiB)": 117.26, "step": 6200, "train_speed(iter/s)": 0.205168 }, { "acc": 0.75654068, "epoch": 0.14488400239140262, "grad_norm": 5.84375, "learning_rate": 9.952841849000935e-06, "loss": 0.90893421, "memory(GiB)": 117.26, "step": 6210, "train_speed(iter/s)": 0.205335 }, { "acc": 0.75758886, "epoch": 0.14511730996369152, "grad_norm": 6.71875, "learning_rate": 9.952582650632967e-06, "loss": 0.9100935, "memory(GiB)": 117.26, "step": 6220, "train_speed(iter/s)": 0.205497 }, { "acc": 0.75849752, "epoch": 0.14535061753598041, "grad_norm": 8.4375, "learning_rate": 9.952322745285266e-06, "loss": 0.86074123, "memory(GiB)": 117.26, "step": 6230, "train_speed(iter/s)": 0.205663 }, { "acc": 0.77823963, "epoch": 0.14558392510826929, "grad_norm": 4.28125, "learning_rate": 9.95206213299493e-06, "loss": 0.81654539, "memory(GiB)": 117.26, "step": 6240, "train_speed(iter/s)": 0.205834 }, { "acc": 0.75980802, "epoch": 0.14581723268055818, "grad_norm": 4.875, "learning_rate": 9.951800813799164e-06, "loss": 0.88594036, "memory(GiB)": 117.26, "step": 6250, "train_speed(iter/s)": 0.206008 }, { "acc": 0.75569377, "epoch": 0.14605054025284708, "grad_norm": 6.21875, "learning_rate": 9.95153878773527e-06, "loss": 0.87857952, "memory(GiB)": 117.26, "step": 6260, "train_speed(iter/s)": 0.206184 }, { "acc": 0.75523491, "epoch": 0.14628384782513598, "grad_norm": 5.125, "learning_rate": 9.951276054840654e-06, "loss": 0.87323513, "memory(GiB)": 117.26, "step": 6270, "train_speed(iter/s)": 0.206362 }, { "acc": 0.74887829, "epoch": 0.14651715539742488, "grad_norm": 6.4375, "learning_rate": 9.951012615152816e-06, "loss": 0.92730255, "memory(GiB)": 117.26, "step": 6280, "train_speed(iter/s)": 0.20654 }, { "acc": 0.77444134, "epoch": 0.14675046296971375, "grad_norm": 7.9375, "learning_rate": 9.950748468709368e-06, "loss": 0.80300159, "memory(GiB)": 117.26, "step": 6290, "train_speed(iter/s)": 0.206707 }, { "acc": 0.74920073, "epoch": 0.14698377054200265, "grad_norm": 5.9375, "learning_rate": 9.950483615548014e-06, "loss": 0.92633486, "memory(GiB)": 117.26, "step": 6300, "train_speed(iter/s)": 0.206878 }, { "acc": 0.75720644, "epoch": 0.14721707811429155, "grad_norm": 12.125, "learning_rate": 9.950218055706563e-06, "loss": 0.89268179, "memory(GiB)": 117.26, "step": 6310, "train_speed(iter/s)": 0.20705 }, { "acc": 0.75816936, "epoch": 0.14745038568658045, "grad_norm": 5.125, "learning_rate": 9.94995178922292e-06, "loss": 0.90541296, "memory(GiB)": 117.26, "step": 6320, "train_speed(iter/s)": 0.207218 }, { "acc": 0.75076075, "epoch": 0.14768369325886932, "grad_norm": 7.34375, "learning_rate": 9.949684816135098e-06, "loss": 0.9223877, "memory(GiB)": 117.26, "step": 6330, "train_speed(iter/s)": 0.207387 }, { "acc": 0.74651184, "epoch": 0.14791700083115822, "grad_norm": 6.21875, "learning_rate": 9.949417136481207e-06, "loss": 0.91885986, "memory(GiB)": 117.26, "step": 6340, "train_speed(iter/s)": 0.207558 }, { "acc": 0.75872321, "epoch": 0.14815030840344712, "grad_norm": 4.375, "learning_rate": 9.94914875029946e-06, "loss": 0.8842907, "memory(GiB)": 117.26, "step": 6350, "train_speed(iter/s)": 0.207724 }, { "acc": 0.76133738, "epoch": 0.14838361597573602, "grad_norm": 6.46875, "learning_rate": 9.948879657628164e-06, "loss": 0.87083912, "memory(GiB)": 117.26, "step": 6360, "train_speed(iter/s)": 0.207886 }, { "acc": 0.73931665, "epoch": 0.14861692354802492, "grad_norm": 5.78125, "learning_rate": 9.948609858505734e-06, "loss": 0.96105928, "memory(GiB)": 117.26, "step": 6370, "train_speed(iter/s)": 0.208052 }, { "acc": 0.74765229, "epoch": 0.1488502311203138, "grad_norm": 4.875, "learning_rate": 9.948339352970683e-06, "loss": 0.93468962, "memory(GiB)": 117.26, "step": 6380, "train_speed(iter/s)": 0.208227 }, { "acc": 0.75981722, "epoch": 0.14908353869260269, "grad_norm": 8.5625, "learning_rate": 9.948068141061631e-06, "loss": 0.85532331, "memory(GiB)": 117.26, "step": 6390, "train_speed(iter/s)": 0.208382 }, { "acc": 0.74079142, "epoch": 0.14931684626489158, "grad_norm": 7.5, "learning_rate": 9.947796222817286e-06, "loss": 0.95135565, "memory(GiB)": 117.26, "step": 6400, "train_speed(iter/s)": 0.208556 }, { "acc": 0.75449347, "epoch": 0.14955015383718048, "grad_norm": 6.5, "learning_rate": 9.94752359827647e-06, "loss": 0.891049, "memory(GiB)": 117.26, "step": 6410, "train_speed(iter/s)": 0.208721 }, { "acc": 0.77301941, "epoch": 0.14978346140946938, "grad_norm": 16.875, "learning_rate": 9.947250267478094e-06, "loss": 0.81918297, "memory(GiB)": 117.26, "step": 6420, "train_speed(iter/s)": 0.208892 }, { "acc": 0.75058451, "epoch": 0.15001676898175825, "grad_norm": 4.8125, "learning_rate": 9.946976230461183e-06, "loss": 0.91223679, "memory(GiB)": 117.26, "step": 6430, "train_speed(iter/s)": 0.209056 }, { "acc": 0.77739282, "epoch": 0.15025007655404715, "grad_norm": 5.9375, "learning_rate": 9.946701487264851e-06, "loss": 0.80219898, "memory(GiB)": 117.26, "step": 6440, "train_speed(iter/s)": 0.209225 }, { "acc": 0.75986276, "epoch": 0.15048338412633605, "grad_norm": 5.25, "learning_rate": 9.946426037928319e-06, "loss": 0.86433973, "memory(GiB)": 117.26, "step": 6450, "train_speed(iter/s)": 0.209379 }, { "acc": 0.7497715, "epoch": 0.15071669169862495, "grad_norm": 9.4375, "learning_rate": 9.946149882490907e-06, "loss": 0.90357056, "memory(GiB)": 117.26, "step": 6460, "train_speed(iter/s)": 0.209542 }, { "acc": 0.76728077, "epoch": 0.15094999927091385, "grad_norm": 5.03125, "learning_rate": 9.945873020992036e-06, "loss": 0.86253672, "memory(GiB)": 117.26, "step": 6470, "train_speed(iter/s)": 0.209695 }, { "acc": 0.76882696, "epoch": 0.15118330684320272, "grad_norm": 6.21875, "learning_rate": 9.945595453471228e-06, "loss": 0.84867611, "memory(GiB)": 117.26, "step": 6480, "train_speed(iter/s)": 0.209853 }, { "acc": 0.76078196, "epoch": 0.15141661441549162, "grad_norm": 5.0625, "learning_rate": 9.945317179968105e-06, "loss": 0.86187792, "memory(GiB)": 117.26, "step": 6490, "train_speed(iter/s)": 0.210008 }, { "acc": 0.73704739, "epoch": 0.15164992198778052, "grad_norm": 5.3125, "learning_rate": 9.945038200522392e-06, "loss": 0.96669559, "memory(GiB)": 117.26, "step": 6500, "train_speed(iter/s)": 0.210169 }, { "epoch": 0.15164992198778052, "eval_acc": 0.7261101197486923, "eval_loss": 0.8700303435325623, "eval_runtime": 1262.6714, "eval_samples_per_second": 28.504, "eval_steps_per_second": 14.252, "step": 6500 }, { "acc": 0.7568408, "epoch": 0.15188322956006942, "grad_norm": 5.0, "learning_rate": 9.944758515173912e-06, "loss": 0.88513393, "memory(GiB)": 117.26, "step": 6510, "train_speed(iter/s)": 0.20192 }, { "acc": 0.76080866, "epoch": 0.15211653713235831, "grad_norm": 5.59375, "learning_rate": 9.944478123962592e-06, "loss": 0.85919609, "memory(GiB)": 117.26, "step": 6520, "train_speed(iter/s)": 0.202093 }, { "acc": 0.75912066, "epoch": 0.15234984470464719, "grad_norm": 4.875, "learning_rate": 9.944197026928454e-06, "loss": 0.87821732, "memory(GiB)": 117.26, "step": 6530, "train_speed(iter/s)": 0.202261 }, { "acc": 0.7715127, "epoch": 0.15258315227693608, "grad_norm": 4.96875, "learning_rate": 9.943915224111627e-06, "loss": 0.85775757, "memory(GiB)": 117.26, "step": 6540, "train_speed(iter/s)": 0.20243 }, { "acc": 0.75159321, "epoch": 0.15281645984922498, "grad_norm": 4.6875, "learning_rate": 9.943632715552338e-06, "loss": 0.91012135, "memory(GiB)": 117.26, "step": 6550, "train_speed(iter/s)": 0.202591 }, { "acc": 0.76549492, "epoch": 0.15304976742151388, "grad_norm": 13.1875, "learning_rate": 9.943349501290916e-06, "loss": 0.83716812, "memory(GiB)": 117.26, "step": 6560, "train_speed(iter/s)": 0.202755 }, { "acc": 0.79378567, "epoch": 0.15328307499380278, "grad_norm": 5.625, "learning_rate": 9.943065581367788e-06, "loss": 0.75107126, "memory(GiB)": 117.26, "step": 6570, "train_speed(iter/s)": 0.202901 }, { "acc": 0.76819811, "epoch": 0.15351638256609165, "grad_norm": 7.125, "learning_rate": 9.942780955823485e-06, "loss": 0.84128418, "memory(GiB)": 117.26, "step": 6580, "train_speed(iter/s)": 0.203063 }, { "acc": 0.75935812, "epoch": 0.15374969013838055, "grad_norm": 6.34375, "learning_rate": 9.942495624698636e-06, "loss": 0.89375591, "memory(GiB)": 117.26, "step": 6590, "train_speed(iter/s)": 0.203217 }, { "acc": 0.73894939, "epoch": 0.15398299771066945, "grad_norm": 7.21875, "learning_rate": 9.942209588033973e-06, "loss": 0.95285892, "memory(GiB)": 117.26, "step": 6600, "train_speed(iter/s)": 0.203374 }, { "acc": 0.76413679, "epoch": 0.15421630528295835, "grad_norm": 4.9375, "learning_rate": 9.941922845870326e-06, "loss": 0.84897823, "memory(GiB)": 117.26, "step": 6610, "train_speed(iter/s)": 0.203531 }, { "acc": 0.75625834, "epoch": 0.15444961285524722, "grad_norm": 9.25, "learning_rate": 9.941635398248628e-06, "loss": 0.87758846, "memory(GiB)": 117.26, "step": 6620, "train_speed(iter/s)": 0.203692 }, { "acc": 0.75790167, "epoch": 0.15468292042753612, "grad_norm": 7.40625, "learning_rate": 9.941347245209914e-06, "loss": 0.88962317, "memory(GiB)": 117.26, "step": 6630, "train_speed(iter/s)": 0.203845 }, { "acc": 0.75927038, "epoch": 0.15491622799982502, "grad_norm": 5.4375, "learning_rate": 9.941058386795314e-06, "loss": 0.869909, "memory(GiB)": 117.26, "step": 6640, "train_speed(iter/s)": 0.203993 }, { "acc": 0.76332517, "epoch": 0.15514953557211392, "grad_norm": 7.875, "learning_rate": 9.940768823046067e-06, "loss": 0.86448622, "memory(GiB)": 117.26, "step": 6650, "train_speed(iter/s)": 0.204147 }, { "acc": 0.77489128, "epoch": 0.15538284314440282, "grad_norm": 5.65625, "learning_rate": 9.940478554003506e-06, "loss": 0.79631014, "memory(GiB)": 117.26, "step": 6660, "train_speed(iter/s)": 0.204298 }, { "acc": 0.75767603, "epoch": 0.1556161507166917, "grad_norm": 6.3125, "learning_rate": 9.940187579709064e-06, "loss": 0.88432436, "memory(GiB)": 117.26, "step": 6670, "train_speed(iter/s)": 0.204463 }, { "acc": 0.7435533, "epoch": 0.15584945828898059, "grad_norm": 5.0625, "learning_rate": 9.939895900204281e-06, "loss": 0.94854183, "memory(GiB)": 117.26, "step": 6680, "train_speed(iter/s)": 0.204623 }, { "acc": 0.76561785, "epoch": 0.15608276586126948, "grad_norm": 4.78125, "learning_rate": 9.939603515530796e-06, "loss": 0.86235189, "memory(GiB)": 117.26, "step": 6690, "train_speed(iter/s)": 0.204783 }, { "acc": 0.74072647, "epoch": 0.15631607343355838, "grad_norm": 7.21875, "learning_rate": 9.939310425730342e-06, "loss": 0.94523354, "memory(GiB)": 117.26, "step": 6700, "train_speed(iter/s)": 0.20494 }, { "acc": 0.77434101, "epoch": 0.15654938100584728, "grad_norm": 5.15625, "learning_rate": 9.939016630844758e-06, "loss": 0.80618992, "memory(GiB)": 117.26, "step": 6710, "train_speed(iter/s)": 0.205106 }, { "acc": 0.76513071, "epoch": 0.15678268857813615, "grad_norm": 7.09375, "learning_rate": 9.938722130915988e-06, "loss": 0.85209513, "memory(GiB)": 117.26, "step": 6720, "train_speed(iter/s)": 0.205263 }, { "acc": 0.76825109, "epoch": 0.15701599615042505, "grad_norm": 4.0625, "learning_rate": 9.938426925986066e-06, "loss": 0.84857235, "memory(GiB)": 117.26, "step": 6730, "train_speed(iter/s)": 0.205412 }, { "acc": 0.7607286, "epoch": 0.15724930372271395, "grad_norm": 5.5, "learning_rate": 9.938131016097137e-06, "loss": 0.8792285, "memory(GiB)": 117.26, "step": 6740, "train_speed(iter/s)": 0.205562 }, { "acc": 0.75114679, "epoch": 0.15748261129500285, "grad_norm": 5.6875, "learning_rate": 9.937834401291437e-06, "loss": 0.92774343, "memory(GiB)": 117.26, "step": 6750, "train_speed(iter/s)": 0.205721 }, { "acc": 0.74006433, "epoch": 0.15771591886729175, "grad_norm": 10.375, "learning_rate": 9.937537081611313e-06, "loss": 0.94107227, "memory(GiB)": 117.26, "step": 6760, "train_speed(iter/s)": 0.205883 }, { "acc": 0.76003213, "epoch": 0.15794922643958062, "grad_norm": 6.59375, "learning_rate": 9.937239057099205e-06, "loss": 0.87540588, "memory(GiB)": 117.26, "step": 6770, "train_speed(iter/s)": 0.206042 }, { "acc": 0.77193604, "epoch": 0.15818253401186952, "grad_norm": 5.75, "learning_rate": 9.936940327797655e-06, "loss": 0.82196531, "memory(GiB)": 117.26, "step": 6780, "train_speed(iter/s)": 0.206197 }, { "acc": 0.77463455, "epoch": 0.15841584158415842, "grad_norm": 5.03125, "learning_rate": 9.936640893749308e-06, "loss": 0.79829493, "memory(GiB)": 117.26, "step": 6790, "train_speed(iter/s)": 0.206351 }, { "acc": 0.75294509, "epoch": 0.15864914915644732, "grad_norm": 5.0625, "learning_rate": 9.936340754996906e-06, "loss": 0.90441732, "memory(GiB)": 117.26, "step": 6800, "train_speed(iter/s)": 0.20651 }, { "acc": 0.74839764, "epoch": 0.15888245672873622, "grad_norm": 6.03125, "learning_rate": 9.936039911583298e-06, "loss": 0.92608595, "memory(GiB)": 117.26, "step": 6810, "train_speed(iter/s)": 0.206668 }, { "acc": 0.76310005, "epoch": 0.1591157643010251, "grad_norm": 5.5625, "learning_rate": 9.935738363551424e-06, "loss": 0.8414053, "memory(GiB)": 117.26, "step": 6820, "train_speed(iter/s)": 0.206826 }, { "acc": 0.74515209, "epoch": 0.15934907187331399, "grad_norm": 9.6875, "learning_rate": 9.935436110944335e-06, "loss": 0.94390917, "memory(GiB)": 117.26, "step": 6830, "train_speed(iter/s)": 0.20697 }, { "acc": 0.74745922, "epoch": 0.15958237944560288, "grad_norm": 7.5, "learning_rate": 9.935133153805172e-06, "loss": 0.90526886, "memory(GiB)": 117.26, "step": 6840, "train_speed(iter/s)": 0.207128 }, { "acc": 0.75993552, "epoch": 0.15981568701789178, "grad_norm": 5.75, "learning_rate": 9.934829492177187e-06, "loss": 0.88849306, "memory(GiB)": 117.26, "step": 6850, "train_speed(iter/s)": 0.207283 }, { "acc": 0.72869906, "epoch": 0.16004899459018065, "grad_norm": 7.375, "learning_rate": 9.934525126103725e-06, "loss": 0.9820755, "memory(GiB)": 117.26, "step": 6860, "train_speed(iter/s)": 0.207441 }, { "acc": 0.75347347, "epoch": 0.16028230216246955, "grad_norm": 6.59375, "learning_rate": 9.934220055628233e-06, "loss": 0.88754311, "memory(GiB)": 117.26, "step": 6870, "train_speed(iter/s)": 0.207603 }, { "acc": 0.76036777, "epoch": 0.16051560973475845, "grad_norm": 5.09375, "learning_rate": 9.933914280794266e-06, "loss": 0.84329424, "memory(GiB)": 117.26, "step": 6880, "train_speed(iter/s)": 0.207758 }, { "acc": 0.75529871, "epoch": 0.16074891730704735, "grad_norm": 32.5, "learning_rate": 9.933607801645464e-06, "loss": 0.90511074, "memory(GiB)": 117.26, "step": 6890, "train_speed(iter/s)": 0.207901 }, { "acc": 0.75234146, "epoch": 0.16098222487933625, "grad_norm": 7.5625, "learning_rate": 9.933300618225584e-06, "loss": 0.90452175, "memory(GiB)": 117.26, "step": 6900, "train_speed(iter/s)": 0.208048 }, { "acc": 0.77234688, "epoch": 0.16121553245162512, "grad_norm": 6.4375, "learning_rate": 9.932992730578473e-06, "loss": 0.82972193, "memory(GiB)": 117.26, "step": 6910, "train_speed(iter/s)": 0.208197 }, { "acc": 0.78141584, "epoch": 0.16144884002391402, "grad_norm": 5.78125, "learning_rate": 9.932684138748083e-06, "loss": 0.79875937, "memory(GiB)": 117.26, "step": 6920, "train_speed(iter/s)": 0.20836 }, { "acc": 0.75423717, "epoch": 0.16168214759620292, "grad_norm": 6.4375, "learning_rate": 9.932374842778466e-06, "loss": 0.9044302, "memory(GiB)": 117.26, "step": 6930, "train_speed(iter/s)": 0.208513 }, { "acc": 0.76593361, "epoch": 0.16191545516849182, "grad_norm": 5.96875, "learning_rate": 9.932064842713773e-06, "loss": 0.8453289, "memory(GiB)": 117.26, "step": 6940, "train_speed(iter/s)": 0.208665 }, { "acc": 0.76248789, "epoch": 0.16214876274078072, "grad_norm": 8.125, "learning_rate": 9.931754138598256e-06, "loss": 0.84321623, "memory(GiB)": 117.26, "step": 6950, "train_speed(iter/s)": 0.208809 }, { "acc": 0.77165947, "epoch": 0.1623820703130696, "grad_norm": 9.25, "learning_rate": 9.931442730476266e-06, "loss": 0.82758007, "memory(GiB)": 117.26, "step": 6960, "train_speed(iter/s)": 0.208962 }, { "acc": 0.75550756, "epoch": 0.1626153778853585, "grad_norm": 8.875, "learning_rate": 9.931130618392262e-06, "loss": 0.90006495, "memory(GiB)": 117.26, "step": 6970, "train_speed(iter/s)": 0.209114 }, { "acc": 0.73909492, "epoch": 0.16284868545764739, "grad_norm": 7.0, "learning_rate": 9.930817802390794e-06, "loss": 0.95952091, "memory(GiB)": 117.26, "step": 6980, "train_speed(iter/s)": 0.209264 }, { "acc": 0.75279455, "epoch": 0.16308199302993628, "grad_norm": 5.5, "learning_rate": 9.930504282516517e-06, "loss": 0.89881077, "memory(GiB)": 117.26, "step": 6990, "train_speed(iter/s)": 0.209415 }, { "acc": 0.76613784, "epoch": 0.16331530060222518, "grad_norm": 5.09375, "learning_rate": 9.930190058814185e-06, "loss": 0.82914057, "memory(GiB)": 117.26, "step": 7000, "train_speed(iter/s)": 0.20957 }, { "epoch": 0.16331530060222518, "eval_acc": 0.7268952142472225, "eval_loss": 0.8673418760299683, "eval_runtime": 1265.1326, "eval_samples_per_second": 28.448, "eval_steps_per_second": 14.225, "step": 7000 }, { "acc": 0.75962133, "epoch": 0.16354860817451405, "grad_norm": 4.4375, "learning_rate": 9.929875131328655e-06, "loss": 0.88552074, "memory(GiB)": 117.26, "step": 7010, "train_speed(iter/s)": 0.201917 }, { "acc": 0.74996486, "epoch": 0.16378191574680295, "grad_norm": 5.125, "learning_rate": 9.929559500104883e-06, "loss": 0.92237072, "memory(GiB)": 117.26, "step": 7020, "train_speed(iter/s)": 0.202058 }, { "acc": 0.74669847, "epoch": 0.16401522331909185, "grad_norm": 5.53125, "learning_rate": 9.929243165187922e-06, "loss": 0.92104797, "memory(GiB)": 117.26, "step": 7030, "train_speed(iter/s)": 0.20221 }, { "acc": 0.76796293, "epoch": 0.16424853089138075, "grad_norm": 11.75, "learning_rate": 9.928926126622933e-06, "loss": 0.84068232, "memory(GiB)": 117.26, "step": 7040, "train_speed(iter/s)": 0.202357 }, { "acc": 0.75302162, "epoch": 0.16448183846366965, "grad_norm": 5.09375, "learning_rate": 9.928608384455172e-06, "loss": 0.88458652, "memory(GiB)": 117.26, "step": 7050, "train_speed(iter/s)": 0.202502 }, { "acc": 0.75752697, "epoch": 0.16471514603595852, "grad_norm": 44.0, "learning_rate": 9.928289938729996e-06, "loss": 0.89839153, "memory(GiB)": 117.26, "step": 7060, "train_speed(iter/s)": 0.202647 }, { "acc": 0.76852369, "epoch": 0.16494845360824742, "grad_norm": 7.65625, "learning_rate": 9.92797078949286e-06, "loss": 0.84967194, "memory(GiB)": 117.26, "step": 7070, "train_speed(iter/s)": 0.202796 }, { "acc": 0.74692674, "epoch": 0.16518176118053632, "grad_norm": 5.71875, "learning_rate": 9.927650936789329e-06, "loss": 0.90885906, "memory(GiB)": 117.26, "step": 7080, "train_speed(iter/s)": 0.202953 }, { "acc": 0.75364532, "epoch": 0.16541506875282522, "grad_norm": 12.75, "learning_rate": 9.927330380665056e-06, "loss": 0.90330276, "memory(GiB)": 117.26, "step": 7090, "train_speed(iter/s)": 0.20311 }, { "acc": 0.7694891, "epoch": 0.1656483763251141, "grad_norm": 5.375, "learning_rate": 9.927009121165803e-06, "loss": 0.84659195, "memory(GiB)": 117.26, "step": 7100, "train_speed(iter/s)": 0.203264 }, { "acc": 0.76276455, "epoch": 0.165881683897403, "grad_norm": 6.875, "learning_rate": 9.92668715833743e-06, "loss": 0.85104065, "memory(GiB)": 117.26, "step": 7110, "train_speed(iter/s)": 0.203391 }, { "acc": 0.77972989, "epoch": 0.16611499146969189, "grad_norm": 5.65625, "learning_rate": 9.926364492225894e-06, "loss": 0.79778728, "memory(GiB)": 117.26, "step": 7120, "train_speed(iter/s)": 0.203542 }, { "acc": 0.76278415, "epoch": 0.16634829904198078, "grad_norm": 8.3125, "learning_rate": 9.92604112287726e-06, "loss": 0.88268147, "memory(GiB)": 117.26, "step": 7130, "train_speed(iter/s)": 0.203694 }, { "acc": 0.756707, "epoch": 0.16658160661426968, "grad_norm": 6.1875, "learning_rate": 9.925717050337686e-06, "loss": 0.89059353, "memory(GiB)": 117.26, "step": 7140, "train_speed(iter/s)": 0.203844 }, { "acc": 0.75645771, "epoch": 0.16681491418655855, "grad_norm": 8.375, "learning_rate": 9.925392274653435e-06, "loss": 0.87008743, "memory(GiB)": 117.26, "step": 7150, "train_speed(iter/s)": 0.203995 }, { "acc": 0.78567905, "epoch": 0.16704822175884745, "grad_norm": 7.5625, "learning_rate": 9.925066795870868e-06, "loss": 0.79263906, "memory(GiB)": 117.26, "step": 7160, "train_speed(iter/s)": 0.204143 }, { "acc": 0.76977458, "epoch": 0.16728152933113635, "grad_norm": 6.71875, "learning_rate": 9.924740614036445e-06, "loss": 0.81868334, "memory(GiB)": 117.26, "step": 7170, "train_speed(iter/s)": 0.204281 }, { "acc": 0.76125951, "epoch": 0.16751483690342525, "grad_norm": 9.0, "learning_rate": 9.92441372919673e-06, "loss": 0.84621677, "memory(GiB)": 117.26, "step": 7180, "train_speed(iter/s)": 0.204432 }, { "acc": 0.75485568, "epoch": 0.16774814447571415, "grad_norm": 6.0, "learning_rate": 9.924086141398385e-06, "loss": 0.92925434, "memory(GiB)": 117.26, "step": 7190, "train_speed(iter/s)": 0.204578 }, { "acc": 0.77337866, "epoch": 0.16798145204800302, "grad_norm": 5.1875, "learning_rate": 9.923757850688176e-06, "loss": 0.81143856, "memory(GiB)": 117.26, "step": 7200, "train_speed(iter/s)": 0.204726 }, { "acc": 0.76596804, "epoch": 0.16821475962029192, "grad_norm": 5.28125, "learning_rate": 9.923428857112963e-06, "loss": 0.8420332, "memory(GiB)": 117.26, "step": 7210, "train_speed(iter/s)": 0.204874 }, { "acc": 0.77970576, "epoch": 0.16844806719258082, "grad_norm": 4.21875, "learning_rate": 9.923099160719711e-06, "loss": 0.79844537, "memory(GiB)": 117.26, "step": 7220, "train_speed(iter/s)": 0.205021 }, { "acc": 0.76020017, "epoch": 0.16868137476486972, "grad_norm": 5.4375, "learning_rate": 9.922768761555485e-06, "loss": 0.87703323, "memory(GiB)": 117.26, "step": 7230, "train_speed(iter/s)": 0.20517 }, { "acc": 0.748102, "epoch": 0.16891468233715862, "grad_norm": 7.46875, "learning_rate": 9.922437659667448e-06, "loss": 0.92345657, "memory(GiB)": 117.26, "step": 7240, "train_speed(iter/s)": 0.205313 }, { "acc": 0.77762918, "epoch": 0.1691479899094475, "grad_norm": 6.53125, "learning_rate": 9.922105855102864e-06, "loss": 0.81333418, "memory(GiB)": 117.26, "step": 7250, "train_speed(iter/s)": 0.205452 }, { "acc": 0.77396507, "epoch": 0.1693812974817364, "grad_norm": 6.9375, "learning_rate": 9.921773347909098e-06, "loss": 0.80872459, "memory(GiB)": 117.26, "step": 7260, "train_speed(iter/s)": 0.205602 }, { "acc": 0.75821466, "epoch": 0.16961460505402529, "grad_norm": 7.40625, "learning_rate": 9.921440138133619e-06, "loss": 0.88538284, "memory(GiB)": 117.26, "step": 7270, "train_speed(iter/s)": 0.205752 }, { "acc": 0.74574528, "epoch": 0.16984791262631418, "grad_norm": 6.65625, "learning_rate": 9.921106225823988e-06, "loss": 0.94284134, "memory(GiB)": 117.26, "step": 7280, "train_speed(iter/s)": 0.2059 }, { "acc": 0.76249104, "epoch": 0.17008122019860308, "grad_norm": 5.9375, "learning_rate": 9.920771611027875e-06, "loss": 0.85105572, "memory(GiB)": 117.26, "step": 7290, "train_speed(iter/s)": 0.206051 }, { "acc": 0.74841185, "epoch": 0.17031452777089195, "grad_norm": 4.78125, "learning_rate": 9.920436293793043e-06, "loss": 0.93259773, "memory(GiB)": 117.26, "step": 7300, "train_speed(iter/s)": 0.206203 }, { "acc": 0.74501886, "epoch": 0.17054783534318085, "grad_norm": 6.1875, "learning_rate": 9.920100274167359e-06, "loss": 0.9237154, "memory(GiB)": 117.26, "step": 7310, "train_speed(iter/s)": 0.206357 }, { "acc": 0.75909886, "epoch": 0.17078114291546975, "grad_norm": 6.59375, "learning_rate": 9.91976355219879e-06, "loss": 0.87716484, "memory(GiB)": 117.26, "step": 7320, "train_speed(iter/s)": 0.206501 }, { "acc": 0.76013303, "epoch": 0.17101445048775865, "grad_norm": 4.625, "learning_rate": 9.919426127935404e-06, "loss": 0.87902298, "memory(GiB)": 117.26, "step": 7330, "train_speed(iter/s)": 0.206638 }, { "acc": 0.76020713, "epoch": 0.17124775806004755, "grad_norm": 6.0, "learning_rate": 9.919088001425367e-06, "loss": 0.87609043, "memory(GiB)": 117.26, "step": 7340, "train_speed(iter/s)": 0.206787 }, { "acc": 0.77011204, "epoch": 0.17148106563233642, "grad_norm": 6.5, "learning_rate": 9.918749172716946e-06, "loss": 0.83197937, "memory(GiB)": 117.26, "step": 7350, "train_speed(iter/s)": 0.206929 }, { "acc": 0.75667143, "epoch": 0.17171437320462532, "grad_norm": 9.1875, "learning_rate": 9.91840964185851e-06, "loss": 0.88572426, "memory(GiB)": 117.26, "step": 7360, "train_speed(iter/s)": 0.207064 }, { "acc": 0.75572929, "epoch": 0.17194768077691422, "grad_norm": 8.375, "learning_rate": 9.918069408898527e-06, "loss": 0.90169907, "memory(GiB)": 117.26, "step": 7370, "train_speed(iter/s)": 0.207211 }, { "acc": 0.76126418, "epoch": 0.17218098834920312, "grad_norm": 5.375, "learning_rate": 9.917728473885564e-06, "loss": 0.84354057, "memory(GiB)": 117.26, "step": 7380, "train_speed(iter/s)": 0.207356 }, { "acc": 0.77166362, "epoch": 0.172414295921492, "grad_norm": 8.25, "learning_rate": 9.91738683686829e-06, "loss": 0.82383671, "memory(GiB)": 117.26, "step": 7390, "train_speed(iter/s)": 0.207506 }, { "acc": 0.75932446, "epoch": 0.1726476034937809, "grad_norm": 5.09375, "learning_rate": 9.917044497895474e-06, "loss": 0.86952419, "memory(GiB)": 117.26, "step": 7400, "train_speed(iter/s)": 0.207651 }, { "acc": 0.75914521, "epoch": 0.1728809110660698, "grad_norm": 6.21875, "learning_rate": 9.916701457015983e-06, "loss": 0.88645802, "memory(GiB)": 117.26, "step": 7410, "train_speed(iter/s)": 0.207789 }, { "acc": 0.75875597, "epoch": 0.17311421863835869, "grad_norm": 5.28125, "learning_rate": 9.91635771427879e-06, "loss": 0.86274452, "memory(GiB)": 117.26, "step": 7420, "train_speed(iter/s)": 0.207927 }, { "acc": 0.75007381, "epoch": 0.17334752621064758, "grad_norm": 6.125, "learning_rate": 9.91601326973296e-06, "loss": 0.90960789, "memory(GiB)": 117.26, "step": 7430, "train_speed(iter/s)": 0.208075 }, { "acc": 0.77600675, "epoch": 0.17358083378293646, "grad_norm": 6.34375, "learning_rate": 9.915668123427662e-06, "loss": 0.81225548, "memory(GiB)": 117.26, "step": 7440, "train_speed(iter/s)": 0.208218 }, { "acc": 0.76685338, "epoch": 0.17381414135522535, "grad_norm": 4.96875, "learning_rate": 9.91532227541217e-06, "loss": 0.84593754, "memory(GiB)": 117.26, "step": 7450, "train_speed(iter/s)": 0.208364 }, { "acc": 0.75233183, "epoch": 0.17404744892751425, "grad_norm": 6.65625, "learning_rate": 9.91497572573585e-06, "loss": 0.9091568, "memory(GiB)": 117.26, "step": 7460, "train_speed(iter/s)": 0.208511 }, { "acc": 0.76754675, "epoch": 0.17428075649980315, "grad_norm": 5.5, "learning_rate": 9.914628474448173e-06, "loss": 0.85342302, "memory(GiB)": 117.26, "step": 7470, "train_speed(iter/s)": 0.208649 }, { "acc": 0.75809622, "epoch": 0.17451406407209205, "grad_norm": 5.375, "learning_rate": 9.91428052159871e-06, "loss": 0.88533516, "memory(GiB)": 117.26, "step": 7480, "train_speed(iter/s)": 0.20879 }, { "acc": 0.76278791, "epoch": 0.17474737164438092, "grad_norm": 4.78125, "learning_rate": 9.913931867237129e-06, "loss": 0.87194748, "memory(GiB)": 117.26, "step": 7490, "train_speed(iter/s)": 0.208935 }, { "acc": 0.75374289, "epoch": 0.17498067921666982, "grad_norm": 5.75, "learning_rate": 9.913582511413201e-06, "loss": 0.9021884, "memory(GiB)": 117.26, "step": 7500, "train_speed(iter/s)": 0.209087 }, { "epoch": 0.17498067921666982, "eval_acc": 0.7272096715381463, "eval_loss": 0.8652455806732178, "eval_runtime": 1262.5227, "eval_samples_per_second": 28.507, "eval_steps_per_second": 14.254, "step": 7500 }, { "acc": 0.74879465, "epoch": 0.17521398678895872, "grad_norm": 6.78125, "learning_rate": 9.913232454176797e-06, "loss": 0.90917902, "memory(GiB)": 117.26, "step": 7510, "train_speed(iter/s)": 0.201977 }, { "acc": 0.74534988, "epoch": 0.17544729436124762, "grad_norm": 8.125, "learning_rate": 9.912881695577889e-06, "loss": 0.94463596, "memory(GiB)": 117.26, "step": 7520, "train_speed(iter/s)": 0.202122 }, { "acc": 0.76872587, "epoch": 0.17568060193353652, "grad_norm": 6.25, "learning_rate": 9.912530235666546e-06, "loss": 0.83975306, "memory(GiB)": 117.26, "step": 7530, "train_speed(iter/s)": 0.202256 }, { "acc": 0.74917636, "epoch": 0.1759139095058254, "grad_norm": 18.25, "learning_rate": 9.912178074492937e-06, "loss": 0.88424053, "memory(GiB)": 117.26, "step": 7540, "train_speed(iter/s)": 0.202406 }, { "acc": 0.76744471, "epoch": 0.1761472170781143, "grad_norm": 5.875, "learning_rate": 9.911825212107337e-06, "loss": 0.82737856, "memory(GiB)": 117.26, "step": 7550, "train_speed(iter/s)": 0.202544 }, { "acc": 0.76676006, "epoch": 0.1763805246504032, "grad_norm": 5.8125, "learning_rate": 9.911471648560114e-06, "loss": 0.84152098, "memory(GiB)": 117.26, "step": 7560, "train_speed(iter/s)": 0.20268 }, { "acc": 0.75398664, "epoch": 0.17661383222269209, "grad_norm": 5.75, "learning_rate": 9.91111738390174e-06, "loss": 0.9107832, "memory(GiB)": 117.26, "step": 7570, "train_speed(iter/s)": 0.202827 }, { "acc": 0.76582565, "epoch": 0.17684713979498098, "grad_norm": 5.0625, "learning_rate": 9.910762418182786e-06, "loss": 0.83912849, "memory(GiB)": 117.26, "step": 7580, "train_speed(iter/s)": 0.202957 }, { "acc": 0.75821486, "epoch": 0.17708044736726986, "grad_norm": 4.25, "learning_rate": 9.910406751453923e-06, "loss": 0.88100357, "memory(GiB)": 117.26, "step": 7590, "train_speed(iter/s)": 0.203101 }, { "acc": 0.7829556, "epoch": 0.17731375493955875, "grad_norm": 6.375, "learning_rate": 9.910050383765924e-06, "loss": 0.78430152, "memory(GiB)": 117.26, "step": 7600, "train_speed(iter/s)": 0.203248 }, { "acc": 0.74231148, "epoch": 0.17754706251184765, "grad_norm": 8.3125, "learning_rate": 9.909693315169657e-06, "loss": 0.93944874, "memory(GiB)": 117.26, "step": 7610, "train_speed(iter/s)": 0.203383 }, { "acc": 0.75942278, "epoch": 0.17778037008413655, "grad_norm": 6.0, "learning_rate": 9.909335545716097e-06, "loss": 0.87185259, "memory(GiB)": 117.26, "step": 7620, "train_speed(iter/s)": 0.203523 }, { "acc": 0.7480834, "epoch": 0.17801367765642542, "grad_norm": 6.75, "learning_rate": 9.908977075456314e-06, "loss": 0.90164347, "memory(GiB)": 117.26, "step": 7630, "train_speed(iter/s)": 0.20366 }, { "acc": 0.77153473, "epoch": 0.17824698522871432, "grad_norm": 6.96875, "learning_rate": 9.90861790444148e-06, "loss": 0.82070885, "memory(GiB)": 117.26, "step": 7640, "train_speed(iter/s)": 0.203804 }, { "acc": 0.75881901, "epoch": 0.17848029280100322, "grad_norm": 5.96875, "learning_rate": 9.908258032722865e-06, "loss": 0.87284966, "memory(GiB)": 117.26, "step": 7650, "train_speed(iter/s)": 0.203937 }, { "acc": 0.77172585, "epoch": 0.17871360037329212, "grad_norm": 6.5, "learning_rate": 9.907897460351842e-06, "loss": 0.83263054, "memory(GiB)": 117.26, "step": 7660, "train_speed(iter/s)": 0.204076 }, { "acc": 0.75512171, "epoch": 0.17894690794558102, "grad_norm": 6.9375, "learning_rate": 9.907536187379883e-06, "loss": 0.8950429, "memory(GiB)": 117.26, "step": 7670, "train_speed(iter/s)": 0.204205 }, { "acc": 0.76720524, "epoch": 0.1791802155178699, "grad_norm": 5.5, "learning_rate": 9.907174213858556e-06, "loss": 0.85235405, "memory(GiB)": 117.26, "step": 7680, "train_speed(iter/s)": 0.204343 }, { "acc": 0.77257328, "epoch": 0.1794135230901588, "grad_norm": 5.375, "learning_rate": 9.906811539839539e-06, "loss": 0.82497387, "memory(GiB)": 117.26, "step": 7690, "train_speed(iter/s)": 0.204478 }, { "acc": 0.76002412, "epoch": 0.1796468306624477, "grad_norm": 6.875, "learning_rate": 9.9064481653746e-06, "loss": 0.89356136, "memory(GiB)": 117.26, "step": 7700, "train_speed(iter/s)": 0.204611 }, { "acc": 0.77156291, "epoch": 0.1798801382347366, "grad_norm": 5.28125, "learning_rate": 9.906084090515609e-06, "loss": 0.82954645, "memory(GiB)": 117.26, "step": 7710, "train_speed(iter/s)": 0.20475 }, { "acc": 0.76320467, "epoch": 0.18011344580702549, "grad_norm": 5.5625, "learning_rate": 9.90571931531454e-06, "loss": 0.87626915, "memory(GiB)": 117.26, "step": 7720, "train_speed(iter/s)": 0.204879 }, { "acc": 0.75608311, "epoch": 0.18034675337931436, "grad_norm": 5.5625, "learning_rate": 9.905353839823463e-06, "loss": 0.88252115, "memory(GiB)": 117.26, "step": 7730, "train_speed(iter/s)": 0.205026 }, { "acc": 0.76085534, "epoch": 0.18058006095160326, "grad_norm": 4.71875, "learning_rate": 9.904987664094553e-06, "loss": 0.85322914, "memory(GiB)": 117.26, "step": 7740, "train_speed(iter/s)": 0.205153 }, { "acc": 0.76637621, "epoch": 0.18081336852389215, "grad_norm": 6.15625, "learning_rate": 9.904620788180076e-06, "loss": 0.84426308, "memory(GiB)": 117.26, "step": 7750, "train_speed(iter/s)": 0.205297 }, { "acc": 0.76200542, "epoch": 0.18104667609618105, "grad_norm": 6.46875, "learning_rate": 9.904253212132406e-06, "loss": 0.86970015, "memory(GiB)": 117.26, "step": 7760, "train_speed(iter/s)": 0.205437 }, { "acc": 0.7645545, "epoch": 0.18127998366846995, "grad_norm": 6.09375, "learning_rate": 9.903884936004017e-06, "loss": 0.84966488, "memory(GiB)": 117.26, "step": 7770, "train_speed(iter/s)": 0.205576 }, { "acc": 0.76717358, "epoch": 0.18151329124075882, "grad_norm": 8.125, "learning_rate": 9.903515959847477e-06, "loss": 0.83945045, "memory(GiB)": 117.26, "step": 7780, "train_speed(iter/s)": 0.205713 }, { "acc": 0.77415552, "epoch": 0.18174659881304772, "grad_norm": 9.3125, "learning_rate": 9.903146283715459e-06, "loss": 0.8211132, "memory(GiB)": 117.26, "step": 7790, "train_speed(iter/s)": 0.20585 }, { "acc": 0.75150852, "epoch": 0.18197990638533662, "grad_norm": 6.46875, "learning_rate": 9.902775907660733e-06, "loss": 0.9031805, "memory(GiB)": 117.26, "step": 7800, "train_speed(iter/s)": 0.205981 }, { "acc": 0.75489626, "epoch": 0.18221321395762552, "grad_norm": 5.34375, "learning_rate": 9.90240483173617e-06, "loss": 0.88254786, "memory(GiB)": 117.26, "step": 7810, "train_speed(iter/s)": 0.206116 }, { "acc": 0.75053511, "epoch": 0.18244652152991442, "grad_norm": 5.625, "learning_rate": 9.902033055994739e-06, "loss": 0.91392794, "memory(GiB)": 117.26, "step": 7820, "train_speed(iter/s)": 0.206242 }, { "acc": 0.75073271, "epoch": 0.1826798291022033, "grad_norm": 5.53125, "learning_rate": 9.901660580489517e-06, "loss": 0.89257965, "memory(GiB)": 117.26, "step": 7830, "train_speed(iter/s)": 0.206372 }, { "acc": 0.77990479, "epoch": 0.1829131366744922, "grad_norm": 5.09375, "learning_rate": 9.90128740527367e-06, "loss": 0.7860126, "memory(GiB)": 117.26, "step": 7840, "train_speed(iter/s)": 0.206509 }, { "acc": 0.76360044, "epoch": 0.1831464442467811, "grad_norm": 7.03125, "learning_rate": 9.900913530400469e-06, "loss": 0.86977549, "memory(GiB)": 117.26, "step": 7850, "train_speed(iter/s)": 0.206644 }, { "acc": 0.76778603, "epoch": 0.18337975181906999, "grad_norm": 4.78125, "learning_rate": 9.900538955923287e-06, "loss": 0.83034687, "memory(GiB)": 117.26, "step": 7860, "train_speed(iter/s)": 0.206775 }, { "acc": 0.77316771, "epoch": 0.18361305939135886, "grad_norm": 4.90625, "learning_rate": 9.900163681895591e-06, "loss": 0.85764179, "memory(GiB)": 117.26, "step": 7870, "train_speed(iter/s)": 0.206905 }, { "acc": 0.75362692, "epoch": 0.18384636696364776, "grad_norm": 4.96875, "learning_rate": 9.899787708370954e-06, "loss": 0.89520912, "memory(GiB)": 117.26, "step": 7880, "train_speed(iter/s)": 0.207034 }, { "acc": 0.77855029, "epoch": 0.18407967453593665, "grad_norm": 6.3125, "learning_rate": 9.899411035403044e-06, "loss": 0.78841982, "memory(GiB)": 117.26, "step": 7890, "train_speed(iter/s)": 0.207169 }, { "acc": 0.76596365, "epoch": 0.18431298210822555, "grad_norm": 5.8125, "learning_rate": 9.899033663045632e-06, "loss": 0.86087551, "memory(GiB)": 117.26, "step": 7900, "train_speed(iter/s)": 0.207308 }, { "acc": 0.74994364, "epoch": 0.18454628968051445, "grad_norm": 5.59375, "learning_rate": 9.898655591352589e-06, "loss": 0.92803307, "memory(GiB)": 117.26, "step": 7910, "train_speed(iter/s)": 0.207441 }, { "acc": 0.75487556, "epoch": 0.18477959725280332, "grad_norm": 5.375, "learning_rate": 9.898276820377882e-06, "loss": 0.90885067, "memory(GiB)": 117.26, "step": 7920, "train_speed(iter/s)": 0.207575 }, { "acc": 0.76914301, "epoch": 0.18501290482509222, "grad_norm": 6.3125, "learning_rate": 9.897897350175583e-06, "loss": 0.83762236, "memory(GiB)": 117.26, "step": 7930, "train_speed(iter/s)": 0.207702 }, { "acc": 0.75061235, "epoch": 0.18524621239738112, "grad_norm": 6.75, "learning_rate": 9.897517180799858e-06, "loss": 0.91876888, "memory(GiB)": 117.26, "step": 7940, "train_speed(iter/s)": 0.207837 }, { "acc": 0.75675759, "epoch": 0.18547951996967002, "grad_norm": 5.5625, "learning_rate": 9.89713631230498e-06, "loss": 0.85782814, "memory(GiB)": 117.26, "step": 7950, "train_speed(iter/s)": 0.207975 }, { "acc": 0.76141992, "epoch": 0.18571282754195892, "grad_norm": 5.25, "learning_rate": 9.896754744745315e-06, "loss": 0.86181946, "memory(GiB)": 117.26, "step": 7960, "train_speed(iter/s)": 0.208099 }, { "acc": 0.76770515, "epoch": 0.1859461351142478, "grad_norm": 5.53125, "learning_rate": 9.896372478175336e-06, "loss": 0.84402161, "memory(GiB)": 117.26, "step": 7970, "train_speed(iter/s)": 0.208236 }, { "acc": 0.77641811, "epoch": 0.1861794426865367, "grad_norm": 4.75, "learning_rate": 9.895989512649605e-06, "loss": 0.79544768, "memory(GiB)": 117.26, "step": 7980, "train_speed(iter/s)": 0.20837 }, { "acc": 0.77689528, "epoch": 0.1864127502588256, "grad_norm": 5.21875, "learning_rate": 9.895605848222794e-06, "loss": 0.79866366, "memory(GiB)": 117.26, "step": 7990, "train_speed(iter/s)": 0.2085 }, { "acc": 0.7672595, "epoch": 0.1866460578311145, "grad_norm": 9.0, "learning_rate": 9.89522148494967e-06, "loss": 0.86895657, "memory(GiB)": 117.26, "step": 8000, "train_speed(iter/s)": 0.208639 }, { "epoch": 0.1866460578311145, "eval_acc": 0.7277959915972624, "eval_loss": 0.8637912273406982, "eval_runtime": 1261.685, "eval_samples_per_second": 28.526, "eval_steps_per_second": 14.263, "step": 8000 }, { "acc": 0.76980562, "epoch": 0.18687936540340339, "grad_norm": 8.125, "learning_rate": 9.894836422885101e-06, "loss": 0.8377346, "memory(GiB)": 117.26, "step": 8010, "train_speed(iter/s)": 0.201998 }, { "acc": 0.75839, "epoch": 0.18711267297569226, "grad_norm": 6.78125, "learning_rate": 9.894450662084055e-06, "loss": 0.87542973, "memory(GiB)": 117.26, "step": 8020, "train_speed(iter/s)": 0.202126 }, { "acc": 0.77061834, "epoch": 0.18734598054798116, "grad_norm": 8.5, "learning_rate": 9.8940642026016e-06, "loss": 0.84420357, "memory(GiB)": 117.26, "step": 8030, "train_speed(iter/s)": 0.202259 }, { "acc": 0.76137247, "epoch": 0.18757928812027005, "grad_norm": 6.46875, "learning_rate": 9.8936770444929e-06, "loss": 0.88222523, "memory(GiB)": 117.26, "step": 8040, "train_speed(iter/s)": 0.202395 }, { "acc": 0.76722088, "epoch": 0.18781259569255895, "grad_norm": 5.71875, "learning_rate": 9.893289187813224e-06, "loss": 0.82576332, "memory(GiB)": 117.26, "step": 8050, "train_speed(iter/s)": 0.202527 }, { "acc": 0.77460055, "epoch": 0.18804590326484785, "grad_norm": 7.21875, "learning_rate": 9.892900632617939e-06, "loss": 0.82546015, "memory(GiB)": 117.26, "step": 8060, "train_speed(iter/s)": 0.202656 }, { "acc": 0.76136332, "epoch": 0.18827921083713672, "grad_norm": 8.625, "learning_rate": 9.892511378962509e-06, "loss": 0.85382442, "memory(GiB)": 117.26, "step": 8070, "train_speed(iter/s)": 0.202785 }, { "acc": 0.77842703, "epoch": 0.18851251840942562, "grad_norm": 5.6875, "learning_rate": 9.892121426902502e-06, "loss": 0.82105141, "memory(GiB)": 117.26, "step": 8080, "train_speed(iter/s)": 0.202912 }, { "acc": 0.76546736, "epoch": 0.18874582598171452, "grad_norm": 6.6875, "learning_rate": 9.891730776493579e-06, "loss": 0.82770977, "memory(GiB)": 117.26, "step": 8090, "train_speed(iter/s)": 0.203034 }, { "acc": 0.78821645, "epoch": 0.18897913355400342, "grad_norm": 7.8125, "learning_rate": 9.891339427791513e-06, "loss": 0.74701271, "memory(GiB)": 117.26, "step": 8100, "train_speed(iter/s)": 0.203165 }, { "acc": 0.77000942, "epoch": 0.18921244112629232, "grad_norm": 5.71875, "learning_rate": 9.890947380852163e-06, "loss": 0.81139717, "memory(GiB)": 117.26, "step": 8110, "train_speed(iter/s)": 0.203297 }, { "acc": 0.78438883, "epoch": 0.1894457486985812, "grad_norm": 8.0, "learning_rate": 9.890554635731496e-06, "loss": 0.78258333, "memory(GiB)": 117.26, "step": 8120, "train_speed(iter/s)": 0.203433 }, { "acc": 0.76116095, "epoch": 0.1896790562708701, "grad_norm": 12.9375, "learning_rate": 9.890161192485573e-06, "loss": 0.88209429, "memory(GiB)": 117.26, "step": 8130, "train_speed(iter/s)": 0.203558 }, { "acc": 0.77423377, "epoch": 0.189912363843159, "grad_norm": 5.6875, "learning_rate": 9.889767051170563e-06, "loss": 0.8021966, "memory(GiB)": 117.26, "step": 8140, "train_speed(iter/s)": 0.203678 }, { "acc": 0.77997742, "epoch": 0.1901456714154479, "grad_norm": 6.75, "learning_rate": 9.889372211842726e-06, "loss": 0.78851585, "memory(GiB)": 117.26, "step": 8150, "train_speed(iter/s)": 0.203808 }, { "acc": 0.76278243, "epoch": 0.19037897898773676, "grad_norm": 7.21875, "learning_rate": 9.888976674558426e-06, "loss": 0.85657482, "memory(GiB)": 117.26, "step": 8160, "train_speed(iter/s)": 0.203941 }, { "acc": 0.75297365, "epoch": 0.19061228656002566, "grad_norm": 5.0, "learning_rate": 9.888580439374126e-06, "loss": 0.89441032, "memory(GiB)": 117.26, "step": 8170, "train_speed(iter/s)": 0.204069 }, { "acc": 0.76671324, "epoch": 0.19084559413231456, "grad_norm": 5.8125, "learning_rate": 9.888183506346389e-06, "loss": 0.85098372, "memory(GiB)": 117.26, "step": 8180, "train_speed(iter/s)": 0.204197 }, { "acc": 0.76597271, "epoch": 0.19107890170460345, "grad_norm": 5.78125, "learning_rate": 9.887785875531875e-06, "loss": 0.84940281, "memory(GiB)": 117.26, "step": 8190, "train_speed(iter/s)": 0.204329 }, { "acc": 0.76571255, "epoch": 0.19131220927689235, "grad_norm": 6.0, "learning_rate": 9.887387546987349e-06, "loss": 0.85268974, "memory(GiB)": 117.26, "step": 8200, "train_speed(iter/s)": 0.204455 }, { "acc": 0.76590738, "epoch": 0.19154551684918122, "grad_norm": 5.40625, "learning_rate": 9.886988520769669e-06, "loss": 0.85765514, "memory(GiB)": 117.26, "step": 8210, "train_speed(iter/s)": 0.204593 }, { "acc": 0.77192249, "epoch": 0.19177882442147012, "grad_norm": 8.6875, "learning_rate": 9.886588796935797e-06, "loss": 0.83547783, "memory(GiB)": 117.26, "step": 8220, "train_speed(iter/s)": 0.204719 }, { "acc": 0.73231807, "epoch": 0.19201213199375902, "grad_norm": 5.53125, "learning_rate": 9.886188375542795e-06, "loss": 0.97665491, "memory(GiB)": 117.26, "step": 8230, "train_speed(iter/s)": 0.20484 }, { "acc": 0.77011299, "epoch": 0.19224543956604792, "grad_norm": 5.28125, "learning_rate": 9.885787256647822e-06, "loss": 0.82477493, "memory(GiB)": 117.26, "step": 8240, "train_speed(iter/s)": 0.204971 }, { "acc": 0.77445898, "epoch": 0.19247874713833682, "grad_norm": 5.0625, "learning_rate": 9.885385440308137e-06, "loss": 0.79150438, "memory(GiB)": 117.26, "step": 8250, "train_speed(iter/s)": 0.205094 }, { "acc": 0.7685101, "epoch": 0.1927120547106257, "grad_norm": 13.6875, "learning_rate": 9.8849829265811e-06, "loss": 0.84411364, "memory(GiB)": 117.26, "step": 8260, "train_speed(iter/s)": 0.205219 }, { "acc": 0.75781097, "epoch": 0.1929453622829146, "grad_norm": 10.5625, "learning_rate": 9.884579715524168e-06, "loss": 0.86267986, "memory(GiB)": 117.26, "step": 8270, "train_speed(iter/s)": 0.205349 }, { "acc": 0.75872669, "epoch": 0.1931786698552035, "grad_norm": 8.6875, "learning_rate": 9.884175807194902e-06, "loss": 0.88134193, "memory(GiB)": 117.26, "step": 8280, "train_speed(iter/s)": 0.205483 }, { "acc": 0.76485038, "epoch": 0.1934119774274924, "grad_norm": 4.28125, "learning_rate": 9.883771201650958e-06, "loss": 0.84986229, "memory(GiB)": 117.26, "step": 8290, "train_speed(iter/s)": 0.205616 }, { "acc": 0.78051891, "epoch": 0.1936452849997813, "grad_norm": 5.90625, "learning_rate": 9.883365898950094e-06, "loss": 0.79819059, "memory(GiB)": 117.26, "step": 8300, "train_speed(iter/s)": 0.205746 }, { "acc": 0.75382128, "epoch": 0.19387859257207016, "grad_norm": 4.8125, "learning_rate": 9.882959899150166e-06, "loss": 0.89902287, "memory(GiB)": 117.26, "step": 8310, "train_speed(iter/s)": 0.205865 }, { "acc": 0.76242323, "epoch": 0.19411190014435906, "grad_norm": 7.25, "learning_rate": 9.882553202309131e-06, "loss": 0.84712563, "memory(GiB)": 117.26, "step": 8320, "train_speed(iter/s)": 0.205998 }, { "acc": 0.76327085, "epoch": 0.19434520771664796, "grad_norm": 5.96875, "learning_rate": 9.882145808485045e-06, "loss": 0.8393714, "memory(GiB)": 117.26, "step": 8330, "train_speed(iter/s)": 0.206125 }, { "acc": 0.77577643, "epoch": 0.19457851528893685, "grad_norm": 4.5625, "learning_rate": 9.881737717736063e-06, "loss": 0.8186286, "memory(GiB)": 117.26, "step": 8340, "train_speed(iter/s)": 0.206253 }, { "acc": 0.76705074, "epoch": 0.19481182286122575, "grad_norm": 10.75, "learning_rate": 9.88132893012044e-06, "loss": 0.86286297, "memory(GiB)": 117.26, "step": 8350, "train_speed(iter/s)": 0.206386 }, { "acc": 0.74571476, "epoch": 0.19504513043351462, "grad_norm": 11.5, "learning_rate": 9.88091944569653e-06, "loss": 0.93961449, "memory(GiB)": 117.26, "step": 8360, "train_speed(iter/s)": 0.206503 }, { "acc": 0.74246101, "epoch": 0.19527843800580352, "grad_norm": 7.125, "learning_rate": 9.880509264522788e-06, "loss": 0.94773483, "memory(GiB)": 117.26, "step": 8370, "train_speed(iter/s)": 0.206627 }, { "acc": 0.77141762, "epoch": 0.19551174557809242, "grad_norm": 4.8125, "learning_rate": 9.880098386657765e-06, "loss": 0.833953, "memory(GiB)": 117.26, "step": 8380, "train_speed(iter/s)": 0.206751 }, { "acc": 0.76310663, "epoch": 0.19574505315038132, "grad_norm": 4.4375, "learning_rate": 9.879686812160116e-06, "loss": 0.8492094, "memory(GiB)": 117.26, "step": 8390, "train_speed(iter/s)": 0.206883 }, { "acc": 0.75756483, "epoch": 0.1959783607226702, "grad_norm": 5.40625, "learning_rate": 9.87927454108859e-06, "loss": 0.8447773, "memory(GiB)": 117.26, "step": 8400, "train_speed(iter/s)": 0.207011 }, { "acc": 0.76140394, "epoch": 0.1962116682949591, "grad_norm": 5.84375, "learning_rate": 9.878861573502044e-06, "loss": 0.87424936, "memory(GiB)": 117.26, "step": 8410, "train_speed(iter/s)": 0.207133 }, { "acc": 0.75709214, "epoch": 0.196444975867248, "grad_norm": 5.78125, "learning_rate": 9.878447909459423e-06, "loss": 0.88904228, "memory(GiB)": 117.26, "step": 8420, "train_speed(iter/s)": 0.207263 }, { "acc": 0.75347986, "epoch": 0.1966782834395369, "grad_norm": 5.25, "learning_rate": 9.878033549019781e-06, "loss": 0.89583454, "memory(GiB)": 117.26, "step": 8430, "train_speed(iter/s)": 0.207389 }, { "acc": 0.76018286, "epoch": 0.1969115910118258, "grad_norm": 5.09375, "learning_rate": 9.877618492242267e-06, "loss": 0.88538971, "memory(GiB)": 117.26, "step": 8440, "train_speed(iter/s)": 0.207516 }, { "acc": 0.76503086, "epoch": 0.19714489858411466, "grad_norm": 5.28125, "learning_rate": 9.877202739186132e-06, "loss": 0.85580301, "memory(GiB)": 117.26, "step": 8450, "train_speed(iter/s)": 0.207647 }, { "acc": 0.75894556, "epoch": 0.19737820615640356, "grad_norm": 5.65625, "learning_rate": 9.876786289910721e-06, "loss": 0.89989386, "memory(GiB)": 117.26, "step": 8460, "train_speed(iter/s)": 0.207769 }, { "acc": 0.76240826, "epoch": 0.19761151372869246, "grad_norm": 6.0, "learning_rate": 9.876369144475484e-06, "loss": 0.86274757, "memory(GiB)": 117.26, "step": 8470, "train_speed(iter/s)": 0.207891 }, { "acc": 0.74325132, "epoch": 0.19784482130098135, "grad_norm": 6.15625, "learning_rate": 9.875951302939967e-06, "loss": 0.92449026, "memory(GiB)": 117.26, "step": 8480, "train_speed(iter/s)": 0.208017 }, { "acc": 0.76734209, "epoch": 0.19807812887327025, "grad_norm": 6.65625, "learning_rate": 9.87553276536382e-06, "loss": 0.85071125, "memory(GiB)": 117.26, "step": 8490, "train_speed(iter/s)": 0.208133 }, { "acc": 0.75068078, "epoch": 0.19831143644555912, "grad_norm": 5.15625, "learning_rate": 9.875113531806785e-06, "loss": 0.89063568, "memory(GiB)": 117.26, "step": 8500, "train_speed(iter/s)": 0.208262 }, { "epoch": 0.19831143644555912, "eval_acc": 0.7283903788008351, "eval_loss": 0.8621500730514526, "eval_runtime": 1263.2854, "eval_samples_per_second": 28.49, "eval_steps_per_second": 14.245, "step": 8500 }, { "acc": 0.76076174, "epoch": 0.19854474401784802, "grad_norm": 5.78125, "learning_rate": 9.874693602328711e-06, "loss": 0.86771774, "memory(GiB)": 117.26, "step": 8510, "train_speed(iter/s)": 0.202008 }, { "acc": 0.75842085, "epoch": 0.19877805159013692, "grad_norm": 4.6875, "learning_rate": 9.874272976989541e-06, "loss": 0.88655357, "memory(GiB)": 117.26, "step": 8520, "train_speed(iter/s)": 0.202132 }, { "acc": 0.76707444, "epoch": 0.19901135916242582, "grad_norm": 10.0625, "learning_rate": 9.87385165584932e-06, "loss": 0.83357239, "memory(GiB)": 117.26, "step": 8530, "train_speed(iter/s)": 0.202263 }, { "acc": 0.77663488, "epoch": 0.19924466673471472, "grad_norm": 5.46875, "learning_rate": 9.873429638968191e-06, "loss": 0.80003738, "memory(GiB)": 117.26, "step": 8540, "train_speed(iter/s)": 0.202391 }, { "acc": 0.73521061, "epoch": 0.1994779743070036, "grad_norm": 7.125, "learning_rate": 9.873006926406397e-06, "loss": 0.99296751, "memory(GiB)": 117.26, "step": 8550, "train_speed(iter/s)": 0.202514 }, { "acc": 0.74921474, "epoch": 0.1997112818792925, "grad_norm": 5.71875, "learning_rate": 9.872583518224279e-06, "loss": 0.90615845, "memory(GiB)": 117.26, "step": 8560, "train_speed(iter/s)": 0.202636 }, { "acc": 0.76292572, "epoch": 0.1999445894515814, "grad_norm": 10.125, "learning_rate": 9.872159414482279e-06, "loss": 0.86987429, "memory(GiB)": 117.26, "step": 8570, "train_speed(iter/s)": 0.202758 }, { "acc": 0.75107698, "epoch": 0.2001778970238703, "grad_norm": 4.78125, "learning_rate": 9.871734615240938e-06, "loss": 0.91640205, "memory(GiB)": 117.26, "step": 8580, "train_speed(iter/s)": 0.202888 }, { "acc": 0.76004953, "epoch": 0.2004112045961592, "grad_norm": 6.625, "learning_rate": 9.871309120560897e-06, "loss": 0.87472467, "memory(GiB)": 117.26, "step": 8590, "train_speed(iter/s)": 0.203012 }, { "acc": 0.78374968, "epoch": 0.20064451216844806, "grad_norm": 6.71875, "learning_rate": 9.870882930502894e-06, "loss": 0.78140373, "memory(GiB)": 117.26, "step": 8600, "train_speed(iter/s)": 0.203126 }, { "acc": 0.76981058, "epoch": 0.20087781974073696, "grad_norm": 7.03125, "learning_rate": 9.870456045127767e-06, "loss": 0.85243578, "memory(GiB)": 117.26, "step": 8610, "train_speed(iter/s)": 0.203251 }, { "acc": 0.76617465, "epoch": 0.20111112731302586, "grad_norm": 8.6875, "learning_rate": 9.870028464496455e-06, "loss": 0.87566414, "memory(GiB)": 117.26, "step": 8620, "train_speed(iter/s)": 0.203372 }, { "acc": 0.74803514, "epoch": 0.20134443488531475, "grad_norm": 7.0625, "learning_rate": 9.869600188669995e-06, "loss": 0.91765823, "memory(GiB)": 117.26, "step": 8630, "train_speed(iter/s)": 0.203494 }, { "acc": 0.77351012, "epoch": 0.20157774245760363, "grad_norm": 12.0625, "learning_rate": 9.869171217709522e-06, "loss": 0.79958906, "memory(GiB)": 117.26, "step": 8640, "train_speed(iter/s)": 0.203611 }, { "acc": 0.76495771, "epoch": 0.20181105002989252, "grad_norm": 5.65625, "learning_rate": 9.86874155167627e-06, "loss": 0.83221092, "memory(GiB)": 117.26, "step": 8650, "train_speed(iter/s)": 0.203731 }, { "acc": 0.76300888, "epoch": 0.20204435760218142, "grad_norm": 5.40625, "learning_rate": 9.868311190631578e-06, "loss": 0.87261219, "memory(GiB)": 117.26, "step": 8660, "train_speed(iter/s)": 0.203849 }, { "acc": 0.7353982, "epoch": 0.20227766517447032, "grad_norm": 5.1875, "learning_rate": 9.867880134636877e-06, "loss": 0.96507521, "memory(GiB)": 117.26, "step": 8670, "train_speed(iter/s)": 0.203977 }, { "acc": 0.77588253, "epoch": 0.20251097274675922, "grad_norm": 5.96875, "learning_rate": 9.867448383753702e-06, "loss": 0.817729, "memory(GiB)": 117.26, "step": 8680, "train_speed(iter/s)": 0.204094 }, { "acc": 0.76220331, "epoch": 0.2027442803190481, "grad_norm": 7.0, "learning_rate": 9.867015938043685e-06, "loss": 0.87093048, "memory(GiB)": 117.26, "step": 8690, "train_speed(iter/s)": 0.204208 }, { "acc": 0.78435011, "epoch": 0.202977587891337, "grad_norm": 6.0625, "learning_rate": 9.866582797568556e-06, "loss": 0.76281714, "memory(GiB)": 117.26, "step": 8700, "train_speed(iter/s)": 0.204333 }, { "acc": 0.75618954, "epoch": 0.2032108954636259, "grad_norm": 6.21875, "learning_rate": 9.866148962390146e-06, "loss": 0.90392761, "memory(GiB)": 117.26, "step": 8710, "train_speed(iter/s)": 0.204453 }, { "acc": 0.75744371, "epoch": 0.2034442030359148, "grad_norm": 5.75, "learning_rate": 9.865714432570384e-06, "loss": 0.88389168, "memory(GiB)": 117.26, "step": 8720, "train_speed(iter/s)": 0.204577 }, { "acc": 0.754671, "epoch": 0.2036775106082037, "grad_norm": 7.21875, "learning_rate": 9.8652792081713e-06, "loss": 0.91077366, "memory(GiB)": 117.26, "step": 8730, "train_speed(iter/s)": 0.204685 }, { "acc": 0.75669479, "epoch": 0.20391081818049256, "grad_norm": 5.84375, "learning_rate": 9.864843289255026e-06, "loss": 0.89245796, "memory(GiB)": 117.26, "step": 8740, "train_speed(iter/s)": 0.204808 }, { "acc": 0.75149775, "epoch": 0.20414412575278146, "grad_norm": 5.96875, "learning_rate": 9.864406675883784e-06, "loss": 0.91204367, "memory(GiB)": 117.26, "step": 8750, "train_speed(iter/s)": 0.204929 }, { "acc": 0.75486255, "epoch": 0.20437743332507036, "grad_norm": 7.625, "learning_rate": 9.863969368119902e-06, "loss": 0.88566628, "memory(GiB)": 117.26, "step": 8760, "train_speed(iter/s)": 0.205043 }, { "acc": 0.75114489, "epoch": 0.20461074089735926, "grad_norm": 8.4375, "learning_rate": 9.863531366025804e-06, "loss": 0.915378, "memory(GiB)": 117.26, "step": 8770, "train_speed(iter/s)": 0.205171 }, { "acc": 0.75982847, "epoch": 0.20484404846964815, "grad_norm": 7.15625, "learning_rate": 9.863092669664018e-06, "loss": 0.87640762, "memory(GiB)": 117.26, "step": 8780, "train_speed(iter/s)": 0.205291 }, { "acc": 0.76948037, "epoch": 0.20507735604193703, "grad_norm": 4.96875, "learning_rate": 9.862653279097166e-06, "loss": 0.82533913, "memory(GiB)": 117.26, "step": 8790, "train_speed(iter/s)": 0.205408 }, { "acc": 0.74778366, "epoch": 0.20531066361422592, "grad_norm": 9.1875, "learning_rate": 9.86221319438797e-06, "loss": 0.89621792, "memory(GiB)": 117.26, "step": 8800, "train_speed(iter/s)": 0.205533 }, { "acc": 0.75014219, "epoch": 0.20554397118651482, "grad_norm": 5.21875, "learning_rate": 9.861772415599256e-06, "loss": 0.90685873, "memory(GiB)": 117.26, "step": 8810, "train_speed(iter/s)": 0.205651 }, { "acc": 0.75046358, "epoch": 0.20577727875880372, "grad_norm": 5.5, "learning_rate": 9.861330942793939e-06, "loss": 0.90041656, "memory(GiB)": 117.26, "step": 8820, "train_speed(iter/s)": 0.205763 }, { "acc": 0.76049786, "epoch": 0.20601058633109262, "grad_norm": 6.1875, "learning_rate": 9.860888776035043e-06, "loss": 0.87330513, "memory(GiB)": 117.26, "step": 8830, "train_speed(iter/s)": 0.205863 }, { "acc": 0.74571142, "epoch": 0.2062438939033815, "grad_norm": 6.40625, "learning_rate": 9.860445915385687e-06, "loss": 0.91440744, "memory(GiB)": 117.26, "step": 8840, "train_speed(iter/s)": 0.205986 }, { "acc": 0.75439949, "epoch": 0.2064772014756704, "grad_norm": 6.03125, "learning_rate": 9.860002360909086e-06, "loss": 0.87698917, "memory(GiB)": 117.26, "step": 8850, "train_speed(iter/s)": 0.2061 }, { "acc": 0.74465933, "epoch": 0.2067105090479593, "grad_norm": 8.3125, "learning_rate": 9.859558112668563e-06, "loss": 0.93020258, "memory(GiB)": 117.26, "step": 8860, "train_speed(iter/s)": 0.20622 }, { "acc": 0.77959089, "epoch": 0.2069438166202482, "grad_norm": 8.8125, "learning_rate": 9.85911317072753e-06, "loss": 0.81912613, "memory(GiB)": 117.26, "step": 8870, "train_speed(iter/s)": 0.20633 }, { "acc": 0.76712036, "epoch": 0.2071771241925371, "grad_norm": 6.53125, "learning_rate": 9.858667535149503e-06, "loss": 0.83284454, "memory(GiB)": 117.26, "step": 8880, "train_speed(iter/s)": 0.206456 }, { "acc": 0.75889626, "epoch": 0.20741043176482596, "grad_norm": 9.9375, "learning_rate": 9.858221205998097e-06, "loss": 0.85101032, "memory(GiB)": 117.26, "step": 8890, "train_speed(iter/s)": 0.206579 }, { "acc": 0.77509351, "epoch": 0.20764373933711486, "grad_norm": 6.65625, "learning_rate": 9.857774183337025e-06, "loss": 0.80764608, "memory(GiB)": 117.26, "step": 8900, "train_speed(iter/s)": 0.206703 }, { "acc": 0.76948729, "epoch": 0.20787704690940376, "grad_norm": 5.59375, "learning_rate": 9.8573264672301e-06, "loss": 0.82791748, "memory(GiB)": 117.26, "step": 8910, "train_speed(iter/s)": 0.206821 }, { "acc": 0.76785903, "epoch": 0.20811035448169266, "grad_norm": 5.25, "learning_rate": 9.856878057741233e-06, "loss": 0.82622242, "memory(GiB)": 117.26, "step": 8920, "train_speed(iter/s)": 0.206935 }, { "acc": 0.78260317, "epoch": 0.20834366205398153, "grad_norm": 6.78125, "learning_rate": 9.856428954934434e-06, "loss": 0.78127007, "memory(GiB)": 117.26, "step": 8930, "train_speed(iter/s)": 0.207052 }, { "acc": 0.73873916, "epoch": 0.20857696962627043, "grad_norm": 5.75, "learning_rate": 9.855979158873812e-06, "loss": 0.95875378, "memory(GiB)": 117.26, "step": 8940, "train_speed(iter/s)": 0.207176 }, { "acc": 0.78493519, "epoch": 0.20881027719855932, "grad_norm": 5.6875, "learning_rate": 9.855528669623576e-06, "loss": 0.750949, "memory(GiB)": 117.26, "step": 8950, "train_speed(iter/s)": 0.207293 }, { "acc": 0.75603533, "epoch": 0.20904358477084822, "grad_norm": 6.21875, "learning_rate": 9.855077487248034e-06, "loss": 0.89010067, "memory(GiB)": 117.26, "step": 8960, "train_speed(iter/s)": 0.207419 }, { "acc": 0.76982169, "epoch": 0.20927689234313712, "grad_norm": 4.4375, "learning_rate": 9.85462561181159e-06, "loss": 0.82097216, "memory(GiB)": 117.26, "step": 8970, "train_speed(iter/s)": 0.207535 }, { "acc": 0.75990238, "epoch": 0.209510199915426, "grad_norm": 5.65625, "learning_rate": 9.85417304337875e-06, "loss": 0.86552811, "memory(GiB)": 117.26, "step": 8980, "train_speed(iter/s)": 0.207651 }, { "acc": 0.74366322, "epoch": 0.2097435074877149, "grad_norm": 5.125, "learning_rate": 9.85371978201412e-06, "loss": 0.93035965, "memory(GiB)": 117.26, "step": 8990, "train_speed(iter/s)": 0.207771 }, { "acc": 0.76551628, "epoch": 0.2099768150600038, "grad_norm": 5.03125, "learning_rate": 9.8532658277824e-06, "loss": 0.85799885, "memory(GiB)": 117.26, "step": 9000, "train_speed(iter/s)": 0.207891 }, { "epoch": 0.2099768150600038, "eval_acc": 0.7287714707049716, "eval_loss": 0.8602161407470703, "eval_runtime": 1264.0959, "eval_samples_per_second": 28.472, "eval_steps_per_second": 14.236, "step": 9000 }, { "acc": 0.77338619, "epoch": 0.2102101226322927, "grad_norm": 5.25, "learning_rate": 9.852811180748391e-06, "loss": 0.84611912, "memory(GiB)": 117.26, "step": 9010, "train_speed(iter/s)": 0.20199 }, { "acc": 0.75551915, "epoch": 0.2104434302045816, "grad_norm": 6.25, "learning_rate": 9.852355840976996e-06, "loss": 0.88547592, "memory(GiB)": 117.26, "step": 9020, "train_speed(iter/s)": 0.202102 }, { "acc": 0.76707592, "epoch": 0.21067673777687046, "grad_norm": 9.625, "learning_rate": 9.851899808533218e-06, "loss": 0.84760303, "memory(GiB)": 117.26, "step": 9030, "train_speed(iter/s)": 0.20222 }, { "acc": 0.75320301, "epoch": 0.21091004534915936, "grad_norm": 5.375, "learning_rate": 9.851443083482149e-06, "loss": 0.90875149, "memory(GiB)": 117.26, "step": 9040, "train_speed(iter/s)": 0.202334 }, { "acc": 0.7483633, "epoch": 0.21114335292144826, "grad_norm": 6.65625, "learning_rate": 9.850985665888988e-06, "loss": 0.89777012, "memory(GiB)": 117.26, "step": 9050, "train_speed(iter/s)": 0.202444 }, { "acc": 0.75853233, "epoch": 0.21137666049373716, "grad_norm": 6.40625, "learning_rate": 9.850527555819036e-06, "loss": 0.86834946, "memory(GiB)": 117.26, "step": 9060, "train_speed(iter/s)": 0.202553 }, { "acc": 0.76872091, "epoch": 0.21160996806602606, "grad_norm": 5.3125, "learning_rate": 9.850068753337683e-06, "loss": 0.84271317, "memory(GiB)": 117.26, "step": 9070, "train_speed(iter/s)": 0.202662 }, { "acc": 0.75690665, "epoch": 0.21184327563831493, "grad_norm": 5.90625, "learning_rate": 9.849609258510423e-06, "loss": 0.85695324, "memory(GiB)": 117.26, "step": 9080, "train_speed(iter/s)": 0.20278 }, { "acc": 0.7541317, "epoch": 0.21207658321060383, "grad_norm": 9.375, "learning_rate": 9.84914907140285e-06, "loss": 0.88244171, "memory(GiB)": 117.26, "step": 9090, "train_speed(iter/s)": 0.202895 }, { "acc": 0.7579011, "epoch": 0.21230989078289272, "grad_norm": 5.4375, "learning_rate": 9.848688192080657e-06, "loss": 0.8640419, "memory(GiB)": 117.26, "step": 9100, "train_speed(iter/s)": 0.20301 }, { "acc": 0.77100873, "epoch": 0.21254319835518162, "grad_norm": 5.90625, "learning_rate": 9.848226620609634e-06, "loss": 0.84852943, "memory(GiB)": 117.26, "step": 9110, "train_speed(iter/s)": 0.203122 }, { "acc": 0.76369905, "epoch": 0.21277650592747052, "grad_norm": 6.3125, "learning_rate": 9.847764357055669e-06, "loss": 0.8594471, "memory(GiB)": 117.26, "step": 9120, "train_speed(iter/s)": 0.203247 }, { "acc": 0.76762819, "epoch": 0.2130098134997594, "grad_norm": 5.59375, "learning_rate": 9.84730140148475e-06, "loss": 0.84239712, "memory(GiB)": 117.26, "step": 9130, "train_speed(iter/s)": 0.203364 }, { "acc": 0.7475502, "epoch": 0.2132431210720483, "grad_norm": 6.1875, "learning_rate": 9.846837753962964e-06, "loss": 0.91216602, "memory(GiB)": 117.26, "step": 9140, "train_speed(iter/s)": 0.203481 }, { "acc": 0.76038733, "epoch": 0.2134764286443372, "grad_norm": 5.21875, "learning_rate": 9.846373414556495e-06, "loss": 0.86435537, "memory(GiB)": 117.26, "step": 9150, "train_speed(iter/s)": 0.2036 }, { "acc": 0.7598423, "epoch": 0.2137097362166261, "grad_norm": 4.53125, "learning_rate": 9.84590838333163e-06, "loss": 0.86975765, "memory(GiB)": 117.26, "step": 9160, "train_speed(iter/s)": 0.20371 }, { "acc": 0.78203835, "epoch": 0.21394304378891496, "grad_norm": 5.15625, "learning_rate": 9.845442660354752e-06, "loss": 0.78698754, "memory(GiB)": 117.26, "step": 9170, "train_speed(iter/s)": 0.203828 }, { "acc": 0.77016506, "epoch": 0.21417635136120386, "grad_norm": 7.0625, "learning_rate": 9.844976245692341e-06, "loss": 0.83244095, "memory(GiB)": 117.26, "step": 9180, "train_speed(iter/s)": 0.203947 }, { "acc": 0.76222048, "epoch": 0.21440965893349276, "grad_norm": 6.15625, "learning_rate": 9.84450913941098e-06, "loss": 0.83710518, "memory(GiB)": 117.26, "step": 9190, "train_speed(iter/s)": 0.20406 }, { "acc": 0.76509194, "epoch": 0.21464296650578166, "grad_norm": 6.3125, "learning_rate": 9.844041341577344e-06, "loss": 0.83961344, "memory(GiB)": 117.26, "step": 9200, "train_speed(iter/s)": 0.204174 }, { "acc": 0.7399087, "epoch": 0.21487627407807056, "grad_norm": 6.375, "learning_rate": 9.843572852258216e-06, "loss": 0.93852739, "memory(GiB)": 117.26, "step": 9210, "train_speed(iter/s)": 0.204283 }, { "acc": 0.74108911, "epoch": 0.21510958165035943, "grad_norm": 10.25, "learning_rate": 9.843103671520469e-06, "loss": 0.94250774, "memory(GiB)": 117.26, "step": 9220, "train_speed(iter/s)": 0.204396 }, { "acc": 0.75845308, "epoch": 0.21534288922264833, "grad_norm": 6.28125, "learning_rate": 9.842633799431081e-06, "loss": 0.88958778, "memory(GiB)": 117.26, "step": 9230, "train_speed(iter/s)": 0.20451 }, { "acc": 0.7644557, "epoch": 0.21557619679493722, "grad_norm": 7.625, "learning_rate": 9.842163236057123e-06, "loss": 0.86300106, "memory(GiB)": 117.26, "step": 9240, "train_speed(iter/s)": 0.204622 }, { "acc": 0.77312346, "epoch": 0.21580950436722612, "grad_norm": 6.0625, "learning_rate": 9.841691981465771e-06, "loss": 0.81684513, "memory(GiB)": 117.26, "step": 9250, "train_speed(iter/s)": 0.204743 }, { "acc": 0.7573225, "epoch": 0.21604281193951502, "grad_norm": 7.5, "learning_rate": 9.841220035724295e-06, "loss": 0.89299822, "memory(GiB)": 117.26, "step": 9260, "train_speed(iter/s)": 0.204864 }, { "acc": 0.76990786, "epoch": 0.2162761195118039, "grad_norm": 5.34375, "learning_rate": 9.840747398900066e-06, "loss": 0.82335033, "memory(GiB)": 117.26, "step": 9270, "train_speed(iter/s)": 0.204977 }, { "acc": 0.75264125, "epoch": 0.2165094270840928, "grad_norm": 6.4375, "learning_rate": 9.840274071060552e-06, "loss": 0.92649002, "memory(GiB)": 117.26, "step": 9280, "train_speed(iter/s)": 0.205091 }, { "acc": 0.76155519, "epoch": 0.2167427346563817, "grad_norm": 7.25, "learning_rate": 9.839800052273319e-06, "loss": 0.85641518, "memory(GiB)": 117.26, "step": 9290, "train_speed(iter/s)": 0.205201 }, { "acc": 0.77669697, "epoch": 0.2169760422286706, "grad_norm": 4.6875, "learning_rate": 9.839325342606034e-06, "loss": 0.80259094, "memory(GiB)": 117.26, "step": 9300, "train_speed(iter/s)": 0.205309 }, { "acc": 0.75696387, "epoch": 0.2172093498009595, "grad_norm": 5.28125, "learning_rate": 9.838849942126465e-06, "loss": 0.88896065, "memory(GiB)": 117.26, "step": 9310, "train_speed(iter/s)": 0.205419 }, { "acc": 0.7682416, "epoch": 0.21744265737324836, "grad_norm": 5.0, "learning_rate": 9.83837385090247e-06, "loss": 0.84865627, "memory(GiB)": 117.26, "step": 9320, "train_speed(iter/s)": 0.205535 }, { "acc": 0.76478214, "epoch": 0.21767596494553726, "grad_norm": 8.4375, "learning_rate": 9.837897069002014e-06, "loss": 0.87260275, "memory(GiB)": 117.26, "step": 9330, "train_speed(iter/s)": 0.205643 }, { "acc": 0.7567647, "epoch": 0.21790927251782616, "grad_norm": 6.90625, "learning_rate": 9.837419596493158e-06, "loss": 0.90559921, "memory(GiB)": 117.26, "step": 9340, "train_speed(iter/s)": 0.205756 }, { "acc": 0.76952367, "epoch": 0.21814258009011506, "grad_norm": 5.0, "learning_rate": 9.836941433444058e-06, "loss": 0.83648939, "memory(GiB)": 117.26, "step": 9350, "train_speed(iter/s)": 0.20587 }, { "acc": 0.75098333, "epoch": 0.21837588766240396, "grad_norm": 6.0625, "learning_rate": 9.836462579922977e-06, "loss": 0.88260384, "memory(GiB)": 117.26, "step": 9360, "train_speed(iter/s)": 0.205987 }, { "acc": 0.77787137, "epoch": 0.21860919523469283, "grad_norm": 10.875, "learning_rate": 9.835983035998264e-06, "loss": 0.80013981, "memory(GiB)": 117.26, "step": 9370, "train_speed(iter/s)": 0.206104 }, { "acc": 0.77306685, "epoch": 0.21884250280698173, "grad_norm": 5.9375, "learning_rate": 9.835502801738379e-06, "loss": 0.81299639, "memory(GiB)": 117.26, "step": 9380, "train_speed(iter/s)": 0.206214 }, { "acc": 0.75491304, "epoch": 0.21907581037927062, "grad_norm": 9.5, "learning_rate": 9.835021877211873e-06, "loss": 0.87373734, "memory(GiB)": 117.26, "step": 9390, "train_speed(iter/s)": 0.206326 }, { "acc": 0.78643999, "epoch": 0.21930911795155952, "grad_norm": 5.0, "learning_rate": 9.834540262487399e-06, "loss": 0.80302534, "memory(GiB)": 117.26, "step": 9400, "train_speed(iter/s)": 0.206441 }, { "acc": 0.76015272, "epoch": 0.2195424255238484, "grad_norm": 5.15625, "learning_rate": 9.834057957633707e-06, "loss": 0.87113552, "memory(GiB)": 117.26, "step": 9410, "train_speed(iter/s)": 0.206548 }, { "acc": 0.77033644, "epoch": 0.2197757330961373, "grad_norm": 6.75, "learning_rate": 9.833574962719646e-06, "loss": 0.84114103, "memory(GiB)": 117.26, "step": 9420, "train_speed(iter/s)": 0.206651 }, { "acc": 0.74658957, "epoch": 0.2200090406684262, "grad_norm": 8.0625, "learning_rate": 9.833091277814163e-06, "loss": 0.92295256, "memory(GiB)": 117.26, "step": 9430, "train_speed(iter/s)": 0.206765 }, { "acc": 0.77327833, "epoch": 0.2202423482407151, "grad_norm": 6.3125, "learning_rate": 9.832606902986305e-06, "loss": 0.82966223, "memory(GiB)": 117.26, "step": 9440, "train_speed(iter/s)": 0.206878 }, { "acc": 0.75789309, "epoch": 0.220475655813004, "grad_norm": 6.59375, "learning_rate": 9.832121838305214e-06, "loss": 0.86921644, "memory(GiB)": 117.26, "step": 9450, "train_speed(iter/s)": 0.206994 }, { "acc": 0.76153183, "epoch": 0.22070896338529286, "grad_norm": 5.28125, "learning_rate": 9.831636083840135e-06, "loss": 0.87766953, "memory(GiB)": 117.26, "step": 9460, "train_speed(iter/s)": 0.207111 }, { "acc": 0.77497025, "epoch": 0.22094227095758176, "grad_norm": 9.0625, "learning_rate": 9.831149639660409e-06, "loss": 0.79494495, "memory(GiB)": 117.26, "step": 9470, "train_speed(iter/s)": 0.207218 }, { "acc": 0.74823189, "epoch": 0.22117557852987066, "grad_norm": 7.46875, "learning_rate": 9.830662505835476e-06, "loss": 0.9338522, "memory(GiB)": 117.26, "step": 9480, "train_speed(iter/s)": 0.207335 }, { "acc": 0.74765511, "epoch": 0.22140888610215956, "grad_norm": 4.875, "learning_rate": 9.830174682434872e-06, "loss": 0.89984465, "memory(GiB)": 117.26, "step": 9490, "train_speed(iter/s)": 0.207451 }, { "acc": 0.76869669, "epoch": 0.22164219367444846, "grad_norm": 5.53125, "learning_rate": 9.829686169528237e-06, "loss": 0.84101973, "memory(GiB)": 117.26, "step": 9500, "train_speed(iter/s)": 0.207562 }, { "epoch": 0.22164219367444846, "eval_acc": 0.7288260046014992, "eval_loss": 0.8586822748184204, "eval_runtime": 1262.9062, "eval_samples_per_second": 28.499, "eval_steps_per_second": 14.25, "step": 9500 }, { "acc": 0.7722209, "epoch": 0.22187550124673733, "grad_norm": 5.34375, "learning_rate": 9.829196967185302e-06, "loss": 0.81807728, "memory(GiB)": 117.26, "step": 9510, "train_speed(iter/s)": 0.201988 }, { "acc": 0.77087946, "epoch": 0.22210880881902623, "grad_norm": 7.65625, "learning_rate": 9.828707075475905e-06, "loss": 0.80396833, "memory(GiB)": 117.26, "step": 9520, "train_speed(iter/s)": 0.202105 }, { "acc": 0.77802572, "epoch": 0.22234211639131513, "grad_norm": 6.71875, "learning_rate": 9.828216494469975e-06, "loss": 0.79953718, "memory(GiB)": 117.26, "step": 9530, "train_speed(iter/s)": 0.202219 }, { "acc": 0.75859208, "epoch": 0.22257542396360402, "grad_norm": 5.875, "learning_rate": 9.827725224237542e-06, "loss": 0.88073997, "memory(GiB)": 117.26, "step": 9540, "train_speed(iter/s)": 0.202324 }, { "acc": 0.74761353, "epoch": 0.22280873153589292, "grad_norm": 9.125, "learning_rate": 9.827233264848737e-06, "loss": 0.91719131, "memory(GiB)": 117.26, "step": 9550, "train_speed(iter/s)": 0.202431 }, { "acc": 0.7573863, "epoch": 0.2230420391081818, "grad_norm": 6.46875, "learning_rate": 9.826740616373785e-06, "loss": 0.87212486, "memory(GiB)": 117.26, "step": 9560, "train_speed(iter/s)": 0.202544 }, { "acc": 0.74880018, "epoch": 0.2232753466804707, "grad_norm": 10.1875, "learning_rate": 9.826247278883012e-06, "loss": 0.9470892, "memory(GiB)": 117.26, "step": 9570, "train_speed(iter/s)": 0.202656 }, { "acc": 0.76526189, "epoch": 0.2235086542527596, "grad_norm": 5.1875, "learning_rate": 9.825753252446843e-06, "loss": 0.87419567, "memory(GiB)": 117.26, "step": 9580, "train_speed(iter/s)": 0.202765 }, { "acc": 0.7914381, "epoch": 0.2237419618250485, "grad_norm": 6.5, "learning_rate": 9.825258537135798e-06, "loss": 0.7480257, "memory(GiB)": 117.26, "step": 9590, "train_speed(iter/s)": 0.202878 }, { "acc": 0.77315226, "epoch": 0.2239752693973374, "grad_norm": 5.8125, "learning_rate": 9.8247631330205e-06, "loss": 0.81920538, "memory(GiB)": 117.26, "step": 9600, "train_speed(iter/s)": 0.202992 }, { "acc": 0.77518106, "epoch": 0.22420857696962626, "grad_norm": 21.5, "learning_rate": 9.824267040171666e-06, "loss": 0.83390617, "memory(GiB)": 117.26, "step": 9610, "train_speed(iter/s)": 0.203095 }, { "acc": 0.76186266, "epoch": 0.22444188454191516, "grad_norm": 5.46875, "learning_rate": 9.823770258660113e-06, "loss": 0.85461226, "memory(GiB)": 117.26, "step": 9620, "train_speed(iter/s)": 0.203203 }, { "acc": 0.77864699, "epoch": 0.22467519211420406, "grad_norm": 5.125, "learning_rate": 9.823272788556757e-06, "loss": 0.79788122, "memory(GiB)": 117.26, "step": 9630, "train_speed(iter/s)": 0.203309 }, { "acc": 0.75430193, "epoch": 0.22490849968649296, "grad_norm": 7.0, "learning_rate": 9.822774629932612e-06, "loss": 0.90059919, "memory(GiB)": 117.26, "step": 9640, "train_speed(iter/s)": 0.203418 }, { "acc": 0.75824127, "epoch": 0.22514180725878186, "grad_norm": 6.59375, "learning_rate": 9.822275782858788e-06, "loss": 0.87358227, "memory(GiB)": 117.26, "step": 9650, "train_speed(iter/s)": 0.203529 }, { "acc": 0.73930073, "epoch": 0.22537511483107073, "grad_norm": 5.5625, "learning_rate": 9.821776247406498e-06, "loss": 0.94385147, "memory(GiB)": 117.26, "step": 9660, "train_speed(iter/s)": 0.203634 }, { "acc": 0.76484847, "epoch": 0.22560842240335963, "grad_norm": 6.03125, "learning_rate": 9.821276023647049e-06, "loss": 0.85446606, "memory(GiB)": 117.26, "step": 9670, "train_speed(iter/s)": 0.203742 }, { "acc": 0.78049212, "epoch": 0.22584172997564853, "grad_norm": 6.46875, "learning_rate": 9.820775111651849e-06, "loss": 0.79773602, "memory(GiB)": 117.26, "step": 9680, "train_speed(iter/s)": 0.203856 }, { "acc": 0.75696478, "epoch": 0.22607503754793742, "grad_norm": 6.15625, "learning_rate": 9.820273511492401e-06, "loss": 0.97019539, "memory(GiB)": 117.26, "step": 9690, "train_speed(iter/s)": 0.203967 }, { "acc": 0.75319004, "epoch": 0.2263083451202263, "grad_norm": 4.6875, "learning_rate": 9.819771223240312e-06, "loss": 0.90383863, "memory(GiB)": 117.26, "step": 9700, "train_speed(iter/s)": 0.204081 }, { "acc": 0.76351118, "epoch": 0.2265416526925152, "grad_norm": 7.1875, "learning_rate": 9.819268246967279e-06, "loss": 0.85396194, "memory(GiB)": 117.26, "step": 9710, "train_speed(iter/s)": 0.204165 }, { "acc": 0.76212788, "epoch": 0.2267749602648041, "grad_norm": 10.0, "learning_rate": 9.818764582745103e-06, "loss": 0.86051655, "memory(GiB)": 117.26, "step": 9720, "train_speed(iter/s)": 0.204278 }, { "acc": 0.77602925, "epoch": 0.227008267837093, "grad_norm": 5.75, "learning_rate": 9.818260230645684e-06, "loss": 0.80839624, "memory(GiB)": 117.26, "step": 9730, "train_speed(iter/s)": 0.204388 }, { "acc": 0.76066914, "epoch": 0.2272415754093819, "grad_norm": 5.5625, "learning_rate": 9.817755190741018e-06, "loss": 0.86518803, "memory(GiB)": 117.26, "step": 9740, "train_speed(iter/s)": 0.204501 }, { "acc": 0.76696286, "epoch": 0.22747488298167076, "grad_norm": 8.1875, "learning_rate": 9.817249463103196e-06, "loss": 0.86744308, "memory(GiB)": 117.26, "step": 9750, "train_speed(iter/s)": 0.2046 }, { "acc": 0.77410145, "epoch": 0.22770819055395966, "grad_norm": 8.0625, "learning_rate": 9.816743047804413e-06, "loss": 0.80678148, "memory(GiB)": 117.26, "step": 9760, "train_speed(iter/s)": 0.204712 }, { "acc": 0.75208983, "epoch": 0.22794149812624856, "grad_norm": 4.1875, "learning_rate": 9.816235944916959e-06, "loss": 0.90964394, "memory(GiB)": 117.26, "step": 9770, "train_speed(iter/s)": 0.204809 }, { "acc": 0.76018462, "epoch": 0.22817480569853746, "grad_norm": 7.90625, "learning_rate": 9.815728154513224e-06, "loss": 0.87123966, "memory(GiB)": 117.26, "step": 9780, "train_speed(iter/s)": 0.204913 }, { "acc": 0.7624814, "epoch": 0.22840811327082636, "grad_norm": 6.90625, "learning_rate": 9.815219676665694e-06, "loss": 0.83920116, "memory(GiB)": 117.26, "step": 9790, "train_speed(iter/s)": 0.204999 }, { "acc": 0.74705095, "epoch": 0.22864142084311523, "grad_norm": 6.90625, "learning_rate": 9.814710511446954e-06, "loss": 0.92915916, "memory(GiB)": 117.26, "step": 9800, "train_speed(iter/s)": 0.205106 }, { "acc": 0.77933888, "epoch": 0.22887472841540413, "grad_norm": 5.6875, "learning_rate": 9.814200658929686e-06, "loss": 0.79282527, "memory(GiB)": 117.26, "step": 9810, "train_speed(iter/s)": 0.205207 }, { "acc": 0.76735115, "epoch": 0.22910803598769303, "grad_norm": 5.875, "learning_rate": 9.813690119186673e-06, "loss": 0.84209175, "memory(GiB)": 117.26, "step": 9820, "train_speed(iter/s)": 0.205314 }, { "acc": 0.7615449, "epoch": 0.22934134355998193, "grad_norm": 6.5625, "learning_rate": 9.813178892290793e-06, "loss": 0.87921515, "memory(GiB)": 117.26, "step": 9830, "train_speed(iter/s)": 0.205423 }, { "acc": 0.77768145, "epoch": 0.22957465113227082, "grad_norm": 6.28125, "learning_rate": 9.812666978315026e-06, "loss": 0.8066925, "memory(GiB)": 117.26, "step": 9840, "train_speed(iter/s)": 0.205538 }, { "acc": 0.74583302, "epoch": 0.2298079587045597, "grad_norm": 5.625, "learning_rate": 9.812154377332446e-06, "loss": 0.92973843, "memory(GiB)": 117.26, "step": 9850, "train_speed(iter/s)": 0.205653 }, { "acc": 0.76136565, "epoch": 0.2300412662768486, "grad_norm": 5.125, "learning_rate": 9.811641089416225e-06, "loss": 0.87200413, "memory(GiB)": 117.26, "step": 9860, "train_speed(iter/s)": 0.205758 }, { "acc": 0.75204549, "epoch": 0.2302745738491375, "grad_norm": 7.53125, "learning_rate": 9.811127114639637e-06, "loss": 0.89518375, "memory(GiB)": 117.26, "step": 9870, "train_speed(iter/s)": 0.205866 }, { "acc": 0.77171698, "epoch": 0.2305078814214264, "grad_norm": 5.28125, "learning_rate": 9.810612453076052e-06, "loss": 0.83219414, "memory(GiB)": 117.26, "step": 9880, "train_speed(iter/s)": 0.205978 }, { "acc": 0.77274904, "epoch": 0.2307411889937153, "grad_norm": 9.8125, "learning_rate": 9.810097104798934e-06, "loss": 0.83192282, "memory(GiB)": 117.26, "step": 9890, "train_speed(iter/s)": 0.206094 }, { "acc": 0.76826916, "epoch": 0.23097449656600416, "grad_norm": 6.65625, "learning_rate": 9.809581069881854e-06, "loss": 0.83542023, "memory(GiB)": 117.26, "step": 9900, "train_speed(iter/s)": 0.206206 }, { "acc": 0.76174197, "epoch": 0.23120780413829306, "grad_norm": 7.625, "learning_rate": 9.809064348398474e-06, "loss": 0.86499538, "memory(GiB)": 117.26, "step": 9910, "train_speed(iter/s)": 0.206314 }, { "acc": 0.76810961, "epoch": 0.23144111171058196, "grad_norm": 8.625, "learning_rate": 9.808546940422555e-06, "loss": 0.81102448, "memory(GiB)": 117.26, "step": 9920, "train_speed(iter/s)": 0.206422 }, { "acc": 0.773141, "epoch": 0.23167441928287086, "grad_norm": 5.28125, "learning_rate": 9.808028846027954e-06, "loss": 0.83236446, "memory(GiB)": 117.26, "step": 9930, "train_speed(iter/s)": 0.206517 }, { "acc": 0.76415548, "epoch": 0.23190772685515973, "grad_norm": 4.84375, "learning_rate": 9.807510065288635e-06, "loss": 0.87150354, "memory(GiB)": 117.26, "step": 9940, "train_speed(iter/s)": 0.206618 }, { "acc": 0.77097445, "epoch": 0.23214103442744863, "grad_norm": 4.78125, "learning_rate": 9.806990598278651e-06, "loss": 0.82122641, "memory(GiB)": 117.26, "step": 9950, "train_speed(iter/s)": 0.20672 }, { "acc": 0.76463575, "epoch": 0.23237434199973753, "grad_norm": 5.75, "learning_rate": 9.806470445072156e-06, "loss": 0.85506172, "memory(GiB)": 122.82, "step": 9960, "train_speed(iter/s)": 0.206815 }, { "acc": 0.75702257, "epoch": 0.23260764957202643, "grad_norm": 4.28125, "learning_rate": 9.8059496057434e-06, "loss": 0.86262856, "memory(GiB)": 122.82, "step": 9970, "train_speed(iter/s)": 0.206909 }, { "acc": 0.76322374, "epoch": 0.23284095714431532, "grad_norm": 8.0, "learning_rate": 9.805428080366733e-06, "loss": 0.84612598, "memory(GiB)": 122.82, "step": 9980, "train_speed(iter/s)": 0.20701 }, { "acc": 0.75973806, "epoch": 0.2330742647166042, "grad_norm": 10.0, "learning_rate": 9.804905869016603e-06, "loss": 0.87426147, "memory(GiB)": 122.82, "step": 9990, "train_speed(iter/s)": 0.207113 }, { "acc": 0.73618741, "epoch": 0.2333075722888931, "grad_norm": 6.1875, "learning_rate": 9.804382971767559e-06, "loss": 1.00271454, "memory(GiB)": 122.82, "step": 10000, "train_speed(iter/s)": 0.207219 }, { "epoch": 0.2333075722888931, "eval_acc": 0.7292821209490834, "eval_loss": 0.8574042916297913, "eval_runtime": 1263.5838, "eval_samples_per_second": 28.483, "eval_steps_per_second": 14.242, "step": 10000 }, { "acc": 0.75128431, "epoch": 0.233540879861182, "grad_norm": 7.125, "learning_rate": 9.803859388694238e-06, "loss": 0.91414642, "memory(GiB)": 122.82, "step": 10010, "train_speed(iter/s)": 0.201936 }, { "acc": 0.77839055, "epoch": 0.2337741874334709, "grad_norm": 9.1875, "learning_rate": 9.803335119871388e-06, "loss": 0.80446138, "memory(GiB)": 122.82, "step": 10020, "train_speed(iter/s)": 0.202031 }, { "acc": 0.74778352, "epoch": 0.2340074950057598, "grad_norm": 12.125, "learning_rate": 9.802810165373845e-06, "loss": 0.92328253, "memory(GiB)": 122.82, "step": 10030, "train_speed(iter/s)": 0.20214 }, { "acc": 0.73962555, "epoch": 0.23424080257804866, "grad_norm": 11.1875, "learning_rate": 9.802284525276544e-06, "loss": 0.96148548, "memory(GiB)": 122.82, "step": 10040, "train_speed(iter/s)": 0.202248 }, { "acc": 0.76071081, "epoch": 0.23447411015033756, "grad_norm": 7.625, "learning_rate": 9.801758199654522e-06, "loss": 0.86371326, "memory(GiB)": 122.82, "step": 10050, "train_speed(iter/s)": 0.202343 }, { "acc": 0.74303417, "epoch": 0.23470741772262646, "grad_norm": 6.0625, "learning_rate": 9.801231188582914e-06, "loss": 0.94352255, "memory(GiB)": 122.82, "step": 10060, "train_speed(iter/s)": 0.202449 }, { "acc": 0.76320934, "epoch": 0.23494072529491536, "grad_norm": 7.28125, "learning_rate": 9.800703492136948e-06, "loss": 0.85445366, "memory(GiB)": 122.82, "step": 10070, "train_speed(iter/s)": 0.202554 }, { "acc": 0.75372086, "epoch": 0.23517403286720426, "grad_norm": 5.8125, "learning_rate": 9.800175110391952e-06, "loss": 0.90433693, "memory(GiB)": 122.82, "step": 10080, "train_speed(iter/s)": 0.202664 }, { "acc": 0.73736844, "epoch": 0.23540734043949313, "grad_norm": 5.09375, "learning_rate": 9.799646043423353e-06, "loss": 0.95774345, "memory(GiB)": 122.82, "step": 10090, "train_speed(iter/s)": 0.202765 }, { "acc": 0.76232147, "epoch": 0.23564064801178203, "grad_norm": 5.9375, "learning_rate": 9.799116291306677e-06, "loss": 0.86406269, "memory(GiB)": 122.82, "step": 10100, "train_speed(iter/s)": 0.202865 }, { "acc": 0.77501373, "epoch": 0.23587395558407093, "grad_norm": 5.4375, "learning_rate": 9.798585854117543e-06, "loss": 0.81827021, "memory(GiB)": 122.82, "step": 10110, "train_speed(iter/s)": 0.202967 }, { "acc": 0.76076441, "epoch": 0.23610726315635983, "grad_norm": 7.59375, "learning_rate": 9.798054731931674e-06, "loss": 0.87671032, "memory(GiB)": 122.82, "step": 10120, "train_speed(iter/s)": 0.203063 }, { "acc": 0.74755526, "epoch": 0.23634057072864872, "grad_norm": 6.4375, "learning_rate": 9.797522924824886e-06, "loss": 0.91171093, "memory(GiB)": 122.82, "step": 10130, "train_speed(iter/s)": 0.203169 }, { "acc": 0.77530651, "epoch": 0.2365738783009376, "grad_norm": 7.6875, "learning_rate": 9.796990432873093e-06, "loss": 0.82149048, "memory(GiB)": 122.82, "step": 10140, "train_speed(iter/s)": 0.203273 }, { "acc": 0.76692486, "epoch": 0.2368071858732265, "grad_norm": 6.34375, "learning_rate": 9.79645725615231e-06, "loss": 0.85743542, "memory(GiB)": 122.82, "step": 10150, "train_speed(iter/s)": 0.203382 }, { "acc": 0.77266049, "epoch": 0.2370404934455154, "grad_norm": 5.09375, "learning_rate": 9.795923394738646e-06, "loss": 0.81590748, "memory(GiB)": 126.71, "step": 10160, "train_speed(iter/s)": 0.203471 }, { "acc": 0.7645483, "epoch": 0.2372738010178043, "grad_norm": 7.25, "learning_rate": 9.795388848708312e-06, "loss": 0.8682291, "memory(GiB)": 126.71, "step": 10170, "train_speed(iter/s)": 0.203572 }, { "acc": 0.74908571, "epoch": 0.23750710859009316, "grad_norm": 5.75, "learning_rate": 9.794853618137612e-06, "loss": 0.92586708, "memory(GiB)": 126.71, "step": 10180, "train_speed(iter/s)": 0.20368 }, { "acc": 0.76867323, "epoch": 0.23774041616238206, "grad_norm": 6.03125, "learning_rate": 9.794317703102951e-06, "loss": 0.84071312, "memory(GiB)": 126.71, "step": 10190, "train_speed(iter/s)": 0.203784 }, { "acc": 0.75858183, "epoch": 0.23797372373467096, "grad_norm": 6.3125, "learning_rate": 9.793781103680833e-06, "loss": 0.90144587, "memory(GiB)": 126.71, "step": 10200, "train_speed(iter/s)": 0.203887 }, { "acc": 0.77408137, "epoch": 0.23820703130695986, "grad_norm": 5.5625, "learning_rate": 9.793243819947851e-06, "loss": 0.82780151, "memory(GiB)": 126.71, "step": 10210, "train_speed(iter/s)": 0.203999 }, { "acc": 0.75728579, "epoch": 0.23844033887924876, "grad_norm": 7.46875, "learning_rate": 9.79270585198071e-06, "loss": 0.8856432, "memory(GiB)": 126.71, "step": 10220, "train_speed(iter/s)": 0.204102 }, { "acc": 0.76272054, "epoch": 0.23867364645153763, "grad_norm": 7.25, "learning_rate": 9.792167199856198e-06, "loss": 0.86025734, "memory(GiB)": 126.71, "step": 10230, "train_speed(iter/s)": 0.204193 }, { "acc": 0.78271184, "epoch": 0.23890695402382653, "grad_norm": 5.0, "learning_rate": 9.791627863651212e-06, "loss": 0.78712101, "memory(GiB)": 126.71, "step": 10240, "train_speed(iter/s)": 0.204298 }, { "acc": 0.73603754, "epoch": 0.23914026159611543, "grad_norm": 7.15625, "learning_rate": 9.791087843442738e-06, "loss": 0.97998352, "memory(GiB)": 126.71, "step": 10250, "train_speed(iter/s)": 0.204403 }, { "acc": 0.76171508, "epoch": 0.23937356916840433, "grad_norm": 6.40625, "learning_rate": 9.790547139307869e-06, "loss": 0.86703663, "memory(GiB)": 126.71, "step": 10260, "train_speed(iter/s)": 0.204508 }, { "acc": 0.76325674, "epoch": 0.23960687674069323, "grad_norm": 6.3125, "learning_rate": 9.790005751323787e-06, "loss": 0.8503664, "memory(GiB)": 126.71, "step": 10270, "train_speed(iter/s)": 0.204602 }, { "acc": 0.7679719, "epoch": 0.2398401843129821, "grad_norm": 10.0625, "learning_rate": 9.789463679567775e-06, "loss": 0.8453804, "memory(GiB)": 126.71, "step": 10280, "train_speed(iter/s)": 0.204692 }, { "acc": 0.76253929, "epoch": 0.240073491885271, "grad_norm": 6.125, "learning_rate": 9.788920924117213e-06, "loss": 0.88661613, "memory(GiB)": 126.71, "step": 10290, "train_speed(iter/s)": 0.204795 }, { "acc": 0.76086283, "epoch": 0.2403067994575599, "grad_norm": 5.6875, "learning_rate": 9.788377485049583e-06, "loss": 0.8614933, "memory(GiB)": 126.71, "step": 10300, "train_speed(iter/s)": 0.204894 }, { "acc": 0.76430244, "epoch": 0.2405401070298488, "grad_norm": 5.21875, "learning_rate": 9.787833362442456e-06, "loss": 0.85298872, "memory(GiB)": 126.71, "step": 10310, "train_speed(iter/s)": 0.204994 }, { "acc": 0.74295855, "epoch": 0.2407734146021377, "grad_norm": 5.5, "learning_rate": 9.78728855637351e-06, "loss": 0.94464645, "memory(GiB)": 126.71, "step": 10320, "train_speed(iter/s)": 0.205096 }, { "acc": 0.75788498, "epoch": 0.24100672217442656, "grad_norm": 4.5625, "learning_rate": 9.786743066920509e-06, "loss": 0.88022709, "memory(GiB)": 126.71, "step": 10330, "train_speed(iter/s)": 0.205191 }, { "acc": 0.75031452, "epoch": 0.24124002974671546, "grad_norm": 4.75, "learning_rate": 9.786196894161329e-06, "loss": 0.90013466, "memory(GiB)": 126.71, "step": 10340, "train_speed(iter/s)": 0.205289 }, { "acc": 0.77473364, "epoch": 0.24147333731900436, "grad_norm": 6.0, "learning_rate": 9.78565003817393e-06, "loss": 0.81478062, "memory(GiB)": 126.71, "step": 10350, "train_speed(iter/s)": 0.205394 }, { "acc": 0.7561985, "epoch": 0.24170664489129326, "grad_norm": 4.59375, "learning_rate": 9.78510249903638e-06, "loss": 0.86722546, "memory(GiB)": 126.71, "step": 10360, "train_speed(iter/s)": 0.205502 }, { "acc": 0.77503967, "epoch": 0.24193995246358216, "grad_norm": 5.8125, "learning_rate": 9.784554276826839e-06, "loss": 0.79824462, "memory(GiB)": 126.71, "step": 10370, "train_speed(iter/s)": 0.205606 }, { "acc": 0.76406875, "epoch": 0.24217326003587103, "grad_norm": 6.15625, "learning_rate": 9.784005371623564e-06, "loss": 0.8647294, "memory(GiB)": 126.71, "step": 10380, "train_speed(iter/s)": 0.205712 }, { "acc": 0.7633657, "epoch": 0.24240656760815993, "grad_norm": 5.03125, "learning_rate": 9.783455783504911e-06, "loss": 0.84617233, "memory(GiB)": 126.71, "step": 10390, "train_speed(iter/s)": 0.205816 }, { "acc": 0.76961269, "epoch": 0.24263987518044883, "grad_norm": 6.28125, "learning_rate": 9.782905512549336e-06, "loss": 0.83270855, "memory(GiB)": 126.71, "step": 10400, "train_speed(iter/s)": 0.205915 }, { "acc": 0.75926919, "epoch": 0.24287318275273773, "grad_norm": 5.4375, "learning_rate": 9.78235455883539e-06, "loss": 0.87408047, "memory(GiB)": 126.71, "step": 10410, "train_speed(iter/s)": 0.206005 }, { "acc": 0.76697769, "epoch": 0.24310649032502663, "grad_norm": 7.25, "learning_rate": 9.781802922441716e-06, "loss": 0.83203278, "memory(GiB)": 126.71, "step": 10420, "train_speed(iter/s)": 0.206105 }, { "acc": 0.76918077, "epoch": 0.2433397978973155, "grad_norm": 4.5625, "learning_rate": 9.781250603447069e-06, "loss": 0.83921432, "memory(GiB)": 126.71, "step": 10430, "train_speed(iter/s)": 0.206202 }, { "acc": 0.78314271, "epoch": 0.2435731054696044, "grad_norm": 8.0, "learning_rate": 9.780697601930282e-06, "loss": 0.78089485, "memory(GiB)": 126.71, "step": 10440, "train_speed(iter/s)": 0.206297 }, { "acc": 0.76764956, "epoch": 0.2438064130418933, "grad_norm": 6.15625, "learning_rate": 9.780143917970304e-06, "loss": 0.82548275, "memory(GiB)": 126.71, "step": 10450, "train_speed(iter/s)": 0.2064 }, { "acc": 0.77233295, "epoch": 0.2440397206141822, "grad_norm": 8.625, "learning_rate": 9.77958955164617e-06, "loss": 0.82057104, "memory(GiB)": 126.71, "step": 10460, "train_speed(iter/s)": 0.206499 }, { "acc": 0.75792322, "epoch": 0.24427302818647106, "grad_norm": 6.21875, "learning_rate": 9.779034503037016e-06, "loss": 0.88646393, "memory(GiB)": 126.71, "step": 10470, "train_speed(iter/s)": 0.206595 }, { "acc": 0.76004329, "epoch": 0.24450633575875996, "grad_norm": 6.6875, "learning_rate": 9.778478772222075e-06, "loss": 0.88139906, "memory(GiB)": 126.71, "step": 10480, "train_speed(iter/s)": 0.206692 }, { "acc": 0.78009186, "epoch": 0.24473964333104886, "grad_norm": 8.0625, "learning_rate": 9.777922359280677e-06, "loss": 0.80844212, "memory(GiB)": 126.71, "step": 10490, "train_speed(iter/s)": 0.206789 }, { "acc": 0.76328154, "epoch": 0.24497295090333776, "grad_norm": 7.78125, "learning_rate": 9.777365264292252e-06, "loss": 0.85358887, "memory(GiB)": 126.71, "step": 10500, "train_speed(iter/s)": 0.206889 }, { "epoch": 0.24497295090333776, "eval_acc": 0.7297711512460512, "eval_loss": 0.8557952046394348, "eval_runtime": 1263.3055, "eval_samples_per_second": 28.49, "eval_steps_per_second": 14.245, "step": 10500 }, { "acc": 0.78785858, "epoch": 0.24520625847562666, "grad_norm": 4.6875, "learning_rate": 9.77680748733632e-06, "loss": 0.75362582, "memory(GiB)": 126.71, "step": 10510, "train_speed(iter/s)": 0.201868 }, { "acc": 0.74264431, "epoch": 0.24543956604791553, "grad_norm": 4.65625, "learning_rate": 9.77624902849251e-06, "loss": 0.95847607, "memory(GiB)": 126.71, "step": 10520, "train_speed(iter/s)": 0.20197 }, { "acc": 0.76365957, "epoch": 0.24567287362020443, "grad_norm": 5.4375, "learning_rate": 9.775689887840537e-06, "loss": 0.86284952, "memory(GiB)": 126.71, "step": 10530, "train_speed(iter/s)": 0.202064 }, { "acc": 0.75689592, "epoch": 0.24590618119249333, "grad_norm": 7.1875, "learning_rate": 9.775130065460222e-06, "loss": 0.8889164, "memory(GiB)": 126.71, "step": 10540, "train_speed(iter/s)": 0.202159 }, { "acc": 0.76508446, "epoch": 0.24613948876478223, "grad_norm": 6.375, "learning_rate": 9.774569561431474e-06, "loss": 0.85591717, "memory(GiB)": 126.71, "step": 10550, "train_speed(iter/s)": 0.202258 }, { "acc": 0.75782385, "epoch": 0.24637279633707113, "grad_norm": 6.09375, "learning_rate": 9.77400837583431e-06, "loss": 0.89698887, "memory(GiB)": 126.71, "step": 10560, "train_speed(iter/s)": 0.202359 }, { "acc": 0.77994204, "epoch": 0.24660610390936, "grad_norm": 6.3125, "learning_rate": 9.773446508748836e-06, "loss": 0.78877945, "memory(GiB)": 126.71, "step": 10570, "train_speed(iter/s)": 0.202452 }, { "acc": 0.74667473, "epoch": 0.2468394114816489, "grad_norm": 6.3125, "learning_rate": 9.772883960255261e-06, "loss": 0.91857672, "memory(GiB)": 126.71, "step": 10580, "train_speed(iter/s)": 0.20255 }, { "acc": 0.76464491, "epoch": 0.2470727190539378, "grad_norm": 5.15625, "learning_rate": 9.772320730433886e-06, "loss": 0.85419235, "memory(GiB)": 126.71, "step": 10590, "train_speed(iter/s)": 0.202645 }, { "acc": 0.77226572, "epoch": 0.2473060266262267, "grad_norm": 5.8125, "learning_rate": 9.771756819365114e-06, "loss": 0.83151188, "memory(GiB)": 126.71, "step": 10600, "train_speed(iter/s)": 0.202744 }, { "acc": 0.76321602, "epoch": 0.2475393341985156, "grad_norm": 10.375, "learning_rate": 9.771192227129442e-06, "loss": 0.86600323, "memory(GiB)": 126.71, "step": 10610, "train_speed(iter/s)": 0.202836 }, { "acc": 0.78701701, "epoch": 0.24777264177080446, "grad_norm": 5.71875, "learning_rate": 9.770626953807468e-06, "loss": 0.7451704, "memory(GiB)": 126.71, "step": 10620, "train_speed(iter/s)": 0.202938 }, { "acc": 0.76000595, "epoch": 0.24800594934309336, "grad_norm": 6.84375, "learning_rate": 9.770060999479878e-06, "loss": 0.87831059, "memory(GiB)": 126.71, "step": 10630, "train_speed(iter/s)": 0.203043 }, { "acc": 0.75896807, "epoch": 0.24823925691538226, "grad_norm": 4.84375, "learning_rate": 9.769494364227468e-06, "loss": 0.89416523, "memory(GiB)": 126.71, "step": 10640, "train_speed(iter/s)": 0.203144 }, { "acc": 0.76193252, "epoch": 0.24847256448767116, "grad_norm": 5.875, "learning_rate": 9.768927048131122e-06, "loss": 0.85950737, "memory(GiB)": 126.71, "step": 10650, "train_speed(iter/s)": 0.203245 }, { "acc": 0.76312141, "epoch": 0.24870587205996006, "grad_norm": 5.25, "learning_rate": 9.768359051271827e-06, "loss": 0.84854403, "memory(GiB)": 126.71, "step": 10660, "train_speed(iter/s)": 0.203346 }, { "acc": 0.75137706, "epoch": 0.24893917963224893, "grad_norm": 8.3125, "learning_rate": 9.767790373730663e-06, "loss": 0.88639297, "memory(GiB)": 126.71, "step": 10670, "train_speed(iter/s)": 0.20345 }, { "acc": 0.75710773, "epoch": 0.24917248720453783, "grad_norm": 5.1875, "learning_rate": 9.767221015588807e-06, "loss": 0.8809803, "memory(GiB)": 126.71, "step": 10680, "train_speed(iter/s)": 0.203557 }, { "acc": 0.76463542, "epoch": 0.24940579477682673, "grad_norm": 5.25, "learning_rate": 9.766650976927536e-06, "loss": 0.85783825, "memory(GiB)": 126.71, "step": 10690, "train_speed(iter/s)": 0.203652 }, { "acc": 0.75808158, "epoch": 0.24963910234911563, "grad_norm": 5.46875, "learning_rate": 9.766080257828223e-06, "loss": 0.87666111, "memory(GiB)": 126.71, "step": 10700, "train_speed(iter/s)": 0.203751 }, { "acc": 0.75333176, "epoch": 0.2498724099214045, "grad_norm": 6.46875, "learning_rate": 9.765508858372337e-06, "loss": 0.88884382, "memory(GiB)": 126.71, "step": 10710, "train_speed(iter/s)": 0.203846 }, { "acc": 0.77092853, "epoch": 0.2501057174936934, "grad_norm": 5.1875, "learning_rate": 9.764936778641448e-06, "loss": 0.83336773, "memory(GiB)": 126.71, "step": 10720, "train_speed(iter/s)": 0.203944 }, { "acc": 0.76557169, "epoch": 0.2503390250659823, "grad_norm": 7.46875, "learning_rate": 9.764364018717215e-06, "loss": 0.82477913, "memory(GiB)": 126.71, "step": 10730, "train_speed(iter/s)": 0.204046 }, { "acc": 0.7622045, "epoch": 0.25057233263827117, "grad_norm": 5.84375, "learning_rate": 9.763790578681404e-06, "loss": 0.86981497, "memory(GiB)": 126.71, "step": 10740, "train_speed(iter/s)": 0.204147 }, { "acc": 0.76684313, "epoch": 0.25080564021056007, "grad_norm": 6.15625, "learning_rate": 9.763216458615871e-06, "loss": 0.8178812, "memory(GiB)": 126.71, "step": 10750, "train_speed(iter/s)": 0.204246 }, { "acc": 0.76217184, "epoch": 0.25103894778284896, "grad_norm": 5.9375, "learning_rate": 9.762641658602575e-06, "loss": 0.84431133, "memory(GiB)": 126.71, "step": 10760, "train_speed(iter/s)": 0.204341 }, { "acc": 0.75090051, "epoch": 0.25127225535513786, "grad_norm": 8.1875, "learning_rate": 9.762066178723562e-06, "loss": 0.9076766, "memory(GiB)": 126.71, "step": 10770, "train_speed(iter/s)": 0.204441 }, { "acc": 0.75777416, "epoch": 0.25150556292742676, "grad_norm": 6.8125, "learning_rate": 9.761490019060988e-06, "loss": 0.88850422, "memory(GiB)": 126.71, "step": 10780, "train_speed(iter/s)": 0.20454 }, { "acc": 0.74511137, "epoch": 0.25173887049971566, "grad_norm": 5.375, "learning_rate": 9.760913179697095e-06, "loss": 0.90051575, "memory(GiB)": 126.71, "step": 10790, "train_speed(iter/s)": 0.204638 }, { "acc": 0.736093, "epoch": 0.25197217807200456, "grad_norm": 6.75, "learning_rate": 9.76033566071423e-06, "loss": 0.9690033, "memory(GiB)": 126.71, "step": 10800, "train_speed(iter/s)": 0.20474 }, { "acc": 0.75845847, "epoch": 0.25220548564429346, "grad_norm": 5.09375, "learning_rate": 9.759757462194832e-06, "loss": 0.8807003, "memory(GiB)": 126.71, "step": 10810, "train_speed(iter/s)": 0.204837 }, { "acc": 0.76187773, "epoch": 0.25243879321658236, "grad_norm": 7.0, "learning_rate": 9.759178584221439e-06, "loss": 0.86267948, "memory(GiB)": 126.71, "step": 10820, "train_speed(iter/s)": 0.204935 }, { "acc": 0.7566637, "epoch": 0.2526721007888712, "grad_norm": 6.375, "learning_rate": 9.758599026876685e-06, "loss": 0.86557817, "memory(GiB)": 126.71, "step": 10830, "train_speed(iter/s)": 0.205032 }, { "acc": 0.77694554, "epoch": 0.2529054083611601, "grad_norm": 7.8125, "learning_rate": 9.758018790243304e-06, "loss": 0.80188293, "memory(GiB)": 126.71, "step": 10840, "train_speed(iter/s)": 0.205126 }, { "acc": 0.7637352, "epoch": 0.253138715933449, "grad_norm": 5.375, "learning_rate": 9.757437874404121e-06, "loss": 0.86346397, "memory(GiB)": 126.71, "step": 10850, "train_speed(iter/s)": 0.205217 }, { "acc": 0.77152028, "epoch": 0.2533720235057379, "grad_norm": 7.78125, "learning_rate": 9.756856279442064e-06, "loss": 0.82565212, "memory(GiB)": 126.71, "step": 10860, "train_speed(iter/s)": 0.205318 }, { "acc": 0.76573048, "epoch": 0.2536053310780268, "grad_norm": 5.125, "learning_rate": 9.756274005440156e-06, "loss": 0.86291542, "memory(GiB)": 126.71, "step": 10870, "train_speed(iter/s)": 0.20542 }, { "acc": 0.76043401, "epoch": 0.2538386386503157, "grad_norm": 5.28125, "learning_rate": 9.755691052481515e-06, "loss": 0.87564516, "memory(GiB)": 126.71, "step": 10880, "train_speed(iter/s)": 0.205515 }, { "acc": 0.76571369, "epoch": 0.2540719462226046, "grad_norm": 6.9375, "learning_rate": 9.755107420649357e-06, "loss": 0.84756193, "memory(GiB)": 126.71, "step": 10890, "train_speed(iter/s)": 0.205609 }, { "acc": 0.75509901, "epoch": 0.2543052537948935, "grad_norm": 6.375, "learning_rate": 9.754523110026997e-06, "loss": 0.89535923, "memory(GiB)": 126.71, "step": 10900, "train_speed(iter/s)": 0.205708 }, { "acc": 0.77214108, "epoch": 0.2545385613671824, "grad_norm": 10.1875, "learning_rate": 9.753938120697843e-06, "loss": 0.83769913, "memory(GiB)": 126.71, "step": 10910, "train_speed(iter/s)": 0.205799 }, { "acc": 0.76207438, "epoch": 0.2547718689394713, "grad_norm": 10.3125, "learning_rate": 9.753352452745406e-06, "loss": 0.87783222, "memory(GiB)": 126.71, "step": 10920, "train_speed(iter/s)": 0.205898 }, { "acc": 0.7913722, "epoch": 0.25500517651176013, "grad_norm": 7.9375, "learning_rate": 9.752766106253285e-06, "loss": 0.74069109, "memory(GiB)": 126.71, "step": 10930, "train_speed(iter/s)": 0.205983 }, { "acc": 0.77416754, "epoch": 0.25523848408404903, "grad_norm": 5.625, "learning_rate": 9.752179081305184e-06, "loss": 0.80955868, "memory(GiB)": 126.71, "step": 10940, "train_speed(iter/s)": 0.206073 }, { "acc": 0.774647, "epoch": 0.25547179165633793, "grad_norm": 6.21875, "learning_rate": 9.751591377984899e-06, "loss": 0.84093094, "memory(GiB)": 126.71, "step": 10950, "train_speed(iter/s)": 0.206167 }, { "acc": 0.76589441, "epoch": 0.25570509922862683, "grad_norm": 5.28125, "learning_rate": 9.751002996376324e-06, "loss": 0.86710567, "memory(GiB)": 126.71, "step": 10960, "train_speed(iter/s)": 0.206259 }, { "acc": 0.77508335, "epoch": 0.25593840680091573, "grad_norm": 4.84375, "learning_rate": 9.750413936563454e-06, "loss": 0.81113377, "memory(GiB)": 126.71, "step": 10970, "train_speed(iter/s)": 0.206355 }, { "acc": 0.76599746, "epoch": 0.25617171437320463, "grad_norm": 7.40625, "learning_rate": 9.749824198630371e-06, "loss": 0.84636421, "memory(GiB)": 126.71, "step": 10980, "train_speed(iter/s)": 0.206454 }, { "acc": 0.75386877, "epoch": 0.2564050219454935, "grad_norm": 5.3125, "learning_rate": 9.749233782661267e-06, "loss": 0.87643614, "memory(GiB)": 126.71, "step": 10990, "train_speed(iter/s)": 0.206554 }, { "acc": 0.783496, "epoch": 0.2566383295177824, "grad_norm": 4.09375, "learning_rate": 9.74864268874042e-06, "loss": 0.7795433, "memory(GiB)": 126.71, "step": 11000, "train_speed(iter/s)": 0.206644 }, { "epoch": 0.2566383295177824, "eval_acc": 0.7301328820034915, "eval_loss": 0.8549428582191467, "eval_runtime": 1264.1411, "eval_samples_per_second": 28.471, "eval_steps_per_second": 14.236, "step": 11000 }, { "acc": 0.76326046, "epoch": 0.2568716370900713, "grad_norm": 4.6875, "learning_rate": 9.748050916952206e-06, "loss": 0.83894501, "memory(GiB)": 126.71, "step": 11010, "train_speed(iter/s)": 0.201847 }, { "acc": 0.77668257, "epoch": 0.2571049446623602, "grad_norm": 5.25, "learning_rate": 9.747458467381104e-06, "loss": 0.80585499, "memory(GiB)": 126.71, "step": 11020, "train_speed(iter/s)": 0.201944 }, { "acc": 0.77455559, "epoch": 0.25733825223464907, "grad_norm": 5.75, "learning_rate": 9.746865340111686e-06, "loss": 0.80291653, "memory(GiB)": 126.71, "step": 11030, "train_speed(iter/s)": 0.202042 }, { "acc": 0.77919416, "epoch": 0.25757155980693797, "grad_norm": 5.125, "learning_rate": 9.74627153522862e-06, "loss": 0.79611454, "memory(GiB)": 126.71, "step": 11040, "train_speed(iter/s)": 0.202141 }, { "acc": 0.75045023, "epoch": 0.25780486737922687, "grad_norm": 14.875, "learning_rate": 9.74567705281667e-06, "loss": 0.89672918, "memory(GiB)": 126.71, "step": 11050, "train_speed(iter/s)": 0.20224 }, { "acc": 0.76361661, "epoch": 0.25803817495151576, "grad_norm": 9.0, "learning_rate": 9.745081892960699e-06, "loss": 0.8723484, "memory(GiB)": 126.71, "step": 11060, "train_speed(iter/s)": 0.202329 }, { "acc": 0.76879368, "epoch": 0.25827148252380466, "grad_norm": 5.125, "learning_rate": 9.744486055745667e-06, "loss": 0.8530344, "memory(GiB)": 126.71, "step": 11070, "train_speed(iter/s)": 0.202418 }, { "acc": 0.74410944, "epoch": 0.25850479009609356, "grad_norm": 5.25, "learning_rate": 9.743889541256628e-06, "loss": 0.92593765, "memory(GiB)": 126.71, "step": 11080, "train_speed(iter/s)": 0.202515 }, { "acc": 0.75798779, "epoch": 0.25873809766838246, "grad_norm": 8.875, "learning_rate": 9.743292349578737e-06, "loss": 0.89210424, "memory(GiB)": 126.71, "step": 11090, "train_speed(iter/s)": 0.202619 }, { "acc": 0.78643255, "epoch": 0.25897140524067136, "grad_norm": 6.21875, "learning_rate": 9.742694480797239e-06, "loss": 0.78887339, "memory(GiB)": 126.71, "step": 11100, "train_speed(iter/s)": 0.202708 }, { "acc": 0.7584465, "epoch": 0.25920471281296026, "grad_norm": 6.8125, "learning_rate": 9.742095934997482e-06, "loss": 0.88120708, "memory(GiB)": 126.71, "step": 11110, "train_speed(iter/s)": 0.202797 }, { "acc": 0.76665049, "epoch": 0.2594380203852491, "grad_norm": 6.0625, "learning_rate": 9.741496712264908e-06, "loss": 0.85755138, "memory(GiB)": 126.71, "step": 11120, "train_speed(iter/s)": 0.202889 }, { "acc": 0.79618187, "epoch": 0.259671327957538, "grad_norm": 11.5625, "learning_rate": 9.740896812685057e-06, "loss": 0.72397442, "memory(GiB)": 126.71, "step": 11130, "train_speed(iter/s)": 0.202981 }, { "acc": 0.76714506, "epoch": 0.2599046355298269, "grad_norm": 6.6875, "learning_rate": 9.740296236343561e-06, "loss": 0.8487793, "memory(GiB)": 126.71, "step": 11140, "train_speed(iter/s)": 0.203079 }, { "acc": 0.77154741, "epoch": 0.2601379431021158, "grad_norm": 6.15625, "learning_rate": 9.739694983326155e-06, "loss": 0.82732563, "memory(GiB)": 126.71, "step": 11150, "train_speed(iter/s)": 0.203175 }, { "acc": 0.76540327, "epoch": 0.2603712506744047, "grad_norm": 4.8125, "learning_rate": 9.739093053718669e-06, "loss": 0.85536747, "memory(GiB)": 126.71, "step": 11160, "train_speed(iter/s)": 0.203262 }, { "acc": 0.78566723, "epoch": 0.2606045582466936, "grad_norm": 6.4375, "learning_rate": 9.738490447607025e-06, "loss": 0.77910624, "memory(GiB)": 126.71, "step": 11170, "train_speed(iter/s)": 0.203363 }, { "acc": 0.74674087, "epoch": 0.2608378658189825, "grad_norm": 5.90625, "learning_rate": 9.737887165077246e-06, "loss": 0.90552397, "memory(GiB)": 126.71, "step": 11180, "train_speed(iter/s)": 0.203458 }, { "acc": 0.77218437, "epoch": 0.2610711733912714, "grad_norm": 6.21875, "learning_rate": 9.73728320621545e-06, "loss": 0.82151499, "memory(GiB)": 126.71, "step": 11190, "train_speed(iter/s)": 0.203548 }, { "acc": 0.7614913, "epoch": 0.2613044809635603, "grad_norm": 5.6875, "learning_rate": 9.736678571107854e-06, "loss": 0.87275715, "memory(GiB)": 126.71, "step": 11200, "train_speed(iter/s)": 0.203638 }, { "acc": 0.77179565, "epoch": 0.2615377885358492, "grad_norm": 4.84375, "learning_rate": 9.736073259840766e-06, "loss": 0.80611324, "memory(GiB)": 126.71, "step": 11210, "train_speed(iter/s)": 0.203733 }, { "acc": 0.7435523, "epoch": 0.26177109610813803, "grad_norm": 6.09375, "learning_rate": 9.735467272500597e-06, "loss": 0.93148308, "memory(GiB)": 126.71, "step": 11220, "train_speed(iter/s)": 0.203831 }, { "acc": 0.76413846, "epoch": 0.26200440368042693, "grad_norm": 5.6875, "learning_rate": 9.73486060917385e-06, "loss": 0.86887436, "memory(GiB)": 126.71, "step": 11230, "train_speed(iter/s)": 0.203924 }, { "acc": 0.76618338, "epoch": 0.26223771125271583, "grad_norm": 6.1875, "learning_rate": 9.734253269947128e-06, "loss": 0.8479351, "memory(GiB)": 126.71, "step": 11240, "train_speed(iter/s)": 0.204017 }, { "acc": 0.75405641, "epoch": 0.26247101882500473, "grad_norm": 4.9375, "learning_rate": 9.733645254907126e-06, "loss": 0.90059662, "memory(GiB)": 126.71, "step": 11250, "train_speed(iter/s)": 0.204113 }, { "acc": 0.7840107, "epoch": 0.26270432639729363, "grad_norm": 5.34375, "learning_rate": 9.73303656414064e-06, "loss": 0.7931426, "memory(GiB)": 126.71, "step": 11260, "train_speed(iter/s)": 0.204213 }, { "acc": 0.76576843, "epoch": 0.26293763396958253, "grad_norm": 6.46875, "learning_rate": 9.732427197734557e-06, "loss": 0.84696712, "memory(GiB)": 126.71, "step": 11270, "train_speed(iter/s)": 0.204306 }, { "acc": 0.7611186, "epoch": 0.26317094154187143, "grad_norm": 6.3125, "learning_rate": 9.73181715577587e-06, "loss": 0.85667114, "memory(GiB)": 126.71, "step": 11280, "train_speed(iter/s)": 0.204391 }, { "acc": 0.76847444, "epoch": 0.2634042491141603, "grad_norm": 7.59375, "learning_rate": 9.731206438351655e-06, "loss": 0.84641085, "memory(GiB)": 126.71, "step": 11290, "train_speed(iter/s)": 0.204483 }, { "acc": 0.76328211, "epoch": 0.2636375566864492, "grad_norm": 6.84375, "learning_rate": 9.730595045549096e-06, "loss": 0.84739742, "memory(GiB)": 126.71, "step": 11300, "train_speed(iter/s)": 0.204578 }, { "acc": 0.78063126, "epoch": 0.2638708642587381, "grad_norm": 4.84375, "learning_rate": 9.72998297745547e-06, "loss": 0.78272877, "memory(GiB)": 126.71, "step": 11310, "train_speed(iter/s)": 0.204668 }, { "acc": 0.77293272, "epoch": 0.26410417183102697, "grad_norm": 5.53125, "learning_rate": 9.729370234158147e-06, "loss": 0.83548946, "memory(GiB)": 126.71, "step": 11320, "train_speed(iter/s)": 0.204759 }, { "acc": 0.76551719, "epoch": 0.26433747940331587, "grad_norm": 5.4375, "learning_rate": 9.728756815744598e-06, "loss": 0.85563173, "memory(GiB)": 126.71, "step": 11330, "train_speed(iter/s)": 0.204854 }, { "acc": 0.73429289, "epoch": 0.26457078697560477, "grad_norm": 4.5625, "learning_rate": 9.728142722302385e-06, "loss": 0.96406136, "memory(GiB)": 126.71, "step": 11340, "train_speed(iter/s)": 0.204949 }, { "acc": 0.7488574, "epoch": 0.26480409454789366, "grad_norm": 10.5, "learning_rate": 9.727527953919174e-06, "loss": 0.9289957, "memory(GiB)": 126.71, "step": 11350, "train_speed(iter/s)": 0.205041 }, { "acc": 0.74727945, "epoch": 0.26503740212018256, "grad_norm": 6.09375, "learning_rate": 9.72691251068272e-06, "loss": 0.93669577, "memory(GiB)": 126.71, "step": 11360, "train_speed(iter/s)": 0.205129 }, { "acc": 0.76031828, "epoch": 0.26527070969247146, "grad_norm": 7.46875, "learning_rate": 9.726296392680879e-06, "loss": 0.86247702, "memory(GiB)": 126.71, "step": 11370, "train_speed(iter/s)": 0.20522 }, { "acc": 0.77017441, "epoch": 0.26550401726476036, "grad_norm": 34.25, "learning_rate": 9.7256796000016e-06, "loss": 0.8388257, "memory(GiB)": 126.71, "step": 11380, "train_speed(iter/s)": 0.205315 }, { "acc": 0.76522474, "epoch": 0.26573732483704926, "grad_norm": 7.28125, "learning_rate": 9.725062132732931e-06, "loss": 0.85769806, "memory(GiB)": 126.71, "step": 11390, "train_speed(iter/s)": 0.205408 }, { "acc": 0.7610724, "epoch": 0.26597063240933816, "grad_norm": 8.125, "learning_rate": 9.724443990963017e-06, "loss": 0.87329168, "memory(GiB)": 126.71, "step": 11400, "train_speed(iter/s)": 0.205503 }, { "acc": 0.75262628, "epoch": 0.266203939981627, "grad_norm": 7.3125, "learning_rate": 9.723825174780095e-06, "loss": 0.89846525, "memory(GiB)": 126.71, "step": 11410, "train_speed(iter/s)": 0.205595 }, { "acc": 0.7575634, "epoch": 0.2664372475539159, "grad_norm": 5.90625, "learning_rate": 9.723205684272501e-06, "loss": 0.87411928, "memory(GiB)": 126.71, "step": 11420, "train_speed(iter/s)": 0.205687 }, { "acc": 0.77559295, "epoch": 0.2666705551262048, "grad_norm": 5.3125, "learning_rate": 9.722585519528666e-06, "loss": 0.82729778, "memory(GiB)": 126.71, "step": 11430, "train_speed(iter/s)": 0.205777 }, { "acc": 0.76725402, "epoch": 0.2669038626984937, "grad_norm": 11.6875, "learning_rate": 9.721964680637124e-06, "loss": 0.83184071, "memory(GiB)": 126.71, "step": 11440, "train_speed(iter/s)": 0.205868 }, { "acc": 0.76837006, "epoch": 0.2671371702707826, "grad_norm": 5.53125, "learning_rate": 9.721343167686491e-06, "loss": 0.84206734, "memory(GiB)": 126.71, "step": 11450, "train_speed(iter/s)": 0.205963 }, { "acc": 0.76364198, "epoch": 0.2673704778430715, "grad_norm": 12.0625, "learning_rate": 9.720720980765495e-06, "loss": 0.87792025, "memory(GiB)": 126.71, "step": 11460, "train_speed(iter/s)": 0.206054 }, { "acc": 0.76016045, "epoch": 0.2676037854153604, "grad_norm": 5.09375, "learning_rate": 9.72009811996295e-06, "loss": 0.85616865, "memory(GiB)": 126.71, "step": 11470, "train_speed(iter/s)": 0.206147 }, { "acc": 0.77119327, "epoch": 0.2678370929876493, "grad_norm": 6.59375, "learning_rate": 9.719474585367771e-06, "loss": 0.83254633, "memory(GiB)": 126.71, "step": 11480, "train_speed(iter/s)": 0.206244 }, { "acc": 0.76275082, "epoch": 0.2680704005599382, "grad_norm": 6.4375, "learning_rate": 9.718850377068964e-06, "loss": 0.86225719, "memory(GiB)": 126.71, "step": 11490, "train_speed(iter/s)": 0.206336 }, { "acc": 0.75841961, "epoch": 0.2683037081322271, "grad_norm": 5.25, "learning_rate": 9.718225495155638e-06, "loss": 0.88602018, "memory(GiB)": 126.71, "step": 11500, "train_speed(iter/s)": 0.206424 }, { "epoch": 0.2683037081322271, "eval_acc": 0.7300978705965492, "eval_loss": 0.854084849357605, "eval_runtime": 1263.3124, "eval_samples_per_second": 28.489, "eval_steps_per_second": 14.245, "step": 11500 }, { "acc": 0.75562239, "epoch": 0.26853701570451594, "grad_norm": 5.53125, "learning_rate": 9.717599939716992e-06, "loss": 0.87457523, "memory(GiB)": 126.71, "step": 11510, "train_speed(iter/s)": 0.201851 }, { "acc": 0.77868471, "epoch": 0.26877032327680483, "grad_norm": 5.25, "learning_rate": 9.716973710842326e-06, "loss": 0.81087313, "memory(GiB)": 126.71, "step": 11520, "train_speed(iter/s)": 0.201942 }, { "acc": 0.77549467, "epoch": 0.26900363084909373, "grad_norm": 5.125, "learning_rate": 9.716346808621031e-06, "loss": 0.82643719, "memory(GiB)": 126.71, "step": 11530, "train_speed(iter/s)": 0.202037 }, { "acc": 0.76876841, "epoch": 0.26923693842138263, "grad_norm": 6.25, "learning_rate": 9.715719233142601e-06, "loss": 0.82000065, "memory(GiB)": 126.71, "step": 11540, "train_speed(iter/s)": 0.202128 }, { "acc": 0.7802412, "epoch": 0.26947024599367153, "grad_norm": 5.5, "learning_rate": 9.71509098449662e-06, "loss": 0.8260355, "memory(GiB)": 126.71, "step": 11550, "train_speed(iter/s)": 0.202219 }, { "acc": 0.75321536, "epoch": 0.26970355356596043, "grad_norm": 5.78125, "learning_rate": 9.71446206277277e-06, "loss": 0.89019775, "memory(GiB)": 126.71, "step": 11560, "train_speed(iter/s)": 0.202305 }, { "acc": 0.76092801, "epoch": 0.26993686113824933, "grad_norm": 5.90625, "learning_rate": 9.713832468060831e-06, "loss": 0.87599754, "memory(GiB)": 126.71, "step": 11570, "train_speed(iter/s)": 0.202392 }, { "acc": 0.75207872, "epoch": 0.2701701687105382, "grad_norm": 10.5, "learning_rate": 9.713202200450678e-06, "loss": 0.90203905, "memory(GiB)": 126.71, "step": 11580, "train_speed(iter/s)": 0.202484 }, { "acc": 0.77110724, "epoch": 0.2704034762828271, "grad_norm": 5.34375, "learning_rate": 9.712571260032277e-06, "loss": 0.83373137, "memory(GiB)": 126.71, "step": 11590, "train_speed(iter/s)": 0.202567 }, { "acc": 0.75453243, "epoch": 0.27063678385511597, "grad_norm": 5.3125, "learning_rate": 9.7119396468957e-06, "loss": 0.8978281, "memory(GiB)": 126.71, "step": 11600, "train_speed(iter/s)": 0.202663 }, { "acc": 0.76179743, "epoch": 0.27087009142740487, "grad_norm": 4.0, "learning_rate": 9.711307361131107e-06, "loss": 0.87683382, "memory(GiB)": 126.71, "step": 11610, "train_speed(iter/s)": 0.202742 }, { "acc": 0.77612991, "epoch": 0.27110339899969377, "grad_norm": 5.75, "learning_rate": 9.710674402828755e-06, "loss": 0.78767071, "memory(GiB)": 126.71, "step": 11620, "train_speed(iter/s)": 0.202838 }, { "acc": 0.75703506, "epoch": 0.27133670657198267, "grad_norm": 5.09375, "learning_rate": 9.710040772079001e-06, "loss": 0.87571831, "memory(GiB)": 126.71, "step": 11630, "train_speed(iter/s)": 0.202929 }, { "acc": 0.75049343, "epoch": 0.27157001414427157, "grad_norm": 7.5, "learning_rate": 9.709406468972295e-06, "loss": 0.90393038, "memory(GiB)": 126.71, "step": 11640, "train_speed(iter/s)": 0.203022 }, { "acc": 0.75180659, "epoch": 0.27180332171656046, "grad_norm": 5.3125, "learning_rate": 9.708771493599185e-06, "loss": 0.87692337, "memory(GiB)": 126.71, "step": 11650, "train_speed(iter/s)": 0.203109 }, { "acc": 0.76274881, "epoch": 0.27203662928884936, "grad_norm": 7.28125, "learning_rate": 9.708135846050313e-06, "loss": 0.85581036, "memory(GiB)": 126.71, "step": 11660, "train_speed(iter/s)": 0.203201 }, { "acc": 0.76783228, "epoch": 0.27226993686113826, "grad_norm": 4.21875, "learning_rate": 9.707499526416415e-06, "loss": 0.83571806, "memory(GiB)": 126.71, "step": 11670, "train_speed(iter/s)": 0.203287 }, { "acc": 0.75098848, "epoch": 0.27250324443342716, "grad_norm": 5.5625, "learning_rate": 9.706862534788327e-06, "loss": 0.90365248, "memory(GiB)": 126.71, "step": 11680, "train_speed(iter/s)": 0.203377 }, { "acc": 0.77068758, "epoch": 0.27273655200571606, "grad_norm": 5.5, "learning_rate": 9.70622487125698e-06, "loss": 0.82007561, "memory(GiB)": 126.71, "step": 11690, "train_speed(iter/s)": 0.203468 }, { "acc": 0.77245183, "epoch": 0.2729698595780049, "grad_norm": 6.28125, "learning_rate": 9.7055865359134e-06, "loss": 0.80792246, "memory(GiB)": 126.71, "step": 11700, "train_speed(iter/s)": 0.203554 }, { "acc": 0.76439018, "epoch": 0.2732031671502938, "grad_norm": 5.0625, "learning_rate": 9.704947528848706e-06, "loss": 0.87264099, "memory(GiB)": 126.71, "step": 11710, "train_speed(iter/s)": 0.203645 }, { "acc": 0.77553177, "epoch": 0.2734364747225827, "grad_norm": 4.96875, "learning_rate": 9.704307850154125e-06, "loss": 0.80482864, "memory(GiB)": 126.71, "step": 11720, "train_speed(iter/s)": 0.203736 }, { "acc": 0.76401672, "epoch": 0.2736697822948716, "grad_norm": 5.28125, "learning_rate": 9.70366749992096e-06, "loss": 0.85416298, "memory(GiB)": 126.71, "step": 11730, "train_speed(iter/s)": 0.203823 }, { "acc": 0.76824088, "epoch": 0.2739030898671605, "grad_norm": 4.71875, "learning_rate": 9.703026478240627e-06, "loss": 0.8355216, "memory(GiB)": 126.71, "step": 11740, "train_speed(iter/s)": 0.203913 }, { "acc": 0.76965742, "epoch": 0.2741363974394494, "grad_norm": 5.34375, "learning_rate": 9.702384785204631e-06, "loss": 0.80834923, "memory(GiB)": 126.71, "step": 11750, "train_speed(iter/s)": 0.204001 }, { "acc": 0.7614625, "epoch": 0.2743697050117383, "grad_norm": 5.25, "learning_rate": 9.701742420904574e-06, "loss": 0.85841808, "memory(GiB)": 126.71, "step": 11760, "train_speed(iter/s)": 0.204095 }, { "acc": 0.76526127, "epoch": 0.2746030125840272, "grad_norm": 10.1875, "learning_rate": 9.701099385432151e-06, "loss": 0.8782877, "memory(GiB)": 126.71, "step": 11770, "train_speed(iter/s)": 0.204182 }, { "acc": 0.75028667, "epoch": 0.2748363201563161, "grad_norm": 5.34375, "learning_rate": 9.700455678879157e-06, "loss": 0.91525326, "memory(GiB)": 126.71, "step": 11780, "train_speed(iter/s)": 0.204263 }, { "acc": 0.76402121, "epoch": 0.275069627728605, "grad_norm": 5.375, "learning_rate": 9.69981130133748e-06, "loss": 0.85571709, "memory(GiB)": 126.71, "step": 11790, "train_speed(iter/s)": 0.204349 }, { "acc": 0.76613884, "epoch": 0.27530293530089384, "grad_norm": 7.40625, "learning_rate": 9.699166252899104e-06, "loss": 0.85957689, "memory(GiB)": 126.71, "step": 11800, "train_speed(iter/s)": 0.20443 }, { "acc": 0.75364928, "epoch": 0.27553624287318274, "grad_norm": 5.28125, "learning_rate": 9.698520533656112e-06, "loss": 0.87422857, "memory(GiB)": 126.71, "step": 11810, "train_speed(iter/s)": 0.204516 }, { "acc": 0.77402854, "epoch": 0.27576955044547163, "grad_norm": 11.0, "learning_rate": 9.697874143700679e-06, "loss": 0.79661379, "memory(GiB)": 126.71, "step": 11820, "train_speed(iter/s)": 0.204607 }, { "acc": 0.77804384, "epoch": 0.27600285801776053, "grad_norm": 5.125, "learning_rate": 9.697227083125076e-06, "loss": 0.81095161, "memory(GiB)": 126.71, "step": 11830, "train_speed(iter/s)": 0.204699 }, { "acc": 0.7634634, "epoch": 0.27623616559004943, "grad_norm": 6.71875, "learning_rate": 9.69657935202167e-06, "loss": 0.84990921, "memory(GiB)": 126.71, "step": 11840, "train_speed(iter/s)": 0.204783 }, { "acc": 0.76870904, "epoch": 0.27646947316233833, "grad_norm": 5.5, "learning_rate": 9.695930950482928e-06, "loss": 0.85224094, "memory(GiB)": 126.71, "step": 11850, "train_speed(iter/s)": 0.204874 }, { "acc": 0.76560907, "epoch": 0.27670278073462723, "grad_norm": 7.03125, "learning_rate": 9.695281878601406e-06, "loss": 0.84133644, "memory(GiB)": 126.71, "step": 11860, "train_speed(iter/s)": 0.204969 }, { "acc": 0.78624902, "epoch": 0.27693608830691613, "grad_norm": 5.84375, "learning_rate": 9.69463213646976e-06, "loss": 0.77302933, "memory(GiB)": 126.71, "step": 11870, "train_speed(iter/s)": 0.205057 }, { "acc": 0.74388537, "epoch": 0.277169395879205, "grad_norm": 5.9375, "learning_rate": 9.69398172418074e-06, "loss": 0.92930565, "memory(GiB)": 126.71, "step": 11880, "train_speed(iter/s)": 0.205143 }, { "acc": 0.77030702, "epoch": 0.27740270345149387, "grad_norm": 4.875, "learning_rate": 9.693330641827194e-06, "loss": 0.82721748, "memory(GiB)": 126.71, "step": 11890, "train_speed(iter/s)": 0.20523 }, { "acc": 0.76433697, "epoch": 0.27763601102378277, "grad_norm": 5.34375, "learning_rate": 9.69267888950206e-06, "loss": 0.8619875, "memory(GiB)": 126.71, "step": 11900, "train_speed(iter/s)": 0.205311 }, { "acc": 0.76113305, "epoch": 0.27786931859607167, "grad_norm": 6.59375, "learning_rate": 9.69202646729838e-06, "loss": 0.85203724, "memory(GiB)": 126.71, "step": 11910, "train_speed(iter/s)": 0.205402 }, { "acc": 0.77061167, "epoch": 0.27810262616836057, "grad_norm": 7.5, "learning_rate": 9.691373375309284e-06, "loss": 0.84358406, "memory(GiB)": 126.71, "step": 11920, "train_speed(iter/s)": 0.205493 }, { "acc": 0.7573184, "epoch": 0.27833593374064947, "grad_norm": 6.0625, "learning_rate": 9.690719613628001e-06, "loss": 0.89419975, "memory(GiB)": 126.71, "step": 11930, "train_speed(iter/s)": 0.205574 }, { "acc": 0.76831846, "epoch": 0.27856924131293836, "grad_norm": 7.15625, "learning_rate": 9.690065182347857e-06, "loss": 0.85143213, "memory(GiB)": 126.71, "step": 11940, "train_speed(iter/s)": 0.205657 }, { "acc": 0.75728703, "epoch": 0.27880254888522726, "grad_norm": 6.15625, "learning_rate": 9.68941008156227e-06, "loss": 0.87258101, "memory(GiB)": 126.71, "step": 11950, "train_speed(iter/s)": 0.20575 }, { "acc": 0.76998615, "epoch": 0.27903585645751616, "grad_norm": 5.5, "learning_rate": 9.688754311364755e-06, "loss": 0.81026497, "memory(GiB)": 126.71, "step": 11960, "train_speed(iter/s)": 0.205833 }, { "acc": 0.76089764, "epoch": 0.27926916402980506, "grad_norm": 5.5625, "learning_rate": 9.688097871848925e-06, "loss": 0.85748444, "memory(GiB)": 126.71, "step": 11970, "train_speed(iter/s)": 0.205916 }, { "acc": 0.76951222, "epoch": 0.27950247160209396, "grad_norm": 7.34375, "learning_rate": 9.687440763108487e-06, "loss": 0.8230751, "memory(GiB)": 126.71, "step": 11980, "train_speed(iter/s)": 0.206001 }, { "acc": 0.75874929, "epoch": 0.2797357791743828, "grad_norm": 7.375, "learning_rate": 9.68678298523724e-06, "loss": 0.87013874, "memory(GiB)": 126.71, "step": 11990, "train_speed(iter/s)": 0.206094 }, { "acc": 0.76591287, "epoch": 0.2799690867466717, "grad_norm": 5.59375, "learning_rate": 9.686124538329083e-06, "loss": 0.85666504, "memory(GiB)": 126.71, "step": 12000, "train_speed(iter/s)": 0.20618 }, { "epoch": 0.2799690867466717, "eval_acc": 0.7304589559824329, "eval_loss": 0.8528935313224792, "eval_runtime": 1262.6831, "eval_samples_per_second": 28.504, "eval_steps_per_second": 14.252, "step": 12000 }, { "acc": 0.76213269, "epoch": 0.2802023943189606, "grad_norm": 6.0, "learning_rate": 9.685465422478011e-06, "loss": 0.87146873, "memory(GiB)": 126.71, "step": 12010, "train_speed(iter/s)": 0.201807 }, { "acc": 0.75742264, "epoch": 0.2804357018912495, "grad_norm": 5.84375, "learning_rate": 9.684805637778109e-06, "loss": 0.89420834, "memory(GiB)": 126.71, "step": 12020, "train_speed(iter/s)": 0.201896 }, { "acc": 0.76129012, "epoch": 0.2806690094635384, "grad_norm": 5.46875, "learning_rate": 9.684145184323565e-06, "loss": 0.86608372, "memory(GiB)": 126.71, "step": 12030, "train_speed(iter/s)": 0.201986 }, { "acc": 0.7592978, "epoch": 0.2809023170358273, "grad_norm": 4.9375, "learning_rate": 9.683484062208657e-06, "loss": 0.87132988, "memory(GiB)": 126.71, "step": 12040, "train_speed(iter/s)": 0.202073 }, { "acc": 0.75835218, "epoch": 0.2811356246081162, "grad_norm": 7.3125, "learning_rate": 9.682822271527758e-06, "loss": 0.88883543, "memory(GiB)": 126.71, "step": 12050, "train_speed(iter/s)": 0.202162 }, { "acc": 0.77570572, "epoch": 0.2813689321804051, "grad_norm": 6.71875, "learning_rate": 9.682159812375342e-06, "loss": 0.81515579, "memory(GiB)": 126.71, "step": 12060, "train_speed(iter/s)": 0.202246 }, { "acc": 0.77992678, "epoch": 0.281602239752694, "grad_norm": 5.6875, "learning_rate": 9.681496684845973e-06, "loss": 0.80841722, "memory(GiB)": 126.71, "step": 12070, "train_speed(iter/s)": 0.20233 }, { "acc": 0.75230103, "epoch": 0.2818355473249829, "grad_norm": 5.40625, "learning_rate": 9.68083288903431e-06, "loss": 0.91463375, "memory(GiB)": 126.71, "step": 12080, "train_speed(iter/s)": 0.202414 }, { "acc": 0.76985493, "epoch": 0.28206885489727174, "grad_norm": 8.4375, "learning_rate": 9.680168425035114e-06, "loss": 0.83439455, "memory(GiB)": 126.71, "step": 12090, "train_speed(iter/s)": 0.202502 }, { "acc": 0.75764389, "epoch": 0.28230216246956064, "grad_norm": 5.65625, "learning_rate": 9.679503292943234e-06, "loss": 0.88384857, "memory(GiB)": 126.71, "step": 12100, "train_speed(iter/s)": 0.202583 }, { "acc": 0.76889114, "epoch": 0.28253547004184953, "grad_norm": 4.84375, "learning_rate": 9.678837492853619e-06, "loss": 0.83500652, "memory(GiB)": 126.71, "step": 12110, "train_speed(iter/s)": 0.202664 }, { "acc": 0.76419258, "epoch": 0.28276877761413843, "grad_norm": 8.75, "learning_rate": 9.67817102486131e-06, "loss": 0.83876381, "memory(GiB)": 126.71, "step": 12120, "train_speed(iter/s)": 0.202754 }, { "acc": 0.76385984, "epoch": 0.28300208518642733, "grad_norm": 4.5, "learning_rate": 9.677503889061446e-06, "loss": 0.862673, "memory(GiB)": 126.71, "step": 12130, "train_speed(iter/s)": 0.202839 }, { "acc": 0.75134239, "epoch": 0.28323539275871623, "grad_norm": 6.5625, "learning_rate": 9.676836085549263e-06, "loss": 0.92557306, "memory(GiB)": 126.71, "step": 12140, "train_speed(iter/s)": 0.202926 }, { "acc": 0.75614481, "epoch": 0.28346870033100513, "grad_norm": 21.25, "learning_rate": 9.676167614420085e-06, "loss": 0.890693, "memory(GiB)": 126.71, "step": 12150, "train_speed(iter/s)": 0.203014 }, { "acc": 0.76238079, "epoch": 0.28370200790329403, "grad_norm": 6.34375, "learning_rate": 9.67549847576934e-06, "loss": 0.87673569, "memory(GiB)": 126.71, "step": 12160, "train_speed(iter/s)": 0.203099 }, { "acc": 0.76840477, "epoch": 0.28393531547558293, "grad_norm": 5.59375, "learning_rate": 9.674828669692545e-06, "loss": 0.8247673, "memory(GiB)": 126.71, "step": 12170, "train_speed(iter/s)": 0.203187 }, { "acc": 0.76925812, "epoch": 0.28416862304787177, "grad_norm": 6.03125, "learning_rate": 9.674158196285316e-06, "loss": 0.84895678, "memory(GiB)": 126.71, "step": 12180, "train_speed(iter/s)": 0.203274 }, { "acc": 0.79075804, "epoch": 0.28440193062016067, "grad_norm": 6.15625, "learning_rate": 9.673487055643362e-06, "loss": 0.75100679, "memory(GiB)": 126.71, "step": 12190, "train_speed(iter/s)": 0.203359 }, { "acc": 0.77747765, "epoch": 0.28463523819244957, "grad_norm": 5.5625, "learning_rate": 9.672815247862489e-06, "loss": 0.79033465, "memory(GiB)": 126.71, "step": 12200, "train_speed(iter/s)": 0.20344 }, { "acc": 0.7722918, "epoch": 0.28486854576473847, "grad_norm": 8.4375, "learning_rate": 9.672142773038595e-06, "loss": 0.83929825, "memory(GiB)": 126.71, "step": 12210, "train_speed(iter/s)": 0.203527 }, { "acc": 0.76563883, "epoch": 0.28510185333702737, "grad_norm": 5.6875, "learning_rate": 9.671469631267678e-06, "loss": 0.86306763, "memory(GiB)": 126.71, "step": 12220, "train_speed(iter/s)": 0.20361 }, { "acc": 0.74334183, "epoch": 0.28533516090931627, "grad_norm": 5.4375, "learning_rate": 9.67079582264583e-06, "loss": 0.91961136, "memory(GiB)": 126.71, "step": 12230, "train_speed(iter/s)": 0.203701 }, { "acc": 0.75210238, "epoch": 0.28556846848160516, "grad_norm": 5.40625, "learning_rate": 9.670121347269234e-06, "loss": 0.89075928, "memory(GiB)": 126.71, "step": 12240, "train_speed(iter/s)": 0.203785 }, { "acc": 0.74986658, "epoch": 0.28580177605389406, "grad_norm": 7.1875, "learning_rate": 9.669446205234172e-06, "loss": 0.89524193, "memory(GiB)": 126.71, "step": 12250, "train_speed(iter/s)": 0.203871 }, { "acc": 0.7721139, "epoch": 0.28603508362618296, "grad_norm": 5.4375, "learning_rate": 9.668770396637022e-06, "loss": 0.80716228, "memory(GiB)": 126.71, "step": 12260, "train_speed(iter/s)": 0.203959 }, { "acc": 0.78414478, "epoch": 0.28626839119847186, "grad_norm": 5.5625, "learning_rate": 9.668093921574253e-06, "loss": 0.78752213, "memory(GiB)": 126.71, "step": 12270, "train_speed(iter/s)": 0.20405 }, { "acc": 0.77421598, "epoch": 0.2865016987707607, "grad_norm": 6.28125, "learning_rate": 9.667416780142434e-06, "loss": 0.79807358, "memory(GiB)": 126.71, "step": 12280, "train_speed(iter/s)": 0.204142 }, { "acc": 0.7758357, "epoch": 0.2867350063430496, "grad_norm": 9.0625, "learning_rate": 9.666738972438224e-06, "loss": 0.80946007, "memory(GiB)": 126.71, "step": 12290, "train_speed(iter/s)": 0.204222 }, { "acc": 0.75358167, "epoch": 0.2869683139153385, "grad_norm": 4.625, "learning_rate": 9.666060498558381e-06, "loss": 0.88906536, "memory(GiB)": 126.71, "step": 12300, "train_speed(iter/s)": 0.20431 }, { "acc": 0.74659085, "epoch": 0.2872016214876274, "grad_norm": 5.65625, "learning_rate": 9.665381358599759e-06, "loss": 0.93836308, "memory(GiB)": 126.71, "step": 12310, "train_speed(iter/s)": 0.204393 }, { "acc": 0.77110682, "epoch": 0.2874349290599163, "grad_norm": 6.96875, "learning_rate": 9.664701552659303e-06, "loss": 0.83051643, "memory(GiB)": 126.71, "step": 12320, "train_speed(iter/s)": 0.204475 }, { "acc": 0.75053787, "epoch": 0.2876682366322052, "grad_norm": 7.03125, "learning_rate": 9.664021080834053e-06, "loss": 0.90708065, "memory(GiB)": 126.71, "step": 12330, "train_speed(iter/s)": 0.204557 }, { "acc": 0.76359439, "epoch": 0.2879015442044941, "grad_norm": 7.25, "learning_rate": 9.663339943221153e-06, "loss": 0.88234577, "memory(GiB)": 126.71, "step": 12340, "train_speed(iter/s)": 0.204633 }, { "acc": 0.75120668, "epoch": 0.288134851776783, "grad_norm": 5.28125, "learning_rate": 9.662658139917827e-06, "loss": 0.89714279, "memory(GiB)": 126.71, "step": 12350, "train_speed(iter/s)": 0.204714 }, { "acc": 0.75707302, "epoch": 0.2883681593490719, "grad_norm": 6.90625, "learning_rate": 9.661975671021408e-06, "loss": 0.8912262, "memory(GiB)": 126.71, "step": 12360, "train_speed(iter/s)": 0.204798 }, { "acc": 0.75848045, "epoch": 0.2886014669213608, "grad_norm": 7.65625, "learning_rate": 9.661292536629316e-06, "loss": 0.86687336, "memory(GiB)": 126.71, "step": 12370, "train_speed(iter/s)": 0.204885 }, { "acc": 0.78685079, "epoch": 0.28883477449364964, "grad_norm": 4.65625, "learning_rate": 9.660608736839067e-06, "loss": 0.77061977, "memory(GiB)": 126.71, "step": 12380, "train_speed(iter/s)": 0.20497 }, { "acc": 0.76425805, "epoch": 0.28906808206593854, "grad_norm": 7.59375, "learning_rate": 9.659924271748277e-06, "loss": 0.85489254, "memory(GiB)": 126.71, "step": 12390, "train_speed(iter/s)": 0.205062 }, { "acc": 0.76545777, "epoch": 0.28930138963822744, "grad_norm": 5.28125, "learning_rate": 9.65923914145465e-06, "loss": 0.82574024, "memory(GiB)": 126.71, "step": 12400, "train_speed(iter/s)": 0.205146 }, { "acc": 0.762534, "epoch": 0.28953469721051633, "grad_norm": 8.875, "learning_rate": 9.65855334605599e-06, "loss": 0.83932781, "memory(GiB)": 126.71, "step": 12410, "train_speed(iter/s)": 0.205233 }, { "acc": 0.77640924, "epoch": 0.28976800478280523, "grad_norm": 5.375, "learning_rate": 9.65786688565019e-06, "loss": 0.82643003, "memory(GiB)": 126.71, "step": 12420, "train_speed(iter/s)": 0.205318 }, { "acc": 0.77272964, "epoch": 0.29000131235509413, "grad_norm": 5.03125, "learning_rate": 9.65717976033525e-06, "loss": 0.81472321, "memory(GiB)": 126.71, "step": 12430, "train_speed(iter/s)": 0.205396 }, { "acc": 0.76347284, "epoch": 0.29023461992738303, "grad_norm": 5.84375, "learning_rate": 9.656491970209248e-06, "loss": 0.86536961, "memory(GiB)": 126.71, "step": 12440, "train_speed(iter/s)": 0.205482 }, { "acc": 0.7746274, "epoch": 0.29046792749967193, "grad_norm": 6.625, "learning_rate": 9.655803515370373e-06, "loss": 0.81599464, "memory(GiB)": 126.71, "step": 12450, "train_speed(iter/s)": 0.205569 }, { "acc": 0.77300863, "epoch": 0.29070123507196083, "grad_norm": 5.28125, "learning_rate": 9.655114395916896e-06, "loss": 0.8248374, "memory(GiB)": 126.71, "step": 12460, "train_speed(iter/s)": 0.205656 }, { "acc": 0.76894598, "epoch": 0.29093454264424967, "grad_norm": 5.84375, "learning_rate": 9.654424611947194e-06, "loss": 0.85667, "memory(GiB)": 126.71, "step": 12470, "train_speed(iter/s)": 0.205736 }, { "acc": 0.77182522, "epoch": 0.29116785021653857, "grad_norm": 6.75, "learning_rate": 9.65373416355973e-06, "loss": 0.83298454, "memory(GiB)": 126.71, "step": 12480, "train_speed(iter/s)": 0.205818 }, { "acc": 0.77251606, "epoch": 0.29140115778882747, "grad_norm": 4.8125, "learning_rate": 9.653043050853065e-06, "loss": 0.83165321, "memory(GiB)": 126.71, "step": 12490, "train_speed(iter/s)": 0.205904 }, { "acc": 0.78174143, "epoch": 0.29163446536111637, "grad_norm": 6.78125, "learning_rate": 9.652351273925854e-06, "loss": 0.80072803, "memory(GiB)": 126.71, "step": 12500, "train_speed(iter/s)": 0.205991 }, { "epoch": 0.29163446536111637, "eval_acc": 0.7307442102104234, "eval_loss": 0.8518584370613098, "eval_runtime": 1262.8812, "eval_samples_per_second": 28.499, "eval_steps_per_second": 14.25, "step": 12500 }, { "acc": 0.76996889, "epoch": 0.29186777293340527, "grad_norm": 9.0, "learning_rate": 9.651658832876853e-06, "loss": 0.8436965, "memory(GiB)": 126.71, "step": 12510, "train_speed(iter/s)": 0.201793 }, { "acc": 0.76791477, "epoch": 0.29210108050569417, "grad_norm": 5.46875, "learning_rate": 9.650965727804907e-06, "loss": 0.85333385, "memory(GiB)": 126.71, "step": 12520, "train_speed(iter/s)": 0.201879 }, { "acc": 0.76511497, "epoch": 0.29233438807798307, "grad_norm": 5.375, "learning_rate": 9.65027195880895e-06, "loss": 0.85215321, "memory(GiB)": 126.71, "step": 12530, "train_speed(iter/s)": 0.201962 }, { "acc": 0.73556137, "epoch": 0.29256769565027196, "grad_norm": 9.9375, "learning_rate": 9.649577525988025e-06, "loss": 0.97543221, "memory(GiB)": 126.71, "step": 12540, "train_speed(iter/s)": 0.202047 }, { "acc": 0.74816146, "epoch": 0.29280100322256086, "grad_norm": 7.1875, "learning_rate": 9.648882429441258e-06, "loss": 0.94485703, "memory(GiB)": 126.71, "step": 12550, "train_speed(iter/s)": 0.20213 }, { "acc": 0.75887346, "epoch": 0.29303431079484976, "grad_norm": 8.3125, "learning_rate": 9.648186669267874e-06, "loss": 0.87317448, "memory(GiB)": 126.71, "step": 12560, "train_speed(iter/s)": 0.202211 }, { "acc": 0.77297869, "epoch": 0.2932676183671386, "grad_norm": 10.625, "learning_rate": 9.647490245567194e-06, "loss": 0.8273922, "memory(GiB)": 126.71, "step": 12570, "train_speed(iter/s)": 0.2023 }, { "acc": 0.77770405, "epoch": 0.2935009259394275, "grad_norm": 9.9375, "learning_rate": 9.646793158438632e-06, "loss": 0.78639069, "memory(GiB)": 126.71, "step": 12580, "train_speed(iter/s)": 0.20238 }, { "acc": 0.78534679, "epoch": 0.2937342335117164, "grad_norm": 5.28125, "learning_rate": 9.646095407981695e-06, "loss": 0.77929201, "memory(GiB)": 126.71, "step": 12590, "train_speed(iter/s)": 0.20246 }, { "acc": 0.75119905, "epoch": 0.2939675410840053, "grad_norm": 10.9375, "learning_rate": 9.64539699429599e-06, "loss": 0.8930438, "memory(GiB)": 126.71, "step": 12600, "train_speed(iter/s)": 0.202542 }, { "acc": 0.76973743, "epoch": 0.2942008486562942, "grad_norm": 7.1875, "learning_rate": 9.644697917481212e-06, "loss": 0.83177662, "memory(GiB)": 126.71, "step": 12610, "train_speed(iter/s)": 0.202626 }, { "acc": 0.76759176, "epoch": 0.2944341562285831, "grad_norm": 5.6875, "learning_rate": 9.643998177637157e-06, "loss": 0.82939339, "memory(GiB)": 126.71, "step": 12620, "train_speed(iter/s)": 0.202713 }, { "acc": 0.76640143, "epoch": 0.294667463800872, "grad_norm": 9.4375, "learning_rate": 9.643297774863709e-06, "loss": 0.8526329, "memory(GiB)": 126.71, "step": 12630, "train_speed(iter/s)": 0.202801 }, { "acc": 0.74112091, "epoch": 0.2949007713731609, "grad_norm": 5.84375, "learning_rate": 9.642596709260854e-06, "loss": 0.94894772, "memory(GiB)": 126.71, "step": 12640, "train_speed(iter/s)": 0.202889 }, { "acc": 0.75334239, "epoch": 0.2951340789454498, "grad_norm": 9.0, "learning_rate": 9.641894980928668e-06, "loss": 0.90535126, "memory(GiB)": 126.71, "step": 12650, "train_speed(iter/s)": 0.202974 }, { "acc": 0.78220768, "epoch": 0.29536738651773864, "grad_norm": 6.0625, "learning_rate": 9.641192589967321e-06, "loss": 0.79351797, "memory(GiB)": 126.71, "step": 12660, "train_speed(iter/s)": 0.203059 }, { "acc": 0.76604443, "epoch": 0.29560069409002754, "grad_norm": 5.15625, "learning_rate": 9.64048953647708e-06, "loss": 0.83979473, "memory(GiB)": 126.71, "step": 12670, "train_speed(iter/s)": 0.203142 }, { "acc": 0.74809709, "epoch": 0.29583400166231644, "grad_norm": 5.71875, "learning_rate": 9.639785820558307e-06, "loss": 0.93830051, "memory(GiB)": 126.71, "step": 12680, "train_speed(iter/s)": 0.203228 }, { "acc": 0.76836653, "epoch": 0.29606730923460534, "grad_norm": 5.46875, "learning_rate": 9.639081442311456e-06, "loss": 0.83578634, "memory(GiB)": 126.71, "step": 12690, "train_speed(iter/s)": 0.20331 }, { "acc": 0.76343279, "epoch": 0.29630061680689423, "grad_norm": 4.3125, "learning_rate": 9.638376401837075e-06, "loss": 0.87286949, "memory(GiB)": 126.71, "step": 12700, "train_speed(iter/s)": 0.203392 }, { "acc": 0.76687584, "epoch": 0.29653392437918313, "grad_norm": 5.21875, "learning_rate": 9.63767069923581e-06, "loss": 0.83260441, "memory(GiB)": 126.71, "step": 12710, "train_speed(iter/s)": 0.203474 }, { "acc": 0.74487181, "epoch": 0.29676723195147203, "grad_norm": 6.15625, "learning_rate": 9.636964334608402e-06, "loss": 0.93563623, "memory(GiB)": 126.71, "step": 12720, "train_speed(iter/s)": 0.203556 }, { "acc": 0.75783634, "epoch": 0.29700053952376093, "grad_norm": 4.3125, "learning_rate": 9.636257308055682e-06, "loss": 0.87265339, "memory(GiB)": 126.71, "step": 12730, "train_speed(iter/s)": 0.203639 }, { "acc": 0.75335588, "epoch": 0.29723384709604983, "grad_norm": 4.71875, "learning_rate": 9.635549619678578e-06, "loss": 0.90690899, "memory(GiB)": 126.71, "step": 12740, "train_speed(iter/s)": 0.203717 }, { "acc": 0.76490326, "epoch": 0.29746715466833873, "grad_norm": 6.46875, "learning_rate": 9.63484126957811e-06, "loss": 0.86026697, "memory(GiB)": 126.71, "step": 12750, "train_speed(iter/s)": 0.2038 }, { "acc": 0.74564862, "epoch": 0.2977004622406276, "grad_norm": 6.46875, "learning_rate": 9.6341322578554e-06, "loss": 0.92997026, "memory(GiB)": 126.71, "step": 12760, "train_speed(iter/s)": 0.203882 }, { "acc": 0.77166858, "epoch": 0.29793376981291647, "grad_norm": 5.375, "learning_rate": 9.633422584611654e-06, "loss": 0.84390697, "memory(GiB)": 126.71, "step": 12770, "train_speed(iter/s)": 0.203966 }, { "acc": 0.76024094, "epoch": 0.29816707738520537, "grad_norm": 6.28125, "learning_rate": 9.632712249948182e-06, "loss": 0.88499451, "memory(GiB)": 126.71, "step": 12780, "train_speed(iter/s)": 0.204045 }, { "acc": 0.77673635, "epoch": 0.29840038495749427, "grad_norm": 6.09375, "learning_rate": 9.632001253966381e-06, "loss": 0.79480124, "memory(GiB)": 126.71, "step": 12790, "train_speed(iter/s)": 0.20413 }, { "acc": 0.77070308, "epoch": 0.29863369252978317, "grad_norm": 6.0, "learning_rate": 9.631289596767748e-06, "loss": 0.82058554, "memory(GiB)": 126.71, "step": 12800, "train_speed(iter/s)": 0.204206 }, { "acc": 0.79163589, "epoch": 0.29886700010207207, "grad_norm": 5.0625, "learning_rate": 9.63057727845387e-06, "loss": 0.74121714, "memory(GiB)": 126.71, "step": 12810, "train_speed(iter/s)": 0.204287 }, { "acc": 0.78729353, "epoch": 0.29910030767436097, "grad_norm": 8.3125, "learning_rate": 9.62986429912643e-06, "loss": 0.75445652, "memory(GiB)": 126.71, "step": 12820, "train_speed(iter/s)": 0.204369 }, { "acc": 0.76602516, "epoch": 0.29933361524664986, "grad_norm": 6.625, "learning_rate": 9.629150658887206e-06, "loss": 0.85484571, "memory(GiB)": 126.71, "step": 12830, "train_speed(iter/s)": 0.204442 }, { "acc": 0.7663785, "epoch": 0.29956692281893876, "grad_norm": 4.875, "learning_rate": 9.628436357838072e-06, "loss": 0.85716782, "memory(GiB)": 126.71, "step": 12840, "train_speed(iter/s)": 0.204517 }, { "acc": 0.77487249, "epoch": 0.29980023039122766, "grad_norm": 4.875, "learning_rate": 9.627721396080992e-06, "loss": 0.81936855, "memory(GiB)": 126.71, "step": 12850, "train_speed(iter/s)": 0.204602 }, { "acc": 0.76845889, "epoch": 0.3000335379635165, "grad_norm": 5.90625, "learning_rate": 9.627005773718026e-06, "loss": 0.82405758, "memory(GiB)": 126.71, "step": 12860, "train_speed(iter/s)": 0.204682 }, { "acc": 0.76396217, "epoch": 0.3002668455358054, "grad_norm": 6.1875, "learning_rate": 9.626289490851329e-06, "loss": 0.87437134, "memory(GiB)": 126.71, "step": 12870, "train_speed(iter/s)": 0.204763 }, { "acc": 0.77304287, "epoch": 0.3005001531080943, "grad_norm": 5.6875, "learning_rate": 9.625572547583153e-06, "loss": 0.81252775, "memory(GiB)": 126.71, "step": 12880, "train_speed(iter/s)": 0.204846 }, { "acc": 0.7688489, "epoch": 0.3007334606803832, "grad_norm": 18.625, "learning_rate": 9.624854944015839e-06, "loss": 0.83976612, "memory(GiB)": 126.71, "step": 12890, "train_speed(iter/s)": 0.204933 }, { "acc": 0.77317305, "epoch": 0.3009667682526721, "grad_norm": 6.8125, "learning_rate": 9.624136680251826e-06, "loss": 0.81203804, "memory(GiB)": 126.71, "step": 12900, "train_speed(iter/s)": 0.205015 }, { "acc": 0.76317711, "epoch": 0.301200075824961, "grad_norm": 7.0, "learning_rate": 9.623417756393644e-06, "loss": 0.87424259, "memory(GiB)": 126.71, "step": 12910, "train_speed(iter/s)": 0.2051 }, { "acc": 0.75433722, "epoch": 0.3014333833972499, "grad_norm": 28.875, "learning_rate": 9.622698172543921e-06, "loss": 0.89263592, "memory(GiB)": 126.71, "step": 12920, "train_speed(iter/s)": 0.205185 }, { "acc": 0.76284971, "epoch": 0.3016666909695388, "grad_norm": 6.65625, "learning_rate": 9.621977928805377e-06, "loss": 0.84346781, "memory(GiB)": 126.71, "step": 12930, "train_speed(iter/s)": 0.205273 }, { "acc": 0.777003, "epoch": 0.3018999985418277, "grad_norm": 5.53125, "learning_rate": 9.621257025280826e-06, "loss": 0.81495667, "memory(GiB)": 126.71, "step": 12940, "train_speed(iter/s)": 0.205353 }, { "acc": 0.76023765, "epoch": 0.30213330611411654, "grad_norm": 14.375, "learning_rate": 9.620535462073177e-06, "loss": 0.85683413, "memory(GiB)": 126.71, "step": 12950, "train_speed(iter/s)": 0.205434 }, { "acc": 0.75732594, "epoch": 0.30236661368640544, "grad_norm": 6.15625, "learning_rate": 9.619813239285433e-06, "loss": 0.88258152, "memory(GiB)": 126.71, "step": 12960, "train_speed(iter/s)": 0.205517 }, { "acc": 0.77402887, "epoch": 0.30259992125869434, "grad_norm": 5.4375, "learning_rate": 9.619090357020691e-06, "loss": 0.81312466, "memory(GiB)": 126.71, "step": 12970, "train_speed(iter/s)": 0.205595 }, { "acc": 0.79348712, "epoch": 0.30283322883098324, "grad_norm": 6.28125, "learning_rate": 9.618366815382143e-06, "loss": 0.74646282, "memory(GiB)": 126.71, "step": 12980, "train_speed(iter/s)": 0.205672 }, { "acc": 0.75259399, "epoch": 0.30306653640327214, "grad_norm": 5.71875, "learning_rate": 9.617642614473073e-06, "loss": 0.91070881, "memory(GiB)": 126.71, "step": 12990, "train_speed(iter/s)": 0.205755 }, { "acc": 0.75765018, "epoch": 0.30329984397556103, "grad_norm": 5.34375, "learning_rate": 9.616917754396861e-06, "loss": 0.85043106, "memory(GiB)": 126.71, "step": 13000, "train_speed(iter/s)": 0.205837 }, { "epoch": 0.30329984397556103, "eval_acc": 0.730756956298665, "eval_loss": 0.8516232371330261, "eval_runtime": 1262.1008, "eval_samples_per_second": 28.517, "eval_steps_per_second": 14.259, "step": 13000 }, { "acc": 0.7704545, "epoch": 0.30353315154784993, "grad_norm": 5.1875, "learning_rate": 9.616192235256983e-06, "loss": 0.82055855, "memory(GiB)": 126.71, "step": 13010, "train_speed(iter/s)": 0.201803 }, { "acc": 0.74478865, "epoch": 0.30376645912013883, "grad_norm": 6.25, "learning_rate": 9.615466057157002e-06, "loss": 0.93407211, "memory(GiB)": 126.71, "step": 13020, "train_speed(iter/s)": 0.201882 }, { "acc": 0.76167407, "epoch": 0.30399976669242773, "grad_norm": 6.1875, "learning_rate": 9.614739220200583e-06, "loss": 0.87905254, "memory(GiB)": 126.71, "step": 13030, "train_speed(iter/s)": 0.201957 }, { "acc": 0.74936733, "epoch": 0.30423307426471663, "grad_norm": 15.1875, "learning_rate": 9.614011724491481e-06, "loss": 0.91672173, "memory(GiB)": 126.71, "step": 13040, "train_speed(iter/s)": 0.202036 }, { "acc": 0.74608107, "epoch": 0.3044663818370055, "grad_norm": 6.6875, "learning_rate": 9.613283570133547e-06, "loss": 0.90242882, "memory(GiB)": 126.71, "step": 13050, "train_speed(iter/s)": 0.202116 }, { "acc": 0.77755613, "epoch": 0.30469968940929437, "grad_norm": 7.3125, "learning_rate": 9.612554757230722e-06, "loss": 0.81360474, "memory(GiB)": 126.71, "step": 13060, "train_speed(iter/s)": 0.202198 }, { "acc": 0.77748594, "epoch": 0.30493299698158327, "grad_norm": 5.03125, "learning_rate": 9.611825285887045e-06, "loss": 0.79001107, "memory(GiB)": 126.71, "step": 13070, "train_speed(iter/s)": 0.202269 }, { "acc": 0.75951447, "epoch": 0.30516630455387217, "grad_norm": 5.84375, "learning_rate": 9.61109515620665e-06, "loss": 0.86944599, "memory(GiB)": 126.71, "step": 13080, "train_speed(iter/s)": 0.202352 }, { "acc": 0.78131723, "epoch": 0.30539961212616107, "grad_norm": 5.28125, "learning_rate": 9.61036436829376e-06, "loss": 0.78312073, "memory(GiB)": 126.71, "step": 13090, "train_speed(iter/s)": 0.202429 }, { "acc": 0.76216078, "epoch": 0.30563291969844997, "grad_norm": 9.3125, "learning_rate": 9.609632922252695e-06, "loss": 0.86766243, "memory(GiB)": 126.71, "step": 13100, "train_speed(iter/s)": 0.202507 }, { "acc": 0.76239228, "epoch": 0.30586622727073887, "grad_norm": 6.5, "learning_rate": 9.60890081818787e-06, "loss": 0.85454035, "memory(GiB)": 126.71, "step": 13110, "train_speed(iter/s)": 0.20259 }, { "acc": 0.77233567, "epoch": 0.30609953484302777, "grad_norm": 5.0625, "learning_rate": 9.608168056203792e-06, "loss": 0.79431753, "memory(GiB)": 126.71, "step": 13120, "train_speed(iter/s)": 0.202669 }, { "acc": 0.7572166, "epoch": 0.30633284241531666, "grad_norm": 5.84375, "learning_rate": 9.607434636405063e-06, "loss": 0.90028782, "memory(GiB)": 126.71, "step": 13130, "train_speed(iter/s)": 0.20275 }, { "acc": 0.79457684, "epoch": 0.30656614998760556, "grad_norm": 5.5625, "learning_rate": 9.606700558896376e-06, "loss": 0.72691998, "memory(GiB)": 126.71, "step": 13140, "train_speed(iter/s)": 0.202831 }, { "acc": 0.76088514, "epoch": 0.3067994575598944, "grad_norm": 18.0, "learning_rate": 9.605965823782525e-06, "loss": 0.88303051, "memory(GiB)": 126.71, "step": 13150, "train_speed(iter/s)": 0.202907 }, { "acc": 0.76800742, "epoch": 0.3070327651321833, "grad_norm": 30.125, "learning_rate": 9.605230431168391e-06, "loss": 0.83991909, "memory(GiB)": 126.71, "step": 13160, "train_speed(iter/s)": 0.202984 }, { "acc": 0.77626777, "epoch": 0.3072660727044722, "grad_norm": 17.625, "learning_rate": 9.604494381158949e-06, "loss": 0.81324158, "memory(GiB)": 126.71, "step": 13170, "train_speed(iter/s)": 0.203062 }, { "acc": 0.76364522, "epoch": 0.3074993802767611, "grad_norm": 6.34375, "learning_rate": 9.603757673859274e-06, "loss": 0.86209192, "memory(GiB)": 126.71, "step": 13180, "train_speed(iter/s)": 0.20314 }, { "acc": 0.75070577, "epoch": 0.30773268784905, "grad_norm": 11.0, "learning_rate": 9.603020309374526e-06, "loss": 0.92564697, "memory(GiB)": 126.71, "step": 13190, "train_speed(iter/s)": 0.203223 }, { "acc": 0.76647429, "epoch": 0.3079659954213389, "grad_norm": 5.96875, "learning_rate": 9.602282287809966e-06, "loss": 0.90197659, "memory(GiB)": 126.71, "step": 13200, "train_speed(iter/s)": 0.203304 }, { "acc": 0.76064634, "epoch": 0.3081993029936278, "grad_norm": 6.3125, "learning_rate": 9.601543609270947e-06, "loss": 0.88957014, "memory(GiB)": 126.71, "step": 13210, "train_speed(iter/s)": 0.203381 }, { "acc": 0.75101113, "epoch": 0.3084326105659167, "grad_norm": 8.0, "learning_rate": 9.600804273862917e-06, "loss": 0.91185532, "memory(GiB)": 126.71, "step": 13220, "train_speed(iter/s)": 0.203464 }, { "acc": 0.75770144, "epoch": 0.3086659181382056, "grad_norm": 11.25, "learning_rate": 9.60006428169141e-06, "loss": 0.87609825, "memory(GiB)": 126.71, "step": 13230, "train_speed(iter/s)": 0.203538 }, { "acc": 0.78372154, "epoch": 0.30889922571049444, "grad_norm": 5.90625, "learning_rate": 9.599323632862063e-06, "loss": 0.77824244, "memory(GiB)": 126.71, "step": 13240, "train_speed(iter/s)": 0.203618 }, { "acc": 0.76620092, "epoch": 0.30913253328278334, "grad_norm": 7.34375, "learning_rate": 9.598582327480605e-06, "loss": 0.84992428, "memory(GiB)": 126.71, "step": 13250, "train_speed(iter/s)": 0.203698 }, { "acc": 0.76315026, "epoch": 0.30936584085507224, "grad_norm": 6.09375, "learning_rate": 9.597840365652857e-06, "loss": 0.855336, "memory(GiB)": 126.71, "step": 13260, "train_speed(iter/s)": 0.203775 }, { "acc": 0.76535864, "epoch": 0.30959914842736114, "grad_norm": 6.21875, "learning_rate": 9.597097747484731e-06, "loss": 0.84769878, "memory(GiB)": 126.71, "step": 13270, "train_speed(iter/s)": 0.203854 }, { "acc": 0.77613759, "epoch": 0.30983245599965004, "grad_norm": 5.53125, "learning_rate": 9.596354473082237e-06, "loss": 0.82380571, "memory(GiB)": 126.71, "step": 13280, "train_speed(iter/s)": 0.203933 }, { "acc": 0.77198167, "epoch": 0.31006576357193893, "grad_norm": 6.15625, "learning_rate": 9.595610542551476e-06, "loss": 0.82092457, "memory(GiB)": 126.71, "step": 13290, "train_speed(iter/s)": 0.204012 }, { "acc": 0.76140842, "epoch": 0.31029907114422783, "grad_norm": 6.34375, "learning_rate": 9.594865955998648e-06, "loss": 0.86993189, "memory(GiB)": 126.71, "step": 13300, "train_speed(iter/s)": 0.204094 }, { "acc": 0.76021256, "epoch": 0.31053237871651673, "grad_norm": 5.6875, "learning_rate": 9.594120713530038e-06, "loss": 0.89229202, "memory(GiB)": 126.71, "step": 13310, "train_speed(iter/s)": 0.204176 }, { "acc": 0.75621815, "epoch": 0.31076568628880563, "grad_norm": 4.59375, "learning_rate": 9.59337481525203e-06, "loss": 0.86857967, "memory(GiB)": 126.71, "step": 13320, "train_speed(iter/s)": 0.204251 }, { "acc": 0.76059723, "epoch": 0.31099899386109453, "grad_norm": 5.6875, "learning_rate": 9.592628261271102e-06, "loss": 0.89437885, "memory(GiB)": 126.71, "step": 13330, "train_speed(iter/s)": 0.204333 }, { "acc": 0.76669917, "epoch": 0.3112323014333834, "grad_norm": 7.03125, "learning_rate": 9.591881051693826e-06, "loss": 0.85553665, "memory(GiB)": 126.71, "step": 13340, "train_speed(iter/s)": 0.204405 }, { "acc": 0.7753427, "epoch": 0.3114656090056723, "grad_norm": 7.9375, "learning_rate": 9.591133186626861e-06, "loss": 0.83228683, "memory(GiB)": 126.71, "step": 13350, "train_speed(iter/s)": 0.204485 }, { "acc": 0.75235758, "epoch": 0.31169891657796117, "grad_norm": 6.03125, "learning_rate": 9.590384666176968e-06, "loss": 0.90382633, "memory(GiB)": 126.71, "step": 13360, "train_speed(iter/s)": 0.204563 }, { "acc": 0.75613799, "epoch": 0.31193222415025007, "grad_norm": 4.375, "learning_rate": 9.589635490450999e-06, "loss": 0.86439152, "memory(GiB)": 126.71, "step": 13370, "train_speed(iter/s)": 0.204636 }, { "acc": 0.7420558, "epoch": 0.31216553172253897, "grad_norm": 5.1875, "learning_rate": 9.588885659555895e-06, "loss": 0.95892324, "memory(GiB)": 126.71, "step": 13380, "train_speed(iter/s)": 0.204715 }, { "acc": 0.75670547, "epoch": 0.31239883929482787, "grad_norm": 7.53125, "learning_rate": 9.588135173598696e-06, "loss": 0.86891193, "memory(GiB)": 126.71, "step": 13390, "train_speed(iter/s)": 0.204793 }, { "acc": 0.76449828, "epoch": 0.31263214686711677, "grad_norm": 6.0625, "learning_rate": 9.587384032686536e-06, "loss": 0.83516788, "memory(GiB)": 126.71, "step": 13400, "train_speed(iter/s)": 0.204871 }, { "acc": 0.78461428, "epoch": 0.31286545443940567, "grad_norm": 9.125, "learning_rate": 9.586632236926637e-06, "loss": 0.78171921, "memory(GiB)": 126.71, "step": 13410, "train_speed(iter/s)": 0.204944 }, { "acc": 0.75920763, "epoch": 0.31309876201169456, "grad_norm": 6.125, "learning_rate": 9.585879786426317e-06, "loss": 0.86515121, "memory(GiB)": 126.71, "step": 13420, "train_speed(iter/s)": 0.205027 }, { "acc": 0.77879372, "epoch": 0.3133320695839834, "grad_norm": 3.90625, "learning_rate": 9.585126681292991e-06, "loss": 0.80493803, "memory(GiB)": 126.71, "step": 13430, "train_speed(iter/s)": 0.205108 }, { "acc": 0.77412901, "epoch": 0.3135653771562723, "grad_norm": 5.65625, "learning_rate": 9.584372921634164e-06, "loss": 0.81880493, "memory(GiB)": 126.71, "step": 13440, "train_speed(iter/s)": 0.205193 }, { "acc": 0.76328559, "epoch": 0.3137986847285612, "grad_norm": 6.375, "learning_rate": 9.583618507557433e-06, "loss": 0.86565266, "memory(GiB)": 126.71, "step": 13450, "train_speed(iter/s)": 0.205276 }, { "acc": 0.76049929, "epoch": 0.3140319923008501, "grad_norm": 5.59375, "learning_rate": 9.582863439170493e-06, "loss": 0.88368587, "memory(GiB)": 126.71, "step": 13460, "train_speed(iter/s)": 0.205352 }, { "acc": 0.77567883, "epoch": 0.314265299873139, "grad_norm": 5.46875, "learning_rate": 9.582107716581125e-06, "loss": 0.82262173, "memory(GiB)": 126.71, "step": 13470, "train_speed(iter/s)": 0.205428 }, { "acc": 0.77586155, "epoch": 0.3144986074454279, "grad_norm": 6.0, "learning_rate": 9.581351339897215e-06, "loss": 0.82963734, "memory(GiB)": 126.71, "step": 13480, "train_speed(iter/s)": 0.205503 }, { "acc": 0.77103205, "epoch": 0.3147319150177168, "grad_norm": 5.375, "learning_rate": 9.580594309226731e-06, "loss": 0.82151461, "memory(GiB)": 126.71, "step": 13490, "train_speed(iter/s)": 0.205582 }, { "acc": 0.74852571, "epoch": 0.3149652225900057, "grad_norm": 7.3125, "learning_rate": 9.579836624677742e-06, "loss": 0.91936827, "memory(GiB)": 126.71, "step": 13500, "train_speed(iter/s)": 0.205665 }, { "epoch": 0.3149652225900057, "eval_acc": 0.7313189135815217, "eval_loss": 0.8501865863800049, "eval_runtime": 1264.147, "eval_samples_per_second": 28.471, "eval_steps_per_second": 14.236, "step": 13500 }, { "acc": 0.7609498, "epoch": 0.3151985301622946, "grad_norm": 4.0625, "learning_rate": 9.579078286358403e-06, "loss": 0.86315899, "memory(GiB)": 126.71, "step": 13510, "train_speed(iter/s)": 0.201778 }, { "acc": 0.76488066, "epoch": 0.3154318377345835, "grad_norm": 5.75, "learning_rate": 9.578319294376968e-06, "loss": 0.84903994, "memory(GiB)": 126.71, "step": 13520, "train_speed(iter/s)": 0.201855 }, { "acc": 0.77556419, "epoch": 0.31566514530687234, "grad_norm": 5.28125, "learning_rate": 9.577559648841785e-06, "loss": 0.80430641, "memory(GiB)": 126.71, "step": 13530, "train_speed(iter/s)": 0.201938 }, { "acc": 0.75571804, "epoch": 0.31589845287916124, "grad_norm": 6.59375, "learning_rate": 9.576799349861292e-06, "loss": 0.88906021, "memory(GiB)": 126.71, "step": 13540, "train_speed(iter/s)": 0.202015 }, { "acc": 0.76867266, "epoch": 0.31613176045145014, "grad_norm": 5.5, "learning_rate": 9.576038397544021e-06, "loss": 0.84826012, "memory(GiB)": 126.71, "step": 13550, "train_speed(iter/s)": 0.202095 }, { "acc": 0.77901011, "epoch": 0.31636506802373904, "grad_norm": 5.40625, "learning_rate": 9.5752767919986e-06, "loss": 0.81336555, "memory(GiB)": 126.71, "step": 13560, "train_speed(iter/s)": 0.202173 }, { "acc": 0.76742849, "epoch": 0.31659837559602794, "grad_norm": 5.375, "learning_rate": 9.574514533333744e-06, "loss": 0.86053085, "memory(GiB)": 126.71, "step": 13570, "train_speed(iter/s)": 0.202251 }, { "acc": 0.7592463, "epoch": 0.31683168316831684, "grad_norm": 5.15625, "learning_rate": 9.573751621658267e-06, "loss": 0.88185768, "memory(GiB)": 126.71, "step": 13580, "train_speed(iter/s)": 0.202334 }, { "acc": 0.78105249, "epoch": 0.31706499074060573, "grad_norm": 7.5625, "learning_rate": 9.572988057081076e-06, "loss": 0.78568778, "memory(GiB)": 126.71, "step": 13590, "train_speed(iter/s)": 0.202413 }, { "acc": 0.7570931, "epoch": 0.31729829831289463, "grad_norm": 5.875, "learning_rate": 9.572223839711168e-06, "loss": 0.89775696, "memory(GiB)": 126.71, "step": 13600, "train_speed(iter/s)": 0.202493 }, { "acc": 0.76521273, "epoch": 0.31753160588518353, "grad_norm": 5.15625, "learning_rate": 9.571458969657634e-06, "loss": 0.84838772, "memory(GiB)": 126.71, "step": 13610, "train_speed(iter/s)": 0.202574 }, { "acc": 0.79602318, "epoch": 0.31776491345747243, "grad_norm": 7.09375, "learning_rate": 9.570693447029662e-06, "loss": 0.72170115, "memory(GiB)": 126.71, "step": 13620, "train_speed(iter/s)": 0.202649 }, { "acc": 0.78130589, "epoch": 0.3179982210297613, "grad_norm": 5.15625, "learning_rate": 9.569927271936528e-06, "loss": 0.77269769, "memory(GiB)": 126.71, "step": 13630, "train_speed(iter/s)": 0.202728 }, { "acc": 0.75481219, "epoch": 0.3182315286020502, "grad_norm": 4.84375, "learning_rate": 9.569160444487602e-06, "loss": 0.87458229, "memory(GiB)": 126.71, "step": 13640, "train_speed(iter/s)": 0.202807 }, { "acc": 0.7716711, "epoch": 0.31846483617433907, "grad_norm": 6.15625, "learning_rate": 9.56839296479235e-06, "loss": 0.80424881, "memory(GiB)": 126.71, "step": 13650, "train_speed(iter/s)": 0.202886 }, { "acc": 0.78376093, "epoch": 0.31869814374662797, "grad_norm": 19.25, "learning_rate": 9.56762483296033e-06, "loss": 0.75964251, "memory(GiB)": 126.71, "step": 13660, "train_speed(iter/s)": 0.202966 }, { "acc": 0.76077061, "epoch": 0.31893145131891687, "grad_norm": 10.0625, "learning_rate": 9.566856049101192e-06, "loss": 0.8783843, "memory(GiB)": 126.71, "step": 13670, "train_speed(iter/s)": 0.203039 }, { "acc": 0.78786325, "epoch": 0.31916475889120577, "grad_norm": 6.59375, "learning_rate": 9.56608661332468e-06, "loss": 0.7712101, "memory(GiB)": 126.71, "step": 13680, "train_speed(iter/s)": 0.20311 }, { "acc": 0.78044405, "epoch": 0.31939806646349467, "grad_norm": 5.0, "learning_rate": 9.56531652574063e-06, "loss": 0.80272665, "memory(GiB)": 126.71, "step": 13690, "train_speed(iter/s)": 0.203186 }, { "acc": 0.77443938, "epoch": 0.31963137403578357, "grad_norm": 8.9375, "learning_rate": 9.564545786458971e-06, "loss": 0.80842047, "memory(GiB)": 126.71, "step": 13700, "train_speed(iter/s)": 0.203263 }, { "acc": 0.75848961, "epoch": 0.31986468160807247, "grad_norm": 6.34375, "learning_rate": 9.563774395589728e-06, "loss": 0.87172365, "memory(GiB)": 126.71, "step": 13710, "train_speed(iter/s)": 0.203337 }, { "acc": 0.77432227, "epoch": 0.3200979891803613, "grad_norm": 6.1875, "learning_rate": 9.563002353243019e-06, "loss": 0.81702223, "memory(GiB)": 126.71, "step": 13720, "train_speed(iter/s)": 0.203412 }, { "acc": 0.74872255, "epoch": 0.3203312967526502, "grad_norm": 5.46875, "learning_rate": 9.562229659529046e-06, "loss": 0.93476353, "memory(GiB)": 126.71, "step": 13730, "train_speed(iter/s)": 0.203494 }, { "acc": 0.77448835, "epoch": 0.3205646043249391, "grad_norm": 6.21875, "learning_rate": 9.561456314558116e-06, "loss": 0.82855978, "memory(GiB)": 126.71, "step": 13740, "train_speed(iter/s)": 0.203569 }, { "acc": 0.74801579, "epoch": 0.320797911897228, "grad_norm": 5.9375, "learning_rate": 9.560682318440619e-06, "loss": 0.88658791, "memory(GiB)": 126.71, "step": 13750, "train_speed(iter/s)": 0.203647 }, { "acc": 0.76101542, "epoch": 0.3210312194695169, "grad_norm": 8.1875, "learning_rate": 9.55990767128705e-06, "loss": 0.86994371, "memory(GiB)": 126.71, "step": 13760, "train_speed(iter/s)": 0.203728 }, { "acc": 0.76376743, "epoch": 0.3212645270418058, "grad_norm": 5.4375, "learning_rate": 9.559132373207984e-06, "loss": 0.84964695, "memory(GiB)": 126.71, "step": 13770, "train_speed(iter/s)": 0.203802 }, { "acc": 0.77500238, "epoch": 0.3214978346140947, "grad_norm": 7.15625, "learning_rate": 9.558356424314095e-06, "loss": 0.81947775, "memory(GiB)": 126.71, "step": 13780, "train_speed(iter/s)": 0.20388 }, { "acc": 0.76300344, "epoch": 0.3217311421863836, "grad_norm": 5.375, "learning_rate": 9.557579824716152e-06, "loss": 0.85338078, "memory(GiB)": 126.71, "step": 13790, "train_speed(iter/s)": 0.203959 }, { "acc": 0.77518468, "epoch": 0.3219644497586725, "grad_norm": 6.28125, "learning_rate": 9.556802574525013e-06, "loss": 0.81727552, "memory(GiB)": 126.71, "step": 13800, "train_speed(iter/s)": 0.204036 }, { "acc": 0.75615144, "epoch": 0.3221977573309614, "grad_norm": 4.34375, "learning_rate": 9.556024673851629e-06, "loss": 0.88319979, "memory(GiB)": 126.71, "step": 13810, "train_speed(iter/s)": 0.204113 }, { "acc": 0.76182871, "epoch": 0.32243106490325024, "grad_norm": 8.3125, "learning_rate": 9.555246122807047e-06, "loss": 0.85009804, "memory(GiB)": 126.71, "step": 13820, "train_speed(iter/s)": 0.204185 }, { "acc": 0.78341169, "epoch": 0.32266437247553914, "grad_norm": 6.15625, "learning_rate": 9.554466921502405e-06, "loss": 0.77107029, "memory(GiB)": 126.71, "step": 13830, "train_speed(iter/s)": 0.204264 }, { "acc": 0.7734736, "epoch": 0.32289768004782804, "grad_norm": 6.90625, "learning_rate": 9.553687070048934e-06, "loss": 0.81421814, "memory(GiB)": 126.71, "step": 13840, "train_speed(iter/s)": 0.204342 }, { "acc": 0.75123091, "epoch": 0.32313098762011694, "grad_norm": 8.6875, "learning_rate": 9.552906568557953e-06, "loss": 0.91024504, "memory(GiB)": 126.71, "step": 13850, "train_speed(iter/s)": 0.204416 }, { "acc": 0.77448788, "epoch": 0.32336429519240584, "grad_norm": 6.09375, "learning_rate": 9.552125417140885e-06, "loss": 0.8000905, "memory(GiB)": 126.71, "step": 13860, "train_speed(iter/s)": 0.204495 }, { "acc": 0.76540542, "epoch": 0.32359760276469474, "grad_norm": 8.0, "learning_rate": 9.551343615909236e-06, "loss": 0.82403202, "memory(GiB)": 126.71, "step": 13870, "train_speed(iter/s)": 0.204566 }, { "acc": 0.76381636, "epoch": 0.32383091033698364, "grad_norm": 4.46875, "learning_rate": 9.550561164974606e-06, "loss": 0.84734306, "memory(GiB)": 126.71, "step": 13880, "train_speed(iter/s)": 0.204641 }, { "acc": 0.75473967, "epoch": 0.32406421790927253, "grad_norm": 7.90625, "learning_rate": 9.549778064448693e-06, "loss": 0.88311577, "memory(GiB)": 126.71, "step": 13890, "train_speed(iter/s)": 0.204721 }, { "acc": 0.77544069, "epoch": 0.32429752548156143, "grad_norm": 7.03125, "learning_rate": 9.548994314443284e-06, "loss": 0.82116642, "memory(GiB)": 126.71, "step": 13900, "train_speed(iter/s)": 0.2048 }, { "acc": 0.75847282, "epoch": 0.32453083305385033, "grad_norm": 7.21875, "learning_rate": 9.548209915070256e-06, "loss": 0.88838072, "memory(GiB)": 126.71, "step": 13910, "train_speed(iter/s)": 0.204872 }, { "acc": 0.76282072, "epoch": 0.3247641406261392, "grad_norm": 5.90625, "learning_rate": 9.547424866441586e-06, "loss": 0.85284786, "memory(GiB)": 126.71, "step": 13920, "train_speed(iter/s)": 0.204947 }, { "acc": 0.77322469, "epoch": 0.3249974481984281, "grad_norm": 6.03125, "learning_rate": 9.546639168669336e-06, "loss": 0.81482582, "memory(GiB)": 126.71, "step": 13930, "train_speed(iter/s)": 0.205024 }, { "acc": 0.7548861, "epoch": 0.325230755770717, "grad_norm": 6.4375, "learning_rate": 9.545852821865667e-06, "loss": 0.88894806, "memory(GiB)": 126.71, "step": 13940, "train_speed(iter/s)": 0.2051 }, { "acc": 0.74424677, "epoch": 0.32546406334300587, "grad_norm": 7.65625, "learning_rate": 9.545065826142825e-06, "loss": 0.93987856, "memory(GiB)": 126.71, "step": 13950, "train_speed(iter/s)": 0.205176 }, { "acc": 0.7565937, "epoch": 0.32569737091529477, "grad_norm": 6.75, "learning_rate": 9.544278181613158e-06, "loss": 0.87104702, "memory(GiB)": 126.71, "step": 13960, "train_speed(iter/s)": 0.205252 }, { "acc": 0.75203457, "epoch": 0.32593067848758367, "grad_norm": 4.375, "learning_rate": 9.543489888389103e-06, "loss": 0.91072731, "memory(GiB)": 126.71, "step": 13970, "train_speed(iter/s)": 0.205328 }, { "acc": 0.77056246, "epoch": 0.32616398605987257, "grad_norm": 5.0, "learning_rate": 9.542700946583184e-06, "loss": 0.82184029, "memory(GiB)": 126.71, "step": 13980, "train_speed(iter/s)": 0.205406 }, { "acc": 0.7491714, "epoch": 0.32639729363216147, "grad_norm": 5.71875, "learning_rate": 9.541911356308025e-06, "loss": 0.90672054, "memory(GiB)": 126.71, "step": 13990, "train_speed(iter/s)": 0.205486 }, { "acc": 0.75829577, "epoch": 0.32663060120445037, "grad_norm": 5.34375, "learning_rate": 9.541121117676339e-06, "loss": 0.89405813, "memory(GiB)": 126.71, "step": 14000, "train_speed(iter/s)": 0.205562 }, { "epoch": 0.32663060120445037, "eval_acc": 0.7314003917405348, "eval_loss": 0.8493373990058899, "eval_runtime": 1263.2629, "eval_samples_per_second": 28.491, "eval_steps_per_second": 14.246, "step": 14000 }, { "acc": 0.77907901, "epoch": 0.3268639087767392, "grad_norm": 4.875, "learning_rate": 9.540330230800935e-06, "loss": 0.80598106, "memory(GiB)": 126.71, "step": 14010, "train_speed(iter/s)": 0.201819 }, { "acc": 0.77135057, "epoch": 0.3270972163490281, "grad_norm": 6.0625, "learning_rate": 9.539538695794708e-06, "loss": 0.8170599, "memory(GiB)": 126.71, "step": 14020, "train_speed(iter/s)": 0.201898 }, { "acc": 0.7746273, "epoch": 0.327330523921317, "grad_norm": 4.28125, "learning_rate": 9.53874651277065e-06, "loss": 0.81331491, "memory(GiB)": 126.71, "step": 14030, "train_speed(iter/s)": 0.201975 }, { "acc": 0.79704523, "epoch": 0.3275638314936059, "grad_norm": 6.875, "learning_rate": 9.537953681841847e-06, "loss": 0.73362274, "memory(GiB)": 126.71, "step": 14040, "train_speed(iter/s)": 0.202046 }, { "acc": 0.76621151, "epoch": 0.3277971390658948, "grad_norm": 5.28125, "learning_rate": 9.537160203121474e-06, "loss": 0.85258369, "memory(GiB)": 126.71, "step": 14050, "train_speed(iter/s)": 0.20212 }, { "acc": 0.78894291, "epoch": 0.3280304466381837, "grad_norm": 7.84375, "learning_rate": 9.536366076722799e-06, "loss": 0.7563345, "memory(GiB)": 126.71, "step": 14060, "train_speed(iter/s)": 0.20219 }, { "acc": 0.76801329, "epoch": 0.3282637542104726, "grad_norm": 5.4375, "learning_rate": 9.535571302759184e-06, "loss": 0.83501291, "memory(GiB)": 126.71, "step": 14070, "train_speed(iter/s)": 0.202261 }, { "acc": 0.76370468, "epoch": 0.3284970617827615, "grad_norm": 7.375, "learning_rate": 9.534775881344086e-06, "loss": 0.87775526, "memory(GiB)": 126.71, "step": 14080, "train_speed(iter/s)": 0.202331 }, { "acc": 0.76569576, "epoch": 0.3287303693550504, "grad_norm": 6.40625, "learning_rate": 9.533979812591046e-06, "loss": 0.82789497, "memory(GiB)": 126.71, "step": 14090, "train_speed(iter/s)": 0.202408 }, { "acc": 0.75989351, "epoch": 0.3289636769273393, "grad_norm": 7.375, "learning_rate": 9.533183096613705e-06, "loss": 0.87083883, "memory(GiB)": 126.71, "step": 14100, "train_speed(iter/s)": 0.202485 }, { "acc": 0.76449294, "epoch": 0.32919698449962814, "grad_norm": 4.46875, "learning_rate": 9.532385733525793e-06, "loss": 0.85408449, "memory(GiB)": 126.71, "step": 14110, "train_speed(iter/s)": 0.202564 }, { "acc": 0.78696404, "epoch": 0.32943029207191704, "grad_norm": 4.34375, "learning_rate": 9.531587723441136e-06, "loss": 0.7695261, "memory(GiB)": 126.71, "step": 14120, "train_speed(iter/s)": 0.202632 }, { "acc": 0.76718712, "epoch": 0.32966359964420594, "grad_norm": 6.46875, "learning_rate": 9.530789066473648e-06, "loss": 0.84387932, "memory(GiB)": 126.71, "step": 14130, "train_speed(iter/s)": 0.202706 }, { "acc": 0.74890509, "epoch": 0.32989690721649484, "grad_norm": 6.40625, "learning_rate": 9.529989762737336e-06, "loss": 0.90712261, "memory(GiB)": 126.71, "step": 14140, "train_speed(iter/s)": 0.202786 }, { "acc": 0.74578581, "epoch": 0.33013021478878374, "grad_norm": 6.875, "learning_rate": 9.529189812346303e-06, "loss": 0.92681608, "memory(GiB)": 126.71, "step": 14150, "train_speed(iter/s)": 0.202858 }, { "acc": 0.77251573, "epoch": 0.33036352236107264, "grad_norm": 5.75, "learning_rate": 9.528389215414737e-06, "loss": 0.83133421, "memory(GiB)": 126.71, "step": 14160, "train_speed(iter/s)": 0.202934 }, { "acc": 0.78011246, "epoch": 0.33059682993336154, "grad_norm": 8.0625, "learning_rate": 9.527587972056929e-06, "loss": 0.78037252, "memory(GiB)": 126.71, "step": 14170, "train_speed(iter/s)": 0.203011 }, { "acc": 0.76995535, "epoch": 0.33083013750565043, "grad_norm": 7.40625, "learning_rate": 9.526786082387251e-06, "loss": 0.79973135, "memory(GiB)": 126.71, "step": 14180, "train_speed(iter/s)": 0.203084 }, { "acc": 0.76955757, "epoch": 0.33106344507793933, "grad_norm": 5.6875, "learning_rate": 9.525983546520176e-06, "loss": 0.84809074, "memory(GiB)": 126.71, "step": 14190, "train_speed(iter/s)": 0.203158 }, { "acc": 0.75482817, "epoch": 0.3312967526502282, "grad_norm": 8.1875, "learning_rate": 9.525180364570265e-06, "loss": 0.90241508, "memory(GiB)": 126.71, "step": 14200, "train_speed(iter/s)": 0.203234 }, { "acc": 0.76044388, "epoch": 0.3315300602225171, "grad_norm": 4.96875, "learning_rate": 9.52437653665217e-06, "loss": 0.88495045, "memory(GiB)": 126.71, "step": 14210, "train_speed(iter/s)": 0.203312 }, { "acc": 0.77102056, "epoch": 0.331763367794806, "grad_norm": 5.1875, "learning_rate": 9.52357206288064e-06, "loss": 0.8409708, "memory(GiB)": 126.71, "step": 14220, "train_speed(iter/s)": 0.203389 }, { "acc": 0.76042128, "epoch": 0.3319966753670949, "grad_norm": 8.125, "learning_rate": 9.522766943370512e-06, "loss": 0.8511735, "memory(GiB)": 126.71, "step": 14230, "train_speed(iter/s)": 0.203461 }, { "acc": 0.77194614, "epoch": 0.33222998293938377, "grad_norm": 4.875, "learning_rate": 9.521961178236716e-06, "loss": 0.83084736, "memory(GiB)": 126.71, "step": 14240, "train_speed(iter/s)": 0.203532 }, { "acc": 0.76267915, "epoch": 0.33246329051167267, "grad_norm": 4.875, "learning_rate": 9.521154767594276e-06, "loss": 0.86722221, "memory(GiB)": 126.71, "step": 14250, "train_speed(iter/s)": 0.203604 }, { "acc": 0.75128598, "epoch": 0.33269659808396157, "grad_norm": 4.96875, "learning_rate": 9.520347711558306e-06, "loss": 0.91769524, "memory(GiB)": 126.71, "step": 14260, "train_speed(iter/s)": 0.203679 }, { "acc": 0.76070714, "epoch": 0.33292990565625047, "grad_norm": 7.59375, "learning_rate": 9.519540010244013e-06, "loss": 0.85982342, "memory(GiB)": 126.71, "step": 14270, "train_speed(iter/s)": 0.203757 }, { "acc": 0.75433779, "epoch": 0.33316321322853937, "grad_norm": 6.46875, "learning_rate": 9.518731663766697e-06, "loss": 0.88026438, "memory(GiB)": 126.71, "step": 14280, "train_speed(iter/s)": 0.203832 }, { "acc": 0.76897736, "epoch": 0.33339652080082827, "grad_norm": 6.96875, "learning_rate": 9.517922672241748e-06, "loss": 0.85092363, "memory(GiB)": 126.71, "step": 14290, "train_speed(iter/s)": 0.203904 }, { "acc": 0.77723322, "epoch": 0.3336298283731171, "grad_norm": 4.90625, "learning_rate": 9.517113035784651e-06, "loss": 0.80185032, "memory(GiB)": 126.71, "step": 14300, "train_speed(iter/s)": 0.203978 }, { "acc": 0.77630749, "epoch": 0.333863135945406, "grad_norm": 4.46875, "learning_rate": 9.51630275451098e-06, "loss": 0.81215239, "memory(GiB)": 126.71, "step": 14310, "train_speed(iter/s)": 0.204058 }, { "acc": 0.76114473, "epoch": 0.3340964435176949, "grad_norm": 6.71875, "learning_rate": 9.515491828536403e-06, "loss": 0.86198978, "memory(GiB)": 126.71, "step": 14320, "train_speed(iter/s)": 0.204138 }, { "acc": 0.75422287, "epoch": 0.3343297510899838, "grad_norm": 4.8125, "learning_rate": 9.51468025797668e-06, "loss": 0.90628757, "memory(GiB)": 126.71, "step": 14330, "train_speed(iter/s)": 0.204215 }, { "acc": 0.77322135, "epoch": 0.3345630586622727, "grad_norm": 6.03125, "learning_rate": 9.51386804294766e-06, "loss": 0.80394497, "memory(GiB)": 126.71, "step": 14340, "train_speed(iter/s)": 0.20429 }, { "acc": 0.76911297, "epoch": 0.3347963662345616, "grad_norm": 5.8125, "learning_rate": 9.51305518356529e-06, "loss": 0.81683111, "memory(GiB)": 126.71, "step": 14350, "train_speed(iter/s)": 0.204365 }, { "acc": 0.74746904, "epoch": 0.3350296738068505, "grad_norm": 6.15625, "learning_rate": 9.512241679945602e-06, "loss": 0.93869228, "memory(GiB)": 126.71, "step": 14360, "train_speed(iter/s)": 0.20444 }, { "acc": 0.77847366, "epoch": 0.3352629813791394, "grad_norm": 5.53125, "learning_rate": 9.511427532204725e-06, "loss": 0.78958197, "memory(GiB)": 126.71, "step": 14370, "train_speed(iter/s)": 0.204513 }, { "acc": 0.76106529, "epoch": 0.3354962889514283, "grad_norm": 12.0, "learning_rate": 9.51061274045888e-06, "loss": 0.85716648, "memory(GiB)": 126.71, "step": 14380, "train_speed(iter/s)": 0.204585 }, { "acc": 0.75618491, "epoch": 0.3357295965237172, "grad_norm": 6.0, "learning_rate": 9.509797304824376e-06, "loss": 0.91164742, "memory(GiB)": 126.71, "step": 14390, "train_speed(iter/s)": 0.204659 }, { "acc": 0.78182993, "epoch": 0.33596290409600604, "grad_norm": 6.03125, "learning_rate": 9.508981225417615e-06, "loss": 0.78372808, "memory(GiB)": 126.71, "step": 14400, "train_speed(iter/s)": 0.204726 }, { "acc": 0.77309303, "epoch": 0.33619621166829494, "grad_norm": 4.90625, "learning_rate": 9.508164502355095e-06, "loss": 0.81637077, "memory(GiB)": 126.71, "step": 14410, "train_speed(iter/s)": 0.204796 }, { "acc": 0.76972952, "epoch": 0.33642951924058384, "grad_norm": 5.4375, "learning_rate": 9.507347135753403e-06, "loss": 0.86627216, "memory(GiB)": 126.71, "step": 14420, "train_speed(iter/s)": 0.20487 }, { "acc": 0.78301582, "epoch": 0.33666282681287274, "grad_norm": 8.6875, "learning_rate": 9.506529125729216e-06, "loss": 0.79099298, "memory(GiB)": 126.71, "step": 14430, "train_speed(iter/s)": 0.204945 }, { "acc": 0.75892696, "epoch": 0.33689613438516164, "grad_norm": 5.875, "learning_rate": 9.505710472399306e-06, "loss": 0.90608978, "memory(GiB)": 126.71, "step": 14440, "train_speed(iter/s)": 0.205019 }, { "acc": 0.75648518, "epoch": 0.33712944195745054, "grad_norm": 4.8125, "learning_rate": 9.504891175880533e-06, "loss": 0.87193394, "memory(GiB)": 126.71, "step": 14450, "train_speed(iter/s)": 0.205093 }, { "acc": 0.76104174, "epoch": 0.33736274952973944, "grad_norm": 5.78125, "learning_rate": 9.504071236289856e-06, "loss": 0.87683582, "memory(GiB)": 126.71, "step": 14460, "train_speed(iter/s)": 0.205167 }, { "acc": 0.76633177, "epoch": 0.33759605710202834, "grad_norm": 5.25, "learning_rate": 9.503250653744316e-06, "loss": 0.84216108, "memory(GiB)": 126.71, "step": 14470, "train_speed(iter/s)": 0.205233 }, { "acc": 0.75750256, "epoch": 0.33782936467431723, "grad_norm": 6.09375, "learning_rate": 9.502429428361055e-06, "loss": 0.87788067, "memory(GiB)": 126.71, "step": 14480, "train_speed(iter/s)": 0.205304 }, { "acc": 0.75579863, "epoch": 0.3380626722466061, "grad_norm": 7.25, "learning_rate": 9.5016075602573e-06, "loss": 0.89555349, "memory(GiB)": 126.71, "step": 14490, "train_speed(iter/s)": 0.205372 }, { "acc": 0.76029468, "epoch": 0.338295979818895, "grad_norm": 9.25, "learning_rate": 9.500785049550373e-06, "loss": 0.86188812, "memory(GiB)": 126.71, "step": 14500, "train_speed(iter/s)": 0.20545 }, { "epoch": 0.338295979818895, "eval_acc": 0.7317122675452324, "eval_loss": 0.8488510251045227, "eval_runtime": 1264.5209, "eval_samples_per_second": 28.462, "eval_steps_per_second": 14.231, "step": 14500 }, { "acc": 0.76895518, "epoch": 0.3385292873911839, "grad_norm": 4.21875, "learning_rate": 9.49996189635769e-06, "loss": 0.82767677, "memory(GiB)": 126.71, "step": 14510, "train_speed(iter/s)": 0.201832 }, { "acc": 0.7662178, "epoch": 0.3387625949634728, "grad_norm": 5.84375, "learning_rate": 9.499138100796752e-06, "loss": 0.84406948, "memory(GiB)": 126.71, "step": 14520, "train_speed(iter/s)": 0.201906 }, { "acc": 0.75791979, "epoch": 0.3389959025357617, "grad_norm": 5.59375, "learning_rate": 9.498313662985159e-06, "loss": 0.88665104, "memory(GiB)": 126.71, "step": 14530, "train_speed(iter/s)": 0.201976 }, { "acc": 0.75994248, "epoch": 0.33922921010805057, "grad_norm": 5.40625, "learning_rate": 9.497488583040595e-06, "loss": 0.88990154, "memory(GiB)": 126.71, "step": 14540, "train_speed(iter/s)": 0.202046 }, { "acc": 0.77198806, "epoch": 0.33946251768033947, "grad_norm": 5.90625, "learning_rate": 9.496662861080842e-06, "loss": 0.83943577, "memory(GiB)": 126.71, "step": 14550, "train_speed(iter/s)": 0.202119 }, { "acc": 0.78443012, "epoch": 0.33969582525262837, "grad_norm": 6.0, "learning_rate": 9.495836497223775e-06, "loss": 0.76263351, "memory(GiB)": 126.71, "step": 14560, "train_speed(iter/s)": 0.202197 }, { "acc": 0.75301332, "epoch": 0.33992913282491727, "grad_norm": 6.34375, "learning_rate": 9.49500949158735e-06, "loss": 0.89327507, "memory(GiB)": 126.71, "step": 14570, "train_speed(iter/s)": 0.202272 }, { "acc": 0.76159992, "epoch": 0.34016244039720617, "grad_norm": 6.03125, "learning_rate": 9.494181844289629e-06, "loss": 0.85369072, "memory(GiB)": 126.71, "step": 14580, "train_speed(iter/s)": 0.202344 }, { "acc": 0.78562536, "epoch": 0.340395747969495, "grad_norm": 6.0625, "learning_rate": 9.493353555448754e-06, "loss": 0.75794916, "memory(GiB)": 135.49, "step": 14590, "train_speed(iter/s)": 0.202412 }, { "acc": 0.77323399, "epoch": 0.3406290555417839, "grad_norm": 6.28125, "learning_rate": 9.492524625182965e-06, "loss": 0.80891438, "memory(GiB)": 135.49, "step": 14600, "train_speed(iter/s)": 0.202487 }, { "acc": 0.76167402, "epoch": 0.3408623631140728, "grad_norm": 5.25, "learning_rate": 9.49169505361059e-06, "loss": 0.88717384, "memory(GiB)": 135.49, "step": 14610, "train_speed(iter/s)": 0.202561 }, { "acc": 0.74396906, "epoch": 0.3410956706863617, "grad_norm": 8.0625, "learning_rate": 9.490864840850051e-06, "loss": 0.94423609, "memory(GiB)": 135.49, "step": 14620, "train_speed(iter/s)": 0.202634 }, { "acc": 0.7612874, "epoch": 0.3413289782586506, "grad_norm": 5.5625, "learning_rate": 9.490033987019862e-06, "loss": 0.87682304, "memory(GiB)": 135.49, "step": 14630, "train_speed(iter/s)": 0.202708 }, { "acc": 0.77509408, "epoch": 0.3415622858309395, "grad_norm": 5.0, "learning_rate": 9.489202492238624e-06, "loss": 0.79678822, "memory(GiB)": 135.49, "step": 14640, "train_speed(iter/s)": 0.20278 }, { "acc": 0.76598597, "epoch": 0.3417955934032284, "grad_norm": 5.53125, "learning_rate": 9.488370356625035e-06, "loss": 0.83838024, "memory(GiB)": 135.49, "step": 14650, "train_speed(iter/s)": 0.202854 }, { "acc": 0.75458593, "epoch": 0.3420289009755173, "grad_norm": 5.03125, "learning_rate": 9.487537580297881e-06, "loss": 0.90455761, "memory(GiB)": 135.49, "step": 14660, "train_speed(iter/s)": 0.202928 }, { "acc": 0.76012125, "epoch": 0.3422622085478062, "grad_norm": 6.125, "learning_rate": 9.486704163376041e-06, "loss": 0.86546555, "memory(GiB)": 135.49, "step": 14670, "train_speed(iter/s)": 0.203001 }, { "acc": 0.76281099, "epoch": 0.3424955161200951, "grad_norm": 7.59375, "learning_rate": 9.485870105978487e-06, "loss": 0.8342617, "memory(GiB)": 135.49, "step": 14680, "train_speed(iter/s)": 0.20307 }, { "acc": 0.76606445, "epoch": 0.34272882369238394, "grad_norm": 4.59375, "learning_rate": 9.485035408224277e-06, "loss": 0.8462326, "memory(GiB)": 135.49, "step": 14690, "train_speed(iter/s)": 0.203145 }, { "acc": 0.76596289, "epoch": 0.34296213126467284, "grad_norm": 8.5, "learning_rate": 9.484200070232565e-06, "loss": 0.85628052, "memory(GiB)": 135.49, "step": 14700, "train_speed(iter/s)": 0.203217 }, { "acc": 0.76501932, "epoch": 0.34319543883696174, "grad_norm": 5.96875, "learning_rate": 9.483364092122595e-06, "loss": 0.84716663, "memory(GiB)": 135.49, "step": 14710, "train_speed(iter/s)": 0.203292 }, { "acc": 0.75231066, "epoch": 0.34342874640925064, "grad_norm": 6.90625, "learning_rate": 9.482527474013705e-06, "loss": 0.91312513, "memory(GiB)": 135.49, "step": 14720, "train_speed(iter/s)": 0.203364 }, { "acc": 0.7673707, "epoch": 0.34366205398153954, "grad_norm": 6.4375, "learning_rate": 9.481690216025321e-06, "loss": 0.85323429, "memory(GiB)": 135.49, "step": 14730, "train_speed(iter/s)": 0.203439 }, { "acc": 0.76607733, "epoch": 0.34389536155382844, "grad_norm": 6.6875, "learning_rate": 9.480852318276958e-06, "loss": 0.86791954, "memory(GiB)": 135.49, "step": 14740, "train_speed(iter/s)": 0.203511 }, { "acc": 0.77544117, "epoch": 0.34412866912611734, "grad_norm": 6.59375, "learning_rate": 9.48001378088823e-06, "loss": 0.81143951, "memory(GiB)": 135.49, "step": 14750, "train_speed(iter/s)": 0.20358 }, { "acc": 0.76676502, "epoch": 0.34436197669840624, "grad_norm": 5.96875, "learning_rate": 9.479174603978836e-06, "loss": 0.84043026, "memory(GiB)": 135.49, "step": 14760, "train_speed(iter/s)": 0.203652 }, { "acc": 0.76243086, "epoch": 0.34459528427069513, "grad_norm": 4.4375, "learning_rate": 9.478334787668569e-06, "loss": 0.86151228, "memory(GiB)": 135.49, "step": 14770, "train_speed(iter/s)": 0.203722 }, { "acc": 0.77660151, "epoch": 0.344828591842984, "grad_norm": 6.5625, "learning_rate": 9.477494332077311e-06, "loss": 0.82240782, "memory(GiB)": 135.49, "step": 14780, "train_speed(iter/s)": 0.203791 }, { "acc": 0.77354336, "epoch": 0.3450618994152729, "grad_norm": 7.9375, "learning_rate": 9.476653237325037e-06, "loss": 0.82591248, "memory(GiB)": 135.49, "step": 14790, "train_speed(iter/s)": 0.203863 }, { "acc": 0.77084007, "epoch": 0.3452952069875618, "grad_norm": 11.25, "learning_rate": 9.475811503531815e-06, "loss": 0.83070307, "memory(GiB)": 135.49, "step": 14800, "train_speed(iter/s)": 0.203934 }, { "acc": 0.75399141, "epoch": 0.3455285145598507, "grad_norm": 6.625, "learning_rate": 9.474969130817801e-06, "loss": 0.89532146, "memory(GiB)": 135.49, "step": 14810, "train_speed(iter/s)": 0.204009 }, { "acc": 0.75840931, "epoch": 0.3457618221321396, "grad_norm": 10.125, "learning_rate": 9.474126119303245e-06, "loss": 0.87002563, "memory(GiB)": 135.49, "step": 14820, "train_speed(iter/s)": 0.204082 }, { "acc": 0.77159939, "epoch": 0.3459951297044285, "grad_norm": 5.71875, "learning_rate": 9.473282469108483e-06, "loss": 0.81620827, "memory(GiB)": 135.49, "step": 14830, "train_speed(iter/s)": 0.204152 }, { "acc": 0.76163678, "epoch": 0.34622843727671737, "grad_norm": 4.71875, "learning_rate": 9.472438180353948e-06, "loss": 0.85409708, "memory(GiB)": 135.49, "step": 14840, "train_speed(iter/s)": 0.204222 }, { "acc": 0.77583766, "epoch": 0.34646174484900627, "grad_norm": 6.15625, "learning_rate": 9.471593253160162e-06, "loss": 0.7942975, "memory(GiB)": 135.49, "step": 14850, "train_speed(iter/s)": 0.204295 }, { "acc": 0.7665369, "epoch": 0.34669505242129517, "grad_norm": 5.4375, "learning_rate": 9.470747687647741e-06, "loss": 0.8501502, "memory(GiB)": 135.49, "step": 14860, "train_speed(iter/s)": 0.20437 }, { "acc": 0.77335835, "epoch": 0.34692835999358407, "grad_norm": 6.375, "learning_rate": 9.469901483937384e-06, "loss": 0.79345427, "memory(GiB)": 135.49, "step": 14870, "train_speed(iter/s)": 0.204439 }, { "acc": 0.78377228, "epoch": 0.3471616675658729, "grad_norm": 5.3125, "learning_rate": 9.469054642149889e-06, "loss": 0.79584489, "memory(GiB)": 135.49, "step": 14880, "train_speed(iter/s)": 0.204503 }, { "acc": 0.76357613, "epoch": 0.3473949751381618, "grad_norm": 4.9375, "learning_rate": 9.468207162406143e-06, "loss": 0.86906633, "memory(GiB)": 135.49, "step": 14890, "train_speed(iter/s)": 0.204577 }, { "acc": 0.75619082, "epoch": 0.3476282827104507, "grad_norm": 5.71875, "learning_rate": 9.46735904482712e-06, "loss": 0.89972754, "memory(GiB)": 135.49, "step": 14900, "train_speed(iter/s)": 0.204643 }, { "acc": 0.76629677, "epoch": 0.3478615902827396, "grad_norm": 6.21875, "learning_rate": 9.466510289533894e-06, "loss": 0.84127073, "memory(GiB)": 135.49, "step": 14910, "train_speed(iter/s)": 0.20471 }, { "acc": 0.76790628, "epoch": 0.3480948978550285, "grad_norm": 5.375, "learning_rate": 9.46566089664762e-06, "loss": 0.8376955, "memory(GiB)": 135.49, "step": 14920, "train_speed(iter/s)": 0.20478 }, { "acc": 0.77358766, "epoch": 0.3483282054273174, "grad_norm": 6.1875, "learning_rate": 9.46481086628955e-06, "loss": 0.82665291, "memory(GiB)": 135.49, "step": 14930, "train_speed(iter/s)": 0.20485 }, { "acc": 0.76839547, "epoch": 0.3485615129996063, "grad_norm": 6.15625, "learning_rate": 9.463960198581028e-06, "loss": 0.82461739, "memory(GiB)": 135.49, "step": 14940, "train_speed(iter/s)": 0.204917 }, { "acc": 0.76211786, "epoch": 0.3487948205718952, "grad_norm": 7.53125, "learning_rate": 9.463108893643483e-06, "loss": 0.85009499, "memory(GiB)": 135.49, "step": 14950, "train_speed(iter/s)": 0.204985 }, { "acc": 0.76830888, "epoch": 0.3490281281441841, "grad_norm": 6.125, "learning_rate": 9.46225695159844e-06, "loss": 0.86592674, "memory(GiB)": 135.49, "step": 14960, "train_speed(iter/s)": 0.205055 }, { "acc": 0.77029295, "epoch": 0.34926143571647295, "grad_norm": 5.25, "learning_rate": 9.461404372567513e-06, "loss": 0.83972254, "memory(GiB)": 135.49, "step": 14970, "train_speed(iter/s)": 0.205128 }, { "acc": 0.76614647, "epoch": 0.34949474328876184, "grad_norm": 8.8125, "learning_rate": 9.460551156672408e-06, "loss": 0.84479351, "memory(GiB)": 135.49, "step": 14980, "train_speed(iter/s)": 0.205201 }, { "acc": 0.75613046, "epoch": 0.34972805086105074, "grad_norm": 5.0625, "learning_rate": 9.459697304034923e-06, "loss": 0.87976704, "memory(GiB)": 135.49, "step": 14990, "train_speed(iter/s)": 0.20527 }, { "acc": 0.74437275, "epoch": 0.34996135843333964, "grad_norm": 8.25, "learning_rate": 9.458842814776941e-06, "loss": 0.93307667, "memory(GiB)": 135.49, "step": 15000, "train_speed(iter/s)": 0.205338 }, { "epoch": 0.34996135843333964, "eval_acc": 0.7316482144182459, "eval_loss": 0.8479017019271851, "eval_runtime": 1262.7376, "eval_samples_per_second": 28.502, "eval_steps_per_second": 14.252, "step": 15000 }, { "acc": 0.78761168, "epoch": 0.35019466600562854, "grad_norm": 5.5625, "learning_rate": 9.457987689020444e-06, "loss": 0.74955397, "memory(GiB)": 135.49, "step": 15010, "train_speed(iter/s)": 0.201846 }, { "acc": 0.75734243, "epoch": 0.35042797357791744, "grad_norm": 5.96875, "learning_rate": 9.457131926887498e-06, "loss": 0.89660664, "memory(GiB)": 135.49, "step": 15020, "train_speed(iter/s)": 0.201916 }, { "acc": 0.77396789, "epoch": 0.35066128115020634, "grad_norm": 7.9375, "learning_rate": 9.456275528500264e-06, "loss": 0.82700577, "memory(GiB)": 135.49, "step": 15030, "train_speed(iter/s)": 0.201991 }, { "acc": 0.77548437, "epoch": 0.35089458872249524, "grad_norm": 6.375, "learning_rate": 9.455418493980996e-06, "loss": 0.82948589, "memory(GiB)": 135.49, "step": 15040, "train_speed(iter/s)": 0.202062 }, { "acc": 0.76138692, "epoch": 0.35112789629478414, "grad_norm": 12.0, "learning_rate": 9.454560823452031e-06, "loss": 0.86358471, "memory(GiB)": 135.49, "step": 15050, "train_speed(iter/s)": 0.202134 }, { "acc": 0.76198502, "epoch": 0.35136120386707304, "grad_norm": 9.625, "learning_rate": 9.4537025170358e-06, "loss": 0.86715317, "memory(GiB)": 135.49, "step": 15060, "train_speed(iter/s)": 0.202201 }, { "acc": 0.78820162, "epoch": 0.3515945114393619, "grad_norm": 8.125, "learning_rate": 9.45284357485483e-06, "loss": 0.77080069, "memory(GiB)": 135.49, "step": 15070, "train_speed(iter/s)": 0.202266 }, { "acc": 0.77067018, "epoch": 0.3518278190116508, "grad_norm": 4.96875, "learning_rate": 9.451983997031736e-06, "loss": 0.82290688, "memory(GiB)": 135.49, "step": 15080, "train_speed(iter/s)": 0.20233 }, { "acc": 0.76627722, "epoch": 0.3520611265839397, "grad_norm": 4.5, "learning_rate": 9.451123783689216e-06, "loss": 0.81932392, "memory(GiB)": 135.49, "step": 15090, "train_speed(iter/s)": 0.202401 }, { "acc": 0.75522623, "epoch": 0.3522944341562286, "grad_norm": 6.8125, "learning_rate": 9.450262934950069e-06, "loss": 0.89056635, "memory(GiB)": 135.49, "step": 15100, "train_speed(iter/s)": 0.202469 }, { "acc": 0.74450626, "epoch": 0.3525277417285175, "grad_norm": 5.21875, "learning_rate": 9.449401450937184e-06, "loss": 0.92881222, "memory(GiB)": 135.49, "step": 15110, "train_speed(iter/s)": 0.202535 }, { "acc": 0.74678998, "epoch": 0.3527610493008064, "grad_norm": 7.5625, "learning_rate": 9.448539331773532e-06, "loss": 0.926754, "memory(GiB)": 135.49, "step": 15120, "train_speed(iter/s)": 0.202603 }, { "acc": 0.78006783, "epoch": 0.35299435687309527, "grad_norm": 7.625, "learning_rate": 9.447676577582184e-06, "loss": 0.77964802, "memory(GiB)": 135.49, "step": 15130, "train_speed(iter/s)": 0.202674 }, { "acc": 0.76157007, "epoch": 0.35322766444538417, "grad_norm": 5.375, "learning_rate": 9.446813188486294e-06, "loss": 0.84762974, "memory(GiB)": 135.49, "step": 15140, "train_speed(iter/s)": 0.202735 }, { "acc": 0.7675025, "epoch": 0.35346097201767307, "grad_norm": 8.625, "learning_rate": 9.445949164609116e-06, "loss": 0.83632679, "memory(GiB)": 135.49, "step": 15150, "train_speed(iter/s)": 0.202805 }, { "acc": 0.77329473, "epoch": 0.35369427958996197, "grad_norm": 5.1875, "learning_rate": 9.445084506073985e-06, "loss": 0.83477125, "memory(GiB)": 135.49, "step": 15160, "train_speed(iter/s)": 0.20288 }, { "acc": 0.76967669, "epoch": 0.3539275871622508, "grad_norm": 7.3125, "learning_rate": 9.444219213004333e-06, "loss": 0.8329854, "memory(GiB)": 135.49, "step": 15170, "train_speed(iter/s)": 0.202948 }, { "acc": 0.75726976, "epoch": 0.3541608947345397, "grad_norm": 7.09375, "learning_rate": 9.443353285523678e-06, "loss": 0.86442919, "memory(GiB)": 135.49, "step": 15180, "train_speed(iter/s)": 0.203016 }, { "acc": 0.76505413, "epoch": 0.3543942023068286, "grad_norm": 7.6875, "learning_rate": 9.442486723755633e-06, "loss": 0.85064774, "memory(GiB)": 135.49, "step": 15190, "train_speed(iter/s)": 0.203086 }, { "acc": 0.78053946, "epoch": 0.3546275098791175, "grad_norm": 5.90625, "learning_rate": 9.4416195278239e-06, "loss": 0.78198586, "memory(GiB)": 135.49, "step": 15200, "train_speed(iter/s)": 0.203154 }, { "acc": 0.76178522, "epoch": 0.3548608174514064, "grad_norm": 5.46875, "learning_rate": 9.440751697852268e-06, "loss": 0.85862417, "memory(GiB)": 135.49, "step": 15210, "train_speed(iter/s)": 0.203224 }, { "acc": 0.77801356, "epoch": 0.3550941250236953, "grad_norm": 6.5625, "learning_rate": 9.439883233964621e-06, "loss": 0.82830906, "memory(GiB)": 135.49, "step": 15220, "train_speed(iter/s)": 0.203298 }, { "acc": 0.77208996, "epoch": 0.3553274325959842, "grad_norm": 4.5, "learning_rate": 9.439014136284934e-06, "loss": 0.82279739, "memory(GiB)": 135.49, "step": 15230, "train_speed(iter/s)": 0.203363 }, { "acc": 0.78998656, "epoch": 0.3555607401682731, "grad_norm": 11.125, "learning_rate": 9.438144404937266e-06, "loss": 0.74346466, "memory(GiB)": 135.49, "step": 15240, "train_speed(iter/s)": 0.203436 }, { "acc": 0.77427793, "epoch": 0.355794047740562, "grad_norm": 6.9375, "learning_rate": 9.437274040045775e-06, "loss": 0.82020922, "memory(GiB)": 135.49, "step": 15250, "train_speed(iter/s)": 0.203503 }, { "acc": 0.77190905, "epoch": 0.35602735531285085, "grad_norm": 5.1875, "learning_rate": 9.436403041734704e-06, "loss": 0.8475811, "memory(GiB)": 135.49, "step": 15260, "train_speed(iter/s)": 0.203575 }, { "acc": 0.76008153, "epoch": 0.35626066288513975, "grad_norm": 5.21875, "learning_rate": 9.435531410128387e-06, "loss": 0.85581741, "memory(GiB)": 135.49, "step": 15270, "train_speed(iter/s)": 0.203642 }, { "acc": 0.74575024, "epoch": 0.35649397045742864, "grad_norm": 5.4375, "learning_rate": 9.434659145351251e-06, "loss": 0.92495975, "memory(GiB)": 135.49, "step": 15280, "train_speed(iter/s)": 0.203714 }, { "acc": 0.76778212, "epoch": 0.35672727802971754, "grad_norm": 6.25, "learning_rate": 9.433786247527809e-06, "loss": 0.81332493, "memory(GiB)": 135.49, "step": 15290, "train_speed(iter/s)": 0.203783 }, { "acc": 0.78525763, "epoch": 0.35696058560200644, "grad_norm": 5.5625, "learning_rate": 9.432912716782667e-06, "loss": 0.76846333, "memory(GiB)": 135.49, "step": 15300, "train_speed(iter/s)": 0.203853 }, { "acc": 0.78436394, "epoch": 0.35719389317429534, "grad_norm": 6.40625, "learning_rate": 9.432038553240526e-06, "loss": 0.76780996, "memory(GiB)": 135.49, "step": 15310, "train_speed(iter/s)": 0.203915 }, { "acc": 0.77124443, "epoch": 0.35742720074658424, "grad_norm": 7.625, "learning_rate": 9.431163757026167e-06, "loss": 0.82404099, "memory(GiB)": 135.49, "step": 15320, "train_speed(iter/s)": 0.203984 }, { "acc": 0.7600667, "epoch": 0.35766050831887314, "grad_norm": 6.78125, "learning_rate": 9.430288328264467e-06, "loss": 0.88547487, "memory(GiB)": 135.49, "step": 15330, "train_speed(iter/s)": 0.204053 }, { "acc": 0.7749239, "epoch": 0.35789381589116204, "grad_norm": 5.75, "learning_rate": 9.429412267080397e-06, "loss": 0.79843283, "memory(GiB)": 135.49, "step": 15340, "train_speed(iter/s)": 0.204117 }, { "acc": 0.77203197, "epoch": 0.35812712346345094, "grad_norm": 5.875, "learning_rate": 9.428535573599013e-06, "loss": 0.81417103, "memory(GiB)": 135.49, "step": 15350, "train_speed(iter/s)": 0.204191 }, { "acc": 0.77609692, "epoch": 0.3583604310357398, "grad_norm": 8.9375, "learning_rate": 9.427658247945463e-06, "loss": 0.83447571, "memory(GiB)": 135.49, "step": 15360, "train_speed(iter/s)": 0.20425 }, { "acc": 0.780509, "epoch": 0.3585937386080287, "grad_norm": 5.4375, "learning_rate": 9.426780290244983e-06, "loss": 0.80116711, "memory(GiB)": 135.49, "step": 15370, "train_speed(iter/s)": 0.204322 }, { "acc": 0.76043262, "epoch": 0.3588270461803176, "grad_norm": 6.09375, "learning_rate": 9.425901700622904e-06, "loss": 0.84358797, "memory(GiB)": 135.49, "step": 15380, "train_speed(iter/s)": 0.204396 }, { "acc": 0.78023806, "epoch": 0.3590603537526065, "grad_norm": 6.8125, "learning_rate": 9.42502247920464e-06, "loss": 0.79185619, "memory(GiB)": 135.49, "step": 15390, "train_speed(iter/s)": 0.204469 }, { "acc": 0.76109076, "epoch": 0.3592936613248954, "grad_norm": 6.84375, "learning_rate": 9.424142626115706e-06, "loss": 0.85470438, "memory(GiB)": 135.49, "step": 15400, "train_speed(iter/s)": 0.204538 }, { "acc": 0.75989237, "epoch": 0.3595269688971843, "grad_norm": 4.6875, "learning_rate": 9.423262141481695e-06, "loss": 0.86965046, "memory(GiB)": 135.49, "step": 15410, "train_speed(iter/s)": 0.204609 }, { "acc": 0.78078728, "epoch": 0.3597602764694732, "grad_norm": 9.4375, "learning_rate": 9.4223810254283e-06, "loss": 0.77249737, "memory(GiB)": 135.49, "step": 15420, "train_speed(iter/s)": 0.204685 }, { "acc": 0.77038364, "epoch": 0.35999358404176207, "grad_norm": 8.375, "learning_rate": 9.421499278081296e-06, "loss": 0.85602398, "memory(GiB)": 135.49, "step": 15430, "train_speed(iter/s)": 0.204754 }, { "acc": 0.77583313, "epoch": 0.36022689161405097, "grad_norm": 6.46875, "learning_rate": 9.420616899566557e-06, "loss": 0.8040802, "memory(GiB)": 135.49, "step": 15440, "train_speed(iter/s)": 0.204821 }, { "acc": 0.75921769, "epoch": 0.36046019918633987, "grad_norm": 6.28125, "learning_rate": 9.41973389001004e-06, "loss": 0.86511822, "memory(GiB)": 135.49, "step": 15450, "train_speed(iter/s)": 0.204895 }, { "acc": 0.75401816, "epoch": 0.3606935067586287, "grad_norm": 5.5625, "learning_rate": 9.418850249537792e-06, "loss": 0.88008451, "memory(GiB)": 135.49, "step": 15460, "train_speed(iter/s)": 0.204967 }, { "acc": 0.75438418, "epoch": 0.3609268143309176, "grad_norm": 4.59375, "learning_rate": 9.417965978275955e-06, "loss": 0.90597763, "memory(GiB)": 135.49, "step": 15470, "train_speed(iter/s)": 0.205036 }, { "acc": 0.73901019, "epoch": 0.3611601219032065, "grad_norm": 5.0625, "learning_rate": 9.417081076350758e-06, "loss": 0.92722607, "memory(GiB)": 135.49, "step": 15480, "train_speed(iter/s)": 0.205103 }, { "acc": 0.77526464, "epoch": 0.3613934294754954, "grad_norm": 5.125, "learning_rate": 9.416195543888522e-06, "loss": 0.81520596, "memory(GiB)": 135.49, "step": 15490, "train_speed(iter/s)": 0.205174 }, { "acc": 0.75166454, "epoch": 0.3616267370477843, "grad_norm": 6.34375, "learning_rate": 9.415309381015654e-06, "loss": 0.91375999, "memory(GiB)": 135.49, "step": 15500, "train_speed(iter/s)": 0.205245 }, { "epoch": 0.3616267370477843, "eval_acc": 0.7318071371640438, "eval_loss": 0.847244918346405, "eval_runtime": 1263.6117, "eval_samples_per_second": 28.483, "eval_steps_per_second": 14.242, "step": 15500 }, { "acc": 0.76878414, "epoch": 0.3618600446200732, "grad_norm": 5.59375, "learning_rate": 9.414422587858654e-06, "loss": 0.83947744, "memory(GiB)": 135.49, "step": 15510, "train_speed(iter/s)": 0.20187 }, { "acc": 0.76726446, "epoch": 0.3620933521923621, "grad_norm": 5.8125, "learning_rate": 9.413535164544112e-06, "loss": 0.82590847, "memory(GiB)": 135.49, "step": 15520, "train_speed(iter/s)": 0.201941 }, { "acc": 0.76219702, "epoch": 0.362326659764651, "grad_norm": 6.5625, "learning_rate": 9.412647111198708e-06, "loss": 0.85966187, "memory(GiB)": 135.49, "step": 15530, "train_speed(iter/s)": 0.202011 }, { "acc": 0.76141219, "epoch": 0.3625599673369399, "grad_norm": 4.25, "learning_rate": 9.411758427949211e-06, "loss": 0.86665916, "memory(GiB)": 135.49, "step": 15540, "train_speed(iter/s)": 0.202075 }, { "acc": 0.79316893, "epoch": 0.36279327490922875, "grad_norm": 5.03125, "learning_rate": 9.410869114922478e-06, "loss": 0.75011024, "memory(GiB)": 135.49, "step": 15550, "train_speed(iter/s)": 0.202142 }, { "acc": 0.78719525, "epoch": 0.36302658248151765, "grad_norm": 7.0, "learning_rate": 9.409979172245463e-06, "loss": 0.76994224, "memory(GiB)": 135.49, "step": 15560, "train_speed(iter/s)": 0.202207 }, { "acc": 0.7809689, "epoch": 0.36325989005380654, "grad_norm": 5.3125, "learning_rate": 9.409088600045202e-06, "loss": 0.79060721, "memory(GiB)": 135.49, "step": 15570, "train_speed(iter/s)": 0.202272 }, { "acc": 0.77929516, "epoch": 0.36349319762609544, "grad_norm": 6.5, "learning_rate": 9.408197398448822e-06, "loss": 0.78506293, "memory(GiB)": 135.49, "step": 15580, "train_speed(iter/s)": 0.202339 }, { "acc": 0.77357693, "epoch": 0.36372650519838434, "grad_norm": 6.5, "learning_rate": 9.407305567583547e-06, "loss": 0.83555279, "memory(GiB)": 135.49, "step": 15590, "train_speed(iter/s)": 0.202403 }, { "acc": 0.76378279, "epoch": 0.36395981277067324, "grad_norm": 5.875, "learning_rate": 9.40641310757668e-06, "loss": 0.86744881, "memory(GiB)": 135.49, "step": 15600, "train_speed(iter/s)": 0.202467 }, { "acc": 0.7713541, "epoch": 0.36419312034296214, "grad_norm": 6.15625, "learning_rate": 9.405520018555624e-06, "loss": 0.83571186, "memory(GiB)": 135.49, "step": 15610, "train_speed(iter/s)": 0.202533 }, { "acc": 0.76760268, "epoch": 0.36442642791525104, "grad_norm": 5.5625, "learning_rate": 9.404626300647864e-06, "loss": 0.84797783, "memory(GiB)": 135.49, "step": 15620, "train_speed(iter/s)": 0.2026 }, { "acc": 0.77534556, "epoch": 0.36465973548753994, "grad_norm": 4.34375, "learning_rate": 9.403731953980978e-06, "loss": 0.80592613, "memory(GiB)": 135.49, "step": 15630, "train_speed(iter/s)": 0.202665 }, { "acc": 0.7665029, "epoch": 0.36489304305982884, "grad_norm": 9.4375, "learning_rate": 9.402836978682636e-06, "loss": 0.84102955, "memory(GiB)": 135.49, "step": 15640, "train_speed(iter/s)": 0.202731 }, { "acc": 0.75593719, "epoch": 0.3651263506321177, "grad_norm": 9.9375, "learning_rate": 9.401941374880595e-06, "loss": 0.87333546, "memory(GiB)": 135.49, "step": 15650, "train_speed(iter/s)": 0.202789 }, { "acc": 0.76403747, "epoch": 0.3653596582044066, "grad_norm": 11.4375, "learning_rate": 9.4010451427027e-06, "loss": 0.86393843, "memory(GiB)": 135.49, "step": 15660, "train_speed(iter/s)": 0.202856 }, { "acc": 0.78011436, "epoch": 0.3655929657766955, "grad_norm": 6.71875, "learning_rate": 9.40014828227689e-06, "loss": 0.78493629, "memory(GiB)": 135.49, "step": 15670, "train_speed(iter/s)": 0.202923 }, { "acc": 0.75369167, "epoch": 0.3658262733489844, "grad_norm": 12.125, "learning_rate": 9.399250793731192e-06, "loss": 0.89451866, "memory(GiB)": 135.49, "step": 15680, "train_speed(iter/s)": 0.202987 }, { "acc": 0.77842922, "epoch": 0.3660595809212733, "grad_norm": 4.8125, "learning_rate": 9.398352677193719e-06, "loss": 0.78451118, "memory(GiB)": 135.49, "step": 15690, "train_speed(iter/s)": 0.203055 }, { "acc": 0.75245914, "epoch": 0.3662928884935622, "grad_norm": 10.1875, "learning_rate": 9.397453932792681e-06, "loss": 0.87892847, "memory(GiB)": 135.49, "step": 15700, "train_speed(iter/s)": 0.203122 }, { "acc": 0.76076908, "epoch": 0.3665261960658511, "grad_norm": 45.5, "learning_rate": 9.396554560656371e-06, "loss": 0.90075703, "memory(GiB)": 135.49, "step": 15710, "train_speed(iter/s)": 0.203187 }, { "acc": 0.76947803, "epoch": 0.36675950363813997, "grad_norm": 5.75, "learning_rate": 9.395654560913174e-06, "loss": 0.83032551, "memory(GiB)": 135.49, "step": 15720, "train_speed(iter/s)": 0.203253 }, { "acc": 0.76151266, "epoch": 0.36699281121042887, "grad_norm": 4.5, "learning_rate": 9.394753933691567e-06, "loss": 0.88484745, "memory(GiB)": 135.49, "step": 15730, "train_speed(iter/s)": 0.203318 }, { "acc": 0.76293488, "epoch": 0.3672261187827177, "grad_norm": 5.0, "learning_rate": 9.393852679120113e-06, "loss": 0.84653797, "memory(GiB)": 135.49, "step": 15740, "train_speed(iter/s)": 0.203387 }, { "acc": 0.77204428, "epoch": 0.3674594263550066, "grad_norm": 4.03125, "learning_rate": 9.392950797327463e-06, "loss": 0.850737, "memory(GiB)": 135.49, "step": 15750, "train_speed(iter/s)": 0.203449 }, { "acc": 0.76659756, "epoch": 0.3676927339272955, "grad_norm": 5.96875, "learning_rate": 9.392048288442363e-06, "loss": 0.85192022, "memory(GiB)": 135.49, "step": 15760, "train_speed(iter/s)": 0.20351 }, { "acc": 0.7597445, "epoch": 0.3679260414995844, "grad_norm": 7.4375, "learning_rate": 9.391145152593646e-06, "loss": 0.89180889, "memory(GiB)": 135.49, "step": 15770, "train_speed(iter/s)": 0.203577 }, { "acc": 0.77264318, "epoch": 0.3681593490718733, "grad_norm": 20.875, "learning_rate": 9.390241389910236e-06, "loss": 0.8230444, "memory(GiB)": 135.49, "step": 15780, "train_speed(iter/s)": 0.203645 }, { "acc": 0.77065697, "epoch": 0.3683926566441622, "grad_norm": 8.0625, "learning_rate": 9.389337000521142e-06, "loss": 0.84154835, "memory(GiB)": 135.49, "step": 15790, "train_speed(iter/s)": 0.203715 }, { "acc": 0.77509365, "epoch": 0.3686259642164511, "grad_norm": 5.3125, "learning_rate": 9.388431984555466e-06, "loss": 0.80728531, "memory(GiB)": 135.49, "step": 15800, "train_speed(iter/s)": 0.20378 }, { "acc": 0.76742454, "epoch": 0.36885927178874, "grad_norm": 4.6875, "learning_rate": 9.387526342142398e-06, "loss": 0.85584698, "memory(GiB)": 135.49, "step": 15810, "train_speed(iter/s)": 0.203848 }, { "acc": 0.76800575, "epoch": 0.3690925793610289, "grad_norm": 5.71875, "learning_rate": 9.386620073411221e-06, "loss": 0.82967463, "memory(GiB)": 135.49, "step": 15820, "train_speed(iter/s)": 0.203909 }, { "acc": 0.7414227, "epoch": 0.3693258869333178, "grad_norm": 5.84375, "learning_rate": 9.385713178491302e-06, "loss": 0.9614665, "memory(GiB)": 135.49, "step": 15830, "train_speed(iter/s)": 0.203975 }, { "acc": 0.75951004, "epoch": 0.36955919450560665, "grad_norm": 8.5625, "learning_rate": 9.384805657512101e-06, "loss": 0.87870388, "memory(GiB)": 135.49, "step": 15840, "train_speed(iter/s)": 0.204039 }, { "acc": 0.77176194, "epoch": 0.36979250207789555, "grad_norm": 6.5, "learning_rate": 9.383897510603167e-06, "loss": 0.82479382, "memory(GiB)": 135.49, "step": 15850, "train_speed(iter/s)": 0.204105 }, { "acc": 0.76164064, "epoch": 0.37002580965018445, "grad_norm": 7.875, "learning_rate": 9.382988737894136e-06, "loss": 0.85617466, "memory(GiB)": 135.49, "step": 15860, "train_speed(iter/s)": 0.204173 }, { "acc": 0.77236791, "epoch": 0.37025911722247334, "grad_norm": 9.6875, "learning_rate": 9.382079339514736e-06, "loss": 0.81436691, "memory(GiB)": 135.49, "step": 15870, "train_speed(iter/s)": 0.204242 }, { "acc": 0.77181535, "epoch": 0.37049242479476224, "grad_norm": 6.0, "learning_rate": 9.381169315594782e-06, "loss": 0.81661558, "memory(GiB)": 135.49, "step": 15880, "train_speed(iter/s)": 0.204309 }, { "acc": 0.77608862, "epoch": 0.37072573236705114, "grad_norm": 5.21875, "learning_rate": 9.380258666264184e-06, "loss": 0.81581669, "memory(GiB)": 135.49, "step": 15890, "train_speed(iter/s)": 0.204375 }, { "acc": 0.76634541, "epoch": 0.37095903993934004, "grad_norm": 7.59375, "learning_rate": 9.379347391652931e-06, "loss": 0.85019417, "memory(GiB)": 135.49, "step": 15900, "train_speed(iter/s)": 0.20444 }, { "acc": 0.76326947, "epoch": 0.37119234751162894, "grad_norm": 5.875, "learning_rate": 9.378435491891112e-06, "loss": 0.84614334, "memory(GiB)": 135.49, "step": 15910, "train_speed(iter/s)": 0.204504 }, { "acc": 0.75156894, "epoch": 0.37142565508391784, "grad_norm": 3.9375, "learning_rate": 9.377522967108897e-06, "loss": 0.9149332, "memory(GiB)": 135.49, "step": 15920, "train_speed(iter/s)": 0.204562 }, { "acc": 0.76230459, "epoch": 0.37165896265620674, "grad_norm": 8.5, "learning_rate": 9.376609817436551e-06, "loss": 0.87423477, "memory(GiB)": 135.49, "step": 15930, "train_speed(iter/s)": 0.204624 }, { "acc": 0.75298328, "epoch": 0.3718922702284956, "grad_norm": 12.0625, "learning_rate": 9.375696043004425e-06, "loss": 0.90856066, "memory(GiB)": 135.49, "step": 15940, "train_speed(iter/s)": 0.204689 }, { "acc": 0.75305901, "epoch": 0.3721255778007845, "grad_norm": 4.4375, "learning_rate": 9.374781643942961e-06, "loss": 0.87506657, "memory(GiB)": 135.49, "step": 15950, "train_speed(iter/s)": 0.204757 }, { "acc": 0.77595949, "epoch": 0.3723588853730734, "grad_norm": 5.5, "learning_rate": 9.373866620382686e-06, "loss": 0.80878391, "memory(GiB)": 135.49, "step": 15960, "train_speed(iter/s)": 0.204823 }, { "acc": 0.77158127, "epoch": 0.3725921929453623, "grad_norm": 5.90625, "learning_rate": 9.372950972454222e-06, "loss": 0.80023937, "memory(GiB)": 135.49, "step": 15970, "train_speed(iter/s)": 0.204887 }, { "acc": 0.76376519, "epoch": 0.3728255005176512, "grad_norm": 18.0, "learning_rate": 9.372034700288278e-06, "loss": 0.83646612, "memory(GiB)": 135.49, "step": 15980, "train_speed(iter/s)": 0.204949 }, { "acc": 0.76122923, "epoch": 0.3730588080899401, "grad_norm": 6.34375, "learning_rate": 9.37111780401565e-06, "loss": 0.86859093, "memory(GiB)": 135.49, "step": 15990, "train_speed(iter/s)": 0.205014 }, { "acc": 0.76710596, "epoch": 0.373292115662229, "grad_norm": 5.875, "learning_rate": 9.370200283767225e-06, "loss": 0.83317699, "memory(GiB)": 135.49, "step": 16000, "train_speed(iter/s)": 0.205084 }, { "epoch": 0.373292115662229, "eval_acc": 0.7319084604984204, "eval_loss": 0.8471704721450806, "eval_runtime": 1264.0462, "eval_samples_per_second": 28.473, "eval_steps_per_second": 14.237, "step": 16000 }, { "acc": 0.76815434, "epoch": 0.3735254232345179, "grad_norm": 6.09375, "learning_rate": 9.369282139673979e-06, "loss": 0.84705839, "memory(GiB)": 135.49, "step": 16010, "train_speed(iter/s)": 0.201816 }, { "acc": 0.75830274, "epoch": 0.37375873080680677, "grad_norm": 5.25, "learning_rate": 9.368363371866978e-06, "loss": 0.89208012, "memory(GiB)": 135.49, "step": 16020, "train_speed(iter/s)": 0.201883 }, { "acc": 0.7730679, "epoch": 0.3739920383790956, "grad_norm": 8.5625, "learning_rate": 9.367443980477374e-06, "loss": 0.81629734, "memory(GiB)": 135.49, "step": 16030, "train_speed(iter/s)": 0.20195 }, { "acc": 0.78336744, "epoch": 0.3742253459513845, "grad_norm": 11.875, "learning_rate": 9.366523965636412e-06, "loss": 0.76085105, "memory(GiB)": 135.49, "step": 16040, "train_speed(iter/s)": 0.202016 }, { "acc": 0.75604334, "epoch": 0.3744586535236734, "grad_norm": 7.5, "learning_rate": 9.36560332747542e-06, "loss": 0.89697542, "memory(GiB)": 135.49, "step": 16050, "train_speed(iter/s)": 0.202081 }, { "acc": 0.77814646, "epoch": 0.3746919610959623, "grad_norm": 7.65625, "learning_rate": 9.364682066125822e-06, "loss": 0.80443506, "memory(GiB)": 135.49, "step": 16060, "train_speed(iter/s)": 0.202146 }, { "acc": 0.77922311, "epoch": 0.3749252686682512, "grad_norm": 8.625, "learning_rate": 9.363760181719127e-06, "loss": 0.8103054, "memory(GiB)": 135.49, "step": 16070, "train_speed(iter/s)": 0.202212 }, { "acc": 0.76417923, "epoch": 0.3751585762405401, "grad_norm": 7.65625, "learning_rate": 9.362837674386934e-06, "loss": 0.85765219, "memory(GiB)": 135.49, "step": 16080, "train_speed(iter/s)": 0.202274 }, { "acc": 0.79342618, "epoch": 0.375391883812829, "grad_norm": 5.34375, "learning_rate": 9.36191454426093e-06, "loss": 0.72560234, "memory(GiB)": 135.49, "step": 16090, "train_speed(iter/s)": 0.20234 }, { "acc": 0.74945507, "epoch": 0.3756251913851179, "grad_norm": 7.25, "learning_rate": 9.360990791472893e-06, "loss": 0.90031233, "memory(GiB)": 135.49, "step": 16100, "train_speed(iter/s)": 0.202404 }, { "acc": 0.76527863, "epoch": 0.3758584989574068, "grad_norm": 8.875, "learning_rate": 9.360066416154687e-06, "loss": 0.8498003, "memory(GiB)": 135.49, "step": 16110, "train_speed(iter/s)": 0.20247 }, { "acc": 0.76937752, "epoch": 0.3760918065296957, "grad_norm": 6.90625, "learning_rate": 9.359141418438266e-06, "loss": 0.84569511, "memory(GiB)": 135.49, "step": 16120, "train_speed(iter/s)": 0.202537 }, { "acc": 0.76679182, "epoch": 0.37632511410198455, "grad_norm": 5.1875, "learning_rate": 9.358215798455674e-06, "loss": 0.84798098, "memory(GiB)": 135.49, "step": 16130, "train_speed(iter/s)": 0.202602 }, { "acc": 0.7567564, "epoch": 0.37655842167427345, "grad_norm": 5.6875, "learning_rate": 9.357289556339044e-06, "loss": 0.89561882, "memory(GiB)": 135.49, "step": 16140, "train_speed(iter/s)": 0.202668 }, { "acc": 0.74548564, "epoch": 0.37679172924656235, "grad_norm": 5.8125, "learning_rate": 9.356362692220593e-06, "loss": 0.93068428, "memory(GiB)": 135.49, "step": 16150, "train_speed(iter/s)": 0.202734 }, { "acc": 0.76299548, "epoch": 0.37702503681885124, "grad_norm": 8.25, "learning_rate": 9.355435206232635e-06, "loss": 0.82308149, "memory(GiB)": 135.49, "step": 16160, "train_speed(iter/s)": 0.202797 }, { "acc": 0.76219506, "epoch": 0.37725834439114014, "grad_norm": 7.59375, "learning_rate": 9.354507098507568e-06, "loss": 0.86612186, "memory(GiB)": 135.49, "step": 16170, "train_speed(iter/s)": 0.202867 }, { "acc": 0.76659021, "epoch": 0.37749165196342904, "grad_norm": 4.9375, "learning_rate": 9.353578369177876e-06, "loss": 0.84450588, "memory(GiB)": 135.49, "step": 16180, "train_speed(iter/s)": 0.202929 }, { "acc": 0.7636869, "epoch": 0.37772495953571794, "grad_norm": 6.4375, "learning_rate": 9.352649018376136e-06, "loss": 0.85545311, "memory(GiB)": 135.49, "step": 16190, "train_speed(iter/s)": 0.202996 }, { "acc": 0.74878197, "epoch": 0.37795826710800684, "grad_norm": 5.84375, "learning_rate": 9.351719046235013e-06, "loss": 0.90061226, "memory(GiB)": 135.49, "step": 16200, "train_speed(iter/s)": 0.203062 }, { "acc": 0.77190561, "epoch": 0.37819157468029574, "grad_norm": 6.1875, "learning_rate": 9.350788452887262e-06, "loss": 0.81603327, "memory(GiB)": 135.49, "step": 16210, "train_speed(iter/s)": 0.203127 }, { "acc": 0.7751091, "epoch": 0.37842488225258464, "grad_norm": 6.6875, "learning_rate": 9.349857238465723e-06, "loss": 0.82127943, "memory(GiB)": 135.49, "step": 16220, "train_speed(iter/s)": 0.203189 }, { "acc": 0.76403146, "epoch": 0.3786581898248735, "grad_norm": 5.3125, "learning_rate": 9.348925403103326e-06, "loss": 0.86137657, "memory(GiB)": 135.49, "step": 16230, "train_speed(iter/s)": 0.203258 }, { "acc": 0.7609869, "epoch": 0.3788914973971624, "grad_norm": 6.0625, "learning_rate": 9.347992946933091e-06, "loss": 0.87185478, "memory(GiB)": 135.49, "step": 16240, "train_speed(iter/s)": 0.203322 }, { "acc": 0.76916203, "epoch": 0.3791248049694513, "grad_norm": 9.0625, "learning_rate": 9.347059870088127e-06, "loss": 0.82950039, "memory(GiB)": 135.49, "step": 16250, "train_speed(iter/s)": 0.203383 }, { "acc": 0.7702538, "epoch": 0.3793581125417402, "grad_norm": 6.15625, "learning_rate": 9.346126172701629e-06, "loss": 0.8304122, "memory(GiB)": 135.49, "step": 16260, "train_speed(iter/s)": 0.203451 }, { "acc": 0.74205942, "epoch": 0.3795914201140291, "grad_norm": 5.5, "learning_rate": 9.345191854906881e-06, "loss": 0.96710758, "memory(GiB)": 135.49, "step": 16270, "train_speed(iter/s)": 0.20352 }, { "acc": 0.78031797, "epoch": 0.379824727686318, "grad_norm": 6.84375, "learning_rate": 9.344256916837259e-06, "loss": 0.8113637, "memory(GiB)": 135.49, "step": 16280, "train_speed(iter/s)": 0.203583 }, { "acc": 0.77685061, "epoch": 0.3800580352586069, "grad_norm": 9.0, "learning_rate": 9.343321358626225e-06, "loss": 0.80179777, "memory(GiB)": 135.49, "step": 16290, "train_speed(iter/s)": 0.203649 }, { "acc": 0.77213812, "epoch": 0.3802913428308958, "grad_norm": 8.625, "learning_rate": 9.342385180407328e-06, "loss": 0.8319025, "memory(GiB)": 135.49, "step": 16300, "train_speed(iter/s)": 0.203716 }, { "acc": 0.76010103, "epoch": 0.38052465040318467, "grad_norm": 4.09375, "learning_rate": 9.341448382314207e-06, "loss": 0.87642326, "memory(GiB)": 135.49, "step": 16310, "train_speed(iter/s)": 0.203777 }, { "acc": 0.78292322, "epoch": 0.3807579579754735, "grad_norm": 4.625, "learning_rate": 9.340510964480591e-06, "loss": 0.76597886, "memory(GiB)": 135.49, "step": 16320, "train_speed(iter/s)": 0.203838 }, { "acc": 0.76660757, "epoch": 0.3809912655477624, "grad_norm": 4.84375, "learning_rate": 9.339572927040298e-06, "loss": 0.86160831, "memory(GiB)": 135.49, "step": 16330, "train_speed(iter/s)": 0.2039 }, { "acc": 0.77685137, "epoch": 0.3812245731200513, "grad_norm": 4.4375, "learning_rate": 9.338634270127227e-06, "loss": 0.8105443, "memory(GiB)": 135.49, "step": 16340, "train_speed(iter/s)": 0.203963 }, { "acc": 0.7585494, "epoch": 0.3814578806923402, "grad_norm": 4.34375, "learning_rate": 9.337694993875376e-06, "loss": 0.86978207, "memory(GiB)": 135.49, "step": 16350, "train_speed(iter/s)": 0.204021 }, { "acc": 0.75688591, "epoch": 0.3816911882646291, "grad_norm": 5.03125, "learning_rate": 9.336755098418824e-06, "loss": 0.88510151, "memory(GiB)": 135.49, "step": 16360, "train_speed(iter/s)": 0.204085 }, { "acc": 0.75924568, "epoch": 0.381924495836918, "grad_norm": 5.75, "learning_rate": 9.335814583891743e-06, "loss": 0.87862453, "memory(GiB)": 135.49, "step": 16370, "train_speed(iter/s)": 0.204144 }, { "acc": 0.75283499, "epoch": 0.3821578034092069, "grad_norm": 6.40625, "learning_rate": 9.33487345042839e-06, "loss": 0.88805857, "memory(GiB)": 135.49, "step": 16380, "train_speed(iter/s)": 0.204206 }, { "acc": 0.75164461, "epoch": 0.3823911109814958, "grad_norm": 7.21875, "learning_rate": 9.333931698163107e-06, "loss": 0.88734884, "memory(GiB)": 135.49, "step": 16390, "train_speed(iter/s)": 0.204269 }, { "acc": 0.75930529, "epoch": 0.3826244185537847, "grad_norm": 5.4375, "learning_rate": 9.332989327230337e-06, "loss": 0.88693256, "memory(GiB)": 135.49, "step": 16400, "train_speed(iter/s)": 0.204331 }, { "acc": 0.74749475, "epoch": 0.3828577261260736, "grad_norm": 6.0, "learning_rate": 9.3320463377646e-06, "loss": 0.90007601, "memory(GiB)": 135.49, "step": 16410, "train_speed(iter/s)": 0.204397 }, { "acc": 0.74689522, "epoch": 0.38309103369836245, "grad_norm": 6.1875, "learning_rate": 9.331102729900505e-06, "loss": 0.92962952, "memory(GiB)": 135.49, "step": 16420, "train_speed(iter/s)": 0.20446 }, { "acc": 0.75736136, "epoch": 0.38332434127065135, "grad_norm": 5.09375, "learning_rate": 9.330158503772753e-06, "loss": 0.86928062, "memory(GiB)": 135.49, "step": 16430, "train_speed(iter/s)": 0.204523 }, { "acc": 0.77536964, "epoch": 0.38355764884294025, "grad_norm": 5.8125, "learning_rate": 9.329213659516134e-06, "loss": 0.80574074, "memory(GiB)": 135.49, "step": 16440, "train_speed(iter/s)": 0.204586 }, { "acc": 0.76693258, "epoch": 0.38379095641522915, "grad_norm": 5.6875, "learning_rate": 9.328268197265523e-06, "loss": 0.84828167, "memory(GiB)": 135.49, "step": 16450, "train_speed(iter/s)": 0.204642 }, { "acc": 0.77771158, "epoch": 0.38402426398751804, "grad_norm": 5.8125, "learning_rate": 9.327322117155881e-06, "loss": 0.79359813, "memory(GiB)": 135.49, "step": 16460, "train_speed(iter/s)": 0.204706 }, { "acc": 0.76446362, "epoch": 0.38425757155980694, "grad_norm": 5.6875, "learning_rate": 9.326375419322267e-06, "loss": 0.88130713, "memory(GiB)": 135.49, "step": 16470, "train_speed(iter/s)": 0.204771 }, { "acc": 0.77127538, "epoch": 0.38449087913209584, "grad_norm": 4.9375, "learning_rate": 9.325428103899818e-06, "loss": 0.8347065, "memory(GiB)": 135.49, "step": 16480, "train_speed(iter/s)": 0.204833 }, { "acc": 0.76558614, "epoch": 0.38472418670438474, "grad_norm": 6.3125, "learning_rate": 9.324480171023764e-06, "loss": 0.83865719, "memory(GiB)": 135.49, "step": 16490, "train_speed(iter/s)": 0.204892 }, { "acc": 0.76412458, "epoch": 0.38495749427667364, "grad_norm": 5.0625, "learning_rate": 9.32353162082942e-06, "loss": 0.87485847, "memory(GiB)": 135.49, "step": 16500, "train_speed(iter/s)": 0.20496 }, { "epoch": 0.38495749427667364, "eval_acc": 0.7320946501924821, "eval_loss": 0.8463056683540344, "eval_runtime": 1264.2934, "eval_samples_per_second": 28.467, "eval_steps_per_second": 14.234, "step": 16500 }, { "acc": 0.77325726, "epoch": 0.3851908018489625, "grad_norm": 5.0625, "learning_rate": 9.322582453452195e-06, "loss": 0.81723528, "memory(GiB)": 135.49, "step": 16510, "train_speed(iter/s)": 0.201792 }, { "acc": 0.77748785, "epoch": 0.3854241094212514, "grad_norm": 6.03125, "learning_rate": 9.32163266902758e-06, "loss": 0.79948807, "memory(GiB)": 135.49, "step": 16520, "train_speed(iter/s)": 0.201853 }, { "acc": 0.75513978, "epoch": 0.3856574169935403, "grad_norm": 5.71875, "learning_rate": 9.320682267691157e-06, "loss": 0.88127022, "memory(GiB)": 135.49, "step": 16530, "train_speed(iter/s)": 0.201917 }, { "acc": 0.77438269, "epoch": 0.3858907245658292, "grad_norm": 9.4375, "learning_rate": 9.319731249578595e-06, "loss": 0.81643505, "memory(GiB)": 135.49, "step": 16540, "train_speed(iter/s)": 0.201983 }, { "acc": 0.77376451, "epoch": 0.3861240321381181, "grad_norm": 5.6875, "learning_rate": 9.318779614825653e-06, "loss": 0.8183198, "memory(GiB)": 135.49, "step": 16550, "train_speed(iter/s)": 0.202044 }, { "acc": 0.77265062, "epoch": 0.386357339710407, "grad_norm": 6.46875, "learning_rate": 9.317827363568176e-06, "loss": 0.82489986, "memory(GiB)": 135.49, "step": 16560, "train_speed(iter/s)": 0.202104 }, { "acc": 0.7697372, "epoch": 0.3865906472826959, "grad_norm": 5.34375, "learning_rate": 9.316874495942095e-06, "loss": 0.80334549, "memory(GiB)": 135.49, "step": 16570, "train_speed(iter/s)": 0.202167 }, { "acc": 0.77908349, "epoch": 0.3868239548549848, "grad_norm": 5.5, "learning_rate": 9.315921012083436e-06, "loss": 0.78984747, "memory(GiB)": 135.49, "step": 16580, "train_speed(iter/s)": 0.20223 }, { "acc": 0.77425575, "epoch": 0.3870572624272737, "grad_norm": 6.40625, "learning_rate": 9.314966912128305e-06, "loss": 0.81702795, "memory(GiB)": 135.49, "step": 16590, "train_speed(iter/s)": 0.202287 }, { "acc": 0.77814913, "epoch": 0.3872905699995626, "grad_norm": 9.875, "learning_rate": 9.3140121962129e-06, "loss": 0.79715223, "memory(GiB)": 135.49, "step": 16600, "train_speed(iter/s)": 0.202351 }, { "acc": 0.76284785, "epoch": 0.3875238775718514, "grad_norm": 5.40625, "learning_rate": 9.313056864473508e-06, "loss": 0.86330223, "memory(GiB)": 135.49, "step": 16610, "train_speed(iter/s)": 0.202412 }, { "acc": 0.76654396, "epoch": 0.3877571851441403, "grad_norm": 8.1875, "learning_rate": 9.312100917046502e-06, "loss": 0.83713474, "memory(GiB)": 135.49, "step": 16620, "train_speed(iter/s)": 0.202472 }, { "acc": 0.76262555, "epoch": 0.3879904927164292, "grad_norm": 6.34375, "learning_rate": 9.311144354068342e-06, "loss": 0.86117086, "memory(GiB)": 135.49, "step": 16630, "train_speed(iter/s)": 0.202533 }, { "acc": 0.77964373, "epoch": 0.3882238002887181, "grad_norm": 6.9375, "learning_rate": 9.310187175675579e-06, "loss": 0.82894449, "memory(GiB)": 135.49, "step": 16640, "train_speed(iter/s)": 0.202596 }, { "acc": 0.75743222, "epoch": 0.388457107861007, "grad_norm": 7.40625, "learning_rate": 9.309229382004847e-06, "loss": 0.8971364, "memory(GiB)": 135.49, "step": 16650, "train_speed(iter/s)": 0.202657 }, { "acc": 0.75978603, "epoch": 0.3886904154332959, "grad_norm": 5.59375, "learning_rate": 9.308270973192875e-06, "loss": 0.87167559, "memory(GiB)": 135.49, "step": 16660, "train_speed(iter/s)": 0.202722 }, { "acc": 0.74200101, "epoch": 0.3889237230055848, "grad_norm": 6.03125, "learning_rate": 9.307311949376472e-06, "loss": 0.93526611, "memory(GiB)": 135.49, "step": 16670, "train_speed(iter/s)": 0.202788 }, { "acc": 0.75631256, "epoch": 0.3891570305778737, "grad_norm": 5.96875, "learning_rate": 9.306352310692539e-06, "loss": 0.89708118, "memory(GiB)": 135.49, "step": 16680, "train_speed(iter/s)": 0.202852 }, { "acc": 0.75519581, "epoch": 0.3893903381501626, "grad_norm": 5.15625, "learning_rate": 9.305392057278066e-06, "loss": 0.86111088, "memory(GiB)": 135.49, "step": 16690, "train_speed(iter/s)": 0.202916 }, { "acc": 0.76217475, "epoch": 0.3896236457224515, "grad_norm": 5.6875, "learning_rate": 9.304431189270127e-06, "loss": 0.87444897, "memory(GiB)": 135.49, "step": 16700, "train_speed(iter/s)": 0.202979 }, { "acc": 0.75610485, "epoch": 0.38985695329474035, "grad_norm": 5.4375, "learning_rate": 9.303469706805886e-06, "loss": 0.87402859, "memory(GiB)": 135.49, "step": 16710, "train_speed(iter/s)": 0.203039 }, { "acc": 0.74796257, "epoch": 0.39009026086702925, "grad_norm": 7.6875, "learning_rate": 9.302507610022593e-06, "loss": 0.94966383, "memory(GiB)": 135.49, "step": 16720, "train_speed(iter/s)": 0.203102 }, { "acc": 0.78919706, "epoch": 0.39032356843931815, "grad_norm": 5.1875, "learning_rate": 9.30154489905759e-06, "loss": 0.7338151, "memory(GiB)": 135.49, "step": 16730, "train_speed(iter/s)": 0.203163 }, { "acc": 0.74758711, "epoch": 0.39055687601160705, "grad_norm": 5.59375, "learning_rate": 9.300581574048303e-06, "loss": 0.92656174, "memory(GiB)": 135.49, "step": 16740, "train_speed(iter/s)": 0.203226 }, { "acc": 0.76868439, "epoch": 0.39079018358389594, "grad_norm": 5.1875, "learning_rate": 9.299617635132243e-06, "loss": 0.8442338, "memory(GiB)": 135.49, "step": 16750, "train_speed(iter/s)": 0.203289 }, { "acc": 0.77853212, "epoch": 0.39102349115618484, "grad_norm": 6.71875, "learning_rate": 9.298653082447019e-06, "loss": 0.79831042, "memory(GiB)": 135.49, "step": 16760, "train_speed(iter/s)": 0.203354 }, { "acc": 0.77360802, "epoch": 0.39125679872847374, "grad_norm": 6.4375, "learning_rate": 9.29768791613031e-06, "loss": 0.81051655, "memory(GiB)": 135.49, "step": 16770, "train_speed(iter/s)": 0.203417 }, { "acc": 0.75892401, "epoch": 0.39149010630076264, "grad_norm": 5.8125, "learning_rate": 9.296722136319904e-06, "loss": 0.87418032, "memory(GiB)": 135.49, "step": 16780, "train_speed(iter/s)": 0.203477 }, { "acc": 0.77487507, "epoch": 0.39172341387305154, "grad_norm": 6.40625, "learning_rate": 9.29575574315366e-06, "loss": 0.82870646, "memory(GiB)": 135.49, "step": 16790, "train_speed(iter/s)": 0.203541 }, { "acc": 0.75115614, "epoch": 0.3919567214453404, "grad_norm": 6.3125, "learning_rate": 9.294788736769534e-06, "loss": 0.93668175, "memory(GiB)": 135.49, "step": 16800, "train_speed(iter/s)": 0.203606 }, { "acc": 0.75833755, "epoch": 0.3921900290176293, "grad_norm": 9.0625, "learning_rate": 9.293821117305562e-06, "loss": 0.86502571, "memory(GiB)": 135.49, "step": 16810, "train_speed(iter/s)": 0.203668 }, { "acc": 0.76663318, "epoch": 0.3924233365899182, "grad_norm": 4.84375, "learning_rate": 9.29285288489987e-06, "loss": 0.84134769, "memory(GiB)": 135.49, "step": 16820, "train_speed(iter/s)": 0.203727 }, { "acc": 0.76594772, "epoch": 0.3926566441622071, "grad_norm": 6.125, "learning_rate": 9.29188403969068e-06, "loss": 0.82861652, "memory(GiB)": 135.49, "step": 16830, "train_speed(iter/s)": 0.20379 }, { "acc": 0.78921766, "epoch": 0.392889951734496, "grad_norm": 8.375, "learning_rate": 9.290914581816287e-06, "loss": 0.73534389, "memory(GiB)": 135.49, "step": 16840, "train_speed(iter/s)": 0.20385 }, { "acc": 0.75628996, "epoch": 0.3931232593067849, "grad_norm": 7.3125, "learning_rate": 9.289944511415086e-06, "loss": 0.87888222, "memory(GiB)": 135.49, "step": 16850, "train_speed(iter/s)": 0.203908 }, { "acc": 0.76801815, "epoch": 0.3933565668790738, "grad_norm": 5.75, "learning_rate": 9.28897382862555e-06, "loss": 0.82630911, "memory(GiB)": 135.49, "step": 16860, "train_speed(iter/s)": 0.203969 }, { "acc": 0.74511909, "epoch": 0.3935898744513627, "grad_norm": 4.875, "learning_rate": 9.288002533586247e-06, "loss": 0.93416595, "memory(GiB)": 135.49, "step": 16870, "train_speed(iter/s)": 0.20403 }, { "acc": 0.77060823, "epoch": 0.3938231820236516, "grad_norm": 6.875, "learning_rate": 9.287030626435828e-06, "loss": 0.82613735, "memory(GiB)": 135.49, "step": 16880, "train_speed(iter/s)": 0.204094 }, { "acc": 0.77240801, "epoch": 0.3940564895959405, "grad_norm": 4.96875, "learning_rate": 9.286058107313034e-06, "loss": 0.80466051, "memory(GiB)": 135.49, "step": 16890, "train_speed(iter/s)": 0.204155 }, { "acc": 0.76605263, "epoch": 0.3942897971682293, "grad_norm": 7.125, "learning_rate": 9.285084976356689e-06, "loss": 0.84355488, "memory(GiB)": 135.49, "step": 16900, "train_speed(iter/s)": 0.204216 }, { "acc": 0.75563893, "epoch": 0.3945231047405182, "grad_norm": 5.6875, "learning_rate": 9.284111233705709e-06, "loss": 0.86302519, "memory(GiB)": 135.49, "step": 16910, "train_speed(iter/s)": 0.204278 }, { "acc": 0.76990614, "epoch": 0.3947564123128071, "grad_norm": 5.03125, "learning_rate": 9.283136879499094e-06, "loss": 0.85423346, "memory(GiB)": 135.49, "step": 16920, "train_speed(iter/s)": 0.204339 }, { "acc": 0.76550145, "epoch": 0.394989719885096, "grad_norm": 5.53125, "learning_rate": 9.282161913875933e-06, "loss": 0.850457, "memory(GiB)": 135.49, "step": 16930, "train_speed(iter/s)": 0.204399 }, { "acc": 0.76220298, "epoch": 0.3952230274573849, "grad_norm": 13.0625, "learning_rate": 9.281186336975406e-06, "loss": 0.87397709, "memory(GiB)": 135.49, "step": 16940, "train_speed(iter/s)": 0.204461 }, { "acc": 0.76191254, "epoch": 0.3954563350296738, "grad_norm": 3.59375, "learning_rate": 9.28021014893677e-06, "loss": 0.88575726, "memory(GiB)": 135.49, "step": 16950, "train_speed(iter/s)": 0.204522 }, { "acc": 0.75411501, "epoch": 0.3956896426019627, "grad_norm": 5.625, "learning_rate": 9.27923334989938e-06, "loss": 0.88283949, "memory(GiB)": 135.49, "step": 16960, "train_speed(iter/s)": 0.204583 }, { "acc": 0.75425615, "epoch": 0.3959229501742516, "grad_norm": 6.8125, "learning_rate": 9.278255940002671e-06, "loss": 0.88334904, "memory(GiB)": 135.49, "step": 16970, "train_speed(iter/s)": 0.204643 }, { "acc": 0.76360617, "epoch": 0.3961562577465405, "grad_norm": 9.125, "learning_rate": 9.27727791938617e-06, "loss": 0.84500113, "memory(GiB)": 135.49, "step": 16980, "train_speed(iter/s)": 0.204706 }, { "acc": 0.77362547, "epoch": 0.3963895653188294, "grad_norm": 8.0625, "learning_rate": 9.27629928818949e-06, "loss": 0.82942829, "memory(GiB)": 135.49, "step": 16990, "train_speed(iter/s)": 0.204767 }, { "acc": 0.77591038, "epoch": 0.39662287289111825, "grad_norm": 8.375, "learning_rate": 9.275320046552328e-06, "loss": 0.79333506, "memory(GiB)": 135.49, "step": 17000, "train_speed(iter/s)": 0.204828 }, { "epoch": 0.39662287289111825, "eval_acc": 0.7325247903349156, "eval_loss": 0.845748245716095, "eval_runtime": 1262.4537, "eval_samples_per_second": 28.509, "eval_steps_per_second": 14.255, "step": 17000 }, { "acc": 0.74947848, "epoch": 0.39685618046340715, "grad_norm": 5.34375, "learning_rate": 9.274340194614471e-06, "loss": 0.89042664, "memory(GiB)": 135.49, "step": 17010, "train_speed(iter/s)": 0.201765 }, { "acc": 0.76690302, "epoch": 0.39708948803569605, "grad_norm": 4.84375, "learning_rate": 9.273359732515793e-06, "loss": 0.84172173, "memory(GiB)": 135.49, "step": 17020, "train_speed(iter/s)": 0.201829 }, { "acc": 0.79353013, "epoch": 0.39732279560798495, "grad_norm": 5.3125, "learning_rate": 9.272378660396255e-06, "loss": 0.75729952, "memory(GiB)": 135.49, "step": 17030, "train_speed(iter/s)": 0.201888 }, { "acc": 0.76955838, "epoch": 0.39755610318027385, "grad_norm": 4.9375, "learning_rate": 9.271396978395904e-06, "loss": 0.85735836, "memory(GiB)": 135.49, "step": 17040, "train_speed(iter/s)": 0.201945 }, { "acc": 0.77185063, "epoch": 0.39778941075256274, "grad_norm": 5.9375, "learning_rate": 9.270414686654875e-06, "loss": 0.82398415, "memory(GiB)": 135.49, "step": 17050, "train_speed(iter/s)": 0.202003 }, { "acc": 0.77393732, "epoch": 0.39802271832485164, "grad_norm": 4.75, "learning_rate": 9.269431785313391e-06, "loss": 0.81761398, "memory(GiB)": 135.49, "step": 17060, "train_speed(iter/s)": 0.202065 }, { "acc": 0.76502919, "epoch": 0.39825602589714054, "grad_norm": 8.625, "learning_rate": 9.268448274511759e-06, "loss": 0.85615883, "memory(GiB)": 135.49, "step": 17070, "train_speed(iter/s)": 0.202129 }, { "acc": 0.77096701, "epoch": 0.39848933346942944, "grad_norm": 6.375, "learning_rate": 9.267464154390375e-06, "loss": 0.83012753, "memory(GiB)": 135.49, "step": 17080, "train_speed(iter/s)": 0.202191 }, { "acc": 0.77365546, "epoch": 0.3987226410417183, "grad_norm": 5.15625, "learning_rate": 9.266479425089725e-06, "loss": 0.82434464, "memory(GiB)": 135.49, "step": 17090, "train_speed(iter/s)": 0.202253 }, { "acc": 0.75208321, "epoch": 0.3989559486140072, "grad_norm": 6.375, "learning_rate": 9.265494086750375e-06, "loss": 0.90136223, "memory(GiB)": 135.49, "step": 17100, "train_speed(iter/s)": 0.202319 }, { "acc": 0.77011585, "epoch": 0.3991892561862961, "grad_norm": 6.75, "learning_rate": 9.264508139512985e-06, "loss": 0.85929527, "memory(GiB)": 135.49, "step": 17110, "train_speed(iter/s)": 0.202382 }, { "acc": 0.76871109, "epoch": 0.399422563758585, "grad_norm": 5.375, "learning_rate": 9.263521583518293e-06, "loss": 0.81891861, "memory(GiB)": 135.49, "step": 17120, "train_speed(iter/s)": 0.202447 }, { "acc": 0.77389183, "epoch": 0.3996558713308739, "grad_norm": 6.125, "learning_rate": 9.262534418907137e-06, "loss": 0.83904629, "memory(GiB)": 135.49, "step": 17130, "train_speed(iter/s)": 0.202506 }, { "acc": 0.76543975, "epoch": 0.3998891789031628, "grad_norm": 6.5, "learning_rate": 9.26154664582043e-06, "loss": 0.84889507, "memory(GiB)": 135.49, "step": 17140, "train_speed(iter/s)": 0.202565 }, { "acc": 0.76832066, "epoch": 0.4001224864754517, "grad_norm": 6.53125, "learning_rate": 9.260558264399177e-06, "loss": 0.82870312, "memory(GiB)": 135.49, "step": 17150, "train_speed(iter/s)": 0.202622 }, { "acc": 0.78240795, "epoch": 0.4003557940477406, "grad_norm": 5.34375, "learning_rate": 9.25956927478447e-06, "loss": 0.78121634, "memory(GiB)": 135.49, "step": 17160, "train_speed(iter/s)": 0.202686 }, { "acc": 0.764293, "epoch": 0.4005891016200295, "grad_norm": 5.625, "learning_rate": 9.258579677117486e-06, "loss": 0.85281124, "memory(GiB)": 135.49, "step": 17170, "train_speed(iter/s)": 0.202745 }, { "acc": 0.76559782, "epoch": 0.4008224091923184, "grad_norm": 5.5, "learning_rate": 9.25758947153949e-06, "loss": 0.84153214, "memory(GiB)": 135.49, "step": 17180, "train_speed(iter/s)": 0.202805 }, { "acc": 0.76584148, "epoch": 0.4010557167646072, "grad_norm": 4.3125, "learning_rate": 9.256598658191834e-06, "loss": 0.84278431, "memory(GiB)": 135.49, "step": 17190, "train_speed(iter/s)": 0.202871 }, { "acc": 0.78349104, "epoch": 0.4012890243368961, "grad_norm": 4.34375, "learning_rate": 9.255607237215957e-06, "loss": 0.7672349, "memory(GiB)": 135.49, "step": 17200, "train_speed(iter/s)": 0.202932 }, { "acc": 0.76623554, "epoch": 0.401522331909185, "grad_norm": 8.5, "learning_rate": 9.254615208753381e-06, "loss": 0.86694813, "memory(GiB)": 135.49, "step": 17210, "train_speed(iter/s)": 0.202995 }, { "acc": 0.77897387, "epoch": 0.4017556394814739, "grad_norm": 5.78125, "learning_rate": 9.253622572945722e-06, "loss": 0.80715046, "memory(GiB)": 135.49, "step": 17220, "train_speed(iter/s)": 0.203058 }, { "acc": 0.75289669, "epoch": 0.4019889470537628, "grad_norm": 6.25, "learning_rate": 9.252629329934676e-06, "loss": 0.91459742, "memory(GiB)": 135.49, "step": 17230, "train_speed(iter/s)": 0.203118 }, { "acc": 0.76796312, "epoch": 0.4022222546260517, "grad_norm": 5.03125, "learning_rate": 9.251635479862029e-06, "loss": 0.82623663, "memory(GiB)": 135.49, "step": 17240, "train_speed(iter/s)": 0.203177 }, { "acc": 0.77224779, "epoch": 0.4024555621983406, "grad_norm": 7.4375, "learning_rate": 9.25064102286965e-06, "loss": 0.83090477, "memory(GiB)": 135.49, "step": 17250, "train_speed(iter/s)": 0.203239 }, { "acc": 0.76042395, "epoch": 0.4026888697706295, "grad_norm": 8.5, "learning_rate": 9.249645959099503e-06, "loss": 0.85248222, "memory(GiB)": 135.49, "step": 17260, "train_speed(iter/s)": 0.203299 }, { "acc": 0.7576129, "epoch": 0.4029221773429184, "grad_norm": 6.4375, "learning_rate": 9.248650288693628e-06, "loss": 0.86457863, "memory(GiB)": 135.49, "step": 17270, "train_speed(iter/s)": 0.203357 }, { "acc": 0.77796803, "epoch": 0.40315548491520725, "grad_norm": 9.375, "learning_rate": 9.247654011794158e-06, "loss": 0.80165262, "memory(GiB)": 135.49, "step": 17280, "train_speed(iter/s)": 0.203413 }, { "acc": 0.75933051, "epoch": 0.40338879248749615, "grad_norm": 6.75, "learning_rate": 9.246657128543313e-06, "loss": 0.88424025, "memory(GiB)": 135.49, "step": 17290, "train_speed(iter/s)": 0.203474 }, { "acc": 0.76133337, "epoch": 0.40362210005978505, "grad_norm": 5.6875, "learning_rate": 9.245659639083396e-06, "loss": 0.85147972, "memory(GiB)": 135.49, "step": 17300, "train_speed(iter/s)": 0.203535 }, { "acc": 0.77055612, "epoch": 0.40385540763207395, "grad_norm": 4.5625, "learning_rate": 9.244661543556799e-06, "loss": 0.82461004, "memory(GiB)": 135.49, "step": 17310, "train_speed(iter/s)": 0.203598 }, { "acc": 0.76077471, "epoch": 0.40408871520436285, "grad_norm": 7.0, "learning_rate": 9.243662842106e-06, "loss": 0.86861343, "memory(GiB)": 135.49, "step": 17320, "train_speed(iter/s)": 0.203658 }, { "acc": 0.76802335, "epoch": 0.40432202277665175, "grad_norm": 6.59375, "learning_rate": 9.242663534873562e-06, "loss": 0.8439394, "memory(GiB)": 135.49, "step": 17330, "train_speed(iter/s)": 0.203715 }, { "acc": 0.77333298, "epoch": 0.40455533034894064, "grad_norm": 6.6875, "learning_rate": 9.241663622002137e-06, "loss": 0.80306969, "memory(GiB)": 135.49, "step": 17340, "train_speed(iter/s)": 0.203771 }, { "acc": 0.75323973, "epoch": 0.40478863792122954, "grad_norm": 6.3125, "learning_rate": 9.240663103634464e-06, "loss": 0.89140587, "memory(GiB)": 135.49, "step": 17350, "train_speed(iter/s)": 0.203833 }, { "acc": 0.7675849, "epoch": 0.40502194549351844, "grad_norm": 10.4375, "learning_rate": 9.239661979913364e-06, "loss": 0.81915207, "memory(GiB)": 135.49, "step": 17360, "train_speed(iter/s)": 0.203898 }, { "acc": 0.75557604, "epoch": 0.40525525306580734, "grad_norm": 6.28125, "learning_rate": 9.238660250981748e-06, "loss": 0.89859982, "memory(GiB)": 135.49, "step": 17370, "train_speed(iter/s)": 0.20396 }, { "acc": 0.76874819, "epoch": 0.4054885606380962, "grad_norm": 4.8125, "learning_rate": 9.237657916982612e-06, "loss": 0.85572834, "memory(GiB)": 135.49, "step": 17380, "train_speed(iter/s)": 0.204018 }, { "acc": 0.78015852, "epoch": 0.4057218682103851, "grad_norm": 11.75, "learning_rate": 9.236654978059039e-06, "loss": 0.8042676, "memory(GiB)": 135.49, "step": 17390, "train_speed(iter/s)": 0.204078 }, { "acc": 0.78704329, "epoch": 0.405955175782674, "grad_norm": 5.53125, "learning_rate": 9.2356514343542e-06, "loss": 0.76533899, "memory(GiB)": 135.49, "step": 17400, "train_speed(iter/s)": 0.204135 }, { "acc": 0.77815037, "epoch": 0.4061884833549629, "grad_norm": 4.125, "learning_rate": 9.234647286011347e-06, "loss": 0.80306835, "memory(GiB)": 135.49, "step": 17410, "train_speed(iter/s)": 0.204195 }, { "acc": 0.77561851, "epoch": 0.4064217909272518, "grad_norm": 13.0625, "learning_rate": 9.233642533173827e-06, "loss": 0.80966911, "memory(GiB)": 135.49, "step": 17420, "train_speed(iter/s)": 0.204252 }, { "acc": 0.76481681, "epoch": 0.4066550984995407, "grad_norm": 5.125, "learning_rate": 9.232637175985064e-06, "loss": 0.8474431, "memory(GiB)": 135.49, "step": 17430, "train_speed(iter/s)": 0.204313 }, { "acc": 0.77325039, "epoch": 0.4068884060718296, "grad_norm": 6.90625, "learning_rate": 9.231631214588572e-06, "loss": 0.80566959, "memory(GiB)": 135.49, "step": 17440, "train_speed(iter/s)": 0.204375 }, { "acc": 0.74929028, "epoch": 0.4071217136441185, "grad_norm": 6.21875, "learning_rate": 9.230624649127956e-06, "loss": 0.91206379, "memory(GiB)": 135.49, "step": 17450, "train_speed(iter/s)": 0.204435 }, { "acc": 0.78061523, "epoch": 0.4073550212164074, "grad_norm": 6.3125, "learning_rate": 9.2296174797469e-06, "loss": 0.80147476, "memory(GiB)": 135.49, "step": 17460, "train_speed(iter/s)": 0.204492 }, { "acc": 0.76616907, "epoch": 0.4075883287886963, "grad_norm": 5.71875, "learning_rate": 9.228609706589175e-06, "loss": 0.8465477, "memory(GiB)": 135.49, "step": 17470, "train_speed(iter/s)": 0.204553 }, { "acc": 0.76730428, "epoch": 0.4078216363609851, "grad_norm": 4.625, "learning_rate": 9.227601329798645e-06, "loss": 0.84671545, "memory(GiB)": 135.49, "step": 17480, "train_speed(iter/s)": 0.204615 }, { "acc": 0.74804673, "epoch": 0.408054943933274, "grad_norm": 8.875, "learning_rate": 9.226592349519254e-06, "loss": 0.91790619, "memory(GiB)": 135.49, "step": 17490, "train_speed(iter/s)": 0.204675 }, { "acc": 0.74212055, "epoch": 0.4082882515055629, "grad_norm": 4.46875, "learning_rate": 9.225582765895032e-06, "loss": 0.95211372, "memory(GiB)": 135.49, "step": 17500, "train_speed(iter/s)": 0.204732 }, { "epoch": 0.4082882515055629, "eval_acc": 0.7325538320549598, "eval_loss": 0.8449062705039978, "eval_runtime": 1261.9238, "eval_samples_per_second": 28.521, "eval_steps_per_second": 14.261, "step": 17500 }, { "acc": 0.75356655, "epoch": 0.4085215590778518, "grad_norm": 7.40625, "learning_rate": 9.224572579070097e-06, "loss": 0.87009401, "memory(GiB)": 135.49, "step": 17510, "train_speed(iter/s)": 0.201754 }, { "acc": 0.76488061, "epoch": 0.4087548666501407, "grad_norm": 6.96875, "learning_rate": 9.223561789188655e-06, "loss": 0.82865801, "memory(GiB)": 135.49, "step": 17520, "train_speed(iter/s)": 0.201813 }, { "acc": 0.75713787, "epoch": 0.4089881742224296, "grad_norm": 5.125, "learning_rate": 9.222550396394994e-06, "loss": 0.88632164, "memory(GiB)": 135.49, "step": 17530, "train_speed(iter/s)": 0.201874 }, { "acc": 0.79052835, "epoch": 0.4092214817947185, "grad_norm": 8.1875, "learning_rate": 9.221538400833489e-06, "loss": 0.75105734, "memory(GiB)": 135.49, "step": 17540, "train_speed(iter/s)": 0.201931 }, { "acc": 0.7863265, "epoch": 0.4094547893670074, "grad_norm": 7.4375, "learning_rate": 9.220525802648605e-06, "loss": 0.77613316, "memory(GiB)": 135.49, "step": 17550, "train_speed(iter/s)": 0.201988 }, { "acc": 0.76330614, "epoch": 0.4096880969392963, "grad_norm": 5.5, "learning_rate": 9.219512601984889e-06, "loss": 0.8626112, "memory(GiB)": 135.49, "step": 17560, "train_speed(iter/s)": 0.202044 }, { "acc": 0.75495834, "epoch": 0.40992140451158515, "grad_norm": 5.78125, "learning_rate": 9.218498798986975e-06, "loss": 0.88509159, "memory(GiB)": 135.49, "step": 17570, "train_speed(iter/s)": 0.202106 }, { "acc": 0.74304113, "epoch": 0.41015471208387405, "grad_norm": 6.625, "learning_rate": 9.217484393799582e-06, "loss": 0.93451405, "memory(GiB)": 135.49, "step": 17580, "train_speed(iter/s)": 0.202165 }, { "acc": 0.76792164, "epoch": 0.41038801965616295, "grad_norm": 4.34375, "learning_rate": 9.216469386567517e-06, "loss": 0.83587046, "memory(GiB)": 135.49, "step": 17590, "train_speed(iter/s)": 0.202224 }, { "acc": 0.80448208, "epoch": 0.41062132722845185, "grad_norm": 5.6875, "learning_rate": 9.215453777435672e-06, "loss": 0.69679956, "memory(GiB)": 135.49, "step": 17600, "train_speed(iter/s)": 0.20228 }, { "acc": 0.79012194, "epoch": 0.41085463480074075, "grad_norm": 10.0625, "learning_rate": 9.214437566549026e-06, "loss": 0.75730219, "memory(GiB)": 135.49, "step": 17610, "train_speed(iter/s)": 0.202338 }, { "acc": 0.76763463, "epoch": 0.41108794237302965, "grad_norm": 6.40625, "learning_rate": 9.21342075405264e-06, "loss": 0.84231091, "memory(GiB)": 135.49, "step": 17620, "train_speed(iter/s)": 0.202398 }, { "acc": 0.75752311, "epoch": 0.41132124994531855, "grad_norm": 5.21875, "learning_rate": 9.212403340091667e-06, "loss": 0.8858263, "memory(GiB)": 135.49, "step": 17630, "train_speed(iter/s)": 0.202456 }, { "acc": 0.77754812, "epoch": 0.41155455751760744, "grad_norm": 5.8125, "learning_rate": 9.21138532481134e-06, "loss": 0.79679494, "memory(GiB)": 135.49, "step": 17640, "train_speed(iter/s)": 0.202517 }, { "acc": 0.77353892, "epoch": 0.41178786508989634, "grad_norm": 9.375, "learning_rate": 9.210366708356982e-06, "loss": 0.82993412, "memory(GiB)": 135.49, "step": 17650, "train_speed(iter/s)": 0.202575 }, { "acc": 0.7597785, "epoch": 0.41202117266218524, "grad_norm": 7.125, "learning_rate": 9.209347490874e-06, "loss": 0.87251024, "memory(GiB)": 135.49, "step": 17660, "train_speed(iter/s)": 0.202633 }, { "acc": 0.75218859, "epoch": 0.4122544802344741, "grad_norm": 6.34375, "learning_rate": 9.208327672507883e-06, "loss": 0.91110229, "memory(GiB)": 135.49, "step": 17670, "train_speed(iter/s)": 0.202695 }, { "acc": 0.76597853, "epoch": 0.412487787806763, "grad_norm": 6.125, "learning_rate": 9.207307253404216e-06, "loss": 0.8424612, "memory(GiB)": 135.49, "step": 17680, "train_speed(iter/s)": 0.202751 }, { "acc": 0.79544787, "epoch": 0.4127210953790519, "grad_norm": 7.65625, "learning_rate": 9.20628623370866e-06, "loss": 0.73689513, "memory(GiB)": 135.49, "step": 17690, "train_speed(iter/s)": 0.202816 }, { "acc": 0.7810009, "epoch": 0.4129544029513408, "grad_norm": 4.4375, "learning_rate": 9.205264613566968e-06, "loss": 0.79979076, "memory(GiB)": 135.49, "step": 17700, "train_speed(iter/s)": 0.202878 }, { "acc": 0.7622191, "epoch": 0.4131877105236297, "grad_norm": 7.5, "learning_rate": 9.204242393124973e-06, "loss": 0.85631981, "memory(GiB)": 135.49, "step": 17710, "train_speed(iter/s)": 0.202937 }, { "acc": 0.76991205, "epoch": 0.4134210180959186, "grad_norm": 7.21875, "learning_rate": 9.203219572528597e-06, "loss": 0.83328342, "memory(GiB)": 135.49, "step": 17720, "train_speed(iter/s)": 0.202995 }, { "acc": 0.76035299, "epoch": 0.4136543256682075, "grad_norm": 6.1875, "learning_rate": 9.202196151923849e-06, "loss": 0.86019039, "memory(GiB)": 135.49, "step": 17730, "train_speed(iter/s)": 0.203054 }, { "acc": 0.76514359, "epoch": 0.4138876332404964, "grad_norm": 5.875, "learning_rate": 9.201172131456821e-06, "loss": 0.84882641, "memory(GiB)": 135.49, "step": 17740, "train_speed(iter/s)": 0.203115 }, { "acc": 0.73318586, "epoch": 0.4141209408127853, "grad_norm": 6.0, "learning_rate": 9.20014751127369e-06, "loss": 0.97825623, "memory(GiB)": 135.49, "step": 17750, "train_speed(iter/s)": 0.203175 }, { "acc": 0.78198056, "epoch": 0.4143542483850742, "grad_norm": 5.21875, "learning_rate": 9.199122291520724e-06, "loss": 0.76726551, "memory(GiB)": 135.49, "step": 17760, "train_speed(iter/s)": 0.203235 }, { "acc": 0.76246471, "epoch": 0.414587555957363, "grad_norm": 4.21875, "learning_rate": 9.198096472344269e-06, "loss": 0.85785522, "memory(GiB)": 135.49, "step": 17770, "train_speed(iter/s)": 0.20329 }, { "acc": 0.77032504, "epoch": 0.4148208635296519, "grad_norm": 5.9375, "learning_rate": 9.197070053890764e-06, "loss": 0.83010693, "memory(GiB)": 135.49, "step": 17780, "train_speed(iter/s)": 0.203347 }, { "acc": 0.76793013, "epoch": 0.4150541711019408, "grad_norm": 4.4375, "learning_rate": 9.196043036306726e-06, "loss": 0.83905163, "memory(GiB)": 135.49, "step": 17790, "train_speed(iter/s)": 0.203408 }, { "acc": 0.77398186, "epoch": 0.4152874786742297, "grad_norm": 4.6875, "learning_rate": 9.195015419738765e-06, "loss": 0.8084425, "memory(GiB)": 135.49, "step": 17800, "train_speed(iter/s)": 0.203461 }, { "acc": 0.75578899, "epoch": 0.4155207862465186, "grad_norm": 6.4375, "learning_rate": 9.193987204333573e-06, "loss": 0.88183308, "memory(GiB)": 135.49, "step": 17810, "train_speed(iter/s)": 0.203523 }, { "acc": 0.7783679, "epoch": 0.4157540938188075, "grad_norm": 5.59375, "learning_rate": 9.192958390237923e-06, "loss": 0.80244465, "memory(GiB)": 135.49, "step": 17820, "train_speed(iter/s)": 0.203579 }, { "acc": 0.7559031, "epoch": 0.4159874013910964, "grad_norm": 7.78125, "learning_rate": 9.19192897759868e-06, "loss": 0.88284931, "memory(GiB)": 135.49, "step": 17830, "train_speed(iter/s)": 0.203636 }, { "acc": 0.76817145, "epoch": 0.4162207089633853, "grad_norm": 8.875, "learning_rate": 9.190898966562796e-06, "loss": 0.88224182, "memory(GiB)": 135.49, "step": 17840, "train_speed(iter/s)": 0.203694 }, { "acc": 0.77200594, "epoch": 0.4164540165356742, "grad_norm": 7.5, "learning_rate": 9.1898683572773e-06, "loss": 0.84379759, "memory(GiB)": 135.49, "step": 17850, "train_speed(iter/s)": 0.203755 }, { "acc": 0.74234772, "epoch": 0.41668732410796305, "grad_norm": 4.625, "learning_rate": 9.188837149889316e-06, "loss": 0.9400465, "memory(GiB)": 135.49, "step": 17860, "train_speed(iter/s)": 0.203816 }, { "acc": 0.75065131, "epoch": 0.41692063168025195, "grad_norm": 5.375, "learning_rate": 9.187805344546044e-06, "loss": 0.9082263, "memory(GiB)": 135.49, "step": 17870, "train_speed(iter/s)": 0.203877 }, { "acc": 0.7836134, "epoch": 0.41715393925254085, "grad_norm": 5.28125, "learning_rate": 9.186772941394776e-06, "loss": 0.79228997, "memory(GiB)": 135.49, "step": 17880, "train_speed(iter/s)": 0.203937 }, { "acc": 0.77082481, "epoch": 0.41738724682482975, "grad_norm": 7.875, "learning_rate": 9.185739940582885e-06, "loss": 0.8218071, "memory(GiB)": 135.49, "step": 17890, "train_speed(iter/s)": 0.203998 }, { "acc": 0.77255001, "epoch": 0.41762055439711865, "grad_norm": 5.5625, "learning_rate": 9.184706342257835e-06, "loss": 0.81804848, "memory(GiB)": 135.49, "step": 17900, "train_speed(iter/s)": 0.20406 }, { "acc": 0.7813489, "epoch": 0.41785386196940755, "grad_norm": 8.375, "learning_rate": 9.183672146567171e-06, "loss": 0.77789054, "memory(GiB)": 135.49, "step": 17910, "train_speed(iter/s)": 0.204116 }, { "acc": 0.76709623, "epoch": 0.41808716954169645, "grad_norm": 5.96875, "learning_rate": 9.182637353658523e-06, "loss": 0.832267, "memory(GiB)": 135.49, "step": 17920, "train_speed(iter/s)": 0.204174 }, { "acc": 0.76321764, "epoch": 0.41832047711398535, "grad_norm": 7.84375, "learning_rate": 9.181601963679607e-06, "loss": 0.85502853, "memory(GiB)": 135.49, "step": 17930, "train_speed(iter/s)": 0.204233 }, { "acc": 0.77298155, "epoch": 0.41855378468627424, "grad_norm": 5.46875, "learning_rate": 9.180565976778226e-06, "loss": 0.81713638, "memory(GiB)": 135.49, "step": 17940, "train_speed(iter/s)": 0.204293 }, { "acc": 0.76654882, "epoch": 0.41878709225856314, "grad_norm": 7.15625, "learning_rate": 9.179529393102265e-06, "loss": 0.84552536, "memory(GiB)": 135.49, "step": 17950, "train_speed(iter/s)": 0.204351 }, { "acc": 0.78289018, "epoch": 0.419020399830852, "grad_norm": 4.375, "learning_rate": 9.1784922127997e-06, "loss": 0.78011537, "memory(GiB)": 135.49, "step": 17960, "train_speed(iter/s)": 0.204411 }, { "acc": 0.7618351, "epoch": 0.4192537074031409, "grad_norm": 5.78125, "learning_rate": 9.177454436018584e-06, "loss": 0.8528636, "memory(GiB)": 135.49, "step": 17970, "train_speed(iter/s)": 0.204463 }, { "acc": 0.770154, "epoch": 0.4194870149754298, "grad_norm": 5.40625, "learning_rate": 9.17641606290706e-06, "loss": 0.88279791, "memory(GiB)": 135.49, "step": 17980, "train_speed(iter/s)": 0.204524 }, { "acc": 0.76044216, "epoch": 0.4197203225477187, "grad_norm": 5.875, "learning_rate": 9.175377093613359e-06, "loss": 0.84369335, "memory(GiB)": 135.49, "step": 17990, "train_speed(iter/s)": 0.204581 }, { "acc": 0.77539001, "epoch": 0.4199536301200076, "grad_norm": 5.34375, "learning_rate": 9.174337528285787e-06, "loss": 0.8212739, "memory(GiB)": 135.49, "step": 18000, "train_speed(iter/s)": 0.20464 }, { "epoch": 0.4199536301200076, "eval_acc": 0.7325130123040087, "eval_loss": 0.844586968421936, "eval_runtime": 1263.7093, "eval_samples_per_second": 28.48, "eval_steps_per_second": 14.241, "step": 18000 }, { "acc": 0.76155825, "epoch": 0.4201869376922965, "grad_norm": 6.125, "learning_rate": 9.173297367072748e-06, "loss": 0.86091423, "memory(GiB)": 135.49, "step": 18010, "train_speed(iter/s)": 0.201743 }, { "acc": 0.75298977, "epoch": 0.4204202452645854, "grad_norm": 5.40625, "learning_rate": 9.172256610122721e-06, "loss": 0.91733246, "memory(GiB)": 135.49, "step": 18020, "train_speed(iter/s)": 0.201804 }, { "acc": 0.77927694, "epoch": 0.4206535528368743, "grad_norm": 4.34375, "learning_rate": 9.171215257584277e-06, "loss": 0.78146801, "memory(GiB)": 135.49, "step": 18030, "train_speed(iter/s)": 0.201865 }, { "acc": 0.75168047, "epoch": 0.4208868604091632, "grad_norm": 9.3125, "learning_rate": 9.170173309606063e-06, "loss": 0.88810215, "memory(GiB)": 135.49, "step": 18040, "train_speed(iter/s)": 0.201925 }, { "acc": 0.75904484, "epoch": 0.421120167981452, "grad_norm": 8.9375, "learning_rate": 9.169130766336824e-06, "loss": 0.88898306, "memory(GiB)": 135.49, "step": 18050, "train_speed(iter/s)": 0.201981 }, { "acc": 0.75873871, "epoch": 0.4213534755537409, "grad_norm": 5.5625, "learning_rate": 9.168087627925377e-06, "loss": 0.87225218, "memory(GiB)": 135.49, "step": 18060, "train_speed(iter/s)": 0.202036 }, { "acc": 0.76567698, "epoch": 0.4215867831260298, "grad_norm": 7.15625, "learning_rate": 9.167043894520633e-06, "loss": 0.83327141, "memory(GiB)": 135.49, "step": 18070, "train_speed(iter/s)": 0.202093 }, { "acc": 0.7680975, "epoch": 0.4218200906983187, "grad_norm": 12.1875, "learning_rate": 9.165999566271584e-06, "loss": 0.83456993, "memory(GiB)": 135.49, "step": 18080, "train_speed(iter/s)": 0.202153 }, { "acc": 0.76747642, "epoch": 0.4220533982706076, "grad_norm": 6.9375, "learning_rate": 9.164954643327306e-06, "loss": 0.83742466, "memory(GiB)": 135.49, "step": 18090, "train_speed(iter/s)": 0.202209 }, { "acc": 0.76605101, "epoch": 0.4222867058428965, "grad_norm": 4.9375, "learning_rate": 9.163909125836965e-06, "loss": 0.85010223, "memory(GiB)": 135.49, "step": 18100, "train_speed(iter/s)": 0.202266 }, { "acc": 0.77108631, "epoch": 0.4225200134151854, "grad_norm": 5.75, "learning_rate": 9.162863013949803e-06, "loss": 0.81097784, "memory(GiB)": 135.49, "step": 18110, "train_speed(iter/s)": 0.202317 }, { "acc": 0.75301914, "epoch": 0.4227533209874743, "grad_norm": 8.75, "learning_rate": 9.161816307815157e-06, "loss": 0.9052969, "memory(GiB)": 135.49, "step": 18120, "train_speed(iter/s)": 0.202375 }, { "acc": 0.75322227, "epoch": 0.4229866285597632, "grad_norm": 6.34375, "learning_rate": 9.160769007582441e-06, "loss": 0.91607609, "memory(GiB)": 135.49, "step": 18130, "train_speed(iter/s)": 0.202433 }, { "acc": 0.75556955, "epoch": 0.4232199361320521, "grad_norm": 7.53125, "learning_rate": 9.15972111340116e-06, "loss": 0.90478497, "memory(GiB)": 135.49, "step": 18140, "train_speed(iter/s)": 0.202495 }, { "acc": 0.75605116, "epoch": 0.42345324370434095, "grad_norm": 5.5625, "learning_rate": 9.158672625420894e-06, "loss": 0.87340937, "memory(GiB)": 135.49, "step": 18150, "train_speed(iter/s)": 0.20255 }, { "acc": 0.78639116, "epoch": 0.42368655127662985, "grad_norm": 5.15625, "learning_rate": 9.157623543791323e-06, "loss": 0.76453834, "memory(GiB)": 135.49, "step": 18160, "train_speed(iter/s)": 0.202608 }, { "acc": 0.76822233, "epoch": 0.42391985884891875, "grad_norm": 17.125, "learning_rate": 9.156573868662197e-06, "loss": 0.84116039, "memory(GiB)": 135.49, "step": 18170, "train_speed(iter/s)": 0.202662 }, { "acc": 0.76789417, "epoch": 0.42415316642120765, "grad_norm": 5.40625, "learning_rate": 9.155523600183359e-06, "loss": 0.83258123, "memory(GiB)": 135.49, "step": 18180, "train_speed(iter/s)": 0.202718 }, { "acc": 0.75960493, "epoch": 0.42438647399349655, "grad_norm": 6.8125, "learning_rate": 9.154472738504735e-06, "loss": 0.86712103, "memory(GiB)": 135.49, "step": 18190, "train_speed(iter/s)": 0.202774 }, { "acc": 0.75802436, "epoch": 0.42461978156578545, "grad_norm": 6.5625, "learning_rate": 9.153421283776334e-06, "loss": 0.87824554, "memory(GiB)": 135.49, "step": 18200, "train_speed(iter/s)": 0.202833 }, { "acc": 0.78091025, "epoch": 0.42485308913807435, "grad_norm": 4.71875, "learning_rate": 9.152369236148252e-06, "loss": 0.79941692, "memory(GiB)": 135.49, "step": 18210, "train_speed(iter/s)": 0.20289 }, { "acc": 0.75900726, "epoch": 0.42508639671036325, "grad_norm": 6.40625, "learning_rate": 9.151316595770665e-06, "loss": 0.87553501, "memory(GiB)": 135.49, "step": 18220, "train_speed(iter/s)": 0.202947 }, { "acc": 0.75160074, "epoch": 0.42531970428265214, "grad_norm": 6.28125, "learning_rate": 9.150263362793844e-06, "loss": 0.91056519, "memory(GiB)": 135.49, "step": 18230, "train_speed(iter/s)": 0.203003 }, { "acc": 0.77717433, "epoch": 0.42555301185494104, "grad_norm": 6.75, "learning_rate": 9.14920953736813e-06, "loss": 0.8116457, "memory(GiB)": 135.49, "step": 18240, "train_speed(iter/s)": 0.203061 }, { "acc": 0.76820488, "epoch": 0.4257863194272299, "grad_norm": 10.3125, "learning_rate": 9.148155119643963e-06, "loss": 0.84591522, "memory(GiB)": 135.49, "step": 18250, "train_speed(iter/s)": 0.203114 }, { "acc": 0.7510025, "epoch": 0.4260196269995188, "grad_norm": 6.5625, "learning_rate": 9.147100109771856e-06, "loss": 0.91643486, "memory(GiB)": 135.49, "step": 18260, "train_speed(iter/s)": 0.203171 }, { "acc": 0.77496357, "epoch": 0.4262529345718077, "grad_norm": 6.3125, "learning_rate": 9.146044507902411e-06, "loss": 0.83246574, "memory(GiB)": 135.49, "step": 18270, "train_speed(iter/s)": 0.203227 }, { "acc": 0.7691823, "epoch": 0.4264862421440966, "grad_norm": 5.5625, "learning_rate": 9.144988314186321e-06, "loss": 0.85047874, "memory(GiB)": 135.49, "step": 18280, "train_speed(iter/s)": 0.203278 }, { "acc": 0.76436348, "epoch": 0.4267195497163855, "grad_norm": 5.5625, "learning_rate": 9.143931528774351e-06, "loss": 0.81677866, "memory(GiB)": 135.49, "step": 18290, "train_speed(iter/s)": 0.203337 }, { "acc": 0.74931517, "epoch": 0.4269528572886744, "grad_norm": 10.3125, "learning_rate": 9.14287415181736e-06, "loss": 0.91142197, "memory(GiB)": 135.49, "step": 18300, "train_speed(iter/s)": 0.203391 }, { "acc": 0.78245277, "epoch": 0.4271861648609633, "grad_norm": 6.65625, "learning_rate": 9.141816183466286e-06, "loss": 0.78849602, "memory(GiB)": 135.49, "step": 18310, "train_speed(iter/s)": 0.203449 }, { "acc": 0.77685156, "epoch": 0.4274194724332522, "grad_norm": 5.84375, "learning_rate": 9.140757623872156e-06, "loss": 0.81227131, "memory(GiB)": 135.49, "step": 18320, "train_speed(iter/s)": 0.203504 }, { "acc": 0.74109993, "epoch": 0.4276527800055411, "grad_norm": 5.25, "learning_rate": 9.139698473186079e-06, "loss": 0.93030472, "memory(GiB)": 135.49, "step": 18330, "train_speed(iter/s)": 0.203562 }, { "acc": 0.77592306, "epoch": 0.4278860875778299, "grad_norm": 6.0, "learning_rate": 9.138638731559246e-06, "loss": 0.82012711, "memory(GiB)": 135.49, "step": 18340, "train_speed(iter/s)": 0.203623 }, { "acc": 0.78633242, "epoch": 0.4281193951501188, "grad_norm": 5.96875, "learning_rate": 9.137578399142936e-06, "loss": 0.76158667, "memory(GiB)": 135.49, "step": 18350, "train_speed(iter/s)": 0.203682 }, { "acc": 0.77769513, "epoch": 0.4283527027224077, "grad_norm": 7.3125, "learning_rate": 9.136517476088513e-06, "loss": 0.81201172, "memory(GiB)": 135.49, "step": 18360, "train_speed(iter/s)": 0.20374 }, { "acc": 0.75548358, "epoch": 0.4285860102946966, "grad_norm": 6.0625, "learning_rate": 9.135455962547422e-06, "loss": 0.90527601, "memory(GiB)": 135.49, "step": 18370, "train_speed(iter/s)": 0.203798 }, { "acc": 0.76435776, "epoch": 0.4288193178669855, "grad_norm": 6.59375, "learning_rate": 9.134393858671193e-06, "loss": 0.8422596, "memory(GiB)": 135.49, "step": 18380, "train_speed(iter/s)": 0.203855 }, { "acc": 0.75541048, "epoch": 0.4290526254392744, "grad_norm": 6.375, "learning_rate": 9.13333116461144e-06, "loss": 0.89545631, "memory(GiB)": 135.49, "step": 18390, "train_speed(iter/s)": 0.203914 }, { "acc": 0.77395415, "epoch": 0.4292859330115633, "grad_norm": 5.59375, "learning_rate": 9.132267880519867e-06, "loss": 0.80533733, "memory(GiB)": 135.49, "step": 18400, "train_speed(iter/s)": 0.203973 }, { "acc": 0.76954994, "epoch": 0.4295192405838522, "grad_norm": 10.3125, "learning_rate": 9.131204006548253e-06, "loss": 0.83953362, "memory(GiB)": 135.49, "step": 18410, "train_speed(iter/s)": 0.204031 }, { "acc": 0.7732923, "epoch": 0.4297525481561411, "grad_norm": 13.25, "learning_rate": 9.130139542848468e-06, "loss": 0.83221512, "memory(GiB)": 135.49, "step": 18420, "train_speed(iter/s)": 0.204094 }, { "acc": 0.77046781, "epoch": 0.42998585572843, "grad_norm": 6.34375, "learning_rate": 9.129074489572463e-06, "loss": 0.83659897, "memory(GiB)": 135.49, "step": 18430, "train_speed(iter/s)": 0.204154 }, { "acc": 0.76340399, "epoch": 0.43021916330071885, "grad_norm": 4.28125, "learning_rate": 9.128008846872273e-06, "loss": 0.86037827, "memory(GiB)": 135.49, "step": 18440, "train_speed(iter/s)": 0.20421 }, { "acc": 0.76125965, "epoch": 0.43045247087300775, "grad_norm": 8.4375, "learning_rate": 9.126942614900021e-06, "loss": 0.8702076, "memory(GiB)": 135.49, "step": 18450, "train_speed(iter/s)": 0.204266 }, { "acc": 0.75469112, "epoch": 0.43068577844529665, "grad_norm": 10.9375, "learning_rate": 9.125875793807908e-06, "loss": 0.86662664, "memory(GiB)": 135.49, "step": 18460, "train_speed(iter/s)": 0.204323 }, { "acc": 0.75385294, "epoch": 0.43091908601758555, "grad_norm": 6.4375, "learning_rate": 9.124808383748226e-06, "loss": 0.90611935, "memory(GiB)": 135.49, "step": 18470, "train_speed(iter/s)": 0.204377 }, { "acc": 0.76246724, "epoch": 0.43115239358987445, "grad_norm": 5.3125, "learning_rate": 9.123740384873343e-06, "loss": 0.87608356, "memory(GiB)": 135.49, "step": 18480, "train_speed(iter/s)": 0.204431 }, { "acc": 0.77380362, "epoch": 0.43138570116216335, "grad_norm": 4.9375, "learning_rate": 9.122671797335719e-06, "loss": 0.82838287, "memory(GiB)": 135.49, "step": 18490, "train_speed(iter/s)": 0.204491 }, { "acc": 0.75823278, "epoch": 0.43161900873445225, "grad_norm": 5.9375, "learning_rate": 9.121602621287892e-06, "loss": 0.86983461, "memory(GiB)": 135.49, "step": 18500, "train_speed(iter/s)": 0.204547 }, { "epoch": 0.43161900873445225, "eval_acc": 0.7327330840047886, "eval_loss": 0.8439441323280334, "eval_runtime": 1263.5344, "eval_samples_per_second": 28.484, "eval_steps_per_second": 14.243, "step": 18500 }, { "acc": 0.7828023, "epoch": 0.43185231630674115, "grad_norm": 6.34375, "learning_rate": 9.120532856882491e-06, "loss": 0.80101767, "memory(GiB)": 135.49, "step": 18510, "train_speed(iter/s)": 0.201728 }, { "acc": 0.79635506, "epoch": 0.43208562387903005, "grad_norm": 5.3125, "learning_rate": 9.119462504272221e-06, "loss": 0.73111959, "memory(GiB)": 135.49, "step": 18520, "train_speed(iter/s)": 0.201782 }, { "acc": 0.74502821, "epoch": 0.43231893145131894, "grad_norm": 21.375, "learning_rate": 9.118391563609875e-06, "loss": 0.91871452, "memory(GiB)": 135.49, "step": 18530, "train_speed(iter/s)": 0.20184 }, { "acc": 0.74859357, "epoch": 0.4325522390236078, "grad_norm": 5.625, "learning_rate": 9.117320035048329e-06, "loss": 0.91543465, "memory(GiB)": 135.49, "step": 18540, "train_speed(iter/s)": 0.201895 }, { "acc": 0.78973494, "epoch": 0.4327855465958967, "grad_norm": 5.25, "learning_rate": 9.116247918740544e-06, "loss": 0.74495215, "memory(GiB)": 135.49, "step": 18550, "train_speed(iter/s)": 0.201949 }, { "acc": 0.75570974, "epoch": 0.4330188541681856, "grad_norm": 4.96875, "learning_rate": 9.115175214839565e-06, "loss": 0.90236988, "memory(GiB)": 135.49, "step": 18560, "train_speed(iter/s)": 0.202003 }, { "acc": 0.77477503, "epoch": 0.4332521617404745, "grad_norm": 6.125, "learning_rate": 9.114101923498519e-06, "loss": 0.81295376, "memory(GiB)": 135.49, "step": 18570, "train_speed(iter/s)": 0.20206 }, { "acc": 0.75810862, "epoch": 0.4334854693127634, "grad_norm": 9.3125, "learning_rate": 9.113028044870619e-06, "loss": 0.90285559, "memory(GiB)": 135.49, "step": 18580, "train_speed(iter/s)": 0.202114 }, { "acc": 0.76575303, "epoch": 0.4337187768850523, "grad_norm": 5.78125, "learning_rate": 9.11195357910916e-06, "loss": 0.84893608, "memory(GiB)": 135.49, "step": 18590, "train_speed(iter/s)": 0.202166 }, { "acc": 0.75782862, "epoch": 0.4339520844573412, "grad_norm": 5.46875, "learning_rate": 9.110878526367523e-06, "loss": 0.88154144, "memory(GiB)": 135.49, "step": 18600, "train_speed(iter/s)": 0.20222 }, { "acc": 0.7674058, "epoch": 0.4341853920296301, "grad_norm": 5.28125, "learning_rate": 9.10980288679917e-06, "loss": 0.8381402, "memory(GiB)": 135.49, "step": 18610, "train_speed(iter/s)": 0.202276 }, { "acc": 0.77663894, "epoch": 0.434418699601919, "grad_norm": 10.0625, "learning_rate": 9.10872666055765e-06, "loss": 0.8133131, "memory(GiB)": 135.49, "step": 18620, "train_speed(iter/s)": 0.202334 }, { "acc": 0.76342402, "epoch": 0.4346520071742078, "grad_norm": 7.625, "learning_rate": 9.107649847796591e-06, "loss": 0.86334209, "memory(GiB)": 135.49, "step": 18630, "train_speed(iter/s)": 0.202385 }, { "acc": 0.7635994, "epoch": 0.4348853147464967, "grad_norm": 8.125, "learning_rate": 9.10657244866971e-06, "loss": 0.86357384, "memory(GiB)": 135.49, "step": 18640, "train_speed(iter/s)": 0.202439 }, { "acc": 0.76394281, "epoch": 0.4351186223187856, "grad_norm": 5.65625, "learning_rate": 9.105494463330805e-06, "loss": 0.85699816, "memory(GiB)": 135.49, "step": 18650, "train_speed(iter/s)": 0.202495 }, { "acc": 0.76372728, "epoch": 0.4353519298910745, "grad_norm": 4.8125, "learning_rate": 9.104415891933757e-06, "loss": 0.86218567, "memory(GiB)": 135.49, "step": 18660, "train_speed(iter/s)": 0.202549 }, { "acc": 0.76563225, "epoch": 0.4355852374633634, "grad_norm": 6.0, "learning_rate": 9.103336734632536e-06, "loss": 0.84135628, "memory(GiB)": 135.49, "step": 18670, "train_speed(iter/s)": 0.202601 }, { "acc": 0.77296343, "epoch": 0.4358185450356523, "grad_norm": 6.09375, "learning_rate": 9.102256991581185e-06, "loss": 0.80417957, "memory(GiB)": 135.49, "step": 18680, "train_speed(iter/s)": 0.202658 }, { "acc": 0.75366073, "epoch": 0.4360518526079412, "grad_norm": 18.125, "learning_rate": 9.101176662933842e-06, "loss": 0.87598476, "memory(GiB)": 135.49, "step": 18690, "train_speed(iter/s)": 0.202715 }, { "acc": 0.75029669, "epoch": 0.4362851601802301, "grad_norm": 7.53125, "learning_rate": 9.10009574884472e-06, "loss": 0.90971479, "memory(GiB)": 135.49, "step": 18700, "train_speed(iter/s)": 0.20277 }, { "acc": 0.76485691, "epoch": 0.436518467752519, "grad_norm": 7.09375, "learning_rate": 9.099014249468124e-06, "loss": 0.86283178, "memory(GiB)": 135.49, "step": 18710, "train_speed(iter/s)": 0.202823 }, { "acc": 0.76640668, "epoch": 0.4367517753248079, "grad_norm": 5.15625, "learning_rate": 9.097932164958432e-06, "loss": 0.84142838, "memory(GiB)": 135.49, "step": 18720, "train_speed(iter/s)": 0.202882 }, { "acc": 0.77220273, "epoch": 0.43698508289709675, "grad_norm": 6.46875, "learning_rate": 9.096849495470113e-06, "loss": 0.84196415, "memory(GiB)": 135.49, "step": 18730, "train_speed(iter/s)": 0.202941 }, { "acc": 0.75912733, "epoch": 0.43721839046938565, "grad_norm": 5.59375, "learning_rate": 9.095766241157721e-06, "loss": 0.86189699, "memory(GiB)": 135.49, "step": 18740, "train_speed(iter/s)": 0.202998 }, { "acc": 0.77064624, "epoch": 0.43745169804167455, "grad_norm": 6.125, "learning_rate": 9.094682402175887e-06, "loss": 0.81884069, "memory(GiB)": 135.49, "step": 18750, "train_speed(iter/s)": 0.203052 }, { "acc": 0.76591458, "epoch": 0.43768500561396345, "grad_norm": 8.4375, "learning_rate": 9.093597978679329e-06, "loss": 0.85175819, "memory(GiB)": 135.49, "step": 18760, "train_speed(iter/s)": 0.20311 }, { "acc": 0.76572514, "epoch": 0.43791831318625235, "grad_norm": 7.96875, "learning_rate": 9.09251297082285e-06, "loss": 0.85351419, "memory(GiB)": 135.49, "step": 18770, "train_speed(iter/s)": 0.203167 }, { "acc": 0.75096698, "epoch": 0.43815162075854125, "grad_norm": 5.1875, "learning_rate": 9.091427378761333e-06, "loss": 0.91916218, "memory(GiB)": 135.49, "step": 18780, "train_speed(iter/s)": 0.203225 }, { "acc": 0.77040977, "epoch": 0.43838492833083015, "grad_norm": 6.125, "learning_rate": 9.090341202649746e-06, "loss": 0.84167595, "memory(GiB)": 135.49, "step": 18790, "train_speed(iter/s)": 0.203281 }, { "acc": 0.77248788, "epoch": 0.43861823590311905, "grad_norm": 5.8125, "learning_rate": 9.08925444264314e-06, "loss": 0.8239686, "memory(GiB)": 135.49, "step": 18800, "train_speed(iter/s)": 0.203334 }, { "acc": 0.76245975, "epoch": 0.43885154347540795, "grad_norm": 6.4375, "learning_rate": 9.088167098896652e-06, "loss": 0.8688695, "memory(GiB)": 135.49, "step": 18810, "train_speed(iter/s)": 0.20339 }, { "acc": 0.76797724, "epoch": 0.4390848510476968, "grad_norm": 11.1875, "learning_rate": 9.087079171565496e-06, "loss": 0.85294209, "memory(GiB)": 135.49, "step": 18820, "train_speed(iter/s)": 0.203449 }, { "acc": 0.7746295, "epoch": 0.4393181586199857, "grad_norm": 4.9375, "learning_rate": 9.085990660804976e-06, "loss": 0.81339827, "memory(GiB)": 135.49, "step": 18830, "train_speed(iter/s)": 0.203503 }, { "acc": 0.75047617, "epoch": 0.4395514661922746, "grad_norm": 8.6875, "learning_rate": 9.084901566770476e-06, "loss": 0.89474802, "memory(GiB)": 135.49, "step": 18840, "train_speed(iter/s)": 0.203557 }, { "acc": 0.78892126, "epoch": 0.4397847737645635, "grad_norm": 4.90625, "learning_rate": 9.083811889617467e-06, "loss": 0.73405313, "memory(GiB)": 135.49, "step": 18850, "train_speed(iter/s)": 0.203611 }, { "acc": 0.75391726, "epoch": 0.4400180813368524, "grad_norm": 7.09375, "learning_rate": 9.082721629501494e-06, "loss": 0.8882412, "memory(GiB)": 135.49, "step": 18860, "train_speed(iter/s)": 0.203668 }, { "acc": 0.76355538, "epoch": 0.4402513889091413, "grad_norm": 8.4375, "learning_rate": 9.081630786578195e-06, "loss": 0.85170126, "memory(GiB)": 135.49, "step": 18870, "train_speed(iter/s)": 0.203723 }, { "acc": 0.7691967, "epoch": 0.4404846964814302, "grad_norm": 7.375, "learning_rate": 9.080539361003288e-06, "loss": 0.82847366, "memory(GiB)": 135.49, "step": 18880, "train_speed(iter/s)": 0.20378 }, { "acc": 0.74100342, "epoch": 0.4407180040537191, "grad_norm": 5.0, "learning_rate": 9.079447352932571e-06, "loss": 0.96188803, "memory(GiB)": 135.49, "step": 18890, "train_speed(iter/s)": 0.203835 }, { "acc": 0.76387348, "epoch": 0.440951311626008, "grad_norm": 6.25, "learning_rate": 9.078354762521931e-06, "loss": 0.87446871, "memory(GiB)": 135.49, "step": 18900, "train_speed(iter/s)": 0.203891 }, { "acc": 0.76383829, "epoch": 0.4411846191982969, "grad_norm": 5.6875, "learning_rate": 9.077261589927333e-06, "loss": 0.8664711, "memory(GiB)": 135.49, "step": 18910, "train_speed(iter/s)": 0.203947 }, { "acc": 0.7568862, "epoch": 0.4414179267705857, "grad_norm": 7.125, "learning_rate": 9.076167835304828e-06, "loss": 0.88744144, "memory(GiB)": 135.49, "step": 18920, "train_speed(iter/s)": 0.204001 }, { "acc": 0.79882202, "epoch": 0.4416512343428746, "grad_norm": 5.8125, "learning_rate": 9.075073498810547e-06, "loss": 0.73312902, "memory(GiB)": 135.49, "step": 18930, "train_speed(iter/s)": 0.204056 }, { "acc": 0.78004942, "epoch": 0.4418845419151635, "grad_norm": 5.0625, "learning_rate": 9.073978580600709e-06, "loss": 0.79119539, "memory(GiB)": 135.49, "step": 18940, "train_speed(iter/s)": 0.204106 }, { "acc": 0.75729127, "epoch": 0.4421178494874524, "grad_norm": 5.9375, "learning_rate": 9.072883080831611e-06, "loss": 0.87996006, "memory(GiB)": 135.49, "step": 18950, "train_speed(iter/s)": 0.204162 }, { "acc": 0.75236416, "epoch": 0.4423511570597413, "grad_norm": 5.75, "learning_rate": 9.071786999659638e-06, "loss": 0.89187069, "memory(GiB)": 135.49, "step": 18960, "train_speed(iter/s)": 0.20422 }, { "acc": 0.75854836, "epoch": 0.4425844646320302, "grad_norm": 4.78125, "learning_rate": 9.070690337241252e-06, "loss": 0.87891855, "memory(GiB)": 135.49, "step": 18970, "train_speed(iter/s)": 0.204276 }, { "acc": 0.77216811, "epoch": 0.4428177722043191, "grad_norm": 11.875, "learning_rate": 9.069593093733004e-06, "loss": 0.82213058, "memory(GiB)": 135.49, "step": 18980, "train_speed(iter/s)": 0.20433 }, { "acc": 0.75923371, "epoch": 0.443051079776608, "grad_norm": 7.71875, "learning_rate": 9.068495269291524e-06, "loss": 0.8817112, "memory(GiB)": 135.49, "step": 18990, "train_speed(iter/s)": 0.204389 }, { "acc": 0.74839468, "epoch": 0.4432843873488969, "grad_norm": 5.09375, "learning_rate": 9.067396864073527e-06, "loss": 0.9162468, "memory(GiB)": 135.49, "step": 19000, "train_speed(iter/s)": 0.204446 }, { "epoch": 0.4432843873488969, "eval_acc": 0.7328047202475645, "eval_loss": 0.8435400724411011, "eval_runtime": 1264.1235, "eval_samples_per_second": 28.471, "eval_steps_per_second": 14.236, "step": 19000 }, { "acc": 0.7712163, "epoch": 0.4435176949211858, "grad_norm": 5.59375, "learning_rate": 9.066297878235808e-06, "loss": 0.81478872, "memory(GiB)": 135.49, "step": 19010, "train_speed(iter/s)": 0.201708 }, { "acc": 0.78462982, "epoch": 0.44375100249347466, "grad_norm": 4.53125, "learning_rate": 9.065198311935248e-06, "loss": 0.76670833, "memory(GiB)": 135.49, "step": 19020, "train_speed(iter/s)": 0.20176 }, { "acc": 0.76604686, "epoch": 0.44398431006576355, "grad_norm": 8.9375, "learning_rate": 9.06409816532881e-06, "loss": 0.85616417, "memory(GiB)": 135.49, "step": 19030, "train_speed(iter/s)": 0.201818 }, { "acc": 0.76752548, "epoch": 0.44421761763805245, "grad_norm": 5.8125, "learning_rate": 9.06299743857354e-06, "loss": 0.84741793, "memory(GiB)": 135.49, "step": 19040, "train_speed(iter/s)": 0.201873 }, { "acc": 0.7702745, "epoch": 0.44445092521034135, "grad_norm": 5.1875, "learning_rate": 9.061896131826566e-06, "loss": 0.84795694, "memory(GiB)": 135.49, "step": 19050, "train_speed(iter/s)": 0.201926 }, { "acc": 0.75156121, "epoch": 0.44468423278263025, "grad_norm": 7.59375, "learning_rate": 9.0607942452451e-06, "loss": 0.91436415, "memory(GiB)": 135.49, "step": 19060, "train_speed(iter/s)": 0.201981 }, { "acc": 0.75817838, "epoch": 0.44491754035491915, "grad_norm": 6.09375, "learning_rate": 9.059691778986433e-06, "loss": 0.87397652, "memory(GiB)": 135.49, "step": 19070, "train_speed(iter/s)": 0.202039 }, { "acc": 0.76608419, "epoch": 0.44515084792720805, "grad_norm": 10.75, "learning_rate": 9.058588733207945e-06, "loss": 0.83239708, "memory(GiB)": 135.49, "step": 19080, "train_speed(iter/s)": 0.202096 }, { "acc": 0.74931364, "epoch": 0.44538415549949695, "grad_norm": 5.9375, "learning_rate": 9.057485108067094e-06, "loss": 0.90924816, "memory(GiB)": 135.49, "step": 19090, "train_speed(iter/s)": 0.202151 }, { "acc": 0.76216726, "epoch": 0.44561746307178585, "grad_norm": 8.75, "learning_rate": 9.056380903721424e-06, "loss": 0.87511959, "memory(GiB)": 135.49, "step": 19100, "train_speed(iter/s)": 0.202205 }, { "acc": 0.75460882, "epoch": 0.4458507706440747, "grad_norm": 6.40625, "learning_rate": 9.055276120328557e-06, "loss": 0.89376383, "memory(GiB)": 135.49, "step": 19110, "train_speed(iter/s)": 0.202259 }, { "acc": 0.77587285, "epoch": 0.4460840782163636, "grad_norm": 5.3125, "learning_rate": 9.054170758046204e-06, "loss": 0.79983253, "memory(GiB)": 135.49, "step": 19120, "train_speed(iter/s)": 0.202315 }, { "acc": 0.75755081, "epoch": 0.4463173857886525, "grad_norm": 4.875, "learning_rate": 9.05306481703215e-06, "loss": 0.88695526, "memory(GiB)": 135.49, "step": 19130, "train_speed(iter/s)": 0.20237 }, { "acc": 0.76732864, "epoch": 0.4465506933609414, "grad_norm": 6.34375, "learning_rate": 9.051958297444272e-06, "loss": 0.85905571, "memory(GiB)": 135.49, "step": 19140, "train_speed(iter/s)": 0.202426 }, { "acc": 0.76415334, "epoch": 0.4467840009332303, "grad_norm": 7.71875, "learning_rate": 9.050851199440524e-06, "loss": 0.84318409, "memory(GiB)": 135.49, "step": 19150, "train_speed(iter/s)": 0.202483 }, { "acc": 0.77990479, "epoch": 0.4470173085055192, "grad_norm": 5.34375, "learning_rate": 9.049743523178945e-06, "loss": 0.80101013, "memory(GiB)": 135.49, "step": 19160, "train_speed(iter/s)": 0.202538 }, { "acc": 0.77943311, "epoch": 0.4472506160778081, "grad_norm": 6.53125, "learning_rate": 9.048635268817653e-06, "loss": 0.80418644, "memory(GiB)": 135.49, "step": 19170, "train_speed(iter/s)": 0.202596 }, { "acc": 0.75144119, "epoch": 0.447483923650097, "grad_norm": 24.125, "learning_rate": 9.047526436514854e-06, "loss": 0.8890274, "memory(GiB)": 135.49, "step": 19180, "train_speed(iter/s)": 0.202652 }, { "acc": 0.77117901, "epoch": 0.4477172312223859, "grad_norm": 6.59375, "learning_rate": 9.04641702642883e-06, "loss": 0.83829861, "memory(GiB)": 135.49, "step": 19190, "train_speed(iter/s)": 0.202711 }, { "acc": 0.7628943, "epoch": 0.4479505387946748, "grad_norm": 5.40625, "learning_rate": 9.045307038717954e-06, "loss": 0.86710186, "memory(GiB)": 135.49, "step": 19200, "train_speed(iter/s)": 0.202767 }, { "acc": 0.77364559, "epoch": 0.4481838463669636, "grad_norm": 6.9375, "learning_rate": 9.044196473540672e-06, "loss": 0.82908278, "memory(GiB)": 135.49, "step": 19210, "train_speed(iter/s)": 0.202818 }, { "acc": 0.7579155, "epoch": 0.4484171539392525, "grad_norm": 5.34375, "learning_rate": 9.043085331055516e-06, "loss": 0.89595413, "memory(GiB)": 135.49, "step": 19220, "train_speed(iter/s)": 0.202874 }, { "acc": 0.77179279, "epoch": 0.4486504615115414, "grad_norm": 4.71875, "learning_rate": 9.041973611421106e-06, "loss": 0.83074512, "memory(GiB)": 135.49, "step": 19230, "train_speed(iter/s)": 0.20293 }, { "acc": 0.76394291, "epoch": 0.4488837690838303, "grad_norm": 6.5625, "learning_rate": 9.040861314796137e-06, "loss": 0.8574007, "memory(GiB)": 135.49, "step": 19240, "train_speed(iter/s)": 0.202986 }, { "acc": 0.78378825, "epoch": 0.4491170766561192, "grad_norm": 5.25, "learning_rate": 9.039748441339389e-06, "loss": 0.78306212, "memory(GiB)": 135.49, "step": 19250, "train_speed(iter/s)": 0.20304 }, { "acc": 0.75171242, "epoch": 0.4493503842284081, "grad_norm": 6.375, "learning_rate": 9.038634991209725e-06, "loss": 0.91204195, "memory(GiB)": 135.49, "step": 19260, "train_speed(iter/s)": 0.203093 }, { "acc": 0.76793242, "epoch": 0.449583691800697, "grad_norm": 8.1875, "learning_rate": 9.03752096456609e-06, "loss": 0.85409517, "memory(GiB)": 135.49, "step": 19270, "train_speed(iter/s)": 0.203148 }, { "acc": 0.76957102, "epoch": 0.4498169993729859, "grad_norm": 5.4375, "learning_rate": 9.036406361567506e-06, "loss": 0.81625576, "memory(GiB)": 135.49, "step": 19280, "train_speed(iter/s)": 0.203203 }, { "acc": 0.7609108, "epoch": 0.4500503069452748, "grad_norm": 8.6875, "learning_rate": 9.035291182373092e-06, "loss": 0.86476421, "memory(GiB)": 135.49, "step": 19290, "train_speed(iter/s)": 0.203257 }, { "acc": 0.76366005, "epoch": 0.4502836145175637, "grad_norm": 7.3125, "learning_rate": 9.03417542714203e-06, "loss": 0.85522423, "memory(GiB)": 135.49, "step": 19300, "train_speed(iter/s)": 0.20331 }, { "acc": 0.75666056, "epoch": 0.45051692208985256, "grad_norm": 5.09375, "learning_rate": 9.033059096033598e-06, "loss": 0.87731857, "memory(GiB)": 135.49, "step": 19310, "train_speed(iter/s)": 0.203365 }, { "acc": 0.7757453, "epoch": 0.45075022966214146, "grad_norm": 4.4375, "learning_rate": 9.031942189207154e-06, "loss": 0.83691645, "memory(GiB)": 135.49, "step": 19320, "train_speed(iter/s)": 0.203414 }, { "acc": 0.74749355, "epoch": 0.45098353723443035, "grad_norm": 9.125, "learning_rate": 9.030824706822132e-06, "loss": 0.91036758, "memory(GiB)": 135.49, "step": 19330, "train_speed(iter/s)": 0.203465 }, { "acc": 0.75945101, "epoch": 0.45121684480671925, "grad_norm": 5.65625, "learning_rate": 9.029706649038055e-06, "loss": 0.87830038, "memory(GiB)": 135.49, "step": 19340, "train_speed(iter/s)": 0.203518 }, { "acc": 0.77404127, "epoch": 0.45145015237900815, "grad_norm": 8.125, "learning_rate": 9.028588016014524e-06, "loss": 0.81739044, "memory(GiB)": 135.49, "step": 19350, "train_speed(iter/s)": 0.203574 }, { "acc": 0.78547907, "epoch": 0.45168345995129705, "grad_norm": 7.625, "learning_rate": 9.027468807911223e-06, "loss": 0.78383422, "memory(GiB)": 135.49, "step": 19360, "train_speed(iter/s)": 0.203625 }, { "acc": 0.7835288, "epoch": 0.45191676752358595, "grad_norm": 6.28125, "learning_rate": 9.026349024887921e-06, "loss": 0.78586035, "memory(GiB)": 135.49, "step": 19370, "train_speed(iter/s)": 0.203677 }, { "acc": 0.78034372, "epoch": 0.45215007509587485, "grad_norm": 5.96875, "learning_rate": 9.025228667104465e-06, "loss": 0.79239817, "memory(GiB)": 135.49, "step": 19380, "train_speed(iter/s)": 0.203732 }, { "acc": 0.75985041, "epoch": 0.45238338266816375, "grad_norm": 5.84375, "learning_rate": 9.024107734720786e-06, "loss": 0.8920948, "memory(GiB)": 135.49, "step": 19390, "train_speed(iter/s)": 0.203781 }, { "acc": 0.76610346, "epoch": 0.4526166902404526, "grad_norm": 9.4375, "learning_rate": 9.022986227896898e-06, "loss": 0.83769817, "memory(GiB)": 135.49, "step": 19400, "train_speed(iter/s)": 0.203835 }, { "acc": 0.77150993, "epoch": 0.4528499978127415, "grad_norm": 5.15625, "learning_rate": 9.021864146792894e-06, "loss": 0.81651821, "memory(GiB)": 135.49, "step": 19410, "train_speed(iter/s)": 0.203893 }, { "acc": 0.75417376, "epoch": 0.4530833053850304, "grad_norm": 3.953125, "learning_rate": 9.02074149156895e-06, "loss": 0.89292383, "memory(GiB)": 135.49, "step": 19420, "train_speed(iter/s)": 0.203948 }, { "acc": 0.75323753, "epoch": 0.4533166129573193, "grad_norm": 5.78125, "learning_rate": 9.019618262385328e-06, "loss": 0.87614441, "memory(GiB)": 135.49, "step": 19430, "train_speed(iter/s)": 0.204003 }, { "acc": 0.78156538, "epoch": 0.4535499205296082, "grad_norm": 5.25, "learning_rate": 9.018494459402365e-06, "loss": 0.76866255, "memory(GiB)": 135.49, "step": 19440, "train_speed(iter/s)": 0.204057 }, { "acc": 0.75708208, "epoch": 0.4537832281018971, "grad_norm": 8.5, "learning_rate": 9.017370082780485e-06, "loss": 0.87950706, "memory(GiB)": 135.49, "step": 19450, "train_speed(iter/s)": 0.204111 }, { "acc": 0.78062558, "epoch": 0.454016535674186, "grad_norm": 6.8125, "learning_rate": 9.016245132680195e-06, "loss": 0.78062654, "memory(GiB)": 135.49, "step": 19460, "train_speed(iter/s)": 0.204163 }, { "acc": 0.78381987, "epoch": 0.4542498432464749, "grad_norm": 5.46875, "learning_rate": 9.015119609262078e-06, "loss": 0.77107534, "memory(GiB)": 135.49, "step": 19470, "train_speed(iter/s)": 0.204217 }, { "acc": 0.7501627, "epoch": 0.4544831508187638, "grad_norm": 5.71875, "learning_rate": 9.013993512686803e-06, "loss": 0.91510344, "memory(GiB)": 135.49, "step": 19480, "train_speed(iter/s)": 0.20427 }, { "acc": 0.78457613, "epoch": 0.4547164583910527, "grad_norm": 8.3125, "learning_rate": 9.01286684311512e-06, "loss": 0.76613955, "memory(GiB)": 135.49, "step": 19490, "train_speed(iter/s)": 0.204314 }, { "acc": 0.75763526, "epoch": 0.4549497659633415, "grad_norm": 5.84375, "learning_rate": 9.011739600707862e-06, "loss": 0.87946453, "memory(GiB)": 135.49, "step": 19500, "train_speed(iter/s)": 0.20437 }, { "epoch": 0.4549497659633415, "eval_acc": 0.7329923620276284, "eval_loss": 0.8430914878845215, "eval_runtime": 1263.1507, "eval_samples_per_second": 28.493, "eval_steps_per_second": 14.247, "step": 19500 }, { "acc": 0.78182864, "epoch": 0.4551830735356304, "grad_norm": 6.53125, "learning_rate": 9.01061178562594e-06, "loss": 0.78330822, "memory(GiB)": 135.49, "step": 19510, "train_speed(iter/s)": 0.2017 }, { "acc": 0.77719784, "epoch": 0.4554163811079193, "grad_norm": 7.9375, "learning_rate": 9.009483398030353e-06, "loss": 0.79287238, "memory(GiB)": 135.49, "step": 19520, "train_speed(iter/s)": 0.201751 }, { "acc": 0.77880406, "epoch": 0.4556496886802082, "grad_norm": 5.78125, "learning_rate": 9.008354438082173e-06, "loss": 0.80105104, "memory(GiB)": 135.49, "step": 19530, "train_speed(iter/s)": 0.201806 }, { "acc": 0.75334883, "epoch": 0.4558829962524971, "grad_norm": 8.3125, "learning_rate": 9.007224905942562e-06, "loss": 0.88375731, "memory(GiB)": 135.49, "step": 19540, "train_speed(iter/s)": 0.20186 }, { "acc": 0.77029333, "epoch": 0.456116303824786, "grad_norm": 4.9375, "learning_rate": 9.00609480177276e-06, "loss": 0.8369318, "memory(GiB)": 135.49, "step": 19550, "train_speed(iter/s)": 0.201916 }, { "acc": 0.76441755, "epoch": 0.4563496113970749, "grad_norm": 5.6875, "learning_rate": 9.00496412573409e-06, "loss": 0.85878811, "memory(GiB)": 135.49, "step": 19560, "train_speed(iter/s)": 0.201966 }, { "acc": 0.76622739, "epoch": 0.4565829189693638, "grad_norm": 5.75, "learning_rate": 9.003832877987952e-06, "loss": 0.86100483, "memory(GiB)": 135.49, "step": 19570, "train_speed(iter/s)": 0.20202 }, { "acc": 0.77262421, "epoch": 0.4568162265416527, "grad_norm": 4.03125, "learning_rate": 9.002701058695836e-06, "loss": 0.81861858, "memory(GiB)": 135.49, "step": 19580, "train_speed(iter/s)": 0.202077 }, { "acc": 0.76407728, "epoch": 0.45704953411394156, "grad_norm": 5.5625, "learning_rate": 9.001568668019306e-06, "loss": 0.84632664, "memory(GiB)": 135.49, "step": 19590, "train_speed(iter/s)": 0.20213 }, { "acc": 0.77121639, "epoch": 0.45728284168623046, "grad_norm": 5.125, "learning_rate": 9.000435706120011e-06, "loss": 0.81092157, "memory(GiB)": 135.49, "step": 19600, "train_speed(iter/s)": 0.202187 }, { "acc": 0.76499758, "epoch": 0.45751614925851936, "grad_norm": 5.21875, "learning_rate": 8.999302173159681e-06, "loss": 0.8503273, "memory(GiB)": 135.49, "step": 19610, "train_speed(iter/s)": 0.202243 }, { "acc": 0.75560789, "epoch": 0.45774945683080825, "grad_norm": 6.53125, "learning_rate": 8.998168069300128e-06, "loss": 0.88104801, "memory(GiB)": 135.49, "step": 19620, "train_speed(iter/s)": 0.202295 }, { "acc": 0.76388116, "epoch": 0.45798276440309715, "grad_norm": 5.75, "learning_rate": 8.997033394703246e-06, "loss": 0.85261612, "memory(GiB)": 135.49, "step": 19630, "train_speed(iter/s)": 0.202348 }, { "acc": 0.75891514, "epoch": 0.45821607197538605, "grad_norm": 6.5625, "learning_rate": 8.995898149531005e-06, "loss": 0.85911217, "memory(GiB)": 135.49, "step": 19640, "train_speed(iter/s)": 0.2024 }, { "acc": 0.74233913, "epoch": 0.45844937954767495, "grad_norm": 6.25, "learning_rate": 8.994762333945465e-06, "loss": 0.9233675, "memory(GiB)": 135.49, "step": 19650, "train_speed(iter/s)": 0.202456 }, { "acc": 0.7639596, "epoch": 0.45868268711996385, "grad_norm": 5.65625, "learning_rate": 8.993625948108764e-06, "loss": 0.86031227, "memory(GiB)": 135.49, "step": 19660, "train_speed(iter/s)": 0.202514 }, { "acc": 0.76620531, "epoch": 0.45891599469225275, "grad_norm": 4.96875, "learning_rate": 8.992488992183116e-06, "loss": 0.84733181, "memory(GiB)": 135.49, "step": 19670, "train_speed(iter/s)": 0.202568 }, { "acc": 0.77153149, "epoch": 0.45914930226454165, "grad_norm": 4.9375, "learning_rate": 8.991351466330827e-06, "loss": 0.82978382, "memory(GiB)": 135.49, "step": 19680, "train_speed(iter/s)": 0.202623 }, { "acc": 0.77499914, "epoch": 0.4593826098368305, "grad_norm": 7.8125, "learning_rate": 8.990213370714274e-06, "loss": 0.82785435, "memory(GiB)": 135.49, "step": 19690, "train_speed(iter/s)": 0.202676 }, { "acc": 0.74658461, "epoch": 0.4596159174091194, "grad_norm": 8.0625, "learning_rate": 8.989074705495921e-06, "loss": 0.93300705, "memory(GiB)": 135.49, "step": 19700, "train_speed(iter/s)": 0.202729 }, { "acc": 0.7492507, "epoch": 0.4598492249814083, "grad_norm": 6.0625, "learning_rate": 8.987935470838315e-06, "loss": 0.90153751, "memory(GiB)": 135.49, "step": 19710, "train_speed(iter/s)": 0.202769 }, { "acc": 0.77228775, "epoch": 0.4600825325536972, "grad_norm": 6.8125, "learning_rate": 8.986795666904077e-06, "loss": 0.84622583, "memory(GiB)": 135.49, "step": 19720, "train_speed(iter/s)": 0.202825 }, { "acc": 0.76305456, "epoch": 0.4603158401259861, "grad_norm": 5.53125, "learning_rate": 8.985655293855917e-06, "loss": 0.87022467, "memory(GiB)": 135.49, "step": 19730, "train_speed(iter/s)": 0.20288 }, { "acc": 0.75789738, "epoch": 0.460549147698275, "grad_norm": 4.96875, "learning_rate": 8.98451435185662e-06, "loss": 0.90890198, "memory(GiB)": 135.49, "step": 19740, "train_speed(iter/s)": 0.20293 }, { "acc": 0.77202063, "epoch": 0.4607824552705639, "grad_norm": 8.5, "learning_rate": 8.983372841069059e-06, "loss": 0.81152611, "memory(GiB)": 135.49, "step": 19750, "train_speed(iter/s)": 0.202986 }, { "acc": 0.75363541, "epoch": 0.4610157628428528, "grad_norm": 8.3125, "learning_rate": 8.98223076165618e-06, "loss": 0.90963354, "memory(GiB)": 135.49, "step": 19760, "train_speed(iter/s)": 0.203042 }, { "acc": 0.77265716, "epoch": 0.4612490704151417, "grad_norm": 9.0, "learning_rate": 8.981088113781018e-06, "loss": 0.83267965, "memory(GiB)": 135.49, "step": 19770, "train_speed(iter/s)": 0.203099 }, { "acc": 0.76287947, "epoch": 0.4614823779874306, "grad_norm": 8.0625, "learning_rate": 8.979944897606685e-06, "loss": 0.85755424, "memory(GiB)": 135.49, "step": 19780, "train_speed(iter/s)": 0.203154 }, { "acc": 0.77385197, "epoch": 0.4617156855597194, "grad_norm": 5.90625, "learning_rate": 8.978801113296371e-06, "loss": 0.82074375, "memory(GiB)": 135.49, "step": 19790, "train_speed(iter/s)": 0.203206 }, { "acc": 0.78009782, "epoch": 0.4619489931320083, "grad_norm": 4.6875, "learning_rate": 8.977656761013357e-06, "loss": 0.79236083, "memory(GiB)": 135.49, "step": 19800, "train_speed(iter/s)": 0.203259 }, { "acc": 0.7615097, "epoch": 0.4621823007042972, "grad_norm": 25.875, "learning_rate": 8.976511840920994e-06, "loss": 0.85614433, "memory(GiB)": 135.49, "step": 19810, "train_speed(iter/s)": 0.203309 }, { "acc": 0.75433359, "epoch": 0.4624156082765861, "grad_norm": 6.0, "learning_rate": 8.975366353182721e-06, "loss": 0.90441046, "memory(GiB)": 135.49, "step": 19820, "train_speed(iter/s)": 0.203365 }, { "acc": 0.77280827, "epoch": 0.462648915848875, "grad_norm": 6.0625, "learning_rate": 8.974220297962058e-06, "loss": 0.80867844, "memory(GiB)": 135.49, "step": 19830, "train_speed(iter/s)": 0.203422 }, { "acc": 0.77420235, "epoch": 0.4628822234211639, "grad_norm": 8.0, "learning_rate": 8.973073675422602e-06, "loss": 0.81556702, "memory(GiB)": 135.49, "step": 19840, "train_speed(iter/s)": 0.203478 }, { "acc": 0.79745798, "epoch": 0.4631155309934528, "grad_norm": 5.8125, "learning_rate": 8.97192648572803e-06, "loss": 0.73720493, "memory(GiB)": 135.49, "step": 19850, "train_speed(iter/s)": 0.20353 }, { "acc": 0.75087461, "epoch": 0.4633488385657417, "grad_norm": 5.71875, "learning_rate": 8.970778729042109e-06, "loss": 0.90252209, "memory(GiB)": 135.49, "step": 19860, "train_speed(iter/s)": 0.203582 }, { "acc": 0.7491725, "epoch": 0.4635821461380306, "grad_norm": 6.59375, "learning_rate": 8.969630405528675e-06, "loss": 0.93051071, "memory(GiB)": 135.49, "step": 19870, "train_speed(iter/s)": 0.203636 }, { "acc": 0.77626762, "epoch": 0.46381545371031946, "grad_norm": 7.625, "learning_rate": 8.968481515351656e-06, "loss": 0.81003628, "memory(GiB)": 135.49, "step": 19880, "train_speed(iter/s)": 0.203685 }, { "acc": 0.77402501, "epoch": 0.46404876128260836, "grad_norm": 6.1875, "learning_rate": 8.967332058675054e-06, "loss": 0.79340954, "memory(GiB)": 135.49, "step": 19890, "train_speed(iter/s)": 0.203739 }, { "acc": 0.75556808, "epoch": 0.46428206885489726, "grad_norm": 6.0625, "learning_rate": 8.96618203566295e-06, "loss": 0.87985764, "memory(GiB)": 135.49, "step": 19900, "train_speed(iter/s)": 0.203793 }, { "acc": 0.78584976, "epoch": 0.46451537642718616, "grad_norm": 4.5, "learning_rate": 8.965031446479516e-06, "loss": 0.77245536, "memory(GiB)": 135.49, "step": 19910, "train_speed(iter/s)": 0.203848 }, { "acc": 0.74681711, "epoch": 0.46474868399947505, "grad_norm": 7.28125, "learning_rate": 8.963880291288992e-06, "loss": 0.92305603, "memory(GiB)": 135.49, "step": 19920, "train_speed(iter/s)": 0.2039 }, { "acc": 0.75608311, "epoch": 0.46498199157176395, "grad_norm": 5.25, "learning_rate": 8.96272857025571e-06, "loss": 0.8719203, "memory(GiB)": 135.49, "step": 19930, "train_speed(iter/s)": 0.203952 }, { "acc": 0.76271238, "epoch": 0.46521529914405285, "grad_norm": 6.96875, "learning_rate": 8.961576283544076e-06, "loss": 0.87346325, "memory(GiB)": 135.49, "step": 19940, "train_speed(iter/s)": 0.204007 }, { "acc": 0.77282944, "epoch": 0.46544860671634175, "grad_norm": 5.25, "learning_rate": 8.960423431318576e-06, "loss": 0.80906467, "memory(GiB)": 135.49, "step": 19950, "train_speed(iter/s)": 0.204062 }, { "acc": 0.76668358, "epoch": 0.46568191428863065, "grad_norm": 7.53125, "learning_rate": 8.959270013743784e-06, "loss": 0.85535049, "memory(GiB)": 135.49, "step": 19960, "train_speed(iter/s)": 0.204114 }, { "acc": 0.75437479, "epoch": 0.46591522186091955, "grad_norm": 6.34375, "learning_rate": 8.958116030984347e-06, "loss": 0.90233879, "memory(GiB)": 135.49, "step": 19970, "train_speed(iter/s)": 0.204167 }, { "acc": 0.7746707, "epoch": 0.4661485294332084, "grad_norm": 6.71875, "learning_rate": 8.956961483204996e-06, "loss": 0.80184937, "memory(GiB)": 135.49, "step": 19980, "train_speed(iter/s)": 0.204222 }, { "acc": 0.75822773, "epoch": 0.4663818370054973, "grad_norm": 8.6875, "learning_rate": 8.955806370570543e-06, "loss": 0.89195271, "memory(GiB)": 135.49, "step": 19990, "train_speed(iter/s)": 0.204274 }, { "acc": 0.75893321, "epoch": 0.4666151445777862, "grad_norm": 6.34375, "learning_rate": 8.954650693245882e-06, "loss": 0.86402254, "memory(GiB)": 135.49, "step": 20000, "train_speed(iter/s)": 0.204326 }, { "epoch": 0.4666151445777862, "eval_acc": 0.732964449707808, "eval_loss": 0.8425295948982239, "eval_runtime": 1263.4329, "eval_samples_per_second": 28.487, "eval_steps_per_second": 14.244, "step": 20000 }, { "acc": 0.75909166, "epoch": 0.4668484521500751, "grad_norm": 7.0, "learning_rate": 8.953494451395979e-06, "loss": 0.85393572, "memory(GiB)": 135.49, "step": 20010, "train_speed(iter/s)": 0.201722 }, { "acc": 0.78142672, "epoch": 0.467081759722364, "grad_norm": 4.4375, "learning_rate": 8.952337645185894e-06, "loss": 0.79034615, "memory(GiB)": 135.49, "step": 20020, "train_speed(iter/s)": 0.201774 }, { "acc": 0.76636105, "epoch": 0.4673150672946529, "grad_norm": 6.0625, "learning_rate": 8.951180274780758e-06, "loss": 0.82435207, "memory(GiB)": 135.49, "step": 20030, "train_speed(iter/s)": 0.201827 }, { "acc": 0.76763597, "epoch": 0.4675483748669418, "grad_norm": 6.3125, "learning_rate": 8.950022340345786e-06, "loss": 0.82885942, "memory(GiB)": 135.49, "step": 20040, "train_speed(iter/s)": 0.201881 }, { "acc": 0.76106687, "epoch": 0.4677816824392307, "grad_norm": 8.125, "learning_rate": 8.948863842046272e-06, "loss": 0.85098438, "memory(GiB)": 135.49, "step": 20050, "train_speed(iter/s)": 0.201934 }, { "acc": 0.76142969, "epoch": 0.4680149900115196, "grad_norm": 4.5625, "learning_rate": 8.947704780047593e-06, "loss": 0.8804718, "memory(GiB)": 135.49, "step": 20060, "train_speed(iter/s)": 0.201985 }, { "acc": 0.79294791, "epoch": 0.4682482975838085, "grad_norm": 3.984375, "learning_rate": 8.946545154515201e-06, "loss": 0.74809256, "memory(GiB)": 135.49, "step": 20070, "train_speed(iter/s)": 0.20204 }, { "acc": 0.76444292, "epoch": 0.4684816051560973, "grad_norm": 9.0, "learning_rate": 8.945384965614636e-06, "loss": 0.85360737, "memory(GiB)": 135.49, "step": 20080, "train_speed(iter/s)": 0.202092 }, { "acc": 0.7802866, "epoch": 0.4687149127283862, "grad_norm": 4.84375, "learning_rate": 8.944224213511514e-06, "loss": 0.80507545, "memory(GiB)": 135.49, "step": 20090, "train_speed(iter/s)": 0.202145 }, { "acc": 0.75602846, "epoch": 0.4689482203006751, "grad_norm": 6.96875, "learning_rate": 8.943062898371531e-06, "loss": 0.88123989, "memory(GiB)": 135.49, "step": 20100, "train_speed(iter/s)": 0.202196 }, { "acc": 0.77043495, "epoch": 0.469181527872964, "grad_norm": 5.65625, "learning_rate": 8.941901020360464e-06, "loss": 0.84115839, "memory(GiB)": 135.49, "step": 20110, "train_speed(iter/s)": 0.202245 }, { "acc": 0.76367178, "epoch": 0.4694148354452529, "grad_norm": 5.03125, "learning_rate": 8.940738579644171e-06, "loss": 0.85250988, "memory(GiB)": 135.49, "step": 20120, "train_speed(iter/s)": 0.2023 }, { "acc": 0.78558803, "epoch": 0.4696481430175418, "grad_norm": 6.96875, "learning_rate": 8.939575576388592e-06, "loss": 0.7812016, "memory(GiB)": 135.49, "step": 20130, "train_speed(iter/s)": 0.20235 }, { "acc": 0.75273132, "epoch": 0.4698814505898307, "grad_norm": 4.8125, "learning_rate": 8.938412010759743e-06, "loss": 0.9199749, "memory(GiB)": 135.49, "step": 20140, "train_speed(iter/s)": 0.202403 }, { "acc": 0.76458349, "epoch": 0.4701147581621196, "grad_norm": 5.09375, "learning_rate": 8.937247882923724e-06, "loss": 0.85190334, "memory(GiB)": 135.49, "step": 20150, "train_speed(iter/s)": 0.202455 }, { "acc": 0.77117901, "epoch": 0.4703480657344085, "grad_norm": 20.125, "learning_rate": 8.936083193046712e-06, "loss": 0.81763039, "memory(GiB)": 135.49, "step": 20160, "train_speed(iter/s)": 0.202502 }, { "acc": 0.75423031, "epoch": 0.47058137330669736, "grad_norm": 6.78125, "learning_rate": 8.93491794129497e-06, "loss": 0.87253876, "memory(GiB)": 135.49, "step": 20170, "train_speed(iter/s)": 0.202556 }, { "acc": 0.78193688, "epoch": 0.47081468087898626, "grad_norm": 4.9375, "learning_rate": 8.933752127834834e-06, "loss": 0.79043837, "memory(GiB)": 135.49, "step": 20180, "train_speed(iter/s)": 0.20261 }, { "acc": 0.75785842, "epoch": 0.47104798845127516, "grad_norm": 9.1875, "learning_rate": 8.932585752832725e-06, "loss": 0.87198391, "memory(GiB)": 135.49, "step": 20190, "train_speed(iter/s)": 0.202664 }, { "acc": 0.7615407, "epoch": 0.47128129602356406, "grad_norm": 9.6875, "learning_rate": 8.931418816455142e-06, "loss": 0.86387424, "memory(GiB)": 135.49, "step": 20200, "train_speed(iter/s)": 0.202715 }, { "acc": 0.74754152, "epoch": 0.47151460359585295, "grad_norm": 5.53125, "learning_rate": 8.930251318868664e-06, "loss": 0.92508488, "memory(GiB)": 135.49, "step": 20210, "train_speed(iter/s)": 0.202765 }, { "acc": 0.76428967, "epoch": 0.47174791116814185, "grad_norm": 5.0, "learning_rate": 8.929083260239952e-06, "loss": 0.84251356, "memory(GiB)": 135.49, "step": 20220, "train_speed(iter/s)": 0.202814 }, { "acc": 0.77040644, "epoch": 0.47198121874043075, "grad_norm": 11.375, "learning_rate": 8.927914640735748e-06, "loss": 0.82272902, "memory(GiB)": 135.49, "step": 20230, "train_speed(iter/s)": 0.202867 }, { "acc": 0.7714467, "epoch": 0.47221452631271965, "grad_norm": 5.9375, "learning_rate": 8.926745460522867e-06, "loss": 0.83695984, "memory(GiB)": 135.49, "step": 20240, "train_speed(iter/s)": 0.202922 }, { "acc": 0.77017117, "epoch": 0.47244783388500855, "grad_norm": 6.84375, "learning_rate": 8.925575719768215e-06, "loss": 0.83387146, "memory(GiB)": 135.49, "step": 20250, "train_speed(iter/s)": 0.202976 }, { "acc": 0.7718565, "epoch": 0.47268114145729745, "grad_norm": 7.09375, "learning_rate": 8.92440541863877e-06, "loss": 0.82963228, "memory(GiB)": 135.49, "step": 20260, "train_speed(iter/s)": 0.203029 }, { "acc": 0.78405137, "epoch": 0.4729144490295863, "grad_norm": 6.5, "learning_rate": 8.923234557301588e-06, "loss": 0.78165607, "memory(GiB)": 135.49, "step": 20270, "train_speed(iter/s)": 0.20308 }, { "acc": 0.77332935, "epoch": 0.4731477566018752, "grad_norm": 4.5625, "learning_rate": 8.922063135923815e-06, "loss": 0.81680212, "memory(GiB)": 135.49, "step": 20280, "train_speed(iter/s)": 0.203132 }, { "acc": 0.77463045, "epoch": 0.4733810641741641, "grad_norm": 5.125, "learning_rate": 8.920891154672668e-06, "loss": 0.80539761, "memory(GiB)": 135.49, "step": 20290, "train_speed(iter/s)": 0.203184 }, { "acc": 0.77089329, "epoch": 0.473614371746453, "grad_norm": 6.0, "learning_rate": 8.91971861371545e-06, "loss": 0.84483061, "memory(GiB)": 135.49, "step": 20300, "train_speed(iter/s)": 0.203237 }, { "acc": 0.76182222, "epoch": 0.4738476793187419, "grad_norm": 4.25, "learning_rate": 8.918545513219535e-06, "loss": 0.84890423, "memory(GiB)": 135.49, "step": 20310, "train_speed(iter/s)": 0.203291 }, { "acc": 0.76259904, "epoch": 0.4740809868910308, "grad_norm": 6.25, "learning_rate": 8.917371853352388e-06, "loss": 0.85839376, "memory(GiB)": 135.49, "step": 20320, "train_speed(iter/s)": 0.203344 }, { "acc": 0.75932689, "epoch": 0.4743142944633197, "grad_norm": 5.84375, "learning_rate": 8.916197634281547e-06, "loss": 0.86830311, "memory(GiB)": 135.49, "step": 20330, "train_speed(iter/s)": 0.203394 }, { "acc": 0.77788315, "epoch": 0.4745476020356086, "grad_norm": 4.03125, "learning_rate": 8.91502285617463e-06, "loss": 0.79365373, "memory(GiB)": 135.49, "step": 20340, "train_speed(iter/s)": 0.20345 }, { "acc": 0.76521893, "epoch": 0.4747809096078975, "grad_norm": 6.40625, "learning_rate": 8.913847519199341e-06, "loss": 0.85026054, "memory(GiB)": 135.49, "step": 20350, "train_speed(iter/s)": 0.203501 }, { "acc": 0.76550341, "epoch": 0.4750142171801863, "grad_norm": 15.5625, "learning_rate": 8.912671623523452e-06, "loss": 0.8434165, "memory(GiB)": 135.49, "step": 20360, "train_speed(iter/s)": 0.20355 }, { "acc": 0.7689528, "epoch": 0.4752475247524752, "grad_norm": 7.71875, "learning_rate": 8.911495169314828e-06, "loss": 0.82899466, "memory(GiB)": 135.49, "step": 20370, "train_speed(iter/s)": 0.203599 }, { "acc": 0.78005404, "epoch": 0.4754808323247641, "grad_norm": 7.40625, "learning_rate": 8.910318156741401e-06, "loss": 0.77696877, "memory(GiB)": 135.49, "step": 20380, "train_speed(iter/s)": 0.203649 }, { "acc": 0.77310462, "epoch": 0.475714139897053, "grad_norm": 4.46875, "learning_rate": 8.909140585971198e-06, "loss": 0.82851744, "memory(GiB)": 135.49, "step": 20390, "train_speed(iter/s)": 0.2037 }, { "acc": 0.77888346, "epoch": 0.4759474474693419, "grad_norm": 6.625, "learning_rate": 8.90796245717231e-06, "loss": 0.80008869, "memory(GiB)": 135.49, "step": 20400, "train_speed(iter/s)": 0.203754 }, { "acc": 0.76855583, "epoch": 0.4761807550416308, "grad_norm": 6.8125, "learning_rate": 8.906783770512915e-06, "loss": 0.8538516, "memory(GiB)": 135.49, "step": 20410, "train_speed(iter/s)": 0.203802 }, { "acc": 0.76310425, "epoch": 0.4764140626139197, "grad_norm": 7.0, "learning_rate": 8.905604526161274e-06, "loss": 0.85504866, "memory(GiB)": 135.49, "step": 20420, "train_speed(iter/s)": 0.203855 }, { "acc": 0.76938725, "epoch": 0.4766473701862086, "grad_norm": 5.8125, "learning_rate": 8.904424724285721e-06, "loss": 0.80504074, "memory(GiB)": 135.49, "step": 20430, "train_speed(iter/s)": 0.203908 }, { "acc": 0.78762207, "epoch": 0.4768806777584975, "grad_norm": 7.8125, "learning_rate": 8.903244365054671e-06, "loss": 0.76677418, "memory(GiB)": 135.49, "step": 20440, "train_speed(iter/s)": 0.203956 }, { "acc": 0.75194197, "epoch": 0.4771139853307864, "grad_norm": 7.78125, "learning_rate": 8.902063448636624e-06, "loss": 0.92183247, "memory(GiB)": 135.49, "step": 20450, "train_speed(iter/s)": 0.204008 }, { "acc": 0.77170496, "epoch": 0.47734729290307526, "grad_norm": 5.75, "learning_rate": 8.900881975200151e-06, "loss": 0.80942726, "memory(GiB)": 135.49, "step": 20460, "train_speed(iter/s)": 0.204058 }, { "acc": 0.76416454, "epoch": 0.47758060047536416, "grad_norm": 7.3125, "learning_rate": 8.89969994491391e-06, "loss": 0.83932962, "memory(GiB)": 135.49, "step": 20470, "train_speed(iter/s)": 0.204105 }, { "acc": 0.76069613, "epoch": 0.47781390804765306, "grad_norm": 5.625, "learning_rate": 8.898517357946636e-06, "loss": 0.86261673, "memory(GiB)": 135.49, "step": 20480, "train_speed(iter/s)": 0.204159 }, { "acc": 0.77294588, "epoch": 0.47804721561994196, "grad_norm": 7.40625, "learning_rate": 8.897334214467141e-06, "loss": 0.83312826, "memory(GiB)": 135.49, "step": 20490, "train_speed(iter/s)": 0.20421 }, { "acc": 0.74003129, "epoch": 0.47828052319223086, "grad_norm": 6.28125, "learning_rate": 8.89615051464432e-06, "loss": 0.95336885, "memory(GiB)": 135.49, "step": 20500, "train_speed(iter/s)": 0.204261 }, { "epoch": 0.47828052319223086, "eval_acc": 0.7330786804733155, "eval_loss": 0.8421733379364014, "eval_runtime": 1262.688, "eval_samples_per_second": 28.503, "eval_steps_per_second": 14.252, "step": 20500 }, { "acc": 0.77263689, "epoch": 0.47851383076451975, "grad_norm": 7.625, "learning_rate": 8.894966258647144e-06, "loss": 0.81624718, "memory(GiB)": 135.49, "step": 20510, "train_speed(iter/s)": 0.201725 }, { "acc": 0.76528654, "epoch": 0.47874713833680865, "grad_norm": 6.09375, "learning_rate": 8.893781446644667e-06, "loss": 0.83885841, "memory(GiB)": 135.49, "step": 20520, "train_speed(iter/s)": 0.201771 }, { "acc": 0.77409019, "epoch": 0.47898044590909755, "grad_norm": 3.9375, "learning_rate": 8.892596078806017e-06, "loss": 0.82466755, "memory(GiB)": 135.49, "step": 20530, "train_speed(iter/s)": 0.201822 }, { "acc": 0.76239758, "epoch": 0.47921375348138645, "grad_norm": 6.0625, "learning_rate": 8.89141015530041e-06, "loss": 0.87065639, "memory(GiB)": 135.49, "step": 20540, "train_speed(iter/s)": 0.201875 }, { "acc": 0.77535877, "epoch": 0.47944706105367535, "grad_norm": 5.15625, "learning_rate": 8.890223676297132e-06, "loss": 0.81849718, "memory(GiB)": 135.49, "step": 20550, "train_speed(iter/s)": 0.20193 }, { "acc": 0.78476429, "epoch": 0.4796803686259642, "grad_norm": 7.21875, "learning_rate": 8.889036641965557e-06, "loss": 0.7692729, "memory(GiB)": 135.49, "step": 20560, "train_speed(iter/s)": 0.201977 }, { "acc": 0.74051456, "epoch": 0.4799136761982531, "grad_norm": 7.125, "learning_rate": 8.887849052475128e-06, "loss": 0.95967026, "memory(GiB)": 135.49, "step": 20570, "train_speed(iter/s)": 0.202029 }, { "acc": 0.77752175, "epoch": 0.480146983770542, "grad_norm": 5.21875, "learning_rate": 8.886660907995379e-06, "loss": 0.80754261, "memory(GiB)": 135.49, "step": 20580, "train_speed(iter/s)": 0.202079 }, { "acc": 0.76207485, "epoch": 0.4803802913428309, "grad_norm": 6.84375, "learning_rate": 8.885472208695911e-06, "loss": 0.86362915, "memory(GiB)": 135.49, "step": 20590, "train_speed(iter/s)": 0.202131 }, { "acc": 0.77339172, "epoch": 0.4806135989151198, "grad_norm": 6.34375, "learning_rate": 8.884282954746417e-06, "loss": 0.80336676, "memory(GiB)": 135.49, "step": 20600, "train_speed(iter/s)": 0.202182 }, { "acc": 0.77904496, "epoch": 0.4808469064874087, "grad_norm": 5.375, "learning_rate": 8.88309314631666e-06, "loss": 0.78758621, "memory(GiB)": 135.49, "step": 20610, "train_speed(iter/s)": 0.202234 }, { "acc": 0.75069017, "epoch": 0.4810802140596976, "grad_norm": 4.625, "learning_rate": 8.881902783576482e-06, "loss": 0.92695675, "memory(GiB)": 135.49, "step": 20620, "train_speed(iter/s)": 0.202285 }, { "acc": 0.75703592, "epoch": 0.4813135216319865, "grad_norm": 5.03125, "learning_rate": 8.88071186669581e-06, "loss": 0.87981892, "memory(GiB)": 135.49, "step": 20630, "train_speed(iter/s)": 0.202333 }, { "acc": 0.75604973, "epoch": 0.4815468292042754, "grad_norm": 5.21875, "learning_rate": 8.879520395844648e-06, "loss": 0.88485985, "memory(GiB)": 135.49, "step": 20640, "train_speed(iter/s)": 0.202383 }, { "acc": 0.75102739, "epoch": 0.4817801367765642, "grad_norm": 6.34375, "learning_rate": 8.878328371193074e-06, "loss": 0.8895853, "memory(GiB)": 135.49, "step": 20650, "train_speed(iter/s)": 0.202437 }, { "acc": 0.76097965, "epoch": 0.4820134443488531, "grad_norm": 5.84375, "learning_rate": 8.877135792911253e-06, "loss": 0.88332043, "memory(GiB)": 135.49, "step": 20660, "train_speed(iter/s)": 0.202489 }, { "acc": 0.76883326, "epoch": 0.482246751921142, "grad_norm": 6.21875, "learning_rate": 8.875942661169423e-06, "loss": 0.84487581, "memory(GiB)": 135.49, "step": 20670, "train_speed(iter/s)": 0.202542 }, { "acc": 0.75137548, "epoch": 0.4824800594934309, "grad_norm": 5.09375, "learning_rate": 8.874748976137905e-06, "loss": 0.92328606, "memory(GiB)": 135.49, "step": 20680, "train_speed(iter/s)": 0.202593 }, { "acc": 0.75061989, "epoch": 0.4827133670657198, "grad_norm": 5.4375, "learning_rate": 8.873554737987098e-06, "loss": 0.90699272, "memory(GiB)": 135.49, "step": 20690, "train_speed(iter/s)": 0.202644 }, { "acc": 0.77780123, "epoch": 0.4829466746380087, "grad_norm": 7.75, "learning_rate": 8.872359946887474e-06, "loss": 0.79770331, "memory(GiB)": 135.49, "step": 20700, "train_speed(iter/s)": 0.202693 }, { "acc": 0.75120778, "epoch": 0.4831799822102976, "grad_norm": 6.375, "learning_rate": 8.871164603009595e-06, "loss": 0.94331627, "memory(GiB)": 135.49, "step": 20710, "train_speed(iter/s)": 0.202746 }, { "acc": 0.77770004, "epoch": 0.4834132897825865, "grad_norm": 7.875, "learning_rate": 8.869968706524092e-06, "loss": 0.818116, "memory(GiB)": 135.49, "step": 20720, "train_speed(iter/s)": 0.202797 }, { "acc": 0.7625134, "epoch": 0.4836465973548754, "grad_norm": 4.28125, "learning_rate": 8.868772257601682e-06, "loss": 0.87479038, "memory(GiB)": 135.49, "step": 20730, "train_speed(iter/s)": 0.202849 }, { "acc": 0.75120974, "epoch": 0.4838799049271643, "grad_norm": 10.0625, "learning_rate": 8.867575256413154e-06, "loss": 0.88485413, "memory(GiB)": 135.49, "step": 20740, "train_speed(iter/s)": 0.202901 }, { "acc": 0.76831837, "epoch": 0.48411321249945316, "grad_norm": 4.8125, "learning_rate": 8.866377703129382e-06, "loss": 0.81743221, "memory(GiB)": 135.49, "step": 20750, "train_speed(iter/s)": 0.202954 }, { "acc": 0.77847271, "epoch": 0.48434652007174206, "grad_norm": 5.46875, "learning_rate": 8.865179597921318e-06, "loss": 0.7904202, "memory(GiB)": 135.49, "step": 20760, "train_speed(iter/s)": 0.203006 }, { "acc": 0.75291114, "epoch": 0.48457982764403096, "grad_norm": 6.71875, "learning_rate": 8.863980940959989e-06, "loss": 0.92126551, "memory(GiB)": 135.49, "step": 20770, "train_speed(iter/s)": 0.203059 }, { "acc": 0.7616416, "epoch": 0.48481313521631986, "grad_norm": 5.09375, "learning_rate": 8.862781732416502e-06, "loss": 0.84408379, "memory(GiB)": 135.49, "step": 20780, "train_speed(iter/s)": 0.203111 }, { "acc": 0.76874342, "epoch": 0.48504644278860876, "grad_norm": 5.28125, "learning_rate": 8.861581972462045e-06, "loss": 0.82992325, "memory(GiB)": 135.49, "step": 20790, "train_speed(iter/s)": 0.203163 }, { "acc": 0.77002258, "epoch": 0.48527975036089765, "grad_norm": 8.5, "learning_rate": 8.860381661267882e-06, "loss": 0.81080923, "memory(GiB)": 135.49, "step": 20800, "train_speed(iter/s)": 0.20321 }, { "acc": 0.78717718, "epoch": 0.48551305793318655, "grad_norm": 9.25, "learning_rate": 8.859180799005361e-06, "loss": 0.76166849, "memory(GiB)": 135.49, "step": 20810, "train_speed(iter/s)": 0.203261 }, { "acc": 0.7669065, "epoch": 0.48574636550547545, "grad_norm": 7.6875, "learning_rate": 8.857979385845901e-06, "loss": 0.83650799, "memory(GiB)": 135.49, "step": 20820, "train_speed(iter/s)": 0.203314 }, { "acc": 0.75469131, "epoch": 0.48597967307776435, "grad_norm": 5.9375, "learning_rate": 8.856777421961004e-06, "loss": 0.89841661, "memory(GiB)": 135.49, "step": 20830, "train_speed(iter/s)": 0.203365 }, { "acc": 0.77101765, "epoch": 0.48621298065005325, "grad_norm": 6.375, "learning_rate": 8.855574907522251e-06, "loss": 0.82555962, "memory(GiB)": 135.49, "step": 20840, "train_speed(iter/s)": 0.203415 }, { "acc": 0.77501698, "epoch": 0.4864462882223421, "grad_norm": 6.8125, "learning_rate": 8.854371842701299e-06, "loss": 0.80563259, "memory(GiB)": 135.49, "step": 20850, "train_speed(iter/s)": 0.203465 }, { "acc": 0.77780895, "epoch": 0.486679595794631, "grad_norm": 6.34375, "learning_rate": 8.853168227669886e-06, "loss": 0.80877914, "memory(GiB)": 135.49, "step": 20860, "train_speed(iter/s)": 0.203516 }, { "acc": 0.77402878, "epoch": 0.4869129033669199, "grad_norm": 7.0, "learning_rate": 8.851964062599828e-06, "loss": 0.80902538, "memory(GiB)": 135.49, "step": 20870, "train_speed(iter/s)": 0.203567 }, { "acc": 0.76007385, "epoch": 0.4871462109392088, "grad_norm": 5.40625, "learning_rate": 8.850759347663021e-06, "loss": 0.84757347, "memory(GiB)": 135.49, "step": 20880, "train_speed(iter/s)": 0.203617 }, { "acc": 0.75199103, "epoch": 0.4873795185114977, "grad_norm": 7.34375, "learning_rate": 8.849554083031435e-06, "loss": 0.91048346, "memory(GiB)": 135.49, "step": 20890, "train_speed(iter/s)": 0.203667 }, { "acc": 0.77853518, "epoch": 0.4876128260837866, "grad_norm": 5.25, "learning_rate": 8.84834826887712e-06, "loss": 0.81313887, "memory(GiB)": 135.49, "step": 20900, "train_speed(iter/s)": 0.203717 }, { "acc": 0.77355547, "epoch": 0.4878461336560755, "grad_norm": 7.21875, "learning_rate": 8.84714190537221e-06, "loss": 0.8153019, "memory(GiB)": 135.49, "step": 20910, "train_speed(iter/s)": 0.203765 }, { "acc": 0.77204885, "epoch": 0.4880794412283644, "grad_norm": 6.125, "learning_rate": 8.84593499268891e-06, "loss": 0.82173595, "memory(GiB)": 135.49, "step": 20920, "train_speed(iter/s)": 0.203815 }, { "acc": 0.78257728, "epoch": 0.4883127488006533, "grad_norm": 8.4375, "learning_rate": 8.844727530999506e-06, "loss": 0.78891296, "memory(GiB)": 135.49, "step": 20930, "train_speed(iter/s)": 0.203863 }, { "acc": 0.77013216, "epoch": 0.48854605637294213, "grad_norm": 7.9375, "learning_rate": 8.843519520476365e-06, "loss": 0.84159336, "memory(GiB)": 135.49, "step": 20940, "train_speed(iter/s)": 0.203912 }, { "acc": 0.76106863, "epoch": 0.488779363945231, "grad_norm": 6.46875, "learning_rate": 8.842310961291926e-06, "loss": 0.87484951, "memory(GiB)": 135.49, "step": 20950, "train_speed(iter/s)": 0.20396 }, { "acc": 0.78280768, "epoch": 0.4890126715175199, "grad_norm": 8.25, "learning_rate": 8.841101853618717e-06, "loss": 0.78613443, "memory(GiB)": 135.49, "step": 20960, "train_speed(iter/s)": 0.20401 }, { "acc": 0.7599493, "epoch": 0.4892459790898088, "grad_norm": 6.53125, "learning_rate": 8.839892197629334e-06, "loss": 0.86100092, "memory(GiB)": 135.49, "step": 20970, "train_speed(iter/s)": 0.20406 }, { "acc": 0.76969032, "epoch": 0.4894792866620977, "grad_norm": 6.9375, "learning_rate": 8.838681993496454e-06, "loss": 0.80889006, "memory(GiB)": 135.49, "step": 20980, "train_speed(iter/s)": 0.204111 }, { "acc": 0.74287944, "epoch": 0.4897125942343866, "grad_norm": 8.3125, "learning_rate": 8.837471241392835e-06, "loss": 0.95559177, "memory(GiB)": 135.49, "step": 20990, "train_speed(iter/s)": 0.204163 }, { "acc": 0.7772768, "epoch": 0.4899459018066755, "grad_norm": 6.09375, "learning_rate": 8.83625994149131e-06, "loss": 0.8026907, "memory(GiB)": 135.49, "step": 21000, "train_speed(iter/s)": 0.204213 }, { "epoch": 0.4899459018066755, "eval_acc": 0.7333171452634568, "eval_loss": 0.8420661091804504, "eval_runtime": 1262.7458, "eval_samples_per_second": 28.502, "eval_steps_per_second": 14.251, "step": 21000 }, { "acc": 0.76543007, "epoch": 0.4901792093789644, "grad_norm": 7.09375, "learning_rate": 8.835048093964796e-06, "loss": 0.85180492, "memory(GiB)": 135.49, "step": 21010, "train_speed(iter/s)": 0.201737 }, { "acc": 0.78669505, "epoch": 0.4904125169512533, "grad_norm": 5.75, "learning_rate": 8.833835698986276e-06, "loss": 0.76402421, "memory(GiB)": 135.49, "step": 21020, "train_speed(iter/s)": 0.201786 }, { "acc": 0.75375919, "epoch": 0.4906458245235422, "grad_norm": 6.71875, "learning_rate": 8.832622756728828e-06, "loss": 0.89295988, "memory(GiB)": 135.49, "step": 21030, "train_speed(iter/s)": 0.201835 }, { "acc": 0.75579996, "epoch": 0.49087913209583106, "grad_norm": 15.4375, "learning_rate": 8.831409267365594e-06, "loss": 0.88147535, "memory(GiB)": 135.49, "step": 21040, "train_speed(iter/s)": 0.201885 }, { "acc": 0.77996082, "epoch": 0.49111243966811996, "grad_norm": 9.3125, "learning_rate": 8.830195231069799e-06, "loss": 0.79135208, "memory(GiB)": 135.49, "step": 21050, "train_speed(iter/s)": 0.201935 }, { "acc": 0.76506524, "epoch": 0.49134574724040886, "grad_norm": 6.90625, "learning_rate": 8.828980648014747e-06, "loss": 0.85189142, "memory(GiB)": 135.49, "step": 21060, "train_speed(iter/s)": 0.20199 }, { "acc": 0.77192574, "epoch": 0.49157905481269776, "grad_norm": 4.78125, "learning_rate": 8.82776551837382e-06, "loss": 0.82044754, "memory(GiB)": 135.49, "step": 21070, "train_speed(iter/s)": 0.20204 }, { "acc": 0.75849233, "epoch": 0.49181236238498666, "grad_norm": 5.78125, "learning_rate": 8.826549842320478e-06, "loss": 0.89614201, "memory(GiB)": 135.49, "step": 21080, "train_speed(iter/s)": 0.202089 }, { "acc": 0.78498607, "epoch": 0.49204566995727556, "grad_norm": 5.28125, "learning_rate": 8.825333620028257e-06, "loss": 0.78414803, "memory(GiB)": 135.49, "step": 21090, "train_speed(iter/s)": 0.202141 }, { "acc": 0.75879197, "epoch": 0.49227897752956445, "grad_norm": 4.1875, "learning_rate": 8.824116851670772e-06, "loss": 0.88919201, "memory(GiB)": 135.49, "step": 21100, "train_speed(iter/s)": 0.202189 }, { "acc": 0.75270882, "epoch": 0.49251228510185335, "grad_norm": 5.8125, "learning_rate": 8.822899537421721e-06, "loss": 0.91976757, "memory(GiB)": 135.49, "step": 21110, "train_speed(iter/s)": 0.202236 }, { "acc": 0.75330138, "epoch": 0.49274559267414225, "grad_norm": 6.375, "learning_rate": 8.821681677454868e-06, "loss": 0.89845581, "memory(GiB)": 135.49, "step": 21120, "train_speed(iter/s)": 0.202286 }, { "acc": 0.76949892, "epoch": 0.49297890024643115, "grad_norm": 5.625, "learning_rate": 8.820463271944066e-06, "loss": 0.80845222, "memory(GiB)": 135.49, "step": 21130, "train_speed(iter/s)": 0.202339 }, { "acc": 0.76322274, "epoch": 0.49321220781872, "grad_norm": 5.71875, "learning_rate": 8.819244321063243e-06, "loss": 0.85682096, "memory(GiB)": 135.49, "step": 21140, "train_speed(iter/s)": 0.202388 }, { "acc": 0.76420536, "epoch": 0.4934455153910089, "grad_norm": 4.125, "learning_rate": 8.818024824986404e-06, "loss": 0.86049805, "memory(GiB)": 135.49, "step": 21150, "train_speed(iter/s)": 0.202436 }, { "acc": 0.76582661, "epoch": 0.4936788229632978, "grad_norm": 10.0625, "learning_rate": 8.816804783887628e-06, "loss": 0.83312578, "memory(GiB)": 135.49, "step": 21160, "train_speed(iter/s)": 0.202487 }, { "acc": 0.76767054, "epoch": 0.4939121305355867, "grad_norm": 5.4375, "learning_rate": 8.815584197941078e-06, "loss": 0.8309309, "memory(GiB)": 135.49, "step": 21170, "train_speed(iter/s)": 0.202539 }, { "acc": 0.7804575, "epoch": 0.4941454381078756, "grad_norm": 6.34375, "learning_rate": 8.814363067320995e-06, "loss": 0.78816233, "memory(GiB)": 135.49, "step": 21180, "train_speed(iter/s)": 0.202588 }, { "acc": 0.76247029, "epoch": 0.4943787456801645, "grad_norm": 5.96875, "learning_rate": 8.81314139220169e-06, "loss": 0.85659065, "memory(GiB)": 135.49, "step": 21190, "train_speed(iter/s)": 0.202642 }, { "acc": 0.76189499, "epoch": 0.4946120532524534, "grad_norm": 4.78125, "learning_rate": 8.811919172757558e-06, "loss": 0.84974251, "memory(GiB)": 135.49, "step": 21200, "train_speed(iter/s)": 0.202693 }, { "acc": 0.7832365, "epoch": 0.4948453608247423, "grad_norm": 3.671875, "learning_rate": 8.810696409163073e-06, "loss": 0.81887817, "memory(GiB)": 135.49, "step": 21210, "train_speed(iter/s)": 0.202742 }, { "acc": 0.7959774, "epoch": 0.4950786683970312, "grad_norm": 7.1875, "learning_rate": 8.809473101592783e-06, "loss": 0.76960783, "memory(GiB)": 135.49, "step": 21220, "train_speed(iter/s)": 0.202793 }, { "acc": 0.78107009, "epoch": 0.49531197596932003, "grad_norm": 8.4375, "learning_rate": 8.808249250221312e-06, "loss": 0.76504307, "memory(GiB)": 135.49, "step": 21230, "train_speed(iter/s)": 0.202844 }, { "acc": 0.75715971, "epoch": 0.4955452835416089, "grad_norm": 6.5, "learning_rate": 8.807024855223369e-06, "loss": 0.87529287, "memory(GiB)": 135.49, "step": 21240, "train_speed(iter/s)": 0.202894 }, { "acc": 0.78597803, "epoch": 0.4957785911138978, "grad_norm": 8.25, "learning_rate": 8.805799916773734e-06, "loss": 0.74819412, "memory(GiB)": 135.49, "step": 21250, "train_speed(iter/s)": 0.202942 }, { "acc": 0.75391788, "epoch": 0.4960118986861867, "grad_norm": 7.78125, "learning_rate": 8.804574435047265e-06, "loss": 0.90406952, "memory(GiB)": 135.49, "step": 21260, "train_speed(iter/s)": 0.202993 }, { "acc": 0.77339497, "epoch": 0.4962452062584756, "grad_norm": 7.125, "learning_rate": 8.803348410218902e-06, "loss": 0.81936436, "memory(GiB)": 135.49, "step": 21270, "train_speed(iter/s)": 0.203041 }, { "acc": 0.7636672, "epoch": 0.4964785138307645, "grad_norm": 6.3125, "learning_rate": 8.802121842463658e-06, "loss": 0.8531002, "memory(GiB)": 135.49, "step": 21280, "train_speed(iter/s)": 0.203093 }, { "acc": 0.76530962, "epoch": 0.4967118214030534, "grad_norm": 5.9375, "learning_rate": 8.800894731956624e-06, "loss": 0.83576422, "memory(GiB)": 135.49, "step": 21290, "train_speed(iter/s)": 0.203143 }, { "acc": 0.77334862, "epoch": 0.4969451289753423, "grad_norm": 5.71875, "learning_rate": 8.799667078872973e-06, "loss": 0.81178007, "memory(GiB)": 135.49, "step": 21300, "train_speed(iter/s)": 0.203192 }, { "acc": 0.77475805, "epoch": 0.4971784365476312, "grad_norm": 5.78125, "learning_rate": 8.79843888338795e-06, "loss": 0.82427921, "memory(GiB)": 135.49, "step": 21310, "train_speed(iter/s)": 0.203242 }, { "acc": 0.77716513, "epoch": 0.4974117441199201, "grad_norm": 6.0625, "learning_rate": 8.797210145676879e-06, "loss": 0.7881115, "memory(GiB)": 135.49, "step": 21320, "train_speed(iter/s)": 0.203289 }, { "acc": 0.78227696, "epoch": 0.49764505169220896, "grad_norm": 4.96875, "learning_rate": 8.795980865915164e-06, "loss": 0.80004463, "memory(GiB)": 135.49, "step": 21330, "train_speed(iter/s)": 0.203338 }, { "acc": 0.77496672, "epoch": 0.49787835926449786, "grad_norm": 6.125, "learning_rate": 8.794751044278282e-06, "loss": 0.81308975, "memory(GiB)": 135.49, "step": 21340, "train_speed(iter/s)": 0.203388 }, { "acc": 0.77686691, "epoch": 0.49811166683678676, "grad_norm": 7.59375, "learning_rate": 8.793520680941792e-06, "loss": 0.81556616, "memory(GiB)": 135.49, "step": 21350, "train_speed(iter/s)": 0.203438 }, { "acc": 0.77071648, "epoch": 0.49834497440907566, "grad_norm": 6.5, "learning_rate": 8.792289776081326e-06, "loss": 0.82695045, "memory(GiB)": 135.49, "step": 21360, "train_speed(iter/s)": 0.203489 }, { "acc": 0.75886583, "epoch": 0.49857828198136456, "grad_norm": 4.53125, "learning_rate": 8.791058329872595e-06, "loss": 0.87699385, "memory(GiB)": 135.49, "step": 21370, "train_speed(iter/s)": 0.203538 }, { "acc": 0.7730752, "epoch": 0.49881158955365346, "grad_norm": 5.0625, "learning_rate": 8.78982634249139e-06, "loss": 0.84314651, "memory(GiB)": 135.49, "step": 21380, "train_speed(iter/s)": 0.203588 }, { "acc": 0.76900158, "epoch": 0.49904489712594235, "grad_norm": 4.53125, "learning_rate": 8.788593814113576e-06, "loss": 0.84465599, "memory(GiB)": 135.49, "step": 21390, "train_speed(iter/s)": 0.203637 }, { "acc": 0.77295122, "epoch": 0.49927820469823125, "grad_norm": 6.78125, "learning_rate": 8.787360744915096e-06, "loss": 0.81024275, "memory(GiB)": 135.49, "step": 21400, "train_speed(iter/s)": 0.203685 }, { "acc": 0.76989708, "epoch": 0.49951151227052015, "grad_norm": 5.96875, "learning_rate": 8.786127135071968e-06, "loss": 0.83313713, "memory(GiB)": 135.49, "step": 21410, "train_speed(iter/s)": 0.203734 }, { "acc": 0.78257427, "epoch": 0.499744819842809, "grad_norm": 4.75, "learning_rate": 8.784892984760292e-06, "loss": 0.78192072, "memory(GiB)": 135.49, "step": 21420, "train_speed(iter/s)": 0.203782 }, { "acc": 0.76356907, "epoch": 0.4999781274150979, "grad_norm": 5.375, "learning_rate": 8.783658294156244e-06, "loss": 0.87735319, "memory(GiB)": 135.49, "step": 21430, "train_speed(iter/s)": 0.20383 }, { "acc": 0.77717724, "epoch": 0.5002114349873868, "grad_norm": 5.96875, "learning_rate": 8.782423063436072e-06, "loss": 0.80252514, "memory(GiB)": 135.49, "step": 21440, "train_speed(iter/s)": 0.20388 }, { "acc": 0.76985121, "epoch": 0.5004447425596757, "grad_norm": 6.09375, "learning_rate": 8.781187292776106e-06, "loss": 0.83680143, "memory(GiB)": 135.49, "step": 21450, "train_speed(iter/s)": 0.203928 }, { "acc": 0.77315006, "epoch": 0.5006780501319646, "grad_norm": 4.5, "learning_rate": 8.779950982352751e-06, "loss": 0.82066984, "memory(GiB)": 135.49, "step": 21460, "train_speed(iter/s)": 0.203979 }, { "acc": 0.76795526, "epoch": 0.5009113577042534, "grad_norm": 5.6875, "learning_rate": 8.778714132342494e-06, "loss": 0.83698311, "memory(GiB)": 135.49, "step": 21470, "train_speed(iter/s)": 0.204028 }, { "acc": 0.77210326, "epoch": 0.5011446652765423, "grad_norm": 6.25, "learning_rate": 8.777476742921893e-06, "loss": 0.83129911, "memory(GiB)": 135.49, "step": 21480, "train_speed(iter/s)": 0.204076 }, { "acc": 0.76538448, "epoch": 0.5013779728488312, "grad_norm": 5.90625, "learning_rate": 8.776238814267581e-06, "loss": 0.86117649, "memory(GiB)": 135.49, "step": 21490, "train_speed(iter/s)": 0.204129 }, { "acc": 0.7572938, "epoch": 0.5016112804211201, "grad_norm": 5.59375, "learning_rate": 8.775000346556278e-06, "loss": 0.91106577, "memory(GiB)": 135.49, "step": 21500, "train_speed(iter/s)": 0.204177 }, { "epoch": 0.5016112804211201, "eval_acc": 0.7334097560818202, "eval_loss": 0.8414279818534851, "eval_runtime": 1263.0588, "eval_samples_per_second": 28.495, "eval_steps_per_second": 14.248, "step": 21500 }, { "acc": 0.74601631, "epoch": 0.501844587993409, "grad_norm": 5.375, "learning_rate": 8.773761339964773e-06, "loss": 0.9213376, "memory(GiB)": 135.49, "step": 21510, "train_speed(iter/s)": 0.201758 }, { "acc": 0.76014342, "epoch": 0.5020778955656979, "grad_norm": 5.625, "learning_rate": 8.77252179466993e-06, "loss": 0.89738178, "memory(GiB)": 135.49, "step": 21520, "train_speed(iter/s)": 0.201808 }, { "acc": 0.77419243, "epoch": 0.5023112031379868, "grad_norm": 7.125, "learning_rate": 8.771281710848697e-06, "loss": 0.80626812, "memory(GiB)": 135.49, "step": 21530, "train_speed(iter/s)": 0.201857 }, { "acc": 0.76942387, "epoch": 0.5025445107102757, "grad_norm": 9.75, "learning_rate": 8.770041088678098e-06, "loss": 0.84295654, "memory(GiB)": 135.49, "step": 21540, "train_speed(iter/s)": 0.201909 }, { "acc": 0.77183619, "epoch": 0.5027778182825646, "grad_norm": 5.625, "learning_rate": 8.768799928335227e-06, "loss": 0.82391186, "memory(GiB)": 135.49, "step": 21550, "train_speed(iter/s)": 0.201954 }, { "acc": 0.79042406, "epoch": 0.5030111258548535, "grad_norm": 5.78125, "learning_rate": 8.76755822999726e-06, "loss": 0.75349379, "memory(GiB)": 135.49, "step": 21560, "train_speed(iter/s)": 0.202005 }, { "acc": 0.76405602, "epoch": 0.5032444334271424, "grad_norm": 7.625, "learning_rate": 8.766315993841452e-06, "loss": 0.85499592, "memory(GiB)": 135.49, "step": 21570, "train_speed(iter/s)": 0.202055 }, { "acc": 0.75095263, "epoch": 0.5034777409994313, "grad_norm": 4.0625, "learning_rate": 8.76507322004513e-06, "loss": 0.90326538, "memory(GiB)": 135.49, "step": 21580, "train_speed(iter/s)": 0.202103 }, { "acc": 0.76625805, "epoch": 0.5037110485717202, "grad_norm": 5.84375, "learning_rate": 8.7638299087857e-06, "loss": 0.83351212, "memory(GiB)": 135.49, "step": 21590, "train_speed(iter/s)": 0.202155 }, { "acc": 0.76977835, "epoch": 0.5039443561440091, "grad_norm": 7.84375, "learning_rate": 8.762586060240642e-06, "loss": 0.83348522, "memory(GiB)": 135.49, "step": 21600, "train_speed(iter/s)": 0.202203 }, { "acc": 0.76524391, "epoch": 0.504177663716298, "grad_norm": 54.0, "learning_rate": 8.761341674587518e-06, "loss": 0.83271847, "memory(GiB)": 135.49, "step": 21610, "train_speed(iter/s)": 0.202254 }, { "acc": 0.78429174, "epoch": 0.5044109712885869, "grad_norm": 5.625, "learning_rate": 8.760096752003962e-06, "loss": 0.79329572, "memory(GiB)": 135.49, "step": 21620, "train_speed(iter/s)": 0.202302 }, { "acc": 0.7748992, "epoch": 0.5046442788608758, "grad_norm": 6.15625, "learning_rate": 8.758851292667687e-06, "loss": 0.8141819, "memory(GiB)": 135.49, "step": 21630, "train_speed(iter/s)": 0.202353 }, { "acc": 0.76074615, "epoch": 0.5048775864331647, "grad_norm": 9.0625, "learning_rate": 8.757605296756483e-06, "loss": 0.8939064, "memory(GiB)": 135.49, "step": 21640, "train_speed(iter/s)": 0.202404 }, { "acc": 0.770714, "epoch": 0.5051108940054536, "grad_norm": 5.25, "learning_rate": 8.756358764448214e-06, "loss": 0.82770824, "memory(GiB)": 135.49, "step": 21650, "train_speed(iter/s)": 0.202456 }, { "acc": 0.77188749, "epoch": 0.5053442015777424, "grad_norm": 6.03125, "learning_rate": 8.755111695920823e-06, "loss": 0.84946203, "memory(GiB)": 135.49, "step": 21660, "train_speed(iter/s)": 0.202504 }, { "acc": 0.77735333, "epoch": 0.5055775091500313, "grad_norm": 4.5, "learning_rate": 8.753864091352326e-06, "loss": 0.7878243, "memory(GiB)": 135.49, "step": 21670, "train_speed(iter/s)": 0.202553 }, { "acc": 0.75859852, "epoch": 0.5058108167223202, "grad_norm": 5.75, "learning_rate": 8.752615950920824e-06, "loss": 0.8829195, "memory(GiB)": 135.49, "step": 21680, "train_speed(iter/s)": 0.2026 }, { "acc": 0.76332989, "epoch": 0.5060441242946091, "grad_norm": 5.78125, "learning_rate": 8.751367274804483e-06, "loss": 0.86273403, "memory(GiB)": 135.49, "step": 21690, "train_speed(iter/s)": 0.202647 }, { "acc": 0.75839734, "epoch": 0.506277431866898, "grad_norm": 5.03125, "learning_rate": 8.750118063181553e-06, "loss": 0.86387119, "memory(GiB)": 135.49, "step": 21700, "train_speed(iter/s)": 0.2027 }, { "acc": 0.76202135, "epoch": 0.5065107394391869, "grad_norm": 6.96875, "learning_rate": 8.74886831623036e-06, "loss": 0.8503706, "memory(GiB)": 135.49, "step": 21710, "train_speed(iter/s)": 0.202742 }, { "acc": 0.76597586, "epoch": 0.5067440470114758, "grad_norm": 4.84375, "learning_rate": 8.747618034129304e-06, "loss": 0.85730705, "memory(GiB)": 135.49, "step": 21720, "train_speed(iter/s)": 0.202792 }, { "acc": 0.77525783, "epoch": 0.5069773545837647, "grad_norm": 6.0625, "learning_rate": 8.746367217056861e-06, "loss": 0.81018257, "memory(GiB)": 135.49, "step": 21730, "train_speed(iter/s)": 0.202835 }, { "acc": 0.75620117, "epoch": 0.5072106621560536, "grad_norm": 5.875, "learning_rate": 8.745115865191587e-06, "loss": 0.8785862, "memory(GiB)": 135.49, "step": 21740, "train_speed(iter/s)": 0.202881 }, { "acc": 0.78099604, "epoch": 0.5074439697283425, "grad_norm": 6.90625, "learning_rate": 8.743863978712111e-06, "loss": 0.78154316, "memory(GiB)": 135.49, "step": 21750, "train_speed(iter/s)": 0.202927 }, { "acc": 0.76001511, "epoch": 0.5076772773006314, "grad_norm": 5.875, "learning_rate": 8.74261155779714e-06, "loss": 0.84587021, "memory(GiB)": 135.49, "step": 21760, "train_speed(iter/s)": 0.202977 }, { "acc": 0.75557723, "epoch": 0.5079105848729203, "grad_norm": 6.28125, "learning_rate": 8.741358602625455e-06, "loss": 0.89103594, "memory(GiB)": 135.49, "step": 21770, "train_speed(iter/s)": 0.203022 }, { "acc": 0.77349072, "epoch": 0.5081438924452092, "grad_norm": 8.5, "learning_rate": 8.740105113375919e-06, "loss": 0.83380623, "memory(GiB)": 135.49, "step": 21780, "train_speed(iter/s)": 0.20307 }, { "acc": 0.75671043, "epoch": 0.5083772000174981, "grad_norm": 6.28125, "learning_rate": 8.738851090227462e-06, "loss": 0.87419128, "memory(GiB)": 135.49, "step": 21790, "train_speed(iter/s)": 0.203118 }, { "acc": 0.77447839, "epoch": 0.508610507589787, "grad_norm": 6.5, "learning_rate": 8.737596533359101e-06, "loss": 0.82175837, "memory(GiB)": 135.49, "step": 21800, "train_speed(iter/s)": 0.203169 }, { "acc": 0.75856404, "epoch": 0.5088438151620759, "grad_norm": 6.71875, "learning_rate": 8.736341442949919e-06, "loss": 0.8789731, "memory(GiB)": 135.49, "step": 21810, "train_speed(iter/s)": 0.203221 }, { "acc": 0.77915144, "epoch": 0.5090771227343648, "grad_norm": 7.8125, "learning_rate": 8.73508581917908e-06, "loss": 0.77205162, "memory(GiB)": 135.49, "step": 21820, "train_speed(iter/s)": 0.203272 }, { "acc": 0.73904734, "epoch": 0.5093104303066537, "grad_norm": 10.6875, "learning_rate": 8.733829662225825e-06, "loss": 0.95421906, "memory(GiB)": 135.49, "step": 21830, "train_speed(iter/s)": 0.203321 }, { "acc": 0.76364794, "epoch": 0.5095437378789426, "grad_norm": 8.25, "learning_rate": 8.732572972269472e-06, "loss": 0.856429, "memory(GiB)": 135.49, "step": 21840, "train_speed(iter/s)": 0.203369 }, { "acc": 0.77039509, "epoch": 0.5097770454512315, "grad_norm": 6.21875, "learning_rate": 8.731315749489412e-06, "loss": 0.80742626, "memory(GiB)": 135.49, "step": 21850, "train_speed(iter/s)": 0.203417 }, { "acc": 0.77626538, "epoch": 0.5100103530235203, "grad_norm": 5.59375, "learning_rate": 8.730057994065113e-06, "loss": 0.8053051, "memory(GiB)": 135.49, "step": 21860, "train_speed(iter/s)": 0.203465 }, { "acc": 0.73854513, "epoch": 0.5102436605958092, "grad_norm": 7.34375, "learning_rate": 8.728799706176117e-06, "loss": 0.95123453, "memory(GiB)": 135.49, "step": 21870, "train_speed(iter/s)": 0.203513 }, { "acc": 0.78395672, "epoch": 0.5104769681680981, "grad_norm": 4.96875, "learning_rate": 8.727540886002048e-06, "loss": 0.76409702, "memory(GiB)": 135.49, "step": 21880, "train_speed(iter/s)": 0.203565 }, { "acc": 0.78190427, "epoch": 0.510710275740387, "grad_norm": 4.96875, "learning_rate": 8.7262815337226e-06, "loss": 0.77849307, "memory(GiB)": 135.49, "step": 21890, "train_speed(iter/s)": 0.203614 }, { "acc": 0.7679678, "epoch": 0.5109435833126759, "grad_norm": 8.1875, "learning_rate": 8.725021649517545e-06, "loss": 0.84533405, "memory(GiB)": 135.49, "step": 21900, "train_speed(iter/s)": 0.203663 }, { "acc": 0.76385984, "epoch": 0.5111768908849648, "grad_norm": 9.0625, "learning_rate": 8.723761233566732e-06, "loss": 0.85051146, "memory(GiB)": 135.49, "step": 21910, "train_speed(iter/s)": 0.203714 }, { "acc": 0.76122065, "epoch": 0.5114101984572537, "grad_norm": 7.375, "learning_rate": 8.722500286050084e-06, "loss": 0.86356144, "memory(GiB)": 135.49, "step": 21920, "train_speed(iter/s)": 0.20376 }, { "acc": 0.75424414, "epoch": 0.5116435060295426, "grad_norm": 7.125, "learning_rate": 8.721238807147602e-06, "loss": 0.88767586, "memory(GiB)": 135.49, "step": 21930, "train_speed(iter/s)": 0.203807 }, { "acc": 0.74846973, "epoch": 0.5118768136018315, "grad_norm": 5.8125, "learning_rate": 8.71997679703936e-06, "loss": 0.9065074, "memory(GiB)": 135.49, "step": 21940, "train_speed(iter/s)": 0.203857 }, { "acc": 0.75657244, "epoch": 0.5121101211741204, "grad_norm": 8.625, "learning_rate": 8.718714255905514e-06, "loss": 0.89676313, "memory(GiB)": 135.49, "step": 21950, "train_speed(iter/s)": 0.203903 }, { "acc": 0.77625394, "epoch": 0.5123434287464093, "grad_norm": 6.625, "learning_rate": 8.717451183926286e-06, "loss": 0.79575191, "memory(GiB)": 135.49, "step": 21960, "train_speed(iter/s)": 0.203952 }, { "acc": 0.76489639, "epoch": 0.5125767363186982, "grad_norm": 5.5625, "learning_rate": 8.716187581281982e-06, "loss": 0.83274612, "memory(GiB)": 135.49, "step": 21970, "train_speed(iter/s)": 0.204 }, { "acc": 0.76640348, "epoch": 0.512810043890987, "grad_norm": 6.15625, "learning_rate": 8.71492344815298e-06, "loss": 0.85010786, "memory(GiB)": 135.49, "step": 21980, "train_speed(iter/s)": 0.20405 }, { "acc": 0.76524248, "epoch": 0.513043351463276, "grad_norm": 6.8125, "learning_rate": 8.713658784719735e-06, "loss": 0.84684811, "memory(GiB)": 135.49, "step": 21990, "train_speed(iter/s)": 0.204096 }, { "acc": 0.77506523, "epoch": 0.5132766590355649, "grad_norm": 5.125, "learning_rate": 8.712393591162779e-06, "loss": 0.81104555, "memory(GiB)": 135.49, "step": 22000, "train_speed(iter/s)": 0.204146 }, { "epoch": 0.5132766590355649, "eval_acc": 0.7333862000200065, "eval_loss": 0.8411638140678406, "eval_runtime": 1263.3286, "eval_samples_per_second": 28.489, "eval_steps_per_second": 14.245, "step": 22000 }, { "acc": 0.75282259, "epoch": 0.5135099666078538, "grad_norm": 7.3125, "learning_rate": 8.711127867662715e-06, "loss": 0.90065165, "memory(GiB)": 135.49, "step": 22010, "train_speed(iter/s)": 0.201781 }, { "acc": 0.76085701, "epoch": 0.5137432741801427, "grad_norm": 7.0625, "learning_rate": 8.709861614400223e-06, "loss": 0.8657465, "memory(GiB)": 135.49, "step": 22020, "train_speed(iter/s)": 0.201831 }, { "acc": 0.76477222, "epoch": 0.5139765817524315, "grad_norm": 5.3125, "learning_rate": 8.708594831556068e-06, "loss": 0.85149727, "memory(GiB)": 135.49, "step": 22030, "train_speed(iter/s)": 0.201877 }, { "acc": 0.77038813, "epoch": 0.5142098893247204, "grad_norm": 6.34375, "learning_rate": 8.707327519311075e-06, "loss": 0.82421589, "memory(GiB)": 135.49, "step": 22040, "train_speed(iter/s)": 0.201922 }, { "acc": 0.75943213, "epoch": 0.5144431968970092, "grad_norm": 8.8125, "learning_rate": 8.706059677846157e-06, "loss": 0.87790775, "memory(GiB)": 135.49, "step": 22050, "train_speed(iter/s)": 0.20197 }, { "acc": 0.77695327, "epoch": 0.5146765044692981, "grad_norm": 4.875, "learning_rate": 8.704791307342297e-06, "loss": 0.78269749, "memory(GiB)": 135.49, "step": 22060, "train_speed(iter/s)": 0.202016 }, { "acc": 0.76895256, "epoch": 0.514909812041587, "grad_norm": 6.21875, "learning_rate": 8.703522407980554e-06, "loss": 0.8444231, "memory(GiB)": 135.49, "step": 22070, "train_speed(iter/s)": 0.202067 }, { "acc": 0.77682228, "epoch": 0.5151431196138759, "grad_norm": 7.34375, "learning_rate": 8.702252979942063e-06, "loss": 0.80677395, "memory(GiB)": 135.49, "step": 22080, "train_speed(iter/s)": 0.202119 }, { "acc": 0.76869836, "epoch": 0.5153764271861648, "grad_norm": 6.21875, "learning_rate": 8.700983023408034e-06, "loss": 0.84169931, "memory(GiB)": 135.49, "step": 22090, "train_speed(iter/s)": 0.202166 }, { "acc": 0.74050241, "epoch": 0.5156097347584537, "grad_norm": 7.53125, "learning_rate": 8.699712538559752e-06, "loss": 0.9410841, "memory(GiB)": 135.49, "step": 22100, "train_speed(iter/s)": 0.202215 }, { "acc": 0.75615292, "epoch": 0.5158430423307426, "grad_norm": 6.03125, "learning_rate": 8.698441525578582e-06, "loss": 0.86833105, "memory(GiB)": 135.49, "step": 22110, "train_speed(iter/s)": 0.202263 }, { "acc": 0.78462915, "epoch": 0.5160763499030315, "grad_norm": 6.3125, "learning_rate": 8.697169984645959e-06, "loss": 0.76147332, "memory(GiB)": 135.49, "step": 22120, "train_speed(iter/s)": 0.202313 }, { "acc": 0.76850891, "epoch": 0.5163096574753204, "grad_norm": 5.125, "learning_rate": 8.695897915943395e-06, "loss": 0.82608023, "memory(GiB)": 135.49, "step": 22130, "train_speed(iter/s)": 0.202362 }, { "acc": 0.77396784, "epoch": 0.5165429650476093, "grad_norm": 7.5625, "learning_rate": 8.694625319652477e-06, "loss": 0.8396596, "memory(GiB)": 135.49, "step": 22140, "train_speed(iter/s)": 0.202409 }, { "acc": 0.78052754, "epoch": 0.5167762726198982, "grad_norm": 5.75, "learning_rate": 8.693352195954866e-06, "loss": 0.82580967, "memory(GiB)": 135.49, "step": 22150, "train_speed(iter/s)": 0.202457 }, { "acc": 0.74678736, "epoch": 0.5170095801921871, "grad_norm": 4.5625, "learning_rate": 8.692078545032304e-06, "loss": 0.91916122, "memory(GiB)": 135.49, "step": 22160, "train_speed(iter/s)": 0.202504 }, { "acc": 0.77834253, "epoch": 0.517242887764476, "grad_norm": 6.40625, "learning_rate": 8.6908043670666e-06, "loss": 0.82092743, "memory(GiB)": 135.49, "step": 22170, "train_speed(iter/s)": 0.202552 }, { "acc": 0.75645924, "epoch": 0.5174761953367649, "grad_norm": 8.1875, "learning_rate": 8.689529662239647e-06, "loss": 0.90903053, "memory(GiB)": 135.49, "step": 22180, "train_speed(iter/s)": 0.202596 }, { "acc": 0.75742769, "epoch": 0.5177095029090538, "grad_norm": 4.8125, "learning_rate": 8.688254430733405e-06, "loss": 0.8678875, "memory(GiB)": 135.49, "step": 22190, "train_speed(iter/s)": 0.202645 }, { "acc": 0.77037096, "epoch": 0.5179428104813427, "grad_norm": 5.0625, "learning_rate": 8.686978672729916e-06, "loss": 0.85545216, "memory(GiB)": 135.49, "step": 22200, "train_speed(iter/s)": 0.202691 }, { "acc": 0.74468794, "epoch": 0.5181761180536316, "grad_norm": 7.21875, "learning_rate": 8.68570238841129e-06, "loss": 0.92368145, "memory(GiB)": 135.49, "step": 22210, "train_speed(iter/s)": 0.202739 }, { "acc": 0.75999327, "epoch": 0.5184094256259205, "grad_norm": 6.25, "learning_rate": 8.684425577959722e-06, "loss": 0.85787907, "memory(GiB)": 135.49, "step": 22220, "train_speed(iter/s)": 0.202786 }, { "acc": 0.77888823, "epoch": 0.5186427331982094, "grad_norm": 5.59375, "learning_rate": 8.683148241557472e-06, "loss": 0.81330433, "memory(GiB)": 135.49, "step": 22230, "train_speed(iter/s)": 0.202833 }, { "acc": 0.75407305, "epoch": 0.5188760407704982, "grad_norm": 6.21875, "learning_rate": 8.681870379386879e-06, "loss": 0.8887085, "memory(GiB)": 135.49, "step": 22240, "train_speed(iter/s)": 0.202884 }, { "acc": 0.76019168, "epoch": 0.5191093483427871, "grad_norm": 5.84375, "learning_rate": 8.68059199163036e-06, "loss": 0.8646513, "memory(GiB)": 135.49, "step": 22250, "train_speed(iter/s)": 0.20293 }, { "acc": 0.77266188, "epoch": 0.519342655915076, "grad_norm": 4.6875, "learning_rate": 8.679313078470403e-06, "loss": 0.80042543, "memory(GiB)": 135.49, "step": 22260, "train_speed(iter/s)": 0.202974 }, { "acc": 0.77338095, "epoch": 0.5195759634873649, "grad_norm": 4.5, "learning_rate": 8.678033640089574e-06, "loss": 0.81481285, "memory(GiB)": 135.49, "step": 22270, "train_speed(iter/s)": 0.203021 }, { "acc": 0.76883183, "epoch": 0.5198092710596538, "grad_norm": 6.96875, "learning_rate": 8.676753676670511e-06, "loss": 0.8299674, "memory(GiB)": 135.49, "step": 22280, "train_speed(iter/s)": 0.203068 }, { "acc": 0.7796629, "epoch": 0.5200425786319427, "grad_norm": 6.21875, "learning_rate": 8.67547318839593e-06, "loss": 0.8038166, "memory(GiB)": 135.49, "step": 22290, "train_speed(iter/s)": 0.203119 }, { "acc": 0.76623964, "epoch": 0.5202758862042316, "grad_norm": 8.25, "learning_rate": 8.674192175448617e-06, "loss": 0.81277609, "memory(GiB)": 135.49, "step": 22300, "train_speed(iter/s)": 0.203166 }, { "acc": 0.77275934, "epoch": 0.5205091937765205, "grad_norm": 7.9375, "learning_rate": 8.672910638011439e-06, "loss": 0.80928802, "memory(GiB)": 135.49, "step": 22310, "train_speed(iter/s)": 0.203213 }, { "acc": 0.74733486, "epoch": 0.5207425013488094, "grad_norm": 5.625, "learning_rate": 8.671628576267333e-06, "loss": 0.90864058, "memory(GiB)": 135.49, "step": 22320, "train_speed(iter/s)": 0.203263 }, { "acc": 0.7622673, "epoch": 0.5209758089210983, "grad_norm": 6.0625, "learning_rate": 8.670345990399317e-06, "loss": 0.86865234, "memory(GiB)": 135.49, "step": 22330, "train_speed(iter/s)": 0.203311 }, { "acc": 0.76430178, "epoch": 0.5212091164933872, "grad_norm": 7.0625, "learning_rate": 8.669062880590474e-06, "loss": 0.87208309, "memory(GiB)": 135.49, "step": 22340, "train_speed(iter/s)": 0.203358 }, { "acc": 0.7700243, "epoch": 0.5214424240656761, "grad_norm": 5.875, "learning_rate": 8.667779247023974e-06, "loss": 0.82719364, "memory(GiB)": 135.49, "step": 22350, "train_speed(iter/s)": 0.203403 }, { "acc": 0.77398543, "epoch": 0.521675731637965, "grad_norm": 5.0625, "learning_rate": 8.666495089883049e-06, "loss": 0.80392952, "memory(GiB)": 135.49, "step": 22360, "train_speed(iter/s)": 0.203451 }, { "acc": 0.78728781, "epoch": 0.5219090392102539, "grad_norm": 6.3125, "learning_rate": 8.665210409351015e-06, "loss": 0.76222181, "memory(GiB)": 135.49, "step": 22370, "train_speed(iter/s)": 0.203494 }, { "acc": 0.757302, "epoch": 0.5221423467825428, "grad_norm": 5.59375, "learning_rate": 8.663925205611261e-06, "loss": 0.91073914, "memory(GiB)": 135.49, "step": 22380, "train_speed(iter/s)": 0.203544 }, { "acc": 0.78042789, "epoch": 0.5223756543548317, "grad_norm": 4.9375, "learning_rate": 8.66263947884725e-06, "loss": 0.80484838, "memory(GiB)": 135.49, "step": 22390, "train_speed(iter/s)": 0.20359 }, { "acc": 0.76949029, "epoch": 0.5226089619271206, "grad_norm": 5.125, "learning_rate": 8.661353229242514e-06, "loss": 0.81789513, "memory(GiB)": 135.49, "step": 22400, "train_speed(iter/s)": 0.203638 }, { "acc": 0.77950153, "epoch": 0.5228422694994095, "grad_norm": 4.78125, "learning_rate": 8.66006645698067e-06, "loss": 0.7946104, "memory(GiB)": 135.49, "step": 22410, "train_speed(iter/s)": 0.203683 }, { "acc": 0.76224537, "epoch": 0.5230755770716984, "grad_norm": 7.625, "learning_rate": 8.658779162245404e-06, "loss": 0.85305929, "memory(GiB)": 135.49, "step": 22420, "train_speed(iter/s)": 0.20373 }, { "acc": 0.78252764, "epoch": 0.5233088846439872, "grad_norm": 6.25, "learning_rate": 8.657491345220475e-06, "loss": 0.77794399, "memory(GiB)": 135.49, "step": 22430, "train_speed(iter/s)": 0.203777 }, { "acc": 0.78178725, "epoch": 0.5235421922162761, "grad_norm": 4.78125, "learning_rate": 8.656203006089716e-06, "loss": 0.7675909, "memory(GiB)": 135.49, "step": 22440, "train_speed(iter/s)": 0.203825 }, { "acc": 0.76836543, "epoch": 0.523775499788565, "grad_norm": 6.5, "learning_rate": 8.654914145037044e-06, "loss": 0.84373474, "memory(GiB)": 135.49, "step": 22450, "train_speed(iter/s)": 0.203875 }, { "acc": 0.7784379, "epoch": 0.5240088073608539, "grad_norm": 4.6875, "learning_rate": 8.653624762246437e-06, "loss": 0.79119368, "memory(GiB)": 135.49, "step": 22460, "train_speed(iter/s)": 0.203922 }, { "acc": 0.75871043, "epoch": 0.5242421149331428, "grad_norm": 4.625, "learning_rate": 8.652334857901957e-06, "loss": 0.86542797, "memory(GiB)": 135.49, "step": 22470, "train_speed(iter/s)": 0.203968 }, { "acc": 0.77279181, "epoch": 0.5244754225054317, "grad_norm": 5.8125, "learning_rate": 8.651044432187736e-06, "loss": 0.82461414, "memory(GiB)": 135.49, "step": 22480, "train_speed(iter/s)": 0.204016 }, { "acc": 0.77023087, "epoch": 0.5247087300777206, "grad_norm": 7.65625, "learning_rate": 8.649753485287986e-06, "loss": 0.83507442, "memory(GiB)": 135.49, "step": 22490, "train_speed(iter/s)": 0.204063 }, { "acc": 0.7425971, "epoch": 0.5249420376500095, "grad_norm": 10.625, "learning_rate": 8.648462017386982e-06, "loss": 0.97700481, "memory(GiB)": 135.49, "step": 22500, "train_speed(iter/s)": 0.204108 }, { "epoch": 0.5249420376500095, "eval_acc": 0.7334762293521437, "eval_loss": 0.8410158157348633, "eval_runtime": 1263.3385, "eval_samples_per_second": 28.489, "eval_steps_per_second": 14.245, "step": 22500 }, { "acc": 0.7463769, "epoch": 0.5251753452222984, "grad_norm": 5.125, "learning_rate": 8.64717002866909e-06, "loss": 0.90748281, "memory(GiB)": 135.49, "step": 22510, "train_speed(iter/s)": 0.201794 }, { "acc": 0.7711338, "epoch": 0.5254086527945873, "grad_norm": 6.21875, "learning_rate": 8.64587751931873e-06, "loss": 0.84214458, "memory(GiB)": 135.49, "step": 22520, "train_speed(iter/s)": 0.201841 }, { "acc": 0.7714242, "epoch": 0.5256419603668762, "grad_norm": 7.0, "learning_rate": 8.644584489520418e-06, "loss": 0.82742691, "memory(GiB)": 135.49, "step": 22530, "train_speed(iter/s)": 0.201885 }, { "acc": 0.77334108, "epoch": 0.5258752679391651, "grad_norm": 6.5625, "learning_rate": 8.643290939458728e-06, "loss": 0.80096493, "memory(GiB)": 135.49, "step": 22540, "train_speed(iter/s)": 0.201931 }, { "acc": 0.75839257, "epoch": 0.526108575511454, "grad_norm": 7.78125, "learning_rate": 8.641996869318313e-06, "loss": 0.88166351, "memory(GiB)": 135.49, "step": 22550, "train_speed(iter/s)": 0.20198 }, { "acc": 0.77138615, "epoch": 0.5263418830837429, "grad_norm": 15.0, "learning_rate": 8.640702279283904e-06, "loss": 0.82558355, "memory(GiB)": 135.49, "step": 22560, "train_speed(iter/s)": 0.202028 }, { "acc": 0.78859496, "epoch": 0.5265751906560318, "grad_norm": 4.125, "learning_rate": 8.639407169540302e-06, "loss": 0.75231824, "memory(GiB)": 135.49, "step": 22570, "train_speed(iter/s)": 0.202072 }, { "acc": 0.7619205, "epoch": 0.5268084982283207, "grad_norm": 5.0, "learning_rate": 8.638111540272384e-06, "loss": 0.85614414, "memory(GiB)": 135.49, "step": 22580, "train_speed(iter/s)": 0.202117 }, { "acc": 0.77643824, "epoch": 0.5270418058006096, "grad_norm": 6.15625, "learning_rate": 8.636815391665102e-06, "loss": 0.80550213, "memory(GiB)": 135.49, "step": 22590, "train_speed(iter/s)": 0.202163 }, { "acc": 0.76234941, "epoch": 0.5272751133728985, "grad_norm": 5.5, "learning_rate": 8.635518723903478e-06, "loss": 0.84981022, "memory(GiB)": 135.49, "step": 22600, "train_speed(iter/s)": 0.202209 }, { "acc": 0.76929321, "epoch": 0.5275084209451874, "grad_norm": 5.21875, "learning_rate": 8.634221537172612e-06, "loss": 0.84304075, "memory(GiB)": 135.49, "step": 22610, "train_speed(iter/s)": 0.202254 }, { "acc": 0.74534149, "epoch": 0.5277417285174762, "grad_norm": 6.28125, "learning_rate": 8.632923831657678e-06, "loss": 0.93666344, "memory(GiB)": 135.49, "step": 22620, "train_speed(iter/s)": 0.202302 }, { "acc": 0.76843634, "epoch": 0.527975036089765, "grad_norm": 5.75, "learning_rate": 8.631625607543921e-06, "loss": 0.83468943, "memory(GiB)": 135.49, "step": 22630, "train_speed(iter/s)": 0.202348 }, { "acc": 0.77865896, "epoch": 0.5282083436620539, "grad_norm": 5.0, "learning_rate": 8.630326865016663e-06, "loss": 0.79705954, "memory(GiB)": 135.49, "step": 22640, "train_speed(iter/s)": 0.202394 }, { "acc": 0.75622969, "epoch": 0.5284416512343428, "grad_norm": 5.46875, "learning_rate": 8.629027604261303e-06, "loss": 0.89146366, "memory(GiB)": 135.49, "step": 22650, "train_speed(iter/s)": 0.202442 }, { "acc": 0.77083435, "epoch": 0.5286749588066317, "grad_norm": 7.28125, "learning_rate": 8.627727825463303e-06, "loss": 0.82840633, "memory(GiB)": 135.49, "step": 22660, "train_speed(iter/s)": 0.202492 }, { "acc": 0.77896614, "epoch": 0.5289082663789206, "grad_norm": 6.1875, "learning_rate": 8.626427528808212e-06, "loss": 0.78767605, "memory(GiB)": 135.49, "step": 22670, "train_speed(iter/s)": 0.202536 }, { "acc": 0.76126528, "epoch": 0.5291415739512095, "grad_norm": 13.5625, "learning_rate": 8.625126714481645e-06, "loss": 0.90700569, "memory(GiB)": 135.49, "step": 22680, "train_speed(iter/s)": 0.202583 }, { "acc": 0.76198072, "epoch": 0.5293748815234984, "grad_norm": 11.6875, "learning_rate": 8.623825382669291e-06, "loss": 0.87938271, "memory(GiB)": 135.49, "step": 22690, "train_speed(iter/s)": 0.20263 }, { "acc": 0.77120328, "epoch": 0.5296081890957873, "grad_norm": 7.15625, "learning_rate": 8.622523533556916e-06, "loss": 0.83929205, "memory(GiB)": 135.49, "step": 22700, "train_speed(iter/s)": 0.202679 }, { "acc": 0.76387472, "epoch": 0.5298414966680762, "grad_norm": 5.6875, "learning_rate": 8.621221167330363e-06, "loss": 0.87949638, "memory(GiB)": 135.49, "step": 22710, "train_speed(iter/s)": 0.202727 }, { "acc": 0.78230534, "epoch": 0.5300748042403651, "grad_norm": 5.875, "learning_rate": 8.619918284175537e-06, "loss": 0.76435866, "memory(GiB)": 135.49, "step": 22720, "train_speed(iter/s)": 0.202775 }, { "acc": 0.76150417, "epoch": 0.530308111812654, "grad_norm": 5.15625, "learning_rate": 8.618614884278427e-06, "loss": 0.86660538, "memory(GiB)": 135.49, "step": 22730, "train_speed(iter/s)": 0.202821 }, { "acc": 0.76483765, "epoch": 0.5305414193849429, "grad_norm": 4.96875, "learning_rate": 8.617310967825094e-06, "loss": 0.82864132, "memory(GiB)": 135.49, "step": 22740, "train_speed(iter/s)": 0.202867 }, { "acc": 0.7704339, "epoch": 0.5307747269572318, "grad_norm": 7.03125, "learning_rate": 8.616006535001673e-06, "loss": 0.84206514, "memory(GiB)": 135.49, "step": 22750, "train_speed(iter/s)": 0.20291 }, { "acc": 0.76693459, "epoch": 0.5310080345295207, "grad_norm": 7.40625, "learning_rate": 8.614701585994368e-06, "loss": 0.83635578, "memory(GiB)": 135.49, "step": 22760, "train_speed(iter/s)": 0.202957 }, { "acc": 0.78106937, "epoch": 0.5312413421018096, "grad_norm": 7.90625, "learning_rate": 8.613396120989463e-06, "loss": 0.78327541, "memory(GiB)": 135.49, "step": 22770, "train_speed(iter/s)": 0.203001 }, { "acc": 0.77457027, "epoch": 0.5314746496740985, "grad_norm": 5.65625, "learning_rate": 8.61209014017331e-06, "loss": 0.80380011, "memory(GiB)": 135.49, "step": 22780, "train_speed(iter/s)": 0.203051 }, { "acc": 0.74233837, "epoch": 0.5317079572463874, "grad_norm": 10.75, "learning_rate": 8.610783643732339e-06, "loss": 0.92551575, "memory(GiB)": 135.49, "step": 22790, "train_speed(iter/s)": 0.203099 }, { "acc": 0.7698678, "epoch": 0.5319412648186763, "grad_norm": 5.40625, "learning_rate": 8.60947663185305e-06, "loss": 0.85055923, "memory(GiB)": 135.49, "step": 22800, "train_speed(iter/s)": 0.203146 }, { "acc": 0.74941635, "epoch": 0.5321745723909652, "grad_norm": 5.125, "learning_rate": 8.608169104722024e-06, "loss": 0.91974421, "memory(GiB)": 135.49, "step": 22810, "train_speed(iter/s)": 0.203191 }, { "acc": 0.73897223, "epoch": 0.532407879963254, "grad_norm": 5.84375, "learning_rate": 8.606861062525904e-06, "loss": 0.95769196, "memory(GiB)": 135.49, "step": 22820, "train_speed(iter/s)": 0.203234 }, { "acc": 0.75574875, "epoch": 0.5326411875355429, "grad_norm": 6.09375, "learning_rate": 8.605552505451417e-06, "loss": 0.89433231, "memory(GiB)": 135.49, "step": 22830, "train_speed(iter/s)": 0.203282 }, { "acc": 0.76349201, "epoch": 0.5328744951078318, "grad_norm": 6.3125, "learning_rate": 8.604243433685356e-06, "loss": 0.88474665, "memory(GiB)": 135.49, "step": 22840, "train_speed(iter/s)": 0.203332 }, { "acc": 0.77740908, "epoch": 0.5331078026801207, "grad_norm": 6.90625, "learning_rate": 8.602933847414592e-06, "loss": 0.80575705, "memory(GiB)": 135.49, "step": 22850, "train_speed(iter/s)": 0.20338 }, { "acc": 0.77938728, "epoch": 0.5333411102524096, "grad_norm": 5.21875, "learning_rate": 8.601623746826068e-06, "loss": 0.80406418, "memory(GiB)": 135.49, "step": 22860, "train_speed(iter/s)": 0.203427 }, { "acc": 0.77814388, "epoch": 0.5335744178246985, "grad_norm": 6.15625, "learning_rate": 8.600313132106801e-06, "loss": 0.82497854, "memory(GiB)": 135.49, "step": 22870, "train_speed(iter/s)": 0.203477 }, { "acc": 0.74938765, "epoch": 0.5338077253969874, "grad_norm": 7.6875, "learning_rate": 8.599002003443879e-06, "loss": 0.89406528, "memory(GiB)": 135.49, "step": 22880, "train_speed(iter/s)": 0.203524 }, { "acc": 0.76606164, "epoch": 0.5340410329692763, "grad_norm": 7.4375, "learning_rate": 8.597690361024468e-06, "loss": 0.84207964, "memory(GiB)": 135.49, "step": 22890, "train_speed(iter/s)": 0.20357 }, { "acc": 0.76744127, "epoch": 0.5342743405415652, "grad_norm": 9.3125, "learning_rate": 8.596378205035803e-06, "loss": 0.86165237, "memory(GiB)": 135.49, "step": 22900, "train_speed(iter/s)": 0.203616 }, { "acc": 0.76862221, "epoch": 0.5345076481138541, "grad_norm": 5.46875, "learning_rate": 8.595065535665192e-06, "loss": 0.83547659, "memory(GiB)": 135.49, "step": 22910, "train_speed(iter/s)": 0.203663 }, { "acc": 0.7635561, "epoch": 0.534740955686143, "grad_norm": 6.0625, "learning_rate": 8.593752353100022e-06, "loss": 0.87516937, "memory(GiB)": 135.49, "step": 22920, "train_speed(iter/s)": 0.203708 }, { "acc": 0.75567832, "epoch": 0.5349742632584319, "grad_norm": 7.5, "learning_rate": 8.592438657527746e-06, "loss": 0.90006075, "memory(GiB)": 135.49, "step": 22930, "train_speed(iter/s)": 0.203752 }, { "acc": 0.76826582, "epoch": 0.5352075708307208, "grad_norm": 5.96875, "learning_rate": 8.591124449135897e-06, "loss": 0.80793161, "memory(GiB)": 135.49, "step": 22940, "train_speed(iter/s)": 0.2038 }, { "acc": 0.77429943, "epoch": 0.5354408784030097, "grad_norm": 5.25, "learning_rate": 8.589809728112076e-06, "loss": 0.81636438, "memory(GiB)": 135.49, "step": 22950, "train_speed(iter/s)": 0.203846 }, { "acc": 0.76664867, "epoch": 0.5356741859752986, "grad_norm": 5.15625, "learning_rate": 8.588494494643959e-06, "loss": 0.85079727, "memory(GiB)": 135.49, "step": 22960, "train_speed(iter/s)": 0.203893 }, { "acc": 0.77166595, "epoch": 0.5359074935475875, "grad_norm": 6.875, "learning_rate": 8.587178748919294e-06, "loss": 0.84034262, "memory(GiB)": 135.49, "step": 22970, "train_speed(iter/s)": 0.203936 }, { "acc": 0.7592371, "epoch": 0.5361408011198764, "grad_norm": 6.15625, "learning_rate": 8.585862491125906e-06, "loss": 0.87178383, "memory(GiB)": 135.49, "step": 22980, "train_speed(iter/s)": 0.203983 }, { "acc": 0.7613801, "epoch": 0.5363741086921653, "grad_norm": 7.09375, "learning_rate": 8.584545721451689e-06, "loss": 0.86859055, "memory(GiB)": 135.49, "step": 22990, "train_speed(iter/s)": 0.204029 }, { "acc": 0.77089453, "epoch": 0.5366074162644542, "grad_norm": 6.28125, "learning_rate": 8.583228440084612e-06, "loss": 0.81495457, "memory(GiB)": 135.49, "step": 23000, "train_speed(iter/s)": 0.204078 }, { "epoch": 0.5366074162644542, "eval_acc": 0.7335970751761057, "eval_loss": 0.8402589559555054, "eval_runtime": 1262.5134, "eval_samples_per_second": 28.507, "eval_steps_per_second": 14.254, "step": 23000 }, { "acc": 0.73282828, "epoch": 0.536840723836743, "grad_norm": 26.125, "learning_rate": 8.581910647212714e-06, "loss": 0.98265486, "memory(GiB)": 135.49, "step": 23010, "train_speed(iter/s)": 0.201815 }, { "acc": 0.75975924, "epoch": 0.5370740314090319, "grad_norm": 6.875, "learning_rate": 8.580592343024114e-06, "loss": 0.88351765, "memory(GiB)": 135.49, "step": 23020, "train_speed(iter/s)": 0.20186 }, { "acc": 0.76980691, "epoch": 0.5373073389813208, "grad_norm": 5.125, "learning_rate": 8.579273527706997e-06, "loss": 0.83077517, "memory(GiB)": 135.49, "step": 23030, "train_speed(iter/s)": 0.201904 }, { "acc": 0.7578351, "epoch": 0.5375406465536097, "grad_norm": 6.46875, "learning_rate": 8.577954201449621e-06, "loss": 0.87964821, "memory(GiB)": 135.49, "step": 23040, "train_speed(iter/s)": 0.201949 }, { "acc": 0.76563787, "epoch": 0.5377739541258986, "grad_norm": 5.9375, "learning_rate": 8.576634364440327e-06, "loss": 0.8388237, "memory(GiB)": 135.49, "step": 23050, "train_speed(iter/s)": 0.201988 }, { "acc": 0.77380834, "epoch": 0.5380072616981875, "grad_norm": 7.25, "learning_rate": 8.575314016867512e-06, "loss": 0.82013531, "memory(GiB)": 135.49, "step": 23060, "train_speed(iter/s)": 0.202035 }, { "acc": 0.7680933, "epoch": 0.5382405692704764, "grad_norm": 8.5625, "learning_rate": 8.573993158919661e-06, "loss": 0.82164459, "memory(GiB)": 135.49, "step": 23070, "train_speed(iter/s)": 0.202081 }, { "acc": 0.77237864, "epoch": 0.5384738768427653, "grad_norm": 5.71875, "learning_rate": 8.572671790785325e-06, "loss": 0.81115112, "memory(GiB)": 135.49, "step": 23080, "train_speed(iter/s)": 0.202124 }, { "acc": 0.75384626, "epoch": 0.5387071844150542, "grad_norm": 5.28125, "learning_rate": 8.57134991265313e-06, "loss": 0.87708273, "memory(GiB)": 135.49, "step": 23090, "train_speed(iter/s)": 0.202172 }, { "acc": 0.77324486, "epoch": 0.5389404919873431, "grad_norm": 5.875, "learning_rate": 8.57002752471177e-06, "loss": 0.83139248, "memory(GiB)": 135.49, "step": 23100, "train_speed(iter/s)": 0.202217 }, { "acc": 0.76092548, "epoch": 0.539173799559632, "grad_norm": 5.78125, "learning_rate": 8.56870462715002e-06, "loss": 0.85643644, "memory(GiB)": 135.49, "step": 23110, "train_speed(iter/s)": 0.202264 }, { "acc": 0.75681486, "epoch": 0.5394071071319209, "grad_norm": 4.59375, "learning_rate": 8.567381220156721e-06, "loss": 0.88403416, "memory(GiB)": 135.49, "step": 23120, "train_speed(iter/s)": 0.202308 }, { "acc": 0.76574316, "epoch": 0.5396404147042098, "grad_norm": 7.3125, "learning_rate": 8.566057303920788e-06, "loss": 0.84044876, "memory(GiB)": 135.49, "step": 23130, "train_speed(iter/s)": 0.202356 }, { "acc": 0.77936935, "epoch": 0.5398737222764987, "grad_norm": 4.75, "learning_rate": 8.564732878631212e-06, "loss": 0.79117284, "memory(GiB)": 135.49, "step": 23140, "train_speed(iter/s)": 0.2024 }, { "acc": 0.76601772, "epoch": 0.5401070298487876, "grad_norm": 4.4375, "learning_rate": 8.563407944477052e-06, "loss": 0.84877987, "memory(GiB)": 135.49, "step": 23150, "train_speed(iter/s)": 0.202446 }, { "acc": 0.76673574, "epoch": 0.5403403374210765, "grad_norm": 5.90625, "learning_rate": 8.562082501647445e-06, "loss": 0.82089748, "memory(GiB)": 135.49, "step": 23160, "train_speed(iter/s)": 0.202494 }, { "acc": 0.75398397, "epoch": 0.5405736449933654, "grad_norm": 14.8125, "learning_rate": 8.560756550331594e-06, "loss": 0.88060741, "memory(GiB)": 135.49, "step": 23170, "train_speed(iter/s)": 0.202541 }, { "acc": 0.77121634, "epoch": 0.5408069525656543, "grad_norm": 5.5, "learning_rate": 8.55943009071878e-06, "loss": 0.84534807, "memory(GiB)": 135.49, "step": 23180, "train_speed(iter/s)": 0.20258 }, { "acc": 0.78186507, "epoch": 0.5410402601379432, "grad_norm": 5.46875, "learning_rate": 8.558103122998354e-06, "loss": 0.78450785, "memory(GiB)": 135.49, "step": 23190, "train_speed(iter/s)": 0.202624 }, { "acc": 0.76591082, "epoch": 0.5412735677102319, "grad_norm": 9.5625, "learning_rate": 8.556775647359744e-06, "loss": 0.84084101, "memory(GiB)": 135.49, "step": 23200, "train_speed(iter/s)": 0.202671 }, { "acc": 0.74875708, "epoch": 0.5415068752825208, "grad_norm": 19.75, "learning_rate": 8.55544766399244e-06, "loss": 0.92503014, "memory(GiB)": 135.49, "step": 23210, "train_speed(iter/s)": 0.202716 }, { "acc": 0.76273284, "epoch": 0.5417401828548097, "grad_norm": 4.78125, "learning_rate": 8.554119173086014e-06, "loss": 0.87135534, "memory(GiB)": 135.49, "step": 23220, "train_speed(iter/s)": 0.20276 }, { "acc": 0.7794795, "epoch": 0.5419734904270986, "grad_norm": 4.3125, "learning_rate": 8.552790174830112e-06, "loss": 0.78043337, "memory(GiB)": 135.49, "step": 23230, "train_speed(iter/s)": 0.202808 }, { "acc": 0.77743006, "epoch": 0.5422067979993875, "grad_norm": 6.0, "learning_rate": 8.551460669414444e-06, "loss": 0.81504974, "memory(GiB)": 135.49, "step": 23240, "train_speed(iter/s)": 0.202852 }, { "acc": 0.76667776, "epoch": 0.5424401055716764, "grad_norm": 7.8125, "learning_rate": 8.550130657028797e-06, "loss": 0.82268848, "memory(GiB)": 135.49, "step": 23250, "train_speed(iter/s)": 0.202897 }, { "acc": 0.76646795, "epoch": 0.5426734131439653, "grad_norm": 5.28125, "learning_rate": 8.548800137863028e-06, "loss": 0.84073057, "memory(GiB)": 135.49, "step": 23260, "train_speed(iter/s)": 0.20294 }, { "acc": 0.763204, "epoch": 0.5429067207162542, "grad_norm": 5.9375, "learning_rate": 8.547469112107071e-06, "loss": 0.8597147, "memory(GiB)": 135.49, "step": 23270, "train_speed(iter/s)": 0.202986 }, { "acc": 0.76539373, "epoch": 0.5431400282885431, "grad_norm": 5.84375, "learning_rate": 8.54613757995093e-06, "loss": 0.87569551, "memory(GiB)": 135.49, "step": 23280, "train_speed(iter/s)": 0.203031 }, { "acc": 0.76815424, "epoch": 0.543373335860832, "grad_norm": 5.90625, "learning_rate": 8.54480554158468e-06, "loss": 0.83838701, "memory(GiB)": 135.49, "step": 23290, "train_speed(iter/s)": 0.203075 }, { "acc": 0.76461372, "epoch": 0.5436066434331209, "grad_norm": 5.3125, "learning_rate": 8.543472997198467e-06, "loss": 0.85878191, "memory(GiB)": 135.49, "step": 23300, "train_speed(iter/s)": 0.20312 }, { "acc": 0.77042131, "epoch": 0.5438399510054098, "grad_norm": 5.25, "learning_rate": 8.542139946982516e-06, "loss": 0.84053593, "memory(GiB)": 135.49, "step": 23310, "train_speed(iter/s)": 0.203165 }, { "acc": 0.76009126, "epoch": 0.5440732585776987, "grad_norm": 5.40625, "learning_rate": 8.540806391127112e-06, "loss": 0.87238865, "memory(GiB)": 135.49, "step": 23320, "train_speed(iter/s)": 0.203209 }, { "acc": 0.76382627, "epoch": 0.5443065661499876, "grad_norm": 7.53125, "learning_rate": 8.539472329822627e-06, "loss": 0.85163088, "memory(GiB)": 135.49, "step": 23330, "train_speed(iter/s)": 0.203253 }, { "acc": 0.769662, "epoch": 0.5445398737222765, "grad_norm": 5.09375, "learning_rate": 8.538137763259495e-06, "loss": 0.83273296, "memory(GiB)": 135.49, "step": 23340, "train_speed(iter/s)": 0.203302 }, { "acc": 0.7740675, "epoch": 0.5447731812945654, "grad_norm": 6.4375, "learning_rate": 8.536802691628226e-06, "loss": 0.8265089, "memory(GiB)": 135.49, "step": 23350, "train_speed(iter/s)": 0.203347 }, { "acc": 0.78265095, "epoch": 0.5450064888668543, "grad_norm": 6.78125, "learning_rate": 8.535467115119399e-06, "loss": 0.79409432, "memory(GiB)": 135.49, "step": 23360, "train_speed(iter/s)": 0.203393 }, { "acc": 0.77970076, "epoch": 0.5452397964391432, "grad_norm": 5.625, "learning_rate": 8.534131033923668e-06, "loss": 0.78820724, "memory(GiB)": 135.49, "step": 23370, "train_speed(iter/s)": 0.203436 }, { "acc": 0.73816423, "epoch": 0.5454731040114321, "grad_norm": 7.0625, "learning_rate": 8.53279444823176e-06, "loss": 0.96989307, "memory(GiB)": 135.49, "step": 23380, "train_speed(iter/s)": 0.203483 }, { "acc": 0.76140728, "epoch": 0.545706411583721, "grad_norm": 5.09375, "learning_rate": 8.531457358234469e-06, "loss": 0.85784225, "memory(GiB)": 135.49, "step": 23390, "train_speed(iter/s)": 0.20353 }, { "acc": 0.76670375, "epoch": 0.5459397191560098, "grad_norm": 5.71875, "learning_rate": 8.530119764122666e-06, "loss": 0.867729, "memory(GiB)": 135.49, "step": 23400, "train_speed(iter/s)": 0.203578 }, { "acc": 0.7645402, "epoch": 0.5461730267282987, "grad_norm": 5.5, "learning_rate": 8.528781666087294e-06, "loss": 0.86784496, "memory(GiB)": 135.49, "step": 23410, "train_speed(iter/s)": 0.203626 }, { "acc": 0.76798515, "epoch": 0.5464063343005876, "grad_norm": 7.84375, "learning_rate": 8.527443064319362e-06, "loss": 0.82815514, "memory(GiB)": 135.49, "step": 23420, "train_speed(iter/s)": 0.203672 }, { "acc": 0.77199774, "epoch": 0.5466396418728765, "grad_norm": 5.40625, "learning_rate": 8.526103959009959e-06, "loss": 0.7984251, "memory(GiB)": 135.49, "step": 23430, "train_speed(iter/s)": 0.203718 }, { "acc": 0.77022491, "epoch": 0.5468729494451654, "grad_norm": 4.3125, "learning_rate": 8.52476435035024e-06, "loss": 0.83827343, "memory(GiB)": 135.49, "step": 23440, "train_speed(iter/s)": 0.203766 }, { "acc": 0.7833993, "epoch": 0.5471062570174543, "grad_norm": 5.53125, "learning_rate": 8.523424238531435e-06, "loss": 0.78432059, "memory(GiB)": 135.49, "step": 23450, "train_speed(iter/s)": 0.203812 }, { "acc": 0.77845445, "epoch": 0.5473395645897432, "grad_norm": 6.15625, "learning_rate": 8.522083623744841e-06, "loss": 0.78911204, "memory(GiB)": 135.49, "step": 23460, "train_speed(iter/s)": 0.203857 }, { "acc": 0.77107821, "epoch": 0.5475728721620321, "grad_norm": 5.53125, "learning_rate": 8.520742506181834e-06, "loss": 0.83951378, "memory(GiB)": 135.49, "step": 23470, "train_speed(iter/s)": 0.203901 }, { "acc": 0.76855087, "epoch": 0.547806179734321, "grad_norm": 4.78125, "learning_rate": 8.519400886033858e-06, "loss": 0.83800716, "memory(GiB)": 135.49, "step": 23480, "train_speed(iter/s)": 0.203946 }, { "acc": 0.76081161, "epoch": 0.5480394873066099, "grad_norm": 7.65625, "learning_rate": 8.518058763492428e-06, "loss": 0.86975174, "memory(GiB)": 135.49, "step": 23490, "train_speed(iter/s)": 0.203993 }, { "acc": 0.75832062, "epoch": 0.5482727948788988, "grad_norm": 11.625, "learning_rate": 8.516716138749131e-06, "loss": 0.87061977, "memory(GiB)": 135.49, "step": 23500, "train_speed(iter/s)": 0.204039 }, { "epoch": 0.5482727948788988, "eval_acc": 0.7337685826672561, "eval_loss": 0.8401154279708862, "eval_runtime": 1263.9979, "eval_samples_per_second": 28.474, "eval_steps_per_second": 14.237, "step": 23500 }, { "acc": 0.75749011, "epoch": 0.5485061024511877, "grad_norm": 8.25, "learning_rate": 8.515373011995624e-06, "loss": 0.86891069, "memory(GiB)": 135.49, "step": 23510, "train_speed(iter/s)": 0.201822 }, { "acc": 0.76609678, "epoch": 0.5487394100234766, "grad_norm": 6.4375, "learning_rate": 8.514029383423644e-06, "loss": 0.83085861, "memory(GiB)": 135.49, "step": 23520, "train_speed(iter/s)": 0.201864 }, { "acc": 0.77853241, "epoch": 0.5489727175957655, "grad_norm": 5.28125, "learning_rate": 8.51268525322499e-06, "loss": 0.790413, "memory(GiB)": 135.49, "step": 23530, "train_speed(iter/s)": 0.201908 }, { "acc": 0.75937691, "epoch": 0.5492060251680544, "grad_norm": 8.875, "learning_rate": 8.511340621591536e-06, "loss": 0.8782299, "memory(GiB)": 135.49, "step": 23540, "train_speed(iter/s)": 0.201952 }, { "acc": 0.77397614, "epoch": 0.5494393327403433, "grad_norm": 6.5, "learning_rate": 8.509995488715228e-06, "loss": 0.83298111, "memory(GiB)": 135.49, "step": 23550, "train_speed(iter/s)": 0.201994 }, { "acc": 0.77391944, "epoch": 0.5496726403126322, "grad_norm": 4.875, "learning_rate": 8.508649854788085e-06, "loss": 0.81943007, "memory(GiB)": 135.49, "step": 23560, "train_speed(iter/s)": 0.202038 }, { "acc": 0.75344067, "epoch": 0.5499059478849211, "grad_norm": 7.03125, "learning_rate": 8.507303720002194e-06, "loss": 0.92825108, "memory(GiB)": 135.49, "step": 23570, "train_speed(iter/s)": 0.202084 }, { "acc": 0.77616901, "epoch": 0.55013925545721, "grad_norm": 5.4375, "learning_rate": 8.505957084549714e-06, "loss": 0.82322779, "memory(GiB)": 135.49, "step": 23580, "train_speed(iter/s)": 0.20213 }, { "acc": 0.77823744, "epoch": 0.5503725630294988, "grad_norm": 5.6875, "learning_rate": 8.50460994862288e-06, "loss": 0.8034605, "memory(GiB)": 135.49, "step": 23590, "train_speed(iter/s)": 0.202175 }, { "acc": 0.76473098, "epoch": 0.5506058706017877, "grad_norm": 6.5625, "learning_rate": 8.503262312413994e-06, "loss": 0.85402584, "memory(GiB)": 135.49, "step": 23600, "train_speed(iter/s)": 0.202218 }, { "acc": 0.77013798, "epoch": 0.5508391781740766, "grad_norm": 6.34375, "learning_rate": 8.501914176115432e-06, "loss": 0.82842979, "memory(GiB)": 135.49, "step": 23610, "train_speed(iter/s)": 0.20226 }, { "acc": 0.75260115, "epoch": 0.5510724857463655, "grad_norm": 7.59375, "learning_rate": 8.500565539919636e-06, "loss": 0.89303665, "memory(GiB)": 135.49, "step": 23620, "train_speed(iter/s)": 0.202306 }, { "acc": 0.76627574, "epoch": 0.5513057933186544, "grad_norm": 5.84375, "learning_rate": 8.499216404019129e-06, "loss": 0.86184864, "memory(GiB)": 135.49, "step": 23630, "train_speed(iter/s)": 0.202352 }, { "acc": 0.75102053, "epoch": 0.5515391008909433, "grad_norm": 7.09375, "learning_rate": 8.497866768606493e-06, "loss": 0.89998341, "memory(GiB)": 135.49, "step": 23640, "train_speed(iter/s)": 0.202395 }, { "acc": 0.76913881, "epoch": 0.5517724084632322, "grad_norm": 5.5625, "learning_rate": 8.496516633874395e-06, "loss": 0.81929016, "memory(GiB)": 135.49, "step": 23650, "train_speed(iter/s)": 0.202441 }, { "acc": 0.76153755, "epoch": 0.5520057160355211, "grad_norm": 5.21875, "learning_rate": 8.495166000015562e-06, "loss": 0.86396456, "memory(GiB)": 135.49, "step": 23660, "train_speed(iter/s)": 0.202488 }, { "acc": 0.77233257, "epoch": 0.55223902360781, "grad_norm": 7.96875, "learning_rate": 8.493814867222799e-06, "loss": 0.835639, "memory(GiB)": 135.49, "step": 23670, "train_speed(iter/s)": 0.202535 }, { "acc": 0.75422645, "epoch": 0.5524723311800989, "grad_norm": 7.625, "learning_rate": 8.492463235688977e-06, "loss": 0.87213039, "memory(GiB)": 135.49, "step": 23680, "train_speed(iter/s)": 0.202579 }, { "acc": 0.78307285, "epoch": 0.5527056387523878, "grad_norm": 5.0, "learning_rate": 8.491111105607044e-06, "loss": 0.77796531, "memory(GiB)": 135.49, "step": 23690, "train_speed(iter/s)": 0.202626 }, { "acc": 0.75421448, "epoch": 0.5529389463246767, "grad_norm": 6.65625, "learning_rate": 8.489758477170015e-06, "loss": 0.88067827, "memory(GiB)": 135.49, "step": 23700, "train_speed(iter/s)": 0.202671 }, { "acc": 0.77133131, "epoch": 0.5531722538969656, "grad_norm": 5.59375, "learning_rate": 8.488405350570976e-06, "loss": 0.80789471, "memory(GiB)": 135.49, "step": 23710, "train_speed(iter/s)": 0.202716 }, { "acc": 0.76543703, "epoch": 0.5534055614692545, "grad_norm": 7.90625, "learning_rate": 8.487051726003087e-06, "loss": 0.85050478, "memory(GiB)": 135.49, "step": 23720, "train_speed(iter/s)": 0.20276 }, { "acc": 0.76747084, "epoch": 0.5536388690415434, "grad_norm": 6.28125, "learning_rate": 8.485697603659578e-06, "loss": 0.84977312, "memory(GiB)": 135.49, "step": 23730, "train_speed(iter/s)": 0.202804 }, { "acc": 0.75640655, "epoch": 0.5538721766138323, "grad_norm": 6.625, "learning_rate": 8.484342983733747e-06, "loss": 0.88149233, "memory(GiB)": 135.49, "step": 23740, "train_speed(iter/s)": 0.202849 }, { "acc": 0.75069995, "epoch": 0.5541054841861212, "grad_norm": 6.46875, "learning_rate": 8.482987866418968e-06, "loss": 0.89634104, "memory(GiB)": 135.49, "step": 23750, "train_speed(iter/s)": 0.202893 }, { "acc": 0.7894906, "epoch": 0.55433879175841, "grad_norm": 7.25, "learning_rate": 8.481632251908684e-06, "loss": 0.75434971, "memory(GiB)": 135.49, "step": 23760, "train_speed(iter/s)": 0.202938 }, { "acc": 0.77172451, "epoch": 0.554572099330699, "grad_norm": 4.71875, "learning_rate": 8.480276140396406e-06, "loss": 0.81036205, "memory(GiB)": 135.49, "step": 23770, "train_speed(iter/s)": 0.20298 }, { "acc": 0.76706014, "epoch": 0.5548054069029877, "grad_norm": 6.125, "learning_rate": 8.478919532075723e-06, "loss": 0.8623455, "memory(GiB)": 135.49, "step": 23780, "train_speed(iter/s)": 0.203023 }, { "acc": 0.7655858, "epoch": 0.5550387144752766, "grad_norm": 7.21875, "learning_rate": 8.477562427140283e-06, "loss": 0.8615139, "memory(GiB)": 135.49, "step": 23790, "train_speed(iter/s)": 0.203069 }, { "acc": 0.7756856, "epoch": 0.5552720220475655, "grad_norm": 6.90625, "learning_rate": 8.47620482578382e-06, "loss": 0.80266457, "memory(GiB)": 135.49, "step": 23800, "train_speed(iter/s)": 0.203109 }, { "acc": 0.7771605, "epoch": 0.5555053296198544, "grad_norm": 7.5, "learning_rate": 8.474846728200125e-06, "loss": 0.82439594, "memory(GiB)": 135.49, "step": 23810, "train_speed(iter/s)": 0.203155 }, { "acc": 0.75472794, "epoch": 0.5557386371921433, "grad_norm": 4.09375, "learning_rate": 8.473488134583071e-06, "loss": 0.90375023, "memory(GiB)": 135.49, "step": 23820, "train_speed(iter/s)": 0.203194 }, { "acc": 0.77176504, "epoch": 0.5559719447644322, "grad_norm": 12.4375, "learning_rate": 8.472129045126596e-06, "loss": 0.81603556, "memory(GiB)": 135.49, "step": 23830, "train_speed(iter/s)": 0.203237 }, { "acc": 0.78797798, "epoch": 0.5562052523367211, "grad_norm": 5.3125, "learning_rate": 8.470769460024705e-06, "loss": 0.7602088, "memory(GiB)": 135.49, "step": 23840, "train_speed(iter/s)": 0.203283 }, { "acc": 0.7616663, "epoch": 0.55643855990901, "grad_norm": 6.84375, "learning_rate": 8.469409379471486e-06, "loss": 0.84912777, "memory(GiB)": 135.49, "step": 23850, "train_speed(iter/s)": 0.203326 }, { "acc": 0.76642904, "epoch": 0.5566718674812989, "grad_norm": 4.65625, "learning_rate": 8.468048803661083e-06, "loss": 0.84387083, "memory(GiB)": 135.49, "step": 23860, "train_speed(iter/s)": 0.203371 }, { "acc": 0.78258657, "epoch": 0.5569051750535878, "grad_norm": 8.375, "learning_rate": 8.466687732787721e-06, "loss": 0.78666115, "memory(GiB)": 135.49, "step": 23870, "train_speed(iter/s)": 0.203417 }, { "acc": 0.78084164, "epoch": 0.5571384826258767, "grad_norm": 5.875, "learning_rate": 8.465326167045693e-06, "loss": 0.79169846, "memory(GiB)": 135.49, "step": 23880, "train_speed(iter/s)": 0.203458 }, { "acc": 0.76451855, "epoch": 0.5573717901981656, "grad_norm": 6.40625, "learning_rate": 8.463964106629361e-06, "loss": 0.84216061, "memory(GiB)": 135.49, "step": 23890, "train_speed(iter/s)": 0.203501 }, { "acc": 0.79227715, "epoch": 0.5576050977704545, "grad_norm": 11.9375, "learning_rate": 8.46260155173316e-06, "loss": 0.73489547, "memory(GiB)": 135.49, "step": 23900, "train_speed(iter/s)": 0.203543 }, { "acc": 0.77097254, "epoch": 0.5578384053427434, "grad_norm": 9.0, "learning_rate": 8.461238502551592e-06, "loss": 0.83603802, "memory(GiB)": 135.49, "step": 23910, "train_speed(iter/s)": 0.203588 }, { "acc": 0.76235552, "epoch": 0.5580717129150323, "grad_norm": 7.34375, "learning_rate": 8.459874959279235e-06, "loss": 0.86564903, "memory(GiB)": 135.49, "step": 23920, "train_speed(iter/s)": 0.203633 }, { "acc": 0.7657094, "epoch": 0.5583050204873212, "grad_norm": 4.21875, "learning_rate": 8.45851092211073e-06, "loss": 0.83802948, "memory(GiB)": 135.49, "step": 23930, "train_speed(iter/s)": 0.203676 }, { "acc": 0.77420778, "epoch": 0.5585383280596101, "grad_norm": 4.40625, "learning_rate": 8.457146391240798e-06, "loss": 0.81684752, "memory(GiB)": 135.49, "step": 23940, "train_speed(iter/s)": 0.20372 }, { "acc": 0.76409664, "epoch": 0.558771635631899, "grad_norm": 6.65625, "learning_rate": 8.455781366864223e-06, "loss": 0.85641346, "memory(GiB)": 135.49, "step": 23950, "train_speed(iter/s)": 0.203765 }, { "acc": 0.76663518, "epoch": 0.5590049432041879, "grad_norm": 6.59375, "learning_rate": 8.45441584917586e-06, "loss": 0.84114819, "memory(GiB)": 135.49, "step": 23960, "train_speed(iter/s)": 0.203806 }, { "acc": 0.7738245, "epoch": 0.5592382507764767, "grad_norm": 6.25, "learning_rate": 8.453049838370639e-06, "loss": 0.81549778, "memory(GiB)": 135.49, "step": 23970, "train_speed(iter/s)": 0.203851 }, { "acc": 0.76086822, "epoch": 0.5594715583487656, "grad_norm": 5.78125, "learning_rate": 8.451683334643557e-06, "loss": 0.86223373, "memory(GiB)": 135.49, "step": 23980, "train_speed(iter/s)": 0.203893 }, { "acc": 0.77582927, "epoch": 0.5597048659210545, "grad_norm": 6.625, "learning_rate": 8.45031633818968e-06, "loss": 0.80273333, "memory(GiB)": 135.49, "step": 23990, "train_speed(iter/s)": 0.203936 }, { "acc": 0.75671964, "epoch": 0.5599381734933434, "grad_norm": 7.0, "learning_rate": 8.44894884920415e-06, "loss": 0.90248652, "memory(GiB)": 135.49, "step": 24000, "train_speed(iter/s)": 0.203981 }, { "epoch": 0.5599381734933434, "eval_acc": 0.7337537391214557, "eval_loss": 0.8400015234947205, "eval_runtime": 1261.546, "eval_samples_per_second": 28.529, "eval_steps_per_second": 14.265, "step": 24000 }, { "acc": 0.75485086, "epoch": 0.5601714810656323, "grad_norm": 13.75, "learning_rate": 8.447580867882172e-06, "loss": 0.8867939, "memory(GiB)": 135.49, "step": 24010, "train_speed(iter/s)": 0.201819 }, { "acc": 0.78055496, "epoch": 0.5604047886379212, "grad_norm": 5.5625, "learning_rate": 8.446212394419028e-06, "loss": 0.78347416, "memory(GiB)": 135.49, "step": 24020, "train_speed(iter/s)": 0.201864 }, { "acc": 0.76335449, "epoch": 0.5606380962102101, "grad_norm": 7.1875, "learning_rate": 8.444843429010065e-06, "loss": 0.87255392, "memory(GiB)": 135.49, "step": 24030, "train_speed(iter/s)": 0.201904 }, { "acc": 0.77163324, "epoch": 0.560871403782499, "grad_norm": 6.625, "learning_rate": 8.443473971850703e-06, "loss": 0.83303404, "memory(GiB)": 135.49, "step": 24040, "train_speed(iter/s)": 0.201946 }, { "acc": 0.76279135, "epoch": 0.5611047113547879, "grad_norm": 8.0, "learning_rate": 8.442104023136435e-06, "loss": 0.87035179, "memory(GiB)": 135.49, "step": 24050, "train_speed(iter/s)": 0.201987 }, { "acc": 0.77218361, "epoch": 0.5613380189270768, "grad_norm": 8.625, "learning_rate": 8.440733583062814e-06, "loss": 0.8095047, "memory(GiB)": 135.49, "step": 24060, "train_speed(iter/s)": 0.202031 }, { "acc": 0.76938787, "epoch": 0.5615713264993657, "grad_norm": 8.0, "learning_rate": 8.439362651825475e-06, "loss": 0.83879461, "memory(GiB)": 135.49, "step": 24070, "train_speed(iter/s)": 0.202076 }, { "acc": 0.75795879, "epoch": 0.5618046340716546, "grad_norm": 16.0, "learning_rate": 8.437991229620117e-06, "loss": 0.89380093, "memory(GiB)": 135.49, "step": 24080, "train_speed(iter/s)": 0.202119 }, { "acc": 0.76871886, "epoch": 0.5620379416439435, "grad_norm": 5.09375, "learning_rate": 8.436619316642508e-06, "loss": 0.83234882, "memory(GiB)": 135.49, "step": 24090, "train_speed(iter/s)": 0.202165 }, { "acc": 0.76712065, "epoch": 0.5622712492162324, "grad_norm": 4.875, "learning_rate": 8.435246913088492e-06, "loss": 0.84074945, "memory(GiB)": 135.49, "step": 24100, "train_speed(iter/s)": 0.202209 }, { "acc": 0.77824478, "epoch": 0.5625045567885213, "grad_norm": 7.53125, "learning_rate": 8.433874019153976e-06, "loss": 0.80724888, "memory(GiB)": 135.49, "step": 24110, "train_speed(iter/s)": 0.202251 }, { "acc": 0.77945185, "epoch": 0.5627378643608102, "grad_norm": 24.875, "learning_rate": 8.432500635034942e-06, "loss": 0.78945742, "memory(GiB)": 135.49, "step": 24120, "train_speed(iter/s)": 0.202295 }, { "acc": 0.75688677, "epoch": 0.5629711719330991, "grad_norm": 7.4375, "learning_rate": 8.43112676092744e-06, "loss": 0.8990778, "memory(GiB)": 135.49, "step": 24130, "train_speed(iter/s)": 0.202338 }, { "acc": 0.77828522, "epoch": 0.563204479505388, "grad_norm": 6.15625, "learning_rate": 8.429752397027585e-06, "loss": 0.80676413, "memory(GiB)": 135.49, "step": 24140, "train_speed(iter/s)": 0.202383 }, { "acc": 0.7591898, "epoch": 0.5634377870776769, "grad_norm": 5.28125, "learning_rate": 8.428377543531577e-06, "loss": 0.85866222, "memory(GiB)": 135.49, "step": 24150, "train_speed(iter/s)": 0.202428 }, { "acc": 0.7789494, "epoch": 0.5636710946499658, "grad_norm": 6.5625, "learning_rate": 8.427002200635669e-06, "loss": 0.81208143, "memory(GiB)": 135.49, "step": 24160, "train_speed(iter/s)": 0.202469 }, { "acc": 0.75479746, "epoch": 0.5639044022222546, "grad_norm": 5.53125, "learning_rate": 8.425626368536192e-06, "loss": 0.91409559, "memory(GiB)": 135.49, "step": 24170, "train_speed(iter/s)": 0.202513 }, { "acc": 0.7566421, "epoch": 0.5641377097945435, "grad_norm": 6.125, "learning_rate": 8.424250047429547e-06, "loss": 0.90373974, "memory(GiB)": 135.49, "step": 24180, "train_speed(iter/s)": 0.202556 }, { "acc": 0.7561944, "epoch": 0.5643710173668324, "grad_norm": 3.875, "learning_rate": 8.4228732375122e-06, "loss": 0.86911192, "memory(GiB)": 135.49, "step": 24190, "train_speed(iter/s)": 0.202598 }, { "acc": 0.76205001, "epoch": 0.5646043249391213, "grad_norm": 9.5, "learning_rate": 8.421495938980695e-06, "loss": 0.85251045, "memory(GiB)": 135.49, "step": 24200, "train_speed(iter/s)": 0.202643 }, { "acc": 0.77242785, "epoch": 0.5648376325114102, "grad_norm": 5.78125, "learning_rate": 8.420118152031638e-06, "loss": 0.82758217, "memory(GiB)": 135.49, "step": 24210, "train_speed(iter/s)": 0.20268 }, { "acc": 0.78525753, "epoch": 0.5650709400836991, "grad_norm": 6.28125, "learning_rate": 8.418739876861708e-06, "loss": 0.76835475, "memory(GiB)": 135.49, "step": 24220, "train_speed(iter/s)": 0.20272 }, { "acc": 0.77326174, "epoch": 0.565304247655988, "grad_norm": 4.71875, "learning_rate": 8.417361113667654e-06, "loss": 0.81643181, "memory(GiB)": 135.49, "step": 24230, "train_speed(iter/s)": 0.202764 }, { "acc": 0.76664114, "epoch": 0.5655375552282769, "grad_norm": 6.25, "learning_rate": 8.415981862646295e-06, "loss": 0.83870277, "memory(GiB)": 135.49, "step": 24240, "train_speed(iter/s)": 0.202805 }, { "acc": 0.76569195, "epoch": 0.5657708628005658, "grad_norm": 6.0625, "learning_rate": 8.414602123994517e-06, "loss": 0.85589161, "memory(GiB)": 135.49, "step": 24250, "train_speed(iter/s)": 0.202852 }, { "acc": 0.78505611, "epoch": 0.5660041703728547, "grad_norm": 5.34375, "learning_rate": 8.413221897909277e-06, "loss": 0.77138081, "memory(GiB)": 135.49, "step": 24260, "train_speed(iter/s)": 0.202895 }, { "acc": 0.75390816, "epoch": 0.5662374779451436, "grad_norm": 37.75, "learning_rate": 8.411841184587602e-06, "loss": 0.9118228, "memory(GiB)": 135.49, "step": 24270, "train_speed(iter/s)": 0.20294 }, { "acc": 0.77440872, "epoch": 0.5664707855174325, "grad_norm": 6.375, "learning_rate": 8.41045998422659e-06, "loss": 0.81315918, "memory(GiB)": 135.49, "step": 24280, "train_speed(iter/s)": 0.202981 }, { "acc": 0.77569914, "epoch": 0.5667040930897214, "grad_norm": 5.5, "learning_rate": 8.409078297023406e-06, "loss": 0.78277702, "memory(GiB)": 135.49, "step": 24290, "train_speed(iter/s)": 0.20302 }, { "acc": 0.76937847, "epoch": 0.5669374006620103, "grad_norm": 5.34375, "learning_rate": 8.407696123175285e-06, "loss": 0.82012405, "memory(GiB)": 135.49, "step": 24300, "train_speed(iter/s)": 0.203065 }, { "acc": 0.78213396, "epoch": 0.5671707082342992, "grad_norm": 4.40625, "learning_rate": 8.406313462879533e-06, "loss": 0.76993542, "memory(GiB)": 135.49, "step": 24310, "train_speed(iter/s)": 0.203109 }, { "acc": 0.75401402, "epoch": 0.5674040158065881, "grad_norm": 5.125, "learning_rate": 8.404930316333524e-06, "loss": 0.8963254, "memory(GiB)": 135.49, "step": 24320, "train_speed(iter/s)": 0.203148 }, { "acc": 0.74869838, "epoch": 0.567637323378877, "grad_norm": 5.625, "learning_rate": 8.4035466837347e-06, "loss": 0.89982834, "memory(GiB)": 135.49, "step": 24330, "train_speed(iter/s)": 0.203191 }, { "acc": 0.76792688, "epoch": 0.5678706309511659, "grad_norm": 4.78125, "learning_rate": 8.402162565280577e-06, "loss": 0.88082752, "memory(GiB)": 135.49, "step": 24340, "train_speed(iter/s)": 0.203233 }, { "acc": 0.76887255, "epoch": 0.5681039385234548, "grad_norm": 6.5625, "learning_rate": 8.400777961168736e-06, "loss": 0.85877419, "memory(GiB)": 135.49, "step": 24350, "train_speed(iter/s)": 0.203277 }, { "acc": 0.77055054, "epoch": 0.5683372460957435, "grad_norm": 6.90625, "learning_rate": 8.399392871596828e-06, "loss": 0.83814201, "memory(GiB)": 135.49, "step": 24360, "train_speed(iter/s)": 0.203322 }, { "acc": 0.77325368, "epoch": 0.5685705536680324, "grad_norm": 4.53125, "learning_rate": 8.398007296762576e-06, "loss": 0.83576584, "memory(GiB)": 135.49, "step": 24370, "train_speed(iter/s)": 0.203363 }, { "acc": 0.76850066, "epoch": 0.5688038612403213, "grad_norm": 5.65625, "learning_rate": 8.39662123686377e-06, "loss": 0.83671799, "memory(GiB)": 135.49, "step": 24380, "train_speed(iter/s)": 0.203407 }, { "acc": 0.77119527, "epoch": 0.5690371688126102, "grad_norm": 5.75, "learning_rate": 8.395234692098267e-06, "loss": 0.8412775, "memory(GiB)": 135.49, "step": 24390, "train_speed(iter/s)": 0.20345 }, { "acc": 0.75769415, "epoch": 0.5692704763848991, "grad_norm": 5.0, "learning_rate": 8.393847662663998e-06, "loss": 0.87856617, "memory(GiB)": 135.49, "step": 24400, "train_speed(iter/s)": 0.203491 }, { "acc": 0.78021812, "epoch": 0.569503783957188, "grad_norm": 5.0, "learning_rate": 8.392460148758962e-06, "loss": 0.80442438, "memory(GiB)": 135.49, "step": 24410, "train_speed(iter/s)": 0.203535 }, { "acc": 0.76191378, "epoch": 0.5697370915294769, "grad_norm": 5.25, "learning_rate": 8.391072150581228e-06, "loss": 0.87808361, "memory(GiB)": 135.49, "step": 24420, "train_speed(iter/s)": 0.203578 }, { "acc": 0.78470774, "epoch": 0.5699703991017658, "grad_norm": 5.875, "learning_rate": 8.389683668328927e-06, "loss": 0.79352775, "memory(GiB)": 135.49, "step": 24430, "train_speed(iter/s)": 0.203619 }, { "acc": 0.75123053, "epoch": 0.5702037066740547, "grad_norm": 6.1875, "learning_rate": 8.388294702200267e-06, "loss": 0.90477457, "memory(GiB)": 135.49, "step": 24440, "train_speed(iter/s)": 0.203663 }, { "acc": 0.77100639, "epoch": 0.5704370142463436, "grad_norm": 6.8125, "learning_rate": 8.386905252393522e-06, "loss": 0.82553024, "memory(GiB)": 135.49, "step": 24450, "train_speed(iter/s)": 0.203703 }, { "acc": 0.78606386, "epoch": 0.5706703218186325, "grad_norm": 5.40625, "learning_rate": 8.385515319107038e-06, "loss": 0.77952895, "memory(GiB)": 135.49, "step": 24460, "train_speed(iter/s)": 0.203742 }, { "acc": 0.76827765, "epoch": 0.5709036293909214, "grad_norm": 4.8125, "learning_rate": 8.384124902539225e-06, "loss": 0.83189783, "memory(GiB)": 135.49, "step": 24470, "train_speed(iter/s)": 0.203785 }, { "acc": 0.75763836, "epoch": 0.5711369369632103, "grad_norm": 6.1875, "learning_rate": 8.382734002888565e-06, "loss": 0.86217041, "memory(GiB)": 135.49, "step": 24480, "train_speed(iter/s)": 0.203827 }, { "acc": 0.76343174, "epoch": 0.5713702445354992, "grad_norm": 11.25, "learning_rate": 8.381342620353609e-06, "loss": 0.86744394, "memory(GiB)": 135.49, "step": 24490, "train_speed(iter/s)": 0.203868 }, { "acc": 0.77855673, "epoch": 0.5716035521077881, "grad_norm": 7.6875, "learning_rate": 8.379950755132975e-06, "loss": 0.78537765, "memory(GiB)": 135.49, "step": 24500, "train_speed(iter/s)": 0.203912 }, { "epoch": 0.5716035521077881, "eval_acc": 0.7338955595210053, "eval_loss": 0.839592456817627, "eval_runtime": 1264.0998, "eval_samples_per_second": 28.472, "eval_steps_per_second": 14.236, "step": 24500 }, { "acc": 0.7789557, "epoch": 0.571836859680077, "grad_norm": 5.0625, "learning_rate": 8.378558407425355e-06, "loss": 0.80478134, "memory(GiB)": 135.49, "step": 24510, "train_speed(iter/s)": 0.201792 }, { "acc": 0.76547346, "epoch": 0.5720701672523659, "grad_norm": 5.90625, "learning_rate": 8.377165577429502e-06, "loss": 0.86148453, "memory(GiB)": 135.49, "step": 24520, "train_speed(iter/s)": 0.201835 }, { "acc": 0.76648288, "epoch": 0.5723034748246548, "grad_norm": 6.90625, "learning_rate": 8.375772265344244e-06, "loss": 0.84420786, "memory(GiB)": 135.49, "step": 24530, "train_speed(iter/s)": 0.201877 }, { "acc": 0.76441922, "epoch": 0.5725367823969437, "grad_norm": 6.6875, "learning_rate": 8.374378471368476e-06, "loss": 0.85121937, "memory(GiB)": 135.49, "step": 24540, "train_speed(iter/s)": 0.20192 }, { "acc": 0.78385429, "epoch": 0.5727700899692325, "grad_norm": 7.65625, "learning_rate": 8.37298419570116e-06, "loss": 0.78713951, "memory(GiB)": 135.49, "step": 24550, "train_speed(iter/s)": 0.20196 }, { "acc": 0.75482912, "epoch": 0.5730033975415214, "grad_norm": 6.34375, "learning_rate": 8.371589438541333e-06, "loss": 0.90485573, "memory(GiB)": 135.49, "step": 24560, "train_speed(iter/s)": 0.202002 }, { "acc": 0.76971889, "epoch": 0.5732367051138103, "grad_norm": 6.5, "learning_rate": 8.370194200088091e-06, "loss": 0.82119541, "memory(GiB)": 135.49, "step": 24570, "train_speed(iter/s)": 0.202043 }, { "acc": 0.77532687, "epoch": 0.5734700126860992, "grad_norm": 6.375, "learning_rate": 8.368798480540607e-06, "loss": 0.80605164, "memory(GiB)": 135.49, "step": 24580, "train_speed(iter/s)": 0.202086 }, { "acc": 0.76634488, "epoch": 0.5737033202583881, "grad_norm": 6.40625, "learning_rate": 8.367402280098118e-06, "loss": 0.82023754, "memory(GiB)": 135.49, "step": 24590, "train_speed(iter/s)": 0.202127 }, { "acc": 0.76108031, "epoch": 0.573936627830677, "grad_norm": 8.8125, "learning_rate": 8.366005598959932e-06, "loss": 0.87079029, "memory(GiB)": 135.49, "step": 24600, "train_speed(iter/s)": 0.202171 }, { "acc": 0.77816839, "epoch": 0.5741699354029659, "grad_norm": 5.8125, "learning_rate": 8.364608437325426e-06, "loss": 0.82553062, "memory(GiB)": 135.49, "step": 24610, "train_speed(iter/s)": 0.202216 }, { "acc": 0.76279898, "epoch": 0.5744032429752548, "grad_norm": 6.84375, "learning_rate": 8.363210795394042e-06, "loss": 0.84448996, "memory(GiB)": 135.49, "step": 24620, "train_speed(iter/s)": 0.20226 }, { "acc": 0.78981194, "epoch": 0.5746365505475437, "grad_norm": 5.75, "learning_rate": 8.361812673365292e-06, "loss": 0.73890553, "memory(GiB)": 135.49, "step": 24630, "train_speed(iter/s)": 0.202303 }, { "acc": 0.77478137, "epoch": 0.5748698581198326, "grad_norm": 11.125, "learning_rate": 8.360414071438761e-06, "loss": 0.82698154, "memory(GiB)": 135.49, "step": 24640, "train_speed(iter/s)": 0.202344 }, { "acc": 0.76924529, "epoch": 0.5751031656921215, "grad_norm": 5.09375, "learning_rate": 8.359014989814099e-06, "loss": 0.84628258, "memory(GiB)": 135.49, "step": 24650, "train_speed(iter/s)": 0.202389 }, { "acc": 0.75518932, "epoch": 0.5753364732644104, "grad_norm": 9.25, "learning_rate": 8.35761542869102e-06, "loss": 0.90584297, "memory(GiB)": 135.49, "step": 24660, "train_speed(iter/s)": 0.202429 }, { "acc": 0.76356349, "epoch": 0.5755697808366993, "grad_norm": 5.75, "learning_rate": 8.356215388269316e-06, "loss": 0.84128323, "memory(GiB)": 135.49, "step": 24670, "train_speed(iter/s)": 0.202467 }, { "acc": 0.76498232, "epoch": 0.5758030884089882, "grad_norm": 6.4375, "learning_rate": 8.354814868748839e-06, "loss": 0.83564301, "memory(GiB)": 135.49, "step": 24680, "train_speed(iter/s)": 0.20251 }, { "acc": 0.75339251, "epoch": 0.5760363959812771, "grad_norm": 6.90625, "learning_rate": 8.353413870329514e-06, "loss": 0.89734612, "memory(GiB)": 135.49, "step": 24690, "train_speed(iter/s)": 0.202554 }, { "acc": 0.77146873, "epoch": 0.576269703553566, "grad_norm": 5.21875, "learning_rate": 8.352012393211336e-06, "loss": 0.83113155, "memory(GiB)": 135.49, "step": 24700, "train_speed(iter/s)": 0.202596 }, { "acc": 0.76600633, "epoch": 0.5765030111258549, "grad_norm": 7.375, "learning_rate": 8.35061043759436e-06, "loss": 0.838587, "memory(GiB)": 135.49, "step": 24710, "train_speed(iter/s)": 0.202638 }, { "acc": 0.75987773, "epoch": 0.5767363186981438, "grad_norm": 7.21875, "learning_rate": 8.349208003678716e-06, "loss": 0.85280399, "memory(GiB)": 135.49, "step": 24720, "train_speed(iter/s)": 0.202683 }, { "acc": 0.76230917, "epoch": 0.5769696262704327, "grad_norm": 5.09375, "learning_rate": 8.347805091664606e-06, "loss": 0.8803956, "memory(GiB)": 135.49, "step": 24730, "train_speed(iter/s)": 0.202725 }, { "acc": 0.76930399, "epoch": 0.5772029338427216, "grad_norm": 6.4375, "learning_rate": 8.34640170175229e-06, "loss": 0.85195999, "memory(GiB)": 135.49, "step": 24740, "train_speed(iter/s)": 0.202764 }, { "acc": 0.75377955, "epoch": 0.5774362414150104, "grad_norm": 6.3125, "learning_rate": 8.344997834142103e-06, "loss": 0.91254864, "memory(GiB)": 135.49, "step": 24750, "train_speed(iter/s)": 0.202806 }, { "acc": 0.76542883, "epoch": 0.5776695489872993, "grad_norm": 7.78125, "learning_rate": 8.343593489034447e-06, "loss": 0.88195496, "memory(GiB)": 135.49, "step": 24760, "train_speed(iter/s)": 0.202849 }, { "acc": 0.77210431, "epoch": 0.5779028565595882, "grad_norm": 5.65625, "learning_rate": 8.342188666629793e-06, "loss": 0.82355309, "memory(GiB)": 135.49, "step": 24770, "train_speed(iter/s)": 0.202893 }, { "acc": 0.7777637, "epoch": 0.5781361641318771, "grad_norm": 6.0625, "learning_rate": 8.340783367128677e-06, "loss": 0.80337086, "memory(GiB)": 135.49, "step": 24780, "train_speed(iter/s)": 0.202935 }, { "acc": 0.77260995, "epoch": 0.578369471704166, "grad_norm": 7.78125, "learning_rate": 8.339377590731705e-06, "loss": 0.80567856, "memory(GiB)": 135.49, "step": 24790, "train_speed(iter/s)": 0.202976 }, { "acc": 0.75802517, "epoch": 0.5786027792764549, "grad_norm": 7.96875, "learning_rate": 8.337971337639552e-06, "loss": 0.88261986, "memory(GiB)": 135.49, "step": 24800, "train_speed(iter/s)": 0.203018 }, { "acc": 0.77085714, "epoch": 0.5788360868487438, "grad_norm": 5.75, "learning_rate": 8.336564608052961e-06, "loss": 0.80792179, "memory(GiB)": 135.49, "step": 24810, "train_speed(iter/s)": 0.203061 }, { "acc": 0.78311849, "epoch": 0.5790693944210327, "grad_norm": 4.1875, "learning_rate": 8.335157402172743e-06, "loss": 0.77227683, "memory(GiB)": 135.49, "step": 24820, "train_speed(iter/s)": 0.203105 }, { "acc": 0.76053352, "epoch": 0.5793027019933216, "grad_norm": 6.78125, "learning_rate": 8.333749720199772e-06, "loss": 0.86424217, "memory(GiB)": 135.49, "step": 24830, "train_speed(iter/s)": 0.203148 }, { "acc": 0.76239681, "epoch": 0.5795360095656105, "grad_norm": 8.6875, "learning_rate": 8.332341562334998e-06, "loss": 0.8642477, "memory(GiB)": 135.49, "step": 24840, "train_speed(iter/s)": 0.203187 }, { "acc": 0.79270029, "epoch": 0.5797693171378994, "grad_norm": 5.8125, "learning_rate": 8.330932928779434e-06, "loss": 0.72758665, "memory(GiB)": 135.49, "step": 24850, "train_speed(iter/s)": 0.203231 }, { "acc": 0.77607317, "epoch": 0.5800026247101883, "grad_norm": 6.96875, "learning_rate": 8.329523819734161e-06, "loss": 0.79465709, "memory(GiB)": 135.49, "step": 24860, "train_speed(iter/s)": 0.203273 }, { "acc": 0.76892376, "epoch": 0.5802359322824772, "grad_norm": 7.0, "learning_rate": 8.328114235400331e-06, "loss": 0.86954517, "memory(GiB)": 135.49, "step": 24870, "train_speed(iter/s)": 0.203315 }, { "acc": 0.75647058, "epoch": 0.5804692398547661, "grad_norm": 8.625, "learning_rate": 8.326704175979162e-06, "loss": 0.88879738, "memory(GiB)": 135.49, "step": 24880, "train_speed(iter/s)": 0.203358 }, { "acc": 0.77644448, "epoch": 0.580702547427055, "grad_norm": 10.375, "learning_rate": 8.325293641671936e-06, "loss": 0.80533257, "memory(GiB)": 135.49, "step": 24890, "train_speed(iter/s)": 0.203402 }, { "acc": 0.76669006, "epoch": 0.5809358549993439, "grad_norm": 4.59375, "learning_rate": 8.32388263268001e-06, "loss": 0.85731697, "memory(GiB)": 135.49, "step": 24900, "train_speed(iter/s)": 0.203441 }, { "acc": 0.77372727, "epoch": 0.5811691625716328, "grad_norm": 4.6875, "learning_rate": 8.322471149204804e-06, "loss": 0.81530294, "memory(GiB)": 135.49, "step": 24910, "train_speed(iter/s)": 0.203481 }, { "acc": 0.75759211, "epoch": 0.5814024701439217, "grad_norm": 5.25, "learning_rate": 8.321059191447807e-06, "loss": 0.89536238, "memory(GiB)": 135.49, "step": 24920, "train_speed(iter/s)": 0.203521 }, { "acc": 0.75937347, "epoch": 0.5816357777162106, "grad_norm": 4.9375, "learning_rate": 8.319646759610573e-06, "loss": 0.89024487, "memory(GiB)": 135.49, "step": 24930, "train_speed(iter/s)": 0.203565 }, { "acc": 0.76285191, "epoch": 0.5818690852884993, "grad_norm": 7.1875, "learning_rate": 8.31823385389473e-06, "loss": 0.88355722, "memory(GiB)": 135.49, "step": 24940, "train_speed(iter/s)": 0.203607 }, { "acc": 0.77381802, "epoch": 0.5821023928607882, "grad_norm": 5.59375, "learning_rate": 8.316820474501968e-06, "loss": 0.83444757, "memory(GiB)": 135.49, "step": 24950, "train_speed(iter/s)": 0.20365 }, { "acc": 0.76418896, "epoch": 0.5823357004330771, "grad_norm": 5.4375, "learning_rate": 8.315406621634048e-06, "loss": 0.86433439, "memory(GiB)": 135.49, "step": 24960, "train_speed(iter/s)": 0.203691 }, { "acc": 0.74658213, "epoch": 0.582569008005366, "grad_norm": 5.46875, "learning_rate": 8.313992295492794e-06, "loss": 0.93567696, "memory(GiB)": 135.49, "step": 24970, "train_speed(iter/s)": 0.203733 }, { "acc": 0.76013808, "epoch": 0.5828023155776549, "grad_norm": 5.8125, "learning_rate": 8.312577496280103e-06, "loss": 0.87076502, "memory(GiB)": 135.49, "step": 24980, "train_speed(iter/s)": 0.203775 }, { "acc": 0.76078329, "epoch": 0.5830356231499438, "grad_norm": 6.75, "learning_rate": 8.311162224197938e-06, "loss": 0.86118374, "memory(GiB)": 135.49, "step": 24990, "train_speed(iter/s)": 0.203817 }, { "acc": 0.76700444, "epoch": 0.5832689307222327, "grad_norm": 5.0625, "learning_rate": 8.309746479448324e-06, "loss": 0.83234472, "memory(GiB)": 135.49, "step": 25000, "train_speed(iter/s)": 0.203861 }, { "epoch": 0.5832689307222327, "eval_acc": 0.7339426716446326, "eval_loss": 0.8393212556838989, "eval_runtime": 1263.0095, "eval_samples_per_second": 28.496, "eval_steps_per_second": 14.249, "step": 25000 }, { "acc": 0.76209784, "epoch": 0.5835022382945216, "grad_norm": 11.3125, "learning_rate": 8.308330262233366e-06, "loss": 0.85937824, "memory(GiB)": 135.49, "step": 25010, "train_speed(iter/s)": 0.201782 }, { "acc": 0.77981024, "epoch": 0.5837355458668105, "grad_norm": 6.46875, "learning_rate": 8.306913572755221e-06, "loss": 0.81295166, "memory(GiB)": 135.49, "step": 25020, "train_speed(iter/s)": 0.201823 }, { "acc": 0.77236223, "epoch": 0.5839688534390994, "grad_norm": 9.875, "learning_rate": 8.305496411216125e-06, "loss": 0.82159195, "memory(GiB)": 135.49, "step": 25030, "train_speed(iter/s)": 0.201863 }, { "acc": 0.75648775, "epoch": 0.5842021610113883, "grad_norm": 6.9375, "learning_rate": 8.304078777818377e-06, "loss": 0.90640936, "memory(GiB)": 135.49, "step": 25040, "train_speed(iter/s)": 0.201909 }, { "acc": 0.78109398, "epoch": 0.5844354685836772, "grad_norm": 6.0, "learning_rate": 8.302660672764343e-06, "loss": 0.79655976, "memory(GiB)": 135.49, "step": 25050, "train_speed(iter/s)": 0.201948 }, { "acc": 0.7578063, "epoch": 0.5846687761559661, "grad_norm": 6.625, "learning_rate": 8.301242096256457e-06, "loss": 0.89277372, "memory(GiB)": 135.49, "step": 25060, "train_speed(iter/s)": 0.20199 }, { "acc": 0.769561, "epoch": 0.584902083728255, "grad_norm": 5.40625, "learning_rate": 8.299823048497221e-06, "loss": 0.83420868, "memory(GiB)": 135.49, "step": 25070, "train_speed(iter/s)": 0.202036 }, { "acc": 0.7753747, "epoch": 0.5851353913005439, "grad_norm": 6.59375, "learning_rate": 8.298403529689204e-06, "loss": 0.81032734, "memory(GiB)": 135.49, "step": 25080, "train_speed(iter/s)": 0.202076 }, { "acc": 0.77332239, "epoch": 0.5853686988728328, "grad_norm": 4.53125, "learning_rate": 8.296983540035041e-06, "loss": 0.80533943, "memory(GiB)": 135.49, "step": 25090, "train_speed(iter/s)": 0.202115 }, { "acc": 0.75597601, "epoch": 0.5856020064451217, "grad_norm": 5.5, "learning_rate": 8.295563079737436e-06, "loss": 0.88431931, "memory(GiB)": 135.49, "step": 25100, "train_speed(iter/s)": 0.202157 }, { "acc": 0.77551785, "epoch": 0.5858353140174106, "grad_norm": 4.71875, "learning_rate": 8.294142148999157e-06, "loss": 0.80018835, "memory(GiB)": 135.49, "step": 25110, "train_speed(iter/s)": 0.202198 }, { "acc": 0.76115618, "epoch": 0.5860686215896995, "grad_norm": 4.59375, "learning_rate": 8.292720748023045e-06, "loss": 0.85160046, "memory(GiB)": 135.49, "step": 25120, "train_speed(iter/s)": 0.20224 }, { "acc": 0.78035097, "epoch": 0.5863019291619883, "grad_norm": 6.46875, "learning_rate": 8.291298877012002e-06, "loss": 0.77919559, "memory(GiB)": 135.49, "step": 25130, "train_speed(iter/s)": 0.20228 }, { "acc": 0.78368254, "epoch": 0.5865352367342772, "grad_norm": 6.6875, "learning_rate": 8.289876536169002e-06, "loss": 0.78788552, "memory(GiB)": 135.49, "step": 25140, "train_speed(iter/s)": 0.202321 }, { "acc": 0.77261596, "epoch": 0.5867685443065661, "grad_norm": 5.59375, "learning_rate": 8.28845372569708e-06, "loss": 0.82084064, "memory(GiB)": 135.49, "step": 25150, "train_speed(iter/s)": 0.202364 }, { "acc": 0.7611661, "epoch": 0.587001851878855, "grad_norm": 7.4375, "learning_rate": 8.287030445799345e-06, "loss": 0.86596851, "memory(GiB)": 135.49, "step": 25160, "train_speed(iter/s)": 0.202409 }, { "acc": 0.76365414, "epoch": 0.5872351594511439, "grad_norm": 5.53125, "learning_rate": 8.285606696678969e-06, "loss": 0.84006405, "memory(GiB)": 135.49, "step": 25170, "train_speed(iter/s)": 0.202448 }, { "acc": 0.7594389, "epoch": 0.5874684670234328, "grad_norm": 5.875, "learning_rate": 8.28418247853919e-06, "loss": 0.85481644, "memory(GiB)": 135.49, "step": 25180, "train_speed(iter/s)": 0.202488 }, { "acc": 0.76785374, "epoch": 0.5877017745957217, "grad_norm": 11.5, "learning_rate": 8.282757791583316e-06, "loss": 0.83124752, "memory(GiB)": 135.49, "step": 25190, "train_speed(iter/s)": 0.202532 }, { "acc": 0.75368061, "epoch": 0.5879350821680106, "grad_norm": 5.375, "learning_rate": 8.281332636014723e-06, "loss": 0.90581379, "memory(GiB)": 135.49, "step": 25200, "train_speed(iter/s)": 0.202575 }, { "acc": 0.76902542, "epoch": 0.5881683897402995, "grad_norm": 6.9375, "learning_rate": 8.279907012036849e-06, "loss": 0.84575863, "memory(GiB)": 135.49, "step": 25210, "train_speed(iter/s)": 0.202615 }, { "acc": 0.7615344, "epoch": 0.5884016973125884, "grad_norm": 6.03125, "learning_rate": 8.2784809198532e-06, "loss": 0.85775738, "memory(GiB)": 135.49, "step": 25220, "train_speed(iter/s)": 0.202657 }, { "acc": 0.78186736, "epoch": 0.5886350048848773, "grad_norm": 5.53125, "learning_rate": 8.277054359667355e-06, "loss": 0.78428698, "memory(GiB)": 135.49, "step": 25230, "train_speed(iter/s)": 0.202699 }, { "acc": 0.75303011, "epoch": 0.5888683124571662, "grad_norm": 5.3125, "learning_rate": 8.27562733168295e-06, "loss": 0.88945141, "memory(GiB)": 135.49, "step": 25240, "train_speed(iter/s)": 0.20274 }, { "acc": 0.76686587, "epoch": 0.5891016200294551, "grad_norm": 7.96875, "learning_rate": 8.274199836103696e-06, "loss": 0.84210701, "memory(GiB)": 135.49, "step": 25250, "train_speed(iter/s)": 0.202781 }, { "acc": 0.77274065, "epoch": 0.589334927601744, "grad_norm": 5.0, "learning_rate": 8.272771873133365e-06, "loss": 0.81848717, "memory(GiB)": 135.49, "step": 25260, "train_speed(iter/s)": 0.20282 }, { "acc": 0.77162447, "epoch": 0.5895682351740329, "grad_norm": 6.5, "learning_rate": 8.271343442975803e-06, "loss": 0.82287512, "memory(GiB)": 135.49, "step": 25270, "train_speed(iter/s)": 0.202862 }, { "acc": 0.76719112, "epoch": 0.5898015427463218, "grad_norm": 7.625, "learning_rate": 8.269914545834911e-06, "loss": 0.84467831, "memory(GiB)": 135.49, "step": 25280, "train_speed(iter/s)": 0.202904 }, { "acc": 0.76279979, "epoch": 0.5900348503186107, "grad_norm": 15.0625, "learning_rate": 8.26848518191467e-06, "loss": 0.86554461, "memory(GiB)": 135.49, "step": 25290, "train_speed(iter/s)": 0.202946 }, { "acc": 0.77388592, "epoch": 0.5902681578908996, "grad_norm": 4.71875, "learning_rate": 8.267055351419117e-06, "loss": 0.80043936, "memory(GiB)": 135.49, "step": 25300, "train_speed(iter/s)": 0.202987 }, { "acc": 0.76615009, "epoch": 0.5905014654631885, "grad_norm": 5.53125, "learning_rate": 8.265625054552363e-06, "loss": 0.86635094, "memory(GiB)": 135.49, "step": 25310, "train_speed(iter/s)": 0.203028 }, { "acc": 0.76060672, "epoch": 0.5907347730354773, "grad_norm": 9.8125, "learning_rate": 8.264194291518583e-06, "loss": 0.87668972, "memory(GiB)": 135.49, "step": 25320, "train_speed(iter/s)": 0.203071 }, { "acc": 0.77020688, "epoch": 0.5909680806077662, "grad_norm": 7.84375, "learning_rate": 8.262763062522013e-06, "loss": 0.82255249, "memory(GiB)": 135.49, "step": 25330, "train_speed(iter/s)": 0.203114 }, { "acc": 0.74745989, "epoch": 0.5912013881800551, "grad_norm": 5.53125, "learning_rate": 8.261331367766965e-06, "loss": 0.89549646, "memory(GiB)": 135.49, "step": 25340, "train_speed(iter/s)": 0.203156 }, { "acc": 0.7629518, "epoch": 0.591434695752344, "grad_norm": 5.28125, "learning_rate": 8.25989920745781e-06, "loss": 0.84561253, "memory(GiB)": 135.49, "step": 25350, "train_speed(iter/s)": 0.203198 }, { "acc": 0.75315294, "epoch": 0.5916680033246329, "grad_norm": 6.5, "learning_rate": 8.258466581798992e-06, "loss": 0.90241127, "memory(GiB)": 135.49, "step": 25360, "train_speed(iter/s)": 0.203239 }, { "acc": 0.75776558, "epoch": 0.5919013108969218, "grad_norm": 5.75, "learning_rate": 8.257033490995017e-06, "loss": 0.86265593, "memory(GiB)": 135.49, "step": 25370, "train_speed(iter/s)": 0.203279 }, { "acc": 0.77211366, "epoch": 0.5921346184692107, "grad_norm": 5.46875, "learning_rate": 8.255599935250456e-06, "loss": 0.8089241, "memory(GiB)": 135.49, "step": 25380, "train_speed(iter/s)": 0.203318 }, { "acc": 0.76005793, "epoch": 0.5923679260414996, "grad_norm": 5.53125, "learning_rate": 8.254165914769949e-06, "loss": 0.8687521, "memory(GiB)": 135.49, "step": 25390, "train_speed(iter/s)": 0.20336 }, { "acc": 0.74438181, "epoch": 0.5926012336137885, "grad_norm": 4.5, "learning_rate": 8.252731429758205e-06, "loss": 0.94144077, "memory(GiB)": 135.49, "step": 25400, "train_speed(iter/s)": 0.203401 }, { "acc": 0.77221847, "epoch": 0.5928345411860774, "grad_norm": 5.4375, "learning_rate": 8.251296480419992e-06, "loss": 0.81281652, "memory(GiB)": 135.49, "step": 25410, "train_speed(iter/s)": 0.203442 }, { "acc": 0.75362692, "epoch": 0.5930678487583663, "grad_norm": 6.25, "learning_rate": 8.249861066960154e-06, "loss": 0.89317455, "memory(GiB)": 135.49, "step": 25420, "train_speed(iter/s)": 0.203484 }, { "acc": 0.76079087, "epoch": 0.5933011563306552, "grad_norm": 5.875, "learning_rate": 8.248425189583589e-06, "loss": 0.87588558, "memory(GiB)": 135.49, "step": 25430, "train_speed(iter/s)": 0.203524 }, { "acc": 0.78229065, "epoch": 0.5935344639029441, "grad_norm": 5.53125, "learning_rate": 8.246988848495275e-06, "loss": 0.79455709, "memory(GiB)": 135.49, "step": 25440, "train_speed(iter/s)": 0.203565 }, { "acc": 0.76851139, "epoch": 0.593767771475233, "grad_norm": 4.5625, "learning_rate": 8.245552043900245e-06, "loss": 0.82855873, "memory(GiB)": 135.49, "step": 25450, "train_speed(iter/s)": 0.203608 }, { "acc": 0.78094959, "epoch": 0.5940010790475219, "grad_norm": 5.96875, "learning_rate": 8.244114776003605e-06, "loss": 0.79457731, "memory(GiB)": 135.49, "step": 25460, "train_speed(iter/s)": 0.203646 }, { "acc": 0.77298074, "epoch": 0.5942343866198108, "grad_norm": 8.25, "learning_rate": 8.24267704501052e-06, "loss": 0.83030329, "memory(GiB)": 135.49, "step": 25470, "train_speed(iter/s)": 0.20369 }, { "acc": 0.77497997, "epoch": 0.5944676941920997, "grad_norm": 5.28125, "learning_rate": 8.241238851126231e-06, "loss": 0.83005123, "memory(GiB)": 135.49, "step": 25480, "train_speed(iter/s)": 0.203733 }, { "acc": 0.78340192, "epoch": 0.5947010017643886, "grad_norm": 4.84375, "learning_rate": 8.239800194556036e-06, "loss": 0.7847209, "memory(GiB)": 135.49, "step": 25490, "train_speed(iter/s)": 0.203774 }, { "acc": 0.75260081, "epoch": 0.5949343093366775, "grad_norm": 5.375, "learning_rate": 8.238361075505307e-06, "loss": 0.92171764, "memory(GiB)": 135.49, "step": 25500, "train_speed(iter/s)": 0.203813 }, { "epoch": 0.5949343093366775, "eval_acc": 0.7339170181252601, "eval_loss": 0.8390910029411316, "eval_runtime": 1262.7513, "eval_samples_per_second": 28.502, "eval_steps_per_second": 14.251, "step": 25500 }, { "acc": 0.75602245, "epoch": 0.5951676169089664, "grad_norm": 7.09375, "learning_rate": 8.236921494179474e-06, "loss": 0.90613184, "memory(GiB)": 135.49, "step": 25510, "train_speed(iter/s)": 0.201775 }, { "acc": 0.77085948, "epoch": 0.5954009244812551, "grad_norm": 5.78125, "learning_rate": 8.235481450784037e-06, "loss": 0.83134451, "memory(GiB)": 135.49, "step": 25520, "train_speed(iter/s)": 0.201816 }, { "acc": 0.78493161, "epoch": 0.595634232053544, "grad_norm": 8.75, "learning_rate": 8.234040945524563e-06, "loss": 0.79396429, "memory(GiB)": 135.49, "step": 25530, "train_speed(iter/s)": 0.201858 }, { "acc": 0.77554154, "epoch": 0.5958675396258329, "grad_norm": 4.84375, "learning_rate": 8.232599978606683e-06, "loss": 0.80468273, "memory(GiB)": 135.49, "step": 25540, "train_speed(iter/s)": 0.2019 }, { "acc": 0.75647626, "epoch": 0.5961008471981218, "grad_norm": 10.25, "learning_rate": 8.231158550236098e-06, "loss": 0.89243793, "memory(GiB)": 135.49, "step": 25550, "train_speed(iter/s)": 0.201943 }, { "acc": 0.77998843, "epoch": 0.5963341547704107, "grad_norm": 6.59375, "learning_rate": 8.229716660618567e-06, "loss": 0.79620509, "memory(GiB)": 135.49, "step": 25560, "train_speed(iter/s)": 0.201986 }, { "acc": 0.78326197, "epoch": 0.5965674623426996, "grad_norm": 5.3125, "learning_rate": 8.22827430995992e-06, "loss": 0.76009641, "memory(GiB)": 135.49, "step": 25570, "train_speed(iter/s)": 0.202025 }, { "acc": 0.74237843, "epoch": 0.5968007699149885, "grad_norm": 6.28125, "learning_rate": 8.226831498466054e-06, "loss": 0.93370266, "memory(GiB)": 135.49, "step": 25580, "train_speed(iter/s)": 0.202068 }, { "acc": 0.75410166, "epoch": 0.5970340774872774, "grad_norm": 7.78125, "learning_rate": 8.22538822634293e-06, "loss": 0.88475266, "memory(GiB)": 135.49, "step": 25590, "train_speed(iter/s)": 0.202109 }, { "acc": 0.76392784, "epoch": 0.5972673850595663, "grad_norm": 5.96875, "learning_rate": 8.223944493796572e-06, "loss": 0.842031, "memory(GiB)": 135.49, "step": 25600, "train_speed(iter/s)": 0.20215 }, { "acc": 0.79064054, "epoch": 0.5975006926318552, "grad_norm": 5.90625, "learning_rate": 8.222500301033075e-06, "loss": 0.76025066, "memory(GiB)": 135.49, "step": 25610, "train_speed(iter/s)": 0.202195 }, { "acc": 0.77819157, "epoch": 0.5977340002041441, "grad_norm": 6.1875, "learning_rate": 8.221055648258596e-06, "loss": 0.79666338, "memory(GiB)": 135.49, "step": 25620, "train_speed(iter/s)": 0.202234 }, { "acc": 0.76438541, "epoch": 0.597967307776433, "grad_norm": 10.4375, "learning_rate": 8.21961053567936e-06, "loss": 0.84868813, "memory(GiB)": 135.49, "step": 25630, "train_speed(iter/s)": 0.20228 }, { "acc": 0.77430286, "epoch": 0.5982006153487219, "grad_norm": 18.25, "learning_rate": 8.218164963501651e-06, "loss": 0.81446762, "memory(GiB)": 135.49, "step": 25640, "train_speed(iter/s)": 0.202323 }, { "acc": 0.77525482, "epoch": 0.5984339229210108, "grad_norm": 6.03125, "learning_rate": 8.216718931931832e-06, "loss": 0.79652071, "memory(GiB)": 135.49, "step": 25650, "train_speed(iter/s)": 0.202365 }, { "acc": 0.77748456, "epoch": 0.5986672304932997, "grad_norm": 7.65625, "learning_rate": 8.21527244117632e-06, "loss": 0.80770845, "memory(GiB)": 135.49, "step": 25660, "train_speed(iter/s)": 0.202407 }, { "acc": 0.78290329, "epoch": 0.5989005380655886, "grad_norm": 4.75, "learning_rate": 8.2138254914416e-06, "loss": 0.76492586, "memory(GiB)": 135.49, "step": 25670, "train_speed(iter/s)": 0.202447 }, { "acc": 0.76808548, "epoch": 0.5991338456378775, "grad_norm": 6.15625, "learning_rate": 8.212378082934225e-06, "loss": 0.83029194, "memory(GiB)": 135.49, "step": 25680, "train_speed(iter/s)": 0.202487 }, { "acc": 0.77649622, "epoch": 0.5993671532101664, "grad_norm": 5.53125, "learning_rate": 8.210930215860812e-06, "loss": 0.7960465, "memory(GiB)": 135.49, "step": 25690, "train_speed(iter/s)": 0.202528 }, { "acc": 0.75436945, "epoch": 0.5996004607824553, "grad_norm": 5.09375, "learning_rate": 8.209481890428044e-06, "loss": 0.89597979, "memory(GiB)": 135.49, "step": 25700, "train_speed(iter/s)": 0.202569 }, { "acc": 0.77021813, "epoch": 0.5998337683547441, "grad_norm": 5.03125, "learning_rate": 8.208033106842668e-06, "loss": 0.82514133, "memory(GiB)": 135.49, "step": 25710, "train_speed(iter/s)": 0.202606 }, { "acc": 0.76905627, "epoch": 0.600067075927033, "grad_norm": 9.1875, "learning_rate": 8.206583865311497e-06, "loss": 0.82045012, "memory(GiB)": 135.49, "step": 25720, "train_speed(iter/s)": 0.202647 }, { "acc": 0.76189842, "epoch": 0.6003003834993219, "grad_norm": 5.25, "learning_rate": 8.205134166041412e-06, "loss": 0.88493633, "memory(GiB)": 135.49, "step": 25730, "train_speed(iter/s)": 0.202688 }, { "acc": 0.77640295, "epoch": 0.6005336910716108, "grad_norm": 4.9375, "learning_rate": 8.203684009239356e-06, "loss": 0.80714579, "memory(GiB)": 135.49, "step": 25740, "train_speed(iter/s)": 0.202725 }, { "acc": 0.7709527, "epoch": 0.6007669986438997, "grad_norm": 8.1875, "learning_rate": 8.202233395112338e-06, "loss": 0.8069706, "memory(GiB)": 135.49, "step": 25750, "train_speed(iter/s)": 0.202765 }, { "acc": 0.78961029, "epoch": 0.6010003062161886, "grad_norm": 7.0, "learning_rate": 8.200782323867432e-06, "loss": 0.77773442, "memory(GiB)": 135.49, "step": 25760, "train_speed(iter/s)": 0.202806 }, { "acc": 0.76734123, "epoch": 0.6012336137884775, "grad_norm": 4.96875, "learning_rate": 8.19933079571178e-06, "loss": 0.82412434, "memory(GiB)": 135.49, "step": 25770, "train_speed(iter/s)": 0.202847 }, { "acc": 0.79435129, "epoch": 0.6014669213607664, "grad_norm": 5.03125, "learning_rate": 8.197878810852587e-06, "loss": 0.73924217, "memory(GiB)": 135.49, "step": 25780, "train_speed(iter/s)": 0.202886 }, { "acc": 0.76705256, "epoch": 0.6017002289330553, "grad_norm": 6.0625, "learning_rate": 8.196426369497121e-06, "loss": 0.83321819, "memory(GiB)": 135.49, "step": 25790, "train_speed(iter/s)": 0.202927 }, { "acc": 0.75777159, "epoch": 0.6019335365053442, "grad_norm": 6.5625, "learning_rate": 8.19497347185272e-06, "loss": 0.88663998, "memory(GiB)": 135.49, "step": 25800, "train_speed(iter/s)": 0.202966 }, { "acc": 0.74939609, "epoch": 0.6021668440776331, "grad_norm": 5.875, "learning_rate": 8.193520118126785e-06, "loss": 0.89984512, "memory(GiB)": 135.49, "step": 25810, "train_speed(iter/s)": 0.203009 }, { "acc": 0.77597027, "epoch": 0.602400151649922, "grad_norm": 6.0, "learning_rate": 8.19206630852678e-06, "loss": 0.78853464, "memory(GiB)": 135.49, "step": 25820, "train_speed(iter/s)": 0.203052 }, { "acc": 0.76595383, "epoch": 0.6026334592222109, "grad_norm": 4.5625, "learning_rate": 8.190612043260238e-06, "loss": 0.84532881, "memory(GiB)": 135.49, "step": 25830, "train_speed(iter/s)": 0.203092 }, { "acc": 0.74006224, "epoch": 0.6028667667944998, "grad_norm": 4.625, "learning_rate": 8.189157322534753e-06, "loss": 0.92300282, "memory(GiB)": 135.49, "step": 25840, "train_speed(iter/s)": 0.203131 }, { "acc": 0.75753255, "epoch": 0.6031000743667887, "grad_norm": 6.65625, "learning_rate": 8.187702146557986e-06, "loss": 0.8854948, "memory(GiB)": 135.49, "step": 25850, "train_speed(iter/s)": 0.203172 }, { "acc": 0.76122169, "epoch": 0.6033333819390776, "grad_norm": 4.8125, "learning_rate": 8.186246515537664e-06, "loss": 0.86791782, "memory(GiB)": 135.49, "step": 25860, "train_speed(iter/s)": 0.203212 }, { "acc": 0.7631247, "epoch": 0.6035666895113665, "grad_norm": 10.75, "learning_rate": 8.184790429681577e-06, "loss": 0.86330433, "memory(GiB)": 135.49, "step": 25870, "train_speed(iter/s)": 0.203254 }, { "acc": 0.76304603, "epoch": 0.6037999970836554, "grad_norm": 4.4375, "learning_rate": 8.183333889197582e-06, "loss": 0.85228939, "memory(GiB)": 135.49, "step": 25880, "train_speed(iter/s)": 0.203294 }, { "acc": 0.7756114, "epoch": 0.6040333046559443, "grad_norm": 5.8125, "learning_rate": 8.181876894293601e-06, "loss": 0.78088102, "memory(GiB)": 135.49, "step": 25890, "train_speed(iter/s)": 0.203334 }, { "acc": 0.76329818, "epoch": 0.6042666122282331, "grad_norm": 6.96875, "learning_rate": 8.180419445177614e-06, "loss": 0.85486784, "memory(GiB)": 135.49, "step": 25900, "train_speed(iter/s)": 0.203375 }, { "acc": 0.77256145, "epoch": 0.604499919800522, "grad_norm": 5.53125, "learning_rate": 8.178961542057677e-06, "loss": 0.81021471, "memory(GiB)": 135.49, "step": 25910, "train_speed(iter/s)": 0.203415 }, { "acc": 0.77258301, "epoch": 0.6047332273728109, "grad_norm": 5.65625, "learning_rate": 8.177503185141904e-06, "loss": 0.83756752, "memory(GiB)": 135.49, "step": 25920, "train_speed(iter/s)": 0.203453 }, { "acc": 0.75234871, "epoch": 0.6049665349450998, "grad_norm": 4.4375, "learning_rate": 8.176044374638473e-06, "loss": 0.88097897, "memory(GiB)": 135.49, "step": 25930, "train_speed(iter/s)": 0.203494 }, { "acc": 0.77932882, "epoch": 0.6051998425173887, "grad_norm": 7.15625, "learning_rate": 8.174585110755631e-06, "loss": 0.78235216, "memory(GiB)": 135.49, "step": 25940, "train_speed(iter/s)": 0.203533 }, { "acc": 0.77629814, "epoch": 0.6054331500896776, "grad_norm": 6.125, "learning_rate": 8.173125393701686e-06, "loss": 0.80188866, "memory(GiB)": 135.49, "step": 25950, "train_speed(iter/s)": 0.203575 }, { "acc": 0.77888889, "epoch": 0.6056664576619665, "grad_norm": 7.0, "learning_rate": 8.171665223685014e-06, "loss": 0.8051487, "memory(GiB)": 135.49, "step": 25960, "train_speed(iter/s)": 0.203614 }, { "acc": 0.76836891, "epoch": 0.6058997652342554, "grad_norm": 4.6875, "learning_rate": 8.170204600914051e-06, "loss": 0.83836594, "memory(GiB)": 135.49, "step": 25970, "train_speed(iter/s)": 0.203654 }, { "acc": 0.77345543, "epoch": 0.6061330728065443, "grad_norm": 5.25, "learning_rate": 8.168743525597304e-06, "loss": 0.81917725, "memory(GiB)": 135.49, "step": 25980, "train_speed(iter/s)": 0.203694 }, { "acc": 0.75752845, "epoch": 0.6063663803788332, "grad_norm": 6.3125, "learning_rate": 8.167281997943338e-06, "loss": 0.87502155, "memory(GiB)": 135.49, "step": 25990, "train_speed(iter/s)": 0.203737 }, { "acc": 0.75441628, "epoch": 0.6065996879511221, "grad_norm": 6.34375, "learning_rate": 8.165820018160787e-06, "loss": 0.8976305, "memory(GiB)": 135.49, "step": 26000, "train_speed(iter/s)": 0.203774 }, { "epoch": 0.6065996879511221, "eval_acc": 0.7340015617991669, "eval_loss": 0.8386362195014954, "eval_runtime": 1263.1583, "eval_samples_per_second": 28.493, "eval_steps_per_second": 14.247, "step": 26000 }, { "acc": 0.76705179, "epoch": 0.606832995523411, "grad_norm": 6.40625, "learning_rate": 8.164357586458348e-06, "loss": 0.8483676, "memory(GiB)": 135.49, "step": 26010, "train_speed(iter/s)": 0.201777 }, { "acc": 0.77568212, "epoch": 0.6070663030956999, "grad_norm": 6.46875, "learning_rate": 8.162894703044783e-06, "loss": 0.81639938, "memory(GiB)": 135.49, "step": 26020, "train_speed(iter/s)": 0.201819 }, { "acc": 0.76290741, "epoch": 0.6072996106679888, "grad_norm": 5.5625, "learning_rate": 8.161431368128919e-06, "loss": 0.8517725, "memory(GiB)": 135.49, "step": 26030, "train_speed(iter/s)": 0.201861 }, { "acc": 0.77222137, "epoch": 0.6075329182402777, "grad_norm": 5.65625, "learning_rate": 8.159967581919644e-06, "loss": 0.81989021, "memory(GiB)": 135.49, "step": 26040, "train_speed(iter/s)": 0.201902 }, { "acc": 0.78696246, "epoch": 0.6077662258125666, "grad_norm": 5.65625, "learning_rate": 8.158503344625915e-06, "loss": 0.79083347, "memory(GiB)": 135.49, "step": 26050, "train_speed(iter/s)": 0.201941 }, { "acc": 0.74508247, "epoch": 0.6079995333848555, "grad_norm": 6.46875, "learning_rate": 8.157038656456752e-06, "loss": 0.93786983, "memory(GiB)": 135.49, "step": 26060, "train_speed(iter/s)": 0.201983 }, { "acc": 0.77679863, "epoch": 0.6082328409571444, "grad_norm": 4.875, "learning_rate": 8.155573517621238e-06, "loss": 0.80609989, "memory(GiB)": 135.49, "step": 26070, "train_speed(iter/s)": 0.202022 }, { "acc": 0.74756947, "epoch": 0.6084661485294333, "grad_norm": 12.0, "learning_rate": 8.154107928328521e-06, "loss": 0.94404678, "memory(GiB)": 135.49, "step": 26080, "train_speed(iter/s)": 0.202063 }, { "acc": 0.76765032, "epoch": 0.608699456101722, "grad_norm": 8.8125, "learning_rate": 8.152641888787812e-06, "loss": 0.83548355, "memory(GiB)": 135.49, "step": 26090, "train_speed(iter/s)": 0.202102 }, { "acc": 0.76142464, "epoch": 0.608932763674011, "grad_norm": 7.34375, "learning_rate": 8.15117539920839e-06, "loss": 0.86243286, "memory(GiB)": 135.49, "step": 26100, "train_speed(iter/s)": 0.202144 }, { "acc": 0.75862079, "epoch": 0.6091660712462998, "grad_norm": 7.09375, "learning_rate": 8.149708459799595e-06, "loss": 0.87197685, "memory(GiB)": 135.49, "step": 26110, "train_speed(iter/s)": 0.202186 }, { "acc": 0.7474113, "epoch": 0.6093993788185887, "grad_norm": 5.59375, "learning_rate": 8.148241070770834e-06, "loss": 0.89805717, "memory(GiB)": 135.49, "step": 26120, "train_speed(iter/s)": 0.20223 }, { "acc": 0.76918221, "epoch": 0.6096326863908776, "grad_norm": 12.75, "learning_rate": 8.146773232331574e-06, "loss": 0.8256732, "memory(GiB)": 135.49, "step": 26130, "train_speed(iter/s)": 0.202271 }, { "acc": 0.75791473, "epoch": 0.6098659939631665, "grad_norm": 5.96875, "learning_rate": 8.145304944691347e-06, "loss": 0.87835732, "memory(GiB)": 135.49, "step": 26140, "train_speed(iter/s)": 0.202313 }, { "acc": 0.76345959, "epoch": 0.6100993015354554, "grad_norm": 5.78125, "learning_rate": 8.143836208059754e-06, "loss": 0.84864016, "memory(GiB)": 135.49, "step": 26150, "train_speed(iter/s)": 0.202355 }, { "acc": 0.76515064, "epoch": 0.6103326091077443, "grad_norm": 4.9375, "learning_rate": 8.142367022646457e-06, "loss": 0.85870914, "memory(GiB)": 135.49, "step": 26160, "train_speed(iter/s)": 0.202395 }, { "acc": 0.78016558, "epoch": 0.6105659166800332, "grad_norm": 5.0625, "learning_rate": 8.14089738866118e-06, "loss": 0.81447334, "memory(GiB)": 135.49, "step": 26170, "train_speed(iter/s)": 0.202431 }, { "acc": 0.76828427, "epoch": 0.6107992242523221, "grad_norm": 5.0, "learning_rate": 8.139427306313713e-06, "loss": 0.82709599, "memory(GiB)": 135.49, "step": 26180, "train_speed(iter/s)": 0.202467 }, { "acc": 0.77300539, "epoch": 0.611032531824611, "grad_norm": 7.6875, "learning_rate": 8.137956775813909e-06, "loss": 0.82861233, "memory(GiB)": 135.49, "step": 26190, "train_speed(iter/s)": 0.20251 }, { "acc": 0.76799994, "epoch": 0.6112658393968999, "grad_norm": 5.75, "learning_rate": 8.136485797371687e-06, "loss": 0.84132051, "memory(GiB)": 135.49, "step": 26200, "train_speed(iter/s)": 0.20255 }, { "acc": 0.77771664, "epoch": 0.6114991469691888, "grad_norm": 5.21875, "learning_rate": 8.13501437119703e-06, "loss": 0.81489172, "memory(GiB)": 135.49, "step": 26210, "train_speed(iter/s)": 0.202592 }, { "acc": 0.75387483, "epoch": 0.6117324545414777, "grad_norm": 7.0625, "learning_rate": 8.133542497499981e-06, "loss": 0.88717518, "memory(GiB)": 135.49, "step": 26220, "train_speed(iter/s)": 0.202631 }, { "acc": 0.79006443, "epoch": 0.6119657621137666, "grad_norm": 4.46875, "learning_rate": 8.132070176490652e-06, "loss": 0.76933956, "memory(GiB)": 135.49, "step": 26230, "train_speed(iter/s)": 0.202671 }, { "acc": 0.77178202, "epoch": 0.6121990696860555, "grad_norm": 5.1875, "learning_rate": 8.130597408379214e-06, "loss": 0.8304903, "memory(GiB)": 135.49, "step": 26240, "train_speed(iter/s)": 0.202711 }, { "acc": 0.76692171, "epoch": 0.6124323772583444, "grad_norm": 5.84375, "learning_rate": 8.129124193375906e-06, "loss": 0.84440994, "memory(GiB)": 135.49, "step": 26250, "train_speed(iter/s)": 0.202753 }, { "acc": 0.78004012, "epoch": 0.6126656848306333, "grad_norm": 5.5625, "learning_rate": 8.127650531691028e-06, "loss": 0.79581604, "memory(GiB)": 135.49, "step": 26260, "train_speed(iter/s)": 0.202792 }, { "acc": 0.75387669, "epoch": 0.6128989924029222, "grad_norm": 5.90625, "learning_rate": 8.126176423534945e-06, "loss": 0.88765812, "memory(GiB)": 135.49, "step": 26270, "train_speed(iter/s)": 0.202833 }, { "acc": 0.77342339, "epoch": 0.6131322999752111, "grad_norm": 6.15625, "learning_rate": 8.124701869118086e-06, "loss": 0.82173004, "memory(GiB)": 135.49, "step": 26280, "train_speed(iter/s)": 0.20287 }, { "acc": 0.75532489, "epoch": 0.6133656075474999, "grad_norm": 5.65625, "learning_rate": 8.123226868650944e-06, "loss": 0.90019913, "memory(GiB)": 135.49, "step": 26290, "train_speed(iter/s)": 0.202908 }, { "acc": 0.76817436, "epoch": 0.6135989151197888, "grad_norm": 7.21875, "learning_rate": 8.121751422344072e-06, "loss": 0.8288271, "memory(GiB)": 135.49, "step": 26300, "train_speed(iter/s)": 0.202951 }, { "acc": 0.7639535, "epoch": 0.6138322226920777, "grad_norm": 13.1875, "learning_rate": 8.120275530408092e-06, "loss": 0.8539258, "memory(GiB)": 135.49, "step": 26310, "train_speed(iter/s)": 0.202989 }, { "acc": 0.75322371, "epoch": 0.6140655302643666, "grad_norm": 5.84375, "learning_rate": 8.118799193053686e-06, "loss": 0.89668293, "memory(GiB)": 135.49, "step": 26320, "train_speed(iter/s)": 0.203029 }, { "acc": 0.76399355, "epoch": 0.6142988378366555, "grad_norm": 7.90625, "learning_rate": 8.117322410491602e-06, "loss": 0.85157032, "memory(GiB)": 135.49, "step": 26330, "train_speed(iter/s)": 0.203067 }, { "acc": 0.75577831, "epoch": 0.6145321454089444, "grad_norm": 10.9375, "learning_rate": 8.11584518293265e-06, "loss": 0.88066349, "memory(GiB)": 135.49, "step": 26340, "train_speed(iter/s)": 0.203109 }, { "acc": 0.76082258, "epoch": 0.6147654529812333, "grad_norm": 5.28125, "learning_rate": 8.114367510587701e-06, "loss": 0.86330738, "memory(GiB)": 135.49, "step": 26350, "train_speed(iter/s)": 0.20315 }, { "acc": 0.75768723, "epoch": 0.6149987605535222, "grad_norm": 6.625, "learning_rate": 8.112889393667698e-06, "loss": 0.88470192, "memory(GiB)": 135.49, "step": 26360, "train_speed(iter/s)": 0.203192 }, { "acc": 0.78070073, "epoch": 0.6152320681258111, "grad_norm": 4.3125, "learning_rate": 8.111410832383635e-06, "loss": 0.77236452, "memory(GiB)": 135.49, "step": 26370, "train_speed(iter/s)": 0.20323 }, { "acc": 0.76793127, "epoch": 0.6154653756981, "grad_norm": 6.65625, "learning_rate": 8.109931826946582e-06, "loss": 0.84716873, "memory(GiB)": 135.49, "step": 26380, "train_speed(iter/s)": 0.20327 }, { "acc": 0.77840319, "epoch": 0.6156986832703889, "grad_norm": 5.65625, "learning_rate": 8.108452377567663e-06, "loss": 0.80551405, "memory(GiB)": 135.49, "step": 26390, "train_speed(iter/s)": 0.20331 }, { "acc": 0.76713343, "epoch": 0.6159319908426778, "grad_norm": 10.1875, "learning_rate": 8.10697248445807e-06, "loss": 0.83567467, "memory(GiB)": 135.49, "step": 26400, "train_speed(iter/s)": 0.203349 }, { "acc": 0.77320881, "epoch": 0.6161652984149667, "grad_norm": 5.78125, "learning_rate": 8.105492147829059e-06, "loss": 0.810427, "memory(GiB)": 135.49, "step": 26410, "train_speed(iter/s)": 0.203391 }, { "acc": 0.77742152, "epoch": 0.6163986059872556, "grad_norm": 7.53125, "learning_rate": 8.104011367891944e-06, "loss": 0.83462811, "memory(GiB)": 135.49, "step": 26420, "train_speed(iter/s)": 0.203429 }, { "acc": 0.7777009, "epoch": 0.6166319135595445, "grad_norm": 5.90625, "learning_rate": 8.102530144858109e-06, "loss": 0.80389175, "memory(GiB)": 135.49, "step": 26430, "train_speed(iter/s)": 0.203469 }, { "acc": 0.77913895, "epoch": 0.6168652211318334, "grad_norm": 11.25, "learning_rate": 8.101048478938997e-06, "loss": 0.77363768, "memory(GiB)": 135.49, "step": 26440, "train_speed(iter/s)": 0.20351 }, { "acc": 0.7755537, "epoch": 0.6170985287041223, "grad_norm": 5.375, "learning_rate": 8.099566370346115e-06, "loss": 0.82108278, "memory(GiB)": 135.49, "step": 26450, "train_speed(iter/s)": 0.203553 }, { "acc": 0.7845252, "epoch": 0.6173318362764112, "grad_norm": 5.375, "learning_rate": 8.098083819291034e-06, "loss": 0.77753963, "memory(GiB)": 135.49, "step": 26460, "train_speed(iter/s)": 0.203594 }, { "acc": 0.75529957, "epoch": 0.6175651438487001, "grad_norm": 5.5, "learning_rate": 8.096600825985388e-06, "loss": 0.88239994, "memory(GiB)": 135.49, "step": 26470, "train_speed(iter/s)": 0.203636 }, { "acc": 0.76516547, "epoch": 0.6177984514209889, "grad_norm": 7.6875, "learning_rate": 8.095117390640875e-06, "loss": 0.83290586, "memory(GiB)": 135.49, "step": 26480, "train_speed(iter/s)": 0.203675 }, { "acc": 0.79196672, "epoch": 0.6180317589932778, "grad_norm": 6.84375, "learning_rate": 8.093633513469252e-06, "loss": 0.74673676, "memory(GiB)": 135.49, "step": 26490, "train_speed(iter/s)": 0.203717 }, { "acc": 0.78214278, "epoch": 0.6182650665655667, "grad_norm": 6.5625, "learning_rate": 8.092149194682343e-06, "loss": 0.79779549, "memory(GiB)": 135.49, "step": 26500, "train_speed(iter/s)": 0.203757 }, { "epoch": 0.6182650665655667, "eval_acc": 0.7340567410672509, "eval_loss": 0.8385833501815796, "eval_runtime": 1262.9222, "eval_samples_per_second": 28.498, "eval_steps_per_second": 14.249, "step": 26500 }, { "acc": 0.77880516, "epoch": 0.6184983741378556, "grad_norm": 4.1875, "learning_rate": 8.090664434492037e-06, "loss": 0.79661665, "memory(GiB)": 135.49, "step": 26510, "train_speed(iter/s)": 0.201801 }, { "acc": 0.7707437, "epoch": 0.6187316817101445, "grad_norm": 57.25, "learning_rate": 8.08917923311028e-06, "loss": 0.8659133, "memory(GiB)": 135.49, "step": 26520, "train_speed(iter/s)": 0.201839 }, { "acc": 0.75570502, "epoch": 0.6189649892824334, "grad_norm": 8.6875, "learning_rate": 8.087693590749083e-06, "loss": 0.88205557, "memory(GiB)": 135.49, "step": 26530, "train_speed(iter/s)": 0.201876 }, { "acc": 0.7877964, "epoch": 0.6191982968547223, "grad_norm": 6.25, "learning_rate": 8.086207507620524e-06, "loss": 0.75233488, "memory(GiB)": 135.49, "step": 26540, "train_speed(iter/s)": 0.201914 }, { "acc": 0.77488666, "epoch": 0.6194316044270112, "grad_norm": 6.875, "learning_rate": 8.084720983936742e-06, "loss": 0.82435246, "memory(GiB)": 135.49, "step": 26550, "train_speed(iter/s)": 0.201955 }, { "acc": 0.73403945, "epoch": 0.6196649119993001, "grad_norm": 6.5, "learning_rate": 8.083234019909933e-06, "loss": 0.97836504, "memory(GiB)": 135.49, "step": 26560, "train_speed(iter/s)": 0.201992 }, { "acc": 0.76793642, "epoch": 0.619898219571589, "grad_norm": 8.375, "learning_rate": 8.081746615752365e-06, "loss": 0.84815531, "memory(GiB)": 135.49, "step": 26570, "train_speed(iter/s)": 0.202031 }, { "acc": 0.76675844, "epoch": 0.6201315271438779, "grad_norm": 6.53125, "learning_rate": 8.080258771676363e-06, "loss": 0.82447166, "memory(GiB)": 135.49, "step": 26580, "train_speed(iter/s)": 0.202071 }, { "acc": 0.77442513, "epoch": 0.6203648347161668, "grad_norm": 8.0, "learning_rate": 8.078770487894314e-06, "loss": 0.812784, "memory(GiB)": 135.49, "step": 26590, "train_speed(iter/s)": 0.202107 }, { "acc": 0.75438404, "epoch": 0.6205981422884557, "grad_norm": 5.4375, "learning_rate": 8.077281764618674e-06, "loss": 0.89564629, "memory(GiB)": 135.49, "step": 26600, "train_speed(iter/s)": 0.202148 }, { "acc": 0.77537041, "epoch": 0.6208314498607446, "grad_norm": 6.90625, "learning_rate": 8.075792602061955e-06, "loss": 0.79871836, "memory(GiB)": 135.49, "step": 26610, "train_speed(iter/s)": 0.202187 }, { "acc": 0.77573071, "epoch": 0.6210647574330335, "grad_norm": 5.6875, "learning_rate": 8.074303000436737e-06, "loss": 0.81978054, "memory(GiB)": 135.49, "step": 26620, "train_speed(iter/s)": 0.202228 }, { "acc": 0.76133389, "epoch": 0.6212980650053224, "grad_norm": 6.09375, "learning_rate": 8.072812959955657e-06, "loss": 0.88510962, "memory(GiB)": 135.49, "step": 26630, "train_speed(iter/s)": 0.202268 }, { "acc": 0.7703104, "epoch": 0.6215313725776113, "grad_norm": 6.28125, "learning_rate": 8.071322480831422e-06, "loss": 0.82699738, "memory(GiB)": 135.49, "step": 26640, "train_speed(iter/s)": 0.202307 }, { "acc": 0.76226177, "epoch": 0.6217646801499002, "grad_norm": 5.28125, "learning_rate": 8.069831563276793e-06, "loss": 0.87262077, "memory(GiB)": 135.49, "step": 26650, "train_speed(iter/s)": 0.202347 }, { "acc": 0.76208782, "epoch": 0.6219979877221891, "grad_norm": 5.15625, "learning_rate": 8.068340207504601e-06, "loss": 0.8816762, "memory(GiB)": 135.49, "step": 26660, "train_speed(iter/s)": 0.202386 }, { "acc": 0.76370525, "epoch": 0.6222312952944778, "grad_norm": 6.15625, "learning_rate": 8.066848413727736e-06, "loss": 0.86626339, "memory(GiB)": 135.49, "step": 26670, "train_speed(iter/s)": 0.202423 }, { "acc": 0.75106564, "epoch": 0.6224646028667667, "grad_norm": 9.9375, "learning_rate": 8.06535618215915e-06, "loss": 0.91045132, "memory(GiB)": 135.49, "step": 26680, "train_speed(iter/s)": 0.202465 }, { "acc": 0.77608376, "epoch": 0.6226979104390556, "grad_norm": 5.625, "learning_rate": 8.06386351301186e-06, "loss": 0.78377199, "memory(GiB)": 135.49, "step": 26690, "train_speed(iter/s)": 0.202503 }, { "acc": 0.76760941, "epoch": 0.6229312180113445, "grad_norm": 6.0625, "learning_rate": 8.062370406498944e-06, "loss": 0.84575386, "memory(GiB)": 135.49, "step": 26700, "train_speed(iter/s)": 0.202543 }, { "acc": 0.76852503, "epoch": 0.6231645255836334, "grad_norm": 4.53125, "learning_rate": 8.060876862833543e-06, "loss": 0.84211216, "memory(GiB)": 135.49, "step": 26710, "train_speed(iter/s)": 0.202582 }, { "acc": 0.76144214, "epoch": 0.6233978331559223, "grad_norm": 5.0, "learning_rate": 8.059382882228857e-06, "loss": 0.87267609, "memory(GiB)": 135.49, "step": 26720, "train_speed(iter/s)": 0.202623 }, { "acc": 0.77035346, "epoch": 0.6236311407282112, "grad_norm": 5.03125, "learning_rate": 8.057888464898153e-06, "loss": 0.82407818, "memory(GiB)": 135.49, "step": 26730, "train_speed(iter/s)": 0.202661 }, { "acc": 0.76023579, "epoch": 0.6238644483005001, "grad_norm": 15.0625, "learning_rate": 8.056393611054761e-06, "loss": 0.84788218, "memory(GiB)": 135.49, "step": 26740, "train_speed(iter/s)": 0.202701 }, { "acc": 0.76369381, "epoch": 0.624097755872789, "grad_norm": 5.1875, "learning_rate": 8.054898320912069e-06, "loss": 0.86801701, "memory(GiB)": 135.49, "step": 26750, "train_speed(iter/s)": 0.202743 }, { "acc": 0.77449284, "epoch": 0.6243310634450779, "grad_norm": 8.5625, "learning_rate": 8.053402594683527e-06, "loss": 0.82818737, "memory(GiB)": 135.49, "step": 26760, "train_speed(iter/s)": 0.202785 }, { "acc": 0.76777878, "epoch": 0.6245643710173668, "grad_norm": 6.28125, "learning_rate": 8.051906432582651e-06, "loss": 0.8671772, "memory(GiB)": 135.49, "step": 26770, "train_speed(iter/s)": 0.202825 }, { "acc": 0.7700264, "epoch": 0.6247976785896557, "grad_norm": 5.40625, "learning_rate": 8.050409834823021e-06, "loss": 0.82168827, "memory(GiB)": 135.49, "step": 26780, "train_speed(iter/s)": 0.202865 }, { "acc": 0.77923937, "epoch": 0.6250309861619446, "grad_norm": 5.21875, "learning_rate": 8.04891280161827e-06, "loss": 0.79964075, "memory(GiB)": 135.49, "step": 26790, "train_speed(iter/s)": 0.202905 }, { "acc": 0.75467615, "epoch": 0.6252642937342335, "grad_norm": 6.21875, "learning_rate": 8.047415333182105e-06, "loss": 0.89148998, "memory(GiB)": 135.49, "step": 26800, "train_speed(iter/s)": 0.202945 }, { "acc": 0.77384977, "epoch": 0.6254976013065224, "grad_norm": 9.0625, "learning_rate": 8.045917429728286e-06, "loss": 0.81179743, "memory(GiB)": 135.49, "step": 26810, "train_speed(iter/s)": 0.202984 }, { "acc": 0.76612177, "epoch": 0.6257309088788113, "grad_norm": 7.0625, "learning_rate": 8.044419091470638e-06, "loss": 0.83759737, "memory(GiB)": 135.49, "step": 26820, "train_speed(iter/s)": 0.203023 }, { "acc": 0.79365482, "epoch": 0.6259642164511002, "grad_norm": 5.28125, "learning_rate": 8.042920318623051e-06, "loss": 0.74877462, "memory(GiB)": 135.49, "step": 26830, "train_speed(iter/s)": 0.203064 }, { "acc": 0.76441841, "epoch": 0.6261975240233891, "grad_norm": 5.3125, "learning_rate": 8.04142111139947e-06, "loss": 0.83741837, "memory(GiB)": 135.49, "step": 26840, "train_speed(iter/s)": 0.203104 }, { "acc": 0.75787349, "epoch": 0.626430831595678, "grad_norm": 5.125, "learning_rate": 8.039921470013912e-06, "loss": 0.88595181, "memory(GiB)": 135.49, "step": 26850, "train_speed(iter/s)": 0.203142 }, { "acc": 0.76140709, "epoch": 0.6266641391679668, "grad_norm": 6.75, "learning_rate": 8.038421394680445e-06, "loss": 0.86222057, "memory(GiB)": 135.49, "step": 26860, "train_speed(iter/s)": 0.203182 }, { "acc": 0.78027782, "epoch": 0.6268974467402557, "grad_norm": 6.0, "learning_rate": 8.036920885613206e-06, "loss": 0.78418803, "memory(GiB)": 135.49, "step": 26870, "train_speed(iter/s)": 0.20322 }, { "acc": 0.78027434, "epoch": 0.6271307543125446, "grad_norm": 5.21875, "learning_rate": 8.035419943026395e-06, "loss": 0.78440542, "memory(GiB)": 135.49, "step": 26880, "train_speed(iter/s)": 0.203257 }, { "acc": 0.76543207, "epoch": 0.6273640618848335, "grad_norm": 4.59375, "learning_rate": 8.033918567134266e-06, "loss": 0.85047245, "memory(GiB)": 135.49, "step": 26890, "train_speed(iter/s)": 0.203294 }, { "acc": 0.75999284, "epoch": 0.6275973694571224, "grad_norm": 4.625, "learning_rate": 8.032416758151144e-06, "loss": 0.88216581, "memory(GiB)": 135.49, "step": 26900, "train_speed(iter/s)": 0.203332 }, { "acc": 0.76236706, "epoch": 0.6278306770294113, "grad_norm": 9.1875, "learning_rate": 8.030914516291413e-06, "loss": 0.83522234, "memory(GiB)": 135.49, "step": 26910, "train_speed(iter/s)": 0.203372 }, { "acc": 0.77721143, "epoch": 0.6280639846017002, "grad_norm": 5.40625, "learning_rate": 8.029411841769515e-06, "loss": 0.81465416, "memory(GiB)": 135.49, "step": 26920, "train_speed(iter/s)": 0.203411 }, { "acc": 0.77981124, "epoch": 0.6282972921739891, "grad_norm": 5.09375, "learning_rate": 8.027908734799954e-06, "loss": 0.78808117, "memory(GiB)": 135.49, "step": 26930, "train_speed(iter/s)": 0.203448 }, { "acc": 0.78238392, "epoch": 0.628530599746278, "grad_norm": 5.53125, "learning_rate": 8.026405195597302e-06, "loss": 0.78049965, "memory(GiB)": 135.49, "step": 26940, "train_speed(iter/s)": 0.203489 }, { "acc": 0.7622592, "epoch": 0.6287639073185669, "grad_norm": 6.71875, "learning_rate": 8.024901224376186e-06, "loss": 0.84679203, "memory(GiB)": 135.49, "step": 26950, "train_speed(iter/s)": 0.20353 }, { "acc": 0.7593092, "epoch": 0.6289972148908558, "grad_norm": 6.125, "learning_rate": 8.023396821351302e-06, "loss": 0.88364334, "memory(GiB)": 135.49, "step": 26960, "train_speed(iter/s)": 0.20357 }, { "acc": 0.76687098, "epoch": 0.6292305224631447, "grad_norm": 7.21875, "learning_rate": 8.021891986737399e-06, "loss": 0.82395229, "memory(GiB)": 135.49, "step": 26970, "train_speed(iter/s)": 0.203609 }, { "acc": 0.77052889, "epoch": 0.6294638300354336, "grad_norm": 5.8125, "learning_rate": 8.020386720749292e-06, "loss": 0.83412533, "memory(GiB)": 135.49, "step": 26980, "train_speed(iter/s)": 0.203649 }, { "acc": 0.7800168, "epoch": 0.6296971376077225, "grad_norm": 4.46875, "learning_rate": 8.018881023601858e-06, "loss": 0.80071144, "memory(GiB)": 135.49, "step": 26990, "train_speed(iter/s)": 0.203689 }, { "acc": 0.78654366, "epoch": 0.6299304451800114, "grad_norm": 5.84375, "learning_rate": 8.017374895510035e-06, "loss": 0.76416159, "memory(GiB)": 135.49, "step": 27000, "train_speed(iter/s)": 0.203728 }, { "epoch": 0.6299304451800114, "eval_acc": 0.7341051439339914, "eval_loss": 0.8383846879005432, "eval_runtime": 1262.1882, "eval_samples_per_second": 28.515, "eval_steps_per_second": 14.258, "step": 27000 }, { "acc": 0.76437502, "epoch": 0.6301637527523003, "grad_norm": 5.15625, "learning_rate": 8.015868336688822e-06, "loss": 0.85710135, "memory(GiB)": 135.49, "step": 27010, "train_speed(iter/s)": 0.201808 }, { "acc": 0.77605839, "epoch": 0.6303970603245892, "grad_norm": 5.5, "learning_rate": 8.01436134735328e-06, "loss": 0.81282873, "memory(GiB)": 135.49, "step": 27020, "train_speed(iter/s)": 0.201847 }, { "acc": 0.79097385, "epoch": 0.6306303678968781, "grad_norm": 5.34375, "learning_rate": 8.012853927718532e-06, "loss": 0.7400207, "memory(GiB)": 135.49, "step": 27030, "train_speed(iter/s)": 0.201885 }, { "acc": 0.76154871, "epoch": 0.630863675469167, "grad_norm": 9.875, "learning_rate": 8.011346077999762e-06, "loss": 0.86349401, "memory(GiB)": 135.49, "step": 27040, "train_speed(iter/s)": 0.201922 }, { "acc": 0.74521198, "epoch": 0.6310969830414559, "grad_norm": 7.0625, "learning_rate": 8.009837798412213e-06, "loss": 0.92456331, "memory(GiB)": 135.49, "step": 27050, "train_speed(iter/s)": 0.201961 }, { "acc": 0.76841507, "epoch": 0.6313302906137447, "grad_norm": 5.21875, "learning_rate": 8.008329089171192e-06, "loss": 0.8208066, "memory(GiB)": 135.49, "step": 27060, "train_speed(iter/s)": 0.202 }, { "acc": 0.77050219, "epoch": 0.6315635981860336, "grad_norm": 5.09375, "learning_rate": 8.006819950492067e-06, "loss": 0.82688265, "memory(GiB)": 135.49, "step": 27070, "train_speed(iter/s)": 0.202038 }, { "acc": 0.75582962, "epoch": 0.6317969057583225, "grad_norm": 7.90625, "learning_rate": 8.00531038259027e-06, "loss": 0.865378, "memory(GiB)": 135.49, "step": 27080, "train_speed(iter/s)": 0.202077 }, { "acc": 0.78286657, "epoch": 0.6320302133306114, "grad_norm": 5.71875, "learning_rate": 8.003800385681287e-06, "loss": 0.77129941, "memory(GiB)": 135.49, "step": 27090, "train_speed(iter/s)": 0.202115 }, { "acc": 0.76713624, "epoch": 0.6322635209029003, "grad_norm": 4.84375, "learning_rate": 8.002289959980672e-06, "loss": 0.83562679, "memory(GiB)": 135.49, "step": 27100, "train_speed(iter/s)": 0.202155 }, { "acc": 0.76295204, "epoch": 0.6324968284751892, "grad_norm": 6.59375, "learning_rate": 8.000779105704037e-06, "loss": 0.83708286, "memory(GiB)": 135.49, "step": 27110, "train_speed(iter/s)": 0.202193 }, { "acc": 0.76064711, "epoch": 0.6327301360474781, "grad_norm": 7.125, "learning_rate": 7.999267823067056e-06, "loss": 0.88228626, "memory(GiB)": 135.49, "step": 27120, "train_speed(iter/s)": 0.202233 }, { "acc": 0.76847858, "epoch": 0.632963443619767, "grad_norm": 5.59375, "learning_rate": 7.997756112285467e-06, "loss": 0.8320076, "memory(GiB)": 135.49, "step": 27130, "train_speed(iter/s)": 0.202271 }, { "acc": 0.77316833, "epoch": 0.6331967511920559, "grad_norm": 4.9375, "learning_rate": 7.996243973575062e-06, "loss": 0.8570303, "memory(GiB)": 135.49, "step": 27140, "train_speed(iter/s)": 0.20231 }, { "acc": 0.77561502, "epoch": 0.6334300587643448, "grad_norm": 5.84375, "learning_rate": 7.994731407151702e-06, "loss": 0.80374851, "memory(GiB)": 135.49, "step": 27150, "train_speed(iter/s)": 0.202348 }, { "acc": 0.75557976, "epoch": 0.6336633663366337, "grad_norm": 5.28125, "learning_rate": 7.9932184132313e-06, "loss": 0.88049164, "memory(GiB)": 135.49, "step": 27160, "train_speed(iter/s)": 0.202386 }, { "acc": 0.76377411, "epoch": 0.6338966739089226, "grad_norm": 4.90625, "learning_rate": 7.99170499202984e-06, "loss": 0.86145134, "memory(GiB)": 135.49, "step": 27170, "train_speed(iter/s)": 0.202424 }, { "acc": 0.74147487, "epoch": 0.6341299814812115, "grad_norm": 6.5, "learning_rate": 7.990191143763364e-06, "loss": 0.92697124, "memory(GiB)": 135.49, "step": 27180, "train_speed(iter/s)": 0.202462 }, { "acc": 0.76796284, "epoch": 0.6343632890535004, "grad_norm": 6.03125, "learning_rate": 7.988676868647969e-06, "loss": 0.84087086, "memory(GiB)": 135.49, "step": 27190, "train_speed(iter/s)": 0.202501 }, { "acc": 0.76078849, "epoch": 0.6345965966257893, "grad_norm": 6.84375, "learning_rate": 7.98716216689982e-06, "loss": 0.87098112, "memory(GiB)": 135.49, "step": 27200, "train_speed(iter/s)": 0.202541 }, { "acc": 0.77481918, "epoch": 0.6348299041980782, "grad_norm": 4.78125, "learning_rate": 7.985647038735139e-06, "loss": 0.83479147, "memory(GiB)": 135.49, "step": 27210, "train_speed(iter/s)": 0.202582 }, { "acc": 0.76346474, "epoch": 0.6350632117703671, "grad_norm": 5.09375, "learning_rate": 7.98413148437021e-06, "loss": 0.849786, "memory(GiB)": 135.49, "step": 27220, "train_speed(iter/s)": 0.202618 }, { "acc": 0.768367, "epoch": 0.635296519342656, "grad_norm": 6.53125, "learning_rate": 7.98261550402138e-06, "loss": 0.83857031, "memory(GiB)": 135.49, "step": 27230, "train_speed(iter/s)": 0.202657 }, { "acc": 0.77152929, "epoch": 0.6355298269149449, "grad_norm": 6.6875, "learning_rate": 7.981099097905051e-06, "loss": 0.82611742, "memory(GiB)": 135.49, "step": 27240, "train_speed(iter/s)": 0.202697 }, { "acc": 0.76331477, "epoch": 0.6357631344872336, "grad_norm": 5.6875, "learning_rate": 7.979582266237695e-06, "loss": 0.88298092, "memory(GiB)": 135.49, "step": 27250, "train_speed(iter/s)": 0.202735 }, { "acc": 0.78458672, "epoch": 0.6359964420595225, "grad_norm": 6.5625, "learning_rate": 7.978065009235834e-06, "loss": 0.76901836, "memory(GiB)": 135.49, "step": 27260, "train_speed(iter/s)": 0.202774 }, { "acc": 0.7700531, "epoch": 0.6362297496318114, "grad_norm": 6.8125, "learning_rate": 7.976547327116058e-06, "loss": 0.81084709, "memory(GiB)": 135.49, "step": 27270, "train_speed(iter/s)": 0.202813 }, { "acc": 0.75314689, "epoch": 0.6364630572041003, "grad_norm": 6.96875, "learning_rate": 7.975029220095016e-06, "loss": 0.89704304, "memory(GiB)": 135.49, "step": 27280, "train_speed(iter/s)": 0.202854 }, { "acc": 0.7581459, "epoch": 0.6366963647763892, "grad_norm": 5.5625, "learning_rate": 7.973510688389417e-06, "loss": 0.86853962, "memory(GiB)": 135.49, "step": 27290, "train_speed(iter/s)": 0.202891 }, { "acc": 0.7669148, "epoch": 0.6369296723486781, "grad_norm": 6.125, "learning_rate": 7.971991732216032e-06, "loss": 0.83967676, "memory(GiB)": 135.49, "step": 27300, "train_speed(iter/s)": 0.202932 }, { "acc": 0.76242847, "epoch": 0.637162979920967, "grad_norm": 5.5, "learning_rate": 7.97047235179169e-06, "loss": 0.84317446, "memory(GiB)": 135.49, "step": 27310, "train_speed(iter/s)": 0.202973 }, { "acc": 0.77377458, "epoch": 0.6373962874932559, "grad_norm": 6.0625, "learning_rate": 7.968952547333281e-06, "loss": 0.80687599, "memory(GiB)": 135.49, "step": 27320, "train_speed(iter/s)": 0.203012 }, { "acc": 0.78364239, "epoch": 0.6376295950655448, "grad_norm": 5.96875, "learning_rate": 7.967432319057762e-06, "loss": 0.76992836, "memory(GiB)": 135.49, "step": 27330, "train_speed(iter/s)": 0.203048 }, { "acc": 0.76945934, "epoch": 0.6378629026378337, "grad_norm": 5.8125, "learning_rate": 7.965911667182138e-06, "loss": 0.82784805, "memory(GiB)": 135.49, "step": 27340, "train_speed(iter/s)": 0.203087 }, { "acc": 0.78532553, "epoch": 0.6380962102101226, "grad_norm": 5.40625, "learning_rate": 7.964390591923487e-06, "loss": 0.77153301, "memory(GiB)": 135.49, "step": 27350, "train_speed(iter/s)": 0.203124 }, { "acc": 0.75179567, "epoch": 0.6383295177824115, "grad_norm": 6.15625, "learning_rate": 7.962869093498939e-06, "loss": 0.91723061, "memory(GiB)": 135.49, "step": 27360, "train_speed(iter/s)": 0.203161 }, { "acc": 0.77853403, "epoch": 0.6385628253547004, "grad_norm": 5.21875, "learning_rate": 7.961347172125689e-06, "loss": 0.80990219, "memory(GiB)": 135.49, "step": 27370, "train_speed(iter/s)": 0.203201 }, { "acc": 0.75443606, "epoch": 0.6387961329269893, "grad_norm": 6.5, "learning_rate": 7.959824828020991e-06, "loss": 0.91891155, "memory(GiB)": 135.49, "step": 27380, "train_speed(iter/s)": 0.203236 }, { "acc": 0.76737123, "epoch": 0.6390294404992782, "grad_norm": 5.96875, "learning_rate": 7.958302061402159e-06, "loss": 0.83611603, "memory(GiB)": 135.49, "step": 27390, "train_speed(iter/s)": 0.203272 }, { "acc": 0.77032247, "epoch": 0.6392627480715671, "grad_norm": 5.46875, "learning_rate": 7.956778872486566e-06, "loss": 0.83638506, "memory(GiB)": 135.49, "step": 27400, "train_speed(iter/s)": 0.203309 }, { "acc": 0.7646183, "epoch": 0.639496055643856, "grad_norm": 5.4375, "learning_rate": 7.955255261491648e-06, "loss": 0.83866177, "memory(GiB)": 135.49, "step": 27410, "train_speed(iter/s)": 0.203349 }, { "acc": 0.78309765, "epoch": 0.6397293632161449, "grad_norm": 5.78125, "learning_rate": 7.9537312286349e-06, "loss": 0.78610535, "memory(GiB)": 135.49, "step": 27420, "train_speed(iter/s)": 0.203389 }, { "acc": 0.76351881, "epoch": 0.6399626707884338, "grad_norm": 5.96875, "learning_rate": 7.952206774133878e-06, "loss": 0.84698591, "memory(GiB)": 135.49, "step": 27430, "train_speed(iter/s)": 0.203427 }, { "acc": 0.78121519, "epoch": 0.6401959783607226, "grad_norm": 7.90625, "learning_rate": 7.950681898206197e-06, "loss": 0.79555488, "memory(GiB)": 135.49, "step": 27440, "train_speed(iter/s)": 0.203463 }, { "acc": 0.75235415, "epoch": 0.6404292859330115, "grad_norm": 9.4375, "learning_rate": 7.949156601069531e-06, "loss": 0.90403576, "memory(GiB)": 135.49, "step": 27450, "train_speed(iter/s)": 0.2035 }, { "acc": 0.78830032, "epoch": 0.6406625935053004, "grad_norm": 5.0, "learning_rate": 7.947630882941617e-06, "loss": 0.74861574, "memory(GiB)": 135.49, "step": 27460, "train_speed(iter/s)": 0.203539 }, { "acc": 0.76749082, "epoch": 0.6408959010775893, "grad_norm": 8.0625, "learning_rate": 7.94610474404025e-06, "loss": 0.85365829, "memory(GiB)": 135.49, "step": 27470, "train_speed(iter/s)": 0.203578 }, { "acc": 0.77542858, "epoch": 0.6411292086498782, "grad_norm": 6.5625, "learning_rate": 7.944578184583289e-06, "loss": 0.80610075, "memory(GiB)": 135.49, "step": 27480, "train_speed(iter/s)": 0.203616 }, { "acc": 0.77051735, "epoch": 0.6413625162221671, "grad_norm": 4.8125, "learning_rate": 7.943051204788646e-06, "loss": 0.8466486, "memory(GiB)": 135.49, "step": 27490, "train_speed(iter/s)": 0.203656 }, { "acc": 0.78912535, "epoch": 0.641595823794456, "grad_norm": 6.3125, "learning_rate": 7.941523804874298e-06, "loss": 0.79532938, "memory(GiB)": 135.49, "step": 27500, "train_speed(iter/s)": 0.203693 }, { "epoch": 0.641595823794456, "eval_acc": 0.7341209555371266, "eval_loss": 0.8380324840545654, "eval_runtime": 1262.2954, "eval_samples_per_second": 28.512, "eval_steps_per_second": 14.257, "step": 27500 }, { "acc": 0.75207825, "epoch": 0.6418291313667449, "grad_norm": 6.25, "learning_rate": 7.939995985058282e-06, "loss": 0.9187892, "memory(GiB)": 135.49, "step": 27510, "train_speed(iter/s)": 0.201804 }, { "acc": 0.76684012, "epoch": 0.6420624389390338, "grad_norm": 5.0625, "learning_rate": 7.938467745558693e-06, "loss": 0.85394001, "memory(GiB)": 135.49, "step": 27520, "train_speed(iter/s)": 0.201841 }, { "acc": 0.77771626, "epoch": 0.6422957465113227, "grad_norm": 8.5, "learning_rate": 7.936939086593688e-06, "loss": 0.80776768, "memory(GiB)": 135.49, "step": 27530, "train_speed(iter/s)": 0.201875 }, { "acc": 0.78615322, "epoch": 0.6425290540836116, "grad_norm": 11.75, "learning_rate": 7.935410008381482e-06, "loss": 0.75615845, "memory(GiB)": 135.49, "step": 27540, "train_speed(iter/s)": 0.201915 }, { "acc": 0.76556025, "epoch": 0.6427623616559005, "grad_norm": 8.0, "learning_rate": 7.933880511140349e-06, "loss": 0.84549389, "memory(GiB)": 135.49, "step": 27550, "train_speed(iter/s)": 0.201955 }, { "acc": 0.77058616, "epoch": 0.6429956692281894, "grad_norm": 8.5625, "learning_rate": 7.932350595088623e-06, "loss": 0.85118876, "memory(GiB)": 135.49, "step": 27560, "train_speed(iter/s)": 0.201992 }, { "acc": 0.76359625, "epoch": 0.6432289768004783, "grad_norm": 6.15625, "learning_rate": 7.930820260444705e-06, "loss": 0.87004967, "memory(GiB)": 135.49, "step": 27570, "train_speed(iter/s)": 0.202031 }, { "acc": 0.75711451, "epoch": 0.6434622843727672, "grad_norm": 5.78125, "learning_rate": 7.929289507427044e-06, "loss": 0.85604267, "memory(GiB)": 135.49, "step": 27580, "train_speed(iter/s)": 0.202071 }, { "acc": 0.77626848, "epoch": 0.6436955919450561, "grad_norm": 5.3125, "learning_rate": 7.927758336254156e-06, "loss": 0.82054424, "memory(GiB)": 135.49, "step": 27590, "train_speed(iter/s)": 0.202109 }, { "acc": 0.76323376, "epoch": 0.643928899517345, "grad_norm": 8.3125, "learning_rate": 7.926226747144618e-06, "loss": 0.85086632, "memory(GiB)": 135.49, "step": 27600, "train_speed(iter/s)": 0.202146 }, { "acc": 0.76386838, "epoch": 0.6441622070896339, "grad_norm": 6.65625, "learning_rate": 7.924694740317063e-06, "loss": 0.85122118, "memory(GiB)": 135.49, "step": 27610, "train_speed(iter/s)": 0.202185 }, { "acc": 0.76744747, "epoch": 0.6443955146619228, "grad_norm": 6.9375, "learning_rate": 7.923162315990181e-06, "loss": 0.82530022, "memory(GiB)": 135.49, "step": 27620, "train_speed(iter/s)": 0.202223 }, { "acc": 0.7603775, "epoch": 0.6446288222342116, "grad_norm": 4.625, "learning_rate": 7.92162947438273e-06, "loss": 0.86902523, "memory(GiB)": 135.49, "step": 27630, "train_speed(iter/s)": 0.202261 }, { "acc": 0.77383785, "epoch": 0.6448621298065005, "grad_norm": 5.4375, "learning_rate": 7.920096215713518e-06, "loss": 0.81223202, "memory(GiB)": 135.49, "step": 27640, "train_speed(iter/s)": 0.202299 }, { "acc": 0.77821484, "epoch": 0.6450954373787894, "grad_norm": 6.8125, "learning_rate": 7.91856254020142e-06, "loss": 0.82064657, "memory(GiB)": 135.49, "step": 27650, "train_speed(iter/s)": 0.202336 }, { "acc": 0.7584342, "epoch": 0.6453287449510783, "grad_norm": 7.125, "learning_rate": 7.917028448065368e-06, "loss": 0.85957165, "memory(GiB)": 135.49, "step": 27660, "train_speed(iter/s)": 0.202372 }, { "acc": 0.7720767, "epoch": 0.6455620525233672, "grad_norm": 4.40625, "learning_rate": 7.915493939524352e-06, "loss": 0.82808685, "memory(GiB)": 135.49, "step": 27670, "train_speed(iter/s)": 0.202407 }, { "acc": 0.78341627, "epoch": 0.6457953600956561, "grad_norm": 6.28125, "learning_rate": 7.913959014797424e-06, "loss": 0.76799898, "memory(GiB)": 135.49, "step": 27680, "train_speed(iter/s)": 0.202445 }, { "acc": 0.75408554, "epoch": 0.646028667667945, "grad_norm": 7.96875, "learning_rate": 7.91242367410369e-06, "loss": 0.91106434, "memory(GiB)": 135.49, "step": 27690, "train_speed(iter/s)": 0.202484 }, { "acc": 0.7685318, "epoch": 0.6462619752402339, "grad_norm": 4.625, "learning_rate": 7.910887917662326e-06, "loss": 0.8257803, "memory(GiB)": 135.49, "step": 27700, "train_speed(iter/s)": 0.202523 }, { "acc": 0.75838747, "epoch": 0.6464952828125228, "grad_norm": 6.09375, "learning_rate": 7.909351745692557e-06, "loss": 0.88021412, "memory(GiB)": 135.49, "step": 27710, "train_speed(iter/s)": 0.202558 }, { "acc": 0.74146585, "epoch": 0.6467285903848117, "grad_norm": 6.46875, "learning_rate": 7.907815158413669e-06, "loss": 0.94747868, "memory(GiB)": 135.49, "step": 27720, "train_speed(iter/s)": 0.202596 }, { "acc": 0.76418734, "epoch": 0.6469618979571006, "grad_norm": 10.625, "learning_rate": 7.906278156045015e-06, "loss": 0.85708141, "memory(GiB)": 135.49, "step": 27730, "train_speed(iter/s)": 0.202634 }, { "acc": 0.76008587, "epoch": 0.6471952055293895, "grad_norm": 6.0, "learning_rate": 7.904740738805996e-06, "loss": 0.87962961, "memory(GiB)": 135.49, "step": 27740, "train_speed(iter/s)": 0.202673 }, { "acc": 0.75891833, "epoch": 0.6474285131016784, "grad_norm": 7.5625, "learning_rate": 7.90320290691608e-06, "loss": 0.87185316, "memory(GiB)": 135.49, "step": 27750, "train_speed(iter/s)": 0.202713 }, { "acc": 0.77476673, "epoch": 0.6476618206739673, "grad_norm": 5.15625, "learning_rate": 7.901664660594794e-06, "loss": 0.80513058, "memory(GiB)": 135.49, "step": 27760, "train_speed(iter/s)": 0.202753 }, { "acc": 0.76863918, "epoch": 0.6478951282462562, "grad_norm": 5.25, "learning_rate": 7.90012600006172e-06, "loss": 0.83673897, "memory(GiB)": 135.49, "step": 27770, "train_speed(iter/s)": 0.202792 }, { "acc": 0.77349844, "epoch": 0.6481284358185451, "grad_norm": 4.9375, "learning_rate": 7.898586925536504e-06, "loss": 0.81495485, "memory(GiB)": 135.49, "step": 27780, "train_speed(iter/s)": 0.20283 }, { "acc": 0.77584057, "epoch": 0.648361743390834, "grad_norm": 5.96875, "learning_rate": 7.897047437238845e-06, "loss": 0.82006063, "memory(GiB)": 135.49, "step": 27790, "train_speed(iter/s)": 0.202866 }, { "acc": 0.77666302, "epoch": 0.6485950509631229, "grad_norm": 5.15625, "learning_rate": 7.895507535388506e-06, "loss": 0.80285645, "memory(GiB)": 135.49, "step": 27800, "train_speed(iter/s)": 0.202906 }, { "acc": 0.77431231, "epoch": 0.6488283585354118, "grad_norm": 5.96875, "learning_rate": 7.893967220205307e-06, "loss": 0.83906975, "memory(GiB)": 135.49, "step": 27810, "train_speed(iter/s)": 0.202943 }, { "acc": 0.76926565, "epoch": 0.6490616661077007, "grad_norm": 6.03125, "learning_rate": 7.89242649190913e-06, "loss": 0.83818226, "memory(GiB)": 135.49, "step": 27820, "train_speed(iter/s)": 0.202981 }, { "acc": 0.77821865, "epoch": 0.6492949736799895, "grad_norm": 7.78125, "learning_rate": 7.890885350719907e-06, "loss": 0.79327741, "memory(GiB)": 135.49, "step": 27830, "train_speed(iter/s)": 0.203018 }, { "acc": 0.76916466, "epoch": 0.6495282812522783, "grad_norm": 5.375, "learning_rate": 7.889343796857645e-06, "loss": 0.83854179, "memory(GiB)": 135.49, "step": 27840, "train_speed(iter/s)": 0.203057 }, { "acc": 0.77355251, "epoch": 0.6497615888245672, "grad_norm": 7.625, "learning_rate": 7.887801830542392e-06, "loss": 0.84079943, "memory(GiB)": 135.49, "step": 27850, "train_speed(iter/s)": 0.203096 }, { "acc": 0.76801319, "epoch": 0.6499948963968561, "grad_norm": 6.46875, "learning_rate": 7.886259451994267e-06, "loss": 0.84022713, "memory(GiB)": 135.49, "step": 27860, "train_speed(iter/s)": 0.203133 }, { "acc": 0.75025473, "epoch": 0.650228203969145, "grad_norm": 9.4375, "learning_rate": 7.884716661433444e-06, "loss": 0.91349478, "memory(GiB)": 135.49, "step": 27870, "train_speed(iter/s)": 0.20317 }, { "acc": 0.76460094, "epoch": 0.650461511541434, "grad_norm": 6.84375, "learning_rate": 7.883173459080159e-06, "loss": 0.83846169, "memory(GiB)": 135.49, "step": 27880, "train_speed(iter/s)": 0.203209 }, { "acc": 0.77396183, "epoch": 0.6506948191137228, "grad_norm": 5.34375, "learning_rate": 7.881629845154696e-06, "loss": 0.8086833, "memory(GiB)": 135.49, "step": 27890, "train_speed(iter/s)": 0.203247 }, { "acc": 0.78768616, "epoch": 0.6509281266860117, "grad_norm": 5.5, "learning_rate": 7.880085819877411e-06, "loss": 0.77090397, "memory(GiB)": 135.49, "step": 27900, "train_speed(iter/s)": 0.203285 }, { "acc": 0.76796641, "epoch": 0.6511614342583006, "grad_norm": 5.46875, "learning_rate": 7.878541383468712e-06, "loss": 0.83645468, "memory(GiB)": 135.49, "step": 27910, "train_speed(iter/s)": 0.203323 }, { "acc": 0.78410215, "epoch": 0.6513947418305895, "grad_norm": 6.375, "learning_rate": 7.876996536149067e-06, "loss": 0.77533784, "memory(GiB)": 135.49, "step": 27920, "train_speed(iter/s)": 0.20336 }, { "acc": 0.79187174, "epoch": 0.6516280494028784, "grad_norm": 6.375, "learning_rate": 7.875451278139001e-06, "loss": 0.74810696, "memory(GiB)": 135.49, "step": 27930, "train_speed(iter/s)": 0.203398 }, { "acc": 0.75581779, "epoch": 0.6518613569751673, "grad_norm": 6.46875, "learning_rate": 7.873905609659102e-06, "loss": 0.93566113, "memory(GiB)": 135.49, "step": 27940, "train_speed(iter/s)": 0.203436 }, { "acc": 0.75583773, "epoch": 0.6520946645474562, "grad_norm": 5.75, "learning_rate": 7.872359530930011e-06, "loss": 0.87403564, "memory(GiB)": 135.49, "step": 27950, "train_speed(iter/s)": 0.203473 }, { "acc": 0.76752958, "epoch": 0.6523279721197451, "grad_norm": 6.46875, "learning_rate": 7.870813042172432e-06, "loss": 0.84181881, "memory(GiB)": 135.49, "step": 27960, "train_speed(iter/s)": 0.20351 }, { "acc": 0.77773209, "epoch": 0.652561279692034, "grad_norm": 5.5625, "learning_rate": 7.869266143607124e-06, "loss": 0.78560009, "memory(GiB)": 135.49, "step": 27970, "train_speed(iter/s)": 0.203548 }, { "acc": 0.76012878, "epoch": 0.6527945872643229, "grad_norm": 8.1875, "learning_rate": 7.86771883545491e-06, "loss": 0.89276485, "memory(GiB)": 135.49, "step": 27980, "train_speed(iter/s)": 0.203588 }, { "acc": 0.75869727, "epoch": 0.6530278948366118, "grad_norm": 5.84375, "learning_rate": 7.866171117936663e-06, "loss": 0.86208553, "memory(GiB)": 135.49, "step": 27990, "train_speed(iter/s)": 0.203627 }, { "acc": 0.7526351, "epoch": 0.6532612024089007, "grad_norm": 7.28125, "learning_rate": 7.864622991273322e-06, "loss": 0.87319565, "memory(GiB)": 135.49, "step": 28000, "train_speed(iter/s)": 0.203665 }, { "epoch": 0.6532612024089007, "eval_acc": 0.7342134050126009, "eval_loss": 0.8376157283782959, "eval_runtime": 1262.964, "eval_samples_per_second": 28.497, "eval_steps_per_second": 14.249, "step": 28000 }, { "acc": 0.75516691, "epoch": 0.6534945099811896, "grad_norm": 9.375, "learning_rate": 7.863074455685882e-06, "loss": 0.90308914, "memory(GiB)": 135.49, "step": 28010, "train_speed(iter/s)": 0.201814 }, { "acc": 0.76953316, "epoch": 0.6537278175534784, "grad_norm": 7.8125, "learning_rate": 7.861525511395394e-06, "loss": 0.83446846, "memory(GiB)": 135.49, "step": 28020, "train_speed(iter/s)": 0.201852 }, { "acc": 0.76302385, "epoch": 0.6539611251257673, "grad_norm": 5.4375, "learning_rate": 7.859976158622971e-06, "loss": 0.84548817, "memory(GiB)": 135.49, "step": 28030, "train_speed(iter/s)": 0.201887 }, { "acc": 0.75310764, "epoch": 0.6541944326980562, "grad_norm": 7.0625, "learning_rate": 7.858426397589783e-06, "loss": 0.91987467, "memory(GiB)": 135.49, "step": 28040, "train_speed(iter/s)": 0.201926 }, { "acc": 0.76601486, "epoch": 0.6544277402703451, "grad_norm": 8.0625, "learning_rate": 7.856876228517057e-06, "loss": 0.82559624, "memory(GiB)": 135.49, "step": 28050, "train_speed(iter/s)": 0.201966 }, { "acc": 0.78004975, "epoch": 0.654661047842634, "grad_norm": 5.34375, "learning_rate": 7.85532565162608e-06, "loss": 0.80361357, "memory(GiB)": 135.49, "step": 28060, "train_speed(iter/s)": 0.202005 }, { "acc": 0.77734928, "epoch": 0.6548943554149229, "grad_norm": 4.9375, "learning_rate": 7.853774667138192e-06, "loss": 0.78006277, "memory(GiB)": 135.49, "step": 28070, "train_speed(iter/s)": 0.202046 }, { "acc": 0.7742959, "epoch": 0.6551276629872118, "grad_norm": 5.96875, "learning_rate": 7.852223275274804e-06, "loss": 0.82164822, "memory(GiB)": 135.49, "step": 28080, "train_speed(iter/s)": 0.202083 }, { "acc": 0.78195372, "epoch": 0.6553609705595007, "grad_norm": 6.96875, "learning_rate": 7.85067147625737e-06, "loss": 0.78586302, "memory(GiB)": 135.49, "step": 28090, "train_speed(iter/s)": 0.202121 }, { "acc": 0.76245651, "epoch": 0.6555942781317896, "grad_norm": 9.0625, "learning_rate": 7.84911927030741e-06, "loss": 0.86702566, "memory(GiB)": 135.49, "step": 28100, "train_speed(iter/s)": 0.20216 }, { "acc": 0.76732879, "epoch": 0.6558275857040785, "grad_norm": 4.75, "learning_rate": 7.847566657646502e-06, "loss": 0.83475609, "memory(GiB)": 135.49, "step": 28110, "train_speed(iter/s)": 0.202199 }, { "acc": 0.77329254, "epoch": 0.6560608932763674, "grad_norm": 6.15625, "learning_rate": 7.846013638496281e-06, "loss": 0.83228092, "memory(GiB)": 135.49, "step": 28120, "train_speed(iter/s)": 0.202234 }, { "acc": 0.78733501, "epoch": 0.6562942008486563, "grad_norm": 5.96875, "learning_rate": 7.84446021307844e-06, "loss": 0.76141357, "memory(GiB)": 135.49, "step": 28130, "train_speed(iter/s)": 0.202273 }, { "acc": 0.75742302, "epoch": 0.6565275084209452, "grad_norm": 9.0, "learning_rate": 7.842906381614732e-06, "loss": 0.85699387, "memory(GiB)": 135.49, "step": 28140, "train_speed(iter/s)": 0.202309 }, { "acc": 0.77642112, "epoch": 0.6567608159932341, "grad_norm": 4.9375, "learning_rate": 7.841352144326962e-06, "loss": 0.80341482, "memory(GiB)": 135.49, "step": 28150, "train_speed(iter/s)": 0.202343 }, { "acc": 0.77638535, "epoch": 0.656994123565523, "grad_norm": 5.4375, "learning_rate": 7.839797501436999e-06, "loss": 0.81114941, "memory(GiB)": 135.49, "step": 28160, "train_speed(iter/s)": 0.202381 }, { "acc": 0.75518484, "epoch": 0.6572274311378119, "grad_norm": 8.0625, "learning_rate": 7.838242453166766e-06, "loss": 0.87745237, "memory(GiB)": 135.49, "step": 28170, "train_speed(iter/s)": 0.202418 }, { "acc": 0.75040169, "epoch": 0.6574607387101008, "grad_norm": 6.40625, "learning_rate": 7.83668699973825e-06, "loss": 0.91995087, "memory(GiB)": 135.49, "step": 28180, "train_speed(iter/s)": 0.202457 }, { "acc": 0.77502036, "epoch": 0.6576940462823897, "grad_norm": 9.375, "learning_rate": 7.835131141373487e-06, "loss": 0.81505318, "memory(GiB)": 135.49, "step": 28190, "train_speed(iter/s)": 0.202495 }, { "acc": 0.79086123, "epoch": 0.6579273538546786, "grad_norm": 6.6875, "learning_rate": 7.833574878294578e-06, "loss": 0.76756587, "memory(GiB)": 135.49, "step": 28200, "train_speed(iter/s)": 0.202528 }, { "acc": 0.75423155, "epoch": 0.6581606614269674, "grad_norm": 4.53125, "learning_rate": 7.832018210723679e-06, "loss": 0.88332577, "memory(GiB)": 135.49, "step": 28210, "train_speed(iter/s)": 0.202565 }, { "acc": 0.75812111, "epoch": 0.6583939689992563, "grad_norm": 5.8125, "learning_rate": 7.830461138883e-06, "loss": 0.88021154, "memory(GiB)": 135.49, "step": 28220, "train_speed(iter/s)": 0.202604 }, { "acc": 0.76058731, "epoch": 0.6586272765715452, "grad_norm": 8.75, "learning_rate": 7.82890366299482e-06, "loss": 0.86981297, "memory(GiB)": 135.49, "step": 28230, "train_speed(iter/s)": 0.202644 }, { "acc": 0.77457695, "epoch": 0.6588605841438341, "grad_norm": 5.9375, "learning_rate": 7.827345783281462e-06, "loss": 0.82959442, "memory(GiB)": 135.49, "step": 28240, "train_speed(iter/s)": 0.202679 }, { "acc": 0.76736584, "epoch": 0.659093891716123, "grad_norm": 8.0, "learning_rate": 7.825787499965315e-06, "loss": 0.85541477, "memory(GiB)": 135.49, "step": 28250, "train_speed(iter/s)": 0.202716 }, { "acc": 0.74858651, "epoch": 0.6593271992884119, "grad_norm": 7.0625, "learning_rate": 7.824228813268823e-06, "loss": 0.92622013, "memory(GiB)": 135.49, "step": 28260, "train_speed(iter/s)": 0.202751 }, { "acc": 0.77106609, "epoch": 0.6595605068607008, "grad_norm": 10.9375, "learning_rate": 7.822669723414488e-06, "loss": 0.82412109, "memory(GiB)": 135.49, "step": 28270, "train_speed(iter/s)": 0.202788 }, { "acc": 0.76748371, "epoch": 0.6597938144329897, "grad_norm": 4.65625, "learning_rate": 7.82111023062487e-06, "loss": 0.85477009, "memory(GiB)": 135.49, "step": 28280, "train_speed(iter/s)": 0.202822 }, { "acc": 0.76726427, "epoch": 0.6600271220052786, "grad_norm": 6.09375, "learning_rate": 7.819550335122587e-06, "loss": 0.85450478, "memory(GiB)": 135.49, "step": 28290, "train_speed(iter/s)": 0.202859 }, { "acc": 0.7820507, "epoch": 0.6602604295775675, "grad_norm": 6.21875, "learning_rate": 7.817990037130312e-06, "loss": 0.77062593, "memory(GiB)": 135.49, "step": 28300, "train_speed(iter/s)": 0.202896 }, { "acc": 0.76910772, "epoch": 0.6604937371498564, "grad_norm": 8.25, "learning_rate": 7.816429336870778e-06, "loss": 0.81802711, "memory(GiB)": 135.49, "step": 28310, "train_speed(iter/s)": 0.202932 }, { "acc": 0.79276152, "epoch": 0.6607270447221453, "grad_norm": 7.40625, "learning_rate": 7.814868234566775e-06, "loss": 0.73200502, "memory(GiB)": 135.49, "step": 28320, "train_speed(iter/s)": 0.202968 }, { "acc": 0.75662432, "epoch": 0.6609603522944342, "grad_norm": 6.125, "learning_rate": 7.813306730441147e-06, "loss": 0.89180403, "memory(GiB)": 135.49, "step": 28330, "train_speed(iter/s)": 0.203004 }, { "acc": 0.77322044, "epoch": 0.6611936598667231, "grad_norm": 4.71875, "learning_rate": 7.811744824716803e-06, "loss": 0.84110184, "memory(GiB)": 135.49, "step": 28340, "train_speed(iter/s)": 0.203041 }, { "acc": 0.7781713, "epoch": 0.661426967439012, "grad_norm": 5.1875, "learning_rate": 7.810182517616702e-06, "loss": 0.80061522, "memory(GiB)": 135.49, "step": 28350, "train_speed(iter/s)": 0.203079 }, { "acc": 0.77150173, "epoch": 0.6616602750113009, "grad_norm": 6.9375, "learning_rate": 7.808619809363863e-06, "loss": 0.81741838, "memory(GiB)": 135.49, "step": 28360, "train_speed(iter/s)": 0.203114 }, { "acc": 0.77438717, "epoch": 0.6618935825835898, "grad_norm": 6.625, "learning_rate": 7.80705670018136e-06, "loss": 0.81885586, "memory(GiB)": 135.49, "step": 28370, "train_speed(iter/s)": 0.203151 }, { "acc": 0.76907768, "epoch": 0.6621268901558787, "grad_norm": 7.125, "learning_rate": 7.805493190292327e-06, "loss": 0.83498898, "memory(GiB)": 135.49, "step": 28380, "train_speed(iter/s)": 0.203185 }, { "acc": 0.76114807, "epoch": 0.6623601977281676, "grad_norm": 6.5, "learning_rate": 7.80392927991996e-06, "loss": 0.85645504, "memory(GiB)": 135.49, "step": 28390, "train_speed(iter/s)": 0.203221 }, { "acc": 0.7884244, "epoch": 0.6625935053004564, "grad_norm": 7.15625, "learning_rate": 7.802364969287501e-06, "loss": 0.76464214, "memory(GiB)": 135.49, "step": 28400, "train_speed(iter/s)": 0.203259 }, { "acc": 0.78246341, "epoch": 0.6628268128727453, "grad_norm": 5.21875, "learning_rate": 7.80080025861826e-06, "loss": 0.77127781, "memory(GiB)": 135.49, "step": 28410, "train_speed(iter/s)": 0.203297 }, { "acc": 0.78445177, "epoch": 0.6630601204450342, "grad_norm": 6.28125, "learning_rate": 7.799235148135592e-06, "loss": 0.75719695, "memory(GiB)": 135.49, "step": 28420, "train_speed(iter/s)": 0.203334 }, { "acc": 0.77215543, "epoch": 0.663293428017323, "grad_norm": 6.09375, "learning_rate": 7.797669638062921e-06, "loss": 0.81624231, "memory(GiB)": 135.49, "step": 28430, "train_speed(iter/s)": 0.203371 }, { "acc": 0.76690803, "epoch": 0.663526735589612, "grad_norm": 4.9375, "learning_rate": 7.796103728623723e-06, "loss": 0.83521976, "memory(GiB)": 135.49, "step": 28440, "train_speed(iter/s)": 0.203405 }, { "acc": 0.77364559, "epoch": 0.6637600431619008, "grad_norm": 5.0, "learning_rate": 7.794537420041527e-06, "loss": 0.81636162, "memory(GiB)": 135.49, "step": 28450, "train_speed(iter/s)": 0.20344 }, { "acc": 0.77083006, "epoch": 0.6639933507341897, "grad_norm": 6.4375, "learning_rate": 7.792970712539929e-06, "loss": 0.83847628, "memory(GiB)": 135.49, "step": 28460, "train_speed(iter/s)": 0.203477 }, { "acc": 0.75938921, "epoch": 0.6642266583064786, "grad_norm": 5.65625, "learning_rate": 7.791403606342572e-06, "loss": 0.90194645, "memory(GiB)": 135.49, "step": 28470, "train_speed(iter/s)": 0.203515 }, { "acc": 0.76458063, "epoch": 0.6644599658787675, "grad_norm": 10.125, "learning_rate": 7.78983610167316e-06, "loss": 0.85806103, "memory(GiB)": 135.49, "step": 28480, "train_speed(iter/s)": 0.203553 }, { "acc": 0.76429634, "epoch": 0.6646932734510564, "grad_norm": 8.125, "learning_rate": 7.788268198755456e-06, "loss": 0.85033817, "memory(GiB)": 135.49, "step": 28490, "train_speed(iter/s)": 0.20359 }, { "acc": 0.75650692, "epoch": 0.6649265810233453, "grad_norm": 5.9375, "learning_rate": 7.786699897813277e-06, "loss": 0.86377296, "memory(GiB)": 135.49, "step": 28500, "train_speed(iter/s)": 0.203625 }, { "epoch": 0.6649265810233453, "eval_acc": 0.7342526113346607, "eval_loss": 0.8375863432884216, "eval_runtime": 1262.516, "eval_samples_per_second": 28.507, "eval_steps_per_second": 14.254, "step": 28500 }, { "acc": 0.76109433, "epoch": 0.6651598885956342, "grad_norm": 6.84375, "learning_rate": 7.785131199070497e-06, "loss": 0.87187109, "memory(GiB)": 135.49, "step": 28510, "train_speed(iter/s)": 0.201805 }, { "acc": 0.78141322, "epoch": 0.6653931961679231, "grad_norm": 5.15625, "learning_rate": 7.783562102751048e-06, "loss": 0.80011148, "memory(GiB)": 135.49, "step": 28520, "train_speed(iter/s)": 0.201838 }, { "acc": 0.76257648, "epoch": 0.665626503740212, "grad_norm": 6.1875, "learning_rate": 7.781992609078916e-06, "loss": 0.8450964, "memory(GiB)": 135.49, "step": 28530, "train_speed(iter/s)": 0.201874 }, { "acc": 0.75891452, "epoch": 0.6658598113125009, "grad_norm": 6.53125, "learning_rate": 7.780422718278148e-06, "loss": 0.8637846, "memory(GiB)": 135.49, "step": 28540, "train_speed(iter/s)": 0.201909 }, { "acc": 0.75063796, "epoch": 0.6660931188847898, "grad_norm": 6.0625, "learning_rate": 7.778852430572846e-06, "loss": 0.92087536, "memory(GiB)": 135.49, "step": 28550, "train_speed(iter/s)": 0.201946 }, { "acc": 0.76128788, "epoch": 0.6663264264570787, "grad_norm": 7.46875, "learning_rate": 7.777281746187163e-06, "loss": 0.84818668, "memory(GiB)": 135.49, "step": 28560, "train_speed(iter/s)": 0.201984 }, { "acc": 0.77570424, "epoch": 0.6665597340293676, "grad_norm": 6.21875, "learning_rate": 7.775710665345322e-06, "loss": 0.82224083, "memory(GiB)": 135.49, "step": 28570, "train_speed(iter/s)": 0.20202 }, { "acc": 0.74976921, "epoch": 0.6667930416016565, "grad_norm": 4.46875, "learning_rate": 7.774139188271588e-06, "loss": 0.93118334, "memory(GiB)": 135.49, "step": 28580, "train_speed(iter/s)": 0.202058 }, { "acc": 0.76208239, "epoch": 0.6670263491739454, "grad_norm": 7.625, "learning_rate": 7.772567315190291e-06, "loss": 0.86769552, "memory(GiB)": 135.49, "step": 28590, "train_speed(iter/s)": 0.202094 }, { "acc": 0.77244534, "epoch": 0.6672596567462342, "grad_norm": 5.40625, "learning_rate": 7.770995046325813e-06, "loss": 0.81451397, "memory(GiB)": 135.49, "step": 28600, "train_speed(iter/s)": 0.202131 }, { "acc": 0.75187464, "epoch": 0.6674929643185231, "grad_norm": 5.75, "learning_rate": 7.769422381902601e-06, "loss": 0.88971424, "memory(GiB)": 135.49, "step": 28610, "train_speed(iter/s)": 0.202171 }, { "acc": 0.76840744, "epoch": 0.667726271890812, "grad_norm": 7.375, "learning_rate": 7.767849322145144e-06, "loss": 0.85218048, "memory(GiB)": 135.49, "step": 28620, "train_speed(iter/s)": 0.202209 }, { "acc": 0.75663261, "epoch": 0.6679595794631009, "grad_norm": 5.6875, "learning_rate": 7.766275867278004e-06, "loss": 0.90435848, "memory(GiB)": 135.49, "step": 28630, "train_speed(iter/s)": 0.202245 }, { "acc": 0.77674093, "epoch": 0.6681928870353898, "grad_norm": 4.96875, "learning_rate": 7.764702017525787e-06, "loss": 0.79779577, "memory(GiB)": 135.49, "step": 28640, "train_speed(iter/s)": 0.202277 }, { "acc": 0.77028193, "epoch": 0.6684261946076787, "grad_norm": 5.59375, "learning_rate": 7.763127773113159e-06, "loss": 0.82342319, "memory(GiB)": 135.49, "step": 28650, "train_speed(iter/s)": 0.202315 }, { "acc": 0.78031445, "epoch": 0.6686595021799676, "grad_norm": 6.1875, "learning_rate": 7.761553134264844e-06, "loss": 0.80032358, "memory(GiB)": 135.49, "step": 28660, "train_speed(iter/s)": 0.202351 }, { "acc": 0.78026819, "epoch": 0.6688928097522565, "grad_norm": 5.5625, "learning_rate": 7.759978101205623e-06, "loss": 0.78050585, "memory(GiB)": 135.49, "step": 28670, "train_speed(iter/s)": 0.202387 }, { "acc": 0.77110233, "epoch": 0.6691261173245454, "grad_norm": 6.09375, "learning_rate": 7.758402674160328e-06, "loss": 0.81672707, "memory(GiB)": 135.49, "step": 28680, "train_speed(iter/s)": 0.202424 }, { "acc": 0.75303931, "epoch": 0.6693594248968343, "grad_norm": 7.84375, "learning_rate": 7.756826853353854e-06, "loss": 0.90334129, "memory(GiB)": 135.49, "step": 28690, "train_speed(iter/s)": 0.202459 }, { "acc": 0.75802727, "epoch": 0.6695927324691232, "grad_norm": 5.96875, "learning_rate": 7.755250639011147e-06, "loss": 0.87267551, "memory(GiB)": 135.49, "step": 28700, "train_speed(iter/s)": 0.202498 }, { "acc": 0.763801, "epoch": 0.6698260400414121, "grad_norm": 7.625, "learning_rate": 7.75367403135721e-06, "loss": 0.84104023, "memory(GiB)": 135.49, "step": 28710, "train_speed(iter/s)": 0.202538 }, { "acc": 0.76521683, "epoch": 0.670059347613701, "grad_norm": 7.375, "learning_rate": 7.752097030617107e-06, "loss": 0.8553648, "memory(GiB)": 135.49, "step": 28720, "train_speed(iter/s)": 0.202575 }, { "acc": 0.76649408, "epoch": 0.6702926551859899, "grad_norm": 6.90625, "learning_rate": 7.750519637015953e-06, "loss": 0.8286849, "memory(GiB)": 135.49, "step": 28730, "train_speed(iter/s)": 0.202611 }, { "acc": 0.77714939, "epoch": 0.6705259627582788, "grad_norm": 5.78125, "learning_rate": 7.748941850778917e-06, "loss": 0.77824764, "memory(GiB)": 135.49, "step": 28740, "train_speed(iter/s)": 0.202647 }, { "acc": 0.7567512, "epoch": 0.6707592703305677, "grad_norm": 9.0625, "learning_rate": 7.747363672131233e-06, "loss": 0.88108101, "memory(GiB)": 135.49, "step": 28750, "train_speed(iter/s)": 0.202684 }, { "acc": 0.78877945, "epoch": 0.6709925779028566, "grad_norm": 8.6875, "learning_rate": 7.745785101298182e-06, "loss": 0.74064379, "memory(GiB)": 135.49, "step": 28760, "train_speed(iter/s)": 0.202721 }, { "acc": 0.7598752, "epoch": 0.6712258854751455, "grad_norm": 8.9375, "learning_rate": 7.744206138505106e-06, "loss": 0.86930542, "memory(GiB)": 135.49, "step": 28770, "train_speed(iter/s)": 0.202759 }, { "acc": 0.7856174, "epoch": 0.6714591930474344, "grad_norm": 5.28125, "learning_rate": 7.7426267839774e-06, "loss": 0.7754293, "memory(GiB)": 135.49, "step": 28780, "train_speed(iter/s)": 0.202796 }, { "acc": 0.78163309, "epoch": 0.6716925006197232, "grad_norm": 7.28125, "learning_rate": 7.741047037940516e-06, "loss": 0.76844721, "memory(GiB)": 135.49, "step": 28790, "train_speed(iter/s)": 0.202833 }, { "acc": 0.75333691, "epoch": 0.6719258081920121, "grad_norm": 5.75, "learning_rate": 7.739466900619966e-06, "loss": 0.89740486, "memory(GiB)": 135.49, "step": 28800, "train_speed(iter/s)": 0.202871 }, { "acc": 0.76331949, "epoch": 0.672159115764301, "grad_norm": 8.9375, "learning_rate": 7.737886372241311e-06, "loss": 0.86295471, "memory(GiB)": 135.49, "step": 28810, "train_speed(iter/s)": 0.202905 }, { "acc": 0.76187544, "epoch": 0.6723924233365899, "grad_norm": 8.125, "learning_rate": 7.736305453030172e-06, "loss": 0.8616106, "memory(GiB)": 135.49, "step": 28820, "train_speed(iter/s)": 0.202941 }, { "acc": 0.78171129, "epoch": 0.6726257309088788, "grad_norm": 6.46875, "learning_rate": 7.734724143212224e-06, "loss": 0.79563236, "memory(GiB)": 135.49, "step": 28830, "train_speed(iter/s)": 0.202977 }, { "acc": 0.77842312, "epoch": 0.6728590384811677, "grad_norm": 6.96875, "learning_rate": 7.733142443013199e-06, "loss": 0.7888751, "memory(GiB)": 135.49, "step": 28840, "train_speed(iter/s)": 0.203011 }, { "acc": 0.77859621, "epoch": 0.6730923460534566, "grad_norm": 5.78125, "learning_rate": 7.731560352658886e-06, "loss": 0.8054018, "memory(GiB)": 135.49, "step": 28850, "train_speed(iter/s)": 0.203048 }, { "acc": 0.74896798, "epoch": 0.6733256536257455, "grad_norm": 8.1875, "learning_rate": 7.729977872375125e-06, "loss": 0.9214407, "memory(GiB)": 135.49, "step": 28860, "train_speed(iter/s)": 0.203087 }, { "acc": 0.76319513, "epoch": 0.6735589611980344, "grad_norm": 5.28125, "learning_rate": 7.728395002387815e-06, "loss": 0.86480675, "memory(GiB)": 135.49, "step": 28870, "train_speed(iter/s)": 0.203122 }, { "acc": 0.76196332, "epoch": 0.6737922687703233, "grad_norm": 7.8125, "learning_rate": 7.726811742922912e-06, "loss": 0.8620656, "memory(GiB)": 135.49, "step": 28880, "train_speed(iter/s)": 0.20316 }, { "acc": 0.77168221, "epoch": 0.6740255763426122, "grad_norm": 5.78125, "learning_rate": 7.725228094206423e-06, "loss": 0.81978989, "memory(GiB)": 135.49, "step": 28890, "train_speed(iter/s)": 0.203198 }, { "acc": 0.77251558, "epoch": 0.6742588839149011, "grad_norm": 9.3125, "learning_rate": 7.723644056464416e-06, "loss": 0.81578951, "memory(GiB)": 135.49, "step": 28900, "train_speed(iter/s)": 0.203237 }, { "acc": 0.75477643, "epoch": 0.67449219148719, "grad_norm": 6.75, "learning_rate": 7.722059629923014e-06, "loss": 0.88286076, "memory(GiB)": 135.49, "step": 28910, "train_speed(iter/s)": 0.20327 }, { "acc": 0.76766858, "epoch": 0.6747254990594789, "grad_norm": 4.90625, "learning_rate": 7.720474814808387e-06, "loss": 0.82360716, "memory(GiB)": 135.49, "step": 28920, "train_speed(iter/s)": 0.203306 }, { "acc": 0.74861059, "epoch": 0.6749588066317678, "grad_norm": 5.875, "learning_rate": 7.718889611346771e-06, "loss": 0.91390362, "memory(GiB)": 135.49, "step": 28930, "train_speed(iter/s)": 0.203342 }, { "acc": 0.74918098, "epoch": 0.6751921142040567, "grad_norm": 5.6875, "learning_rate": 7.717304019764456e-06, "loss": 0.91532822, "memory(GiB)": 135.49, "step": 28940, "train_speed(iter/s)": 0.203379 }, { "acc": 0.75819092, "epoch": 0.6754254217763456, "grad_norm": 6.3125, "learning_rate": 7.71571804028778e-06, "loss": 0.85656643, "memory(GiB)": 135.49, "step": 28950, "train_speed(iter/s)": 0.203414 }, { "acc": 0.76632853, "epoch": 0.6756587293486345, "grad_norm": 5.75, "learning_rate": 7.714131673143139e-06, "loss": 0.83468342, "memory(GiB)": 135.49, "step": 28960, "train_speed(iter/s)": 0.203452 }, { "acc": 0.77796788, "epoch": 0.6758920369209234, "grad_norm": 7.28125, "learning_rate": 7.712544918556994e-06, "loss": 0.76835852, "memory(GiB)": 135.49, "step": 28970, "train_speed(iter/s)": 0.203488 }, { "acc": 0.75563149, "epoch": 0.6761253444932122, "grad_norm": 7.03125, "learning_rate": 7.71095777675585e-06, "loss": 0.89352531, "memory(GiB)": 135.49, "step": 28980, "train_speed(iter/s)": 0.203525 }, { "acc": 0.76286697, "epoch": 0.676358652065501, "grad_norm": 6.28125, "learning_rate": 7.709370247966269e-06, "loss": 0.85611649, "memory(GiB)": 135.49, "step": 28990, "train_speed(iter/s)": 0.203562 }, { "acc": 0.75173554, "epoch": 0.67659195963779, "grad_norm": 6.65625, "learning_rate": 7.707782332414873e-06, "loss": 0.9111145, "memory(GiB)": 135.49, "step": 29000, "train_speed(iter/s)": 0.203598 }, { "epoch": 0.67659195963779, "eval_acc": 0.7343682941861703, "eval_loss": 0.8374514579772949, "eval_runtime": 1261.545, "eval_samples_per_second": 28.529, "eval_steps_per_second": 14.265, "step": 29000 }, { "acc": 0.74383545, "epoch": 0.6768252672100789, "grad_norm": 7.46875, "learning_rate": 7.706194030328336e-06, "loss": 0.92722845, "memory(GiB)": 135.49, "step": 29010, "train_speed(iter/s)": 0.201813 }, { "acc": 0.77386131, "epoch": 0.6770585747823678, "grad_norm": 6.28125, "learning_rate": 7.704605341933385e-06, "loss": 0.81485901, "memory(GiB)": 135.49, "step": 29020, "train_speed(iter/s)": 0.201849 }, { "acc": 0.76572227, "epoch": 0.6772918823546566, "grad_norm": 6.125, "learning_rate": 7.70301626745681e-06, "loss": 0.85542765, "memory(GiB)": 135.49, "step": 29030, "train_speed(iter/s)": 0.201884 }, { "acc": 0.74995356, "epoch": 0.6775251899269455, "grad_norm": 9.9375, "learning_rate": 7.701426807125447e-06, "loss": 0.89386511, "memory(GiB)": 135.49, "step": 29040, "train_speed(iter/s)": 0.201921 }, { "acc": 0.75983763, "epoch": 0.6777584974992344, "grad_norm": 6.3125, "learning_rate": 7.699836961166192e-06, "loss": 0.88416843, "memory(GiB)": 135.49, "step": 29050, "train_speed(iter/s)": 0.201956 }, { "acc": 0.76318026, "epoch": 0.6779918050715233, "grad_norm": 6.25, "learning_rate": 7.698246729805996e-06, "loss": 0.86252737, "memory(GiB)": 135.49, "step": 29060, "train_speed(iter/s)": 0.201994 }, { "acc": 0.7742053, "epoch": 0.6782251126438122, "grad_norm": 6.71875, "learning_rate": 7.696656113271863e-06, "loss": 0.80427151, "memory(GiB)": 135.49, "step": 29070, "train_speed(iter/s)": 0.202029 }, { "acc": 0.77524776, "epoch": 0.6784584202161011, "grad_norm": 6.625, "learning_rate": 7.695065111790852e-06, "loss": 0.8069046, "memory(GiB)": 135.49, "step": 29080, "train_speed(iter/s)": 0.202067 }, { "acc": 0.77106915, "epoch": 0.67869172778839, "grad_norm": 5.0625, "learning_rate": 7.693473725590079e-06, "loss": 0.84126062, "memory(GiB)": 135.49, "step": 29090, "train_speed(iter/s)": 0.202102 }, { "acc": 0.77355213, "epoch": 0.6789250353606789, "grad_norm": 5.78125, "learning_rate": 7.691881954896716e-06, "loss": 0.8134306, "memory(GiB)": 135.49, "step": 29100, "train_speed(iter/s)": 0.202137 }, { "acc": 0.79410286, "epoch": 0.6791583429329678, "grad_norm": 11.5625, "learning_rate": 7.690289799937985e-06, "loss": 0.73981142, "memory(GiB)": 135.49, "step": 29110, "train_speed(iter/s)": 0.202175 }, { "acc": 0.76534491, "epoch": 0.6793916505052567, "grad_norm": 4.8125, "learning_rate": 7.688697260941164e-06, "loss": 0.84023952, "memory(GiB)": 135.49, "step": 29120, "train_speed(iter/s)": 0.202213 }, { "acc": 0.78333211, "epoch": 0.6796249580775456, "grad_norm": 4.75, "learning_rate": 7.687104338133595e-06, "loss": 0.79640245, "memory(GiB)": 135.49, "step": 29130, "train_speed(iter/s)": 0.202248 }, { "acc": 0.76339774, "epoch": 0.6798582656498345, "grad_norm": 4.6875, "learning_rate": 7.68551103174266e-06, "loss": 0.86120672, "memory(GiB)": 135.49, "step": 29140, "train_speed(iter/s)": 0.202286 }, { "acc": 0.78185148, "epoch": 0.6800915732221234, "grad_norm": 5.3125, "learning_rate": 7.683917341995806e-06, "loss": 0.79188509, "memory(GiB)": 135.49, "step": 29150, "train_speed(iter/s)": 0.202324 }, { "acc": 0.7575397, "epoch": 0.6803248807944123, "grad_norm": 5.6875, "learning_rate": 7.68232326912053e-06, "loss": 0.87140503, "memory(GiB)": 135.49, "step": 29160, "train_speed(iter/s)": 0.202364 }, { "acc": 0.76979284, "epoch": 0.6805581883667011, "grad_norm": 6.09375, "learning_rate": 7.680728813344388e-06, "loss": 0.82970543, "memory(GiB)": 135.49, "step": 29170, "train_speed(iter/s)": 0.202401 }, { "acc": 0.75912275, "epoch": 0.68079149593899, "grad_norm": 8.875, "learning_rate": 7.679133974894984e-06, "loss": 0.88762703, "memory(GiB)": 135.49, "step": 29180, "train_speed(iter/s)": 0.202437 }, { "acc": 0.77694855, "epoch": 0.6810248035112789, "grad_norm": 5.0625, "learning_rate": 7.677538753999984e-06, "loss": 0.79625664, "memory(GiB)": 135.49, "step": 29190, "train_speed(iter/s)": 0.202471 }, { "acc": 0.77004776, "epoch": 0.6812581110835678, "grad_norm": 6.9375, "learning_rate": 7.675943150887107e-06, "loss": 0.81552019, "memory(GiB)": 135.49, "step": 29200, "train_speed(iter/s)": 0.20251 }, { "acc": 0.77045546, "epoch": 0.6814914186558567, "grad_norm": 9.875, "learning_rate": 7.674347165784122e-06, "loss": 0.82159977, "memory(GiB)": 135.49, "step": 29210, "train_speed(iter/s)": 0.202546 }, { "acc": 0.75648999, "epoch": 0.6817247262281456, "grad_norm": 4.8125, "learning_rate": 7.672750798918854e-06, "loss": 0.85807323, "memory(GiB)": 135.49, "step": 29220, "train_speed(iter/s)": 0.202581 }, { "acc": 0.75390387, "epoch": 0.6819580338004345, "grad_norm": 7.78125, "learning_rate": 7.671154050519187e-06, "loss": 0.91498032, "memory(GiB)": 135.49, "step": 29230, "train_speed(iter/s)": 0.202616 }, { "acc": 0.76486883, "epoch": 0.6821913413727234, "grad_norm": 5.125, "learning_rate": 7.669556920813056e-06, "loss": 0.85592499, "memory(GiB)": 135.49, "step": 29240, "train_speed(iter/s)": 0.202651 }, { "acc": 0.79351215, "epoch": 0.6824246489450123, "grad_norm": 6.71875, "learning_rate": 7.66795941002845e-06, "loss": 0.73773308, "memory(GiB)": 135.49, "step": 29250, "train_speed(iter/s)": 0.202685 }, { "acc": 0.79087706, "epoch": 0.6826579565173012, "grad_norm": 5.34375, "learning_rate": 7.666361518393413e-06, "loss": 0.74379201, "memory(GiB)": 135.49, "step": 29260, "train_speed(iter/s)": 0.20272 }, { "acc": 0.77690592, "epoch": 0.6828912640895901, "grad_norm": 5.15625, "learning_rate": 7.664763246136042e-06, "loss": 0.80756559, "memory(GiB)": 135.49, "step": 29270, "train_speed(iter/s)": 0.202755 }, { "acc": 0.78685894, "epoch": 0.683124571661879, "grad_norm": 4.65625, "learning_rate": 7.663164593484493e-06, "loss": 0.77242274, "memory(GiB)": 135.49, "step": 29280, "train_speed(iter/s)": 0.202791 }, { "acc": 0.76086931, "epoch": 0.6833578792341679, "grad_norm": 5.53125, "learning_rate": 7.661565560666973e-06, "loss": 0.88107281, "memory(GiB)": 135.49, "step": 29290, "train_speed(iter/s)": 0.202827 }, { "acc": 0.76811972, "epoch": 0.6835911868064568, "grad_norm": 5.1875, "learning_rate": 7.65996614791174e-06, "loss": 0.83273897, "memory(GiB)": 135.49, "step": 29300, "train_speed(iter/s)": 0.202862 }, { "acc": 0.77451668, "epoch": 0.6838244943787457, "grad_norm": 5.6875, "learning_rate": 7.658366355447115e-06, "loss": 0.82910671, "memory(GiB)": 135.49, "step": 29310, "train_speed(iter/s)": 0.202888 }, { "acc": 0.75781212, "epoch": 0.6840578019510346, "grad_norm": 5.46875, "learning_rate": 7.656766183501465e-06, "loss": 0.88279362, "memory(GiB)": 135.49, "step": 29320, "train_speed(iter/s)": 0.202921 }, { "acc": 0.76610146, "epoch": 0.6842911095233235, "grad_norm": 5.90625, "learning_rate": 7.655165632303212e-06, "loss": 0.84330101, "memory(GiB)": 135.49, "step": 29330, "train_speed(iter/s)": 0.202959 }, { "acc": 0.75190992, "epoch": 0.6845244170956124, "grad_norm": 7.40625, "learning_rate": 7.653564702080837e-06, "loss": 0.90060654, "memory(GiB)": 135.49, "step": 29340, "train_speed(iter/s)": 0.202996 }, { "acc": 0.7814827, "epoch": 0.6847577246679013, "grad_norm": 5.0, "learning_rate": 7.651963393062872e-06, "loss": 0.79791055, "memory(GiB)": 135.49, "step": 29350, "train_speed(iter/s)": 0.203032 }, { "acc": 0.78214259, "epoch": 0.6849910322401902, "grad_norm": 6.46875, "learning_rate": 7.650361705477903e-06, "loss": 0.76334648, "memory(GiB)": 135.49, "step": 29360, "train_speed(iter/s)": 0.203068 }, { "acc": 0.76659269, "epoch": 0.685224339812479, "grad_norm": 6.65625, "learning_rate": 7.648759639554571e-06, "loss": 0.84559326, "memory(GiB)": 135.49, "step": 29370, "train_speed(iter/s)": 0.203106 }, { "acc": 0.77495842, "epoch": 0.6854576473847679, "grad_norm": 7.84375, "learning_rate": 7.647157195521568e-06, "loss": 0.82010784, "memory(GiB)": 135.49, "step": 29380, "train_speed(iter/s)": 0.203142 }, { "acc": 0.77952728, "epoch": 0.6856909549570568, "grad_norm": 6.1875, "learning_rate": 7.645554373607647e-06, "loss": 0.7875299, "memory(GiB)": 135.49, "step": 29390, "train_speed(iter/s)": 0.203177 }, { "acc": 0.77067337, "epoch": 0.6859242625293457, "grad_norm": 5.96875, "learning_rate": 7.643951174041606e-06, "loss": 0.81943865, "memory(GiB)": 135.49, "step": 29400, "train_speed(iter/s)": 0.203209 }, { "acc": 0.77070804, "epoch": 0.6861575701016346, "grad_norm": 6.28125, "learning_rate": 7.642347597052303e-06, "loss": 0.81606722, "memory(GiB)": 135.49, "step": 29410, "train_speed(iter/s)": 0.203245 }, { "acc": 0.76373134, "epoch": 0.6863908776739235, "grad_norm": 4.4375, "learning_rate": 7.64074364286865e-06, "loss": 0.83013935, "memory(GiB)": 135.49, "step": 29420, "train_speed(iter/s)": 0.203281 }, { "acc": 0.77700739, "epoch": 0.6866241852462124, "grad_norm": 5.6875, "learning_rate": 7.639139311719605e-06, "loss": 0.80385571, "memory(GiB)": 135.49, "step": 29430, "train_speed(iter/s)": 0.203316 }, { "acc": 0.76509376, "epoch": 0.6868574928185013, "grad_norm": 5.9375, "learning_rate": 7.637534603834193e-06, "loss": 0.84914351, "memory(GiB)": 135.49, "step": 29440, "train_speed(iter/s)": 0.203352 }, { "acc": 0.77780399, "epoch": 0.6870908003907902, "grad_norm": 6.28125, "learning_rate": 7.635929519441483e-06, "loss": 0.80320492, "memory(GiB)": 135.49, "step": 29450, "train_speed(iter/s)": 0.203388 }, { "acc": 0.74972467, "epoch": 0.6873241079630791, "grad_norm": 5.96875, "learning_rate": 7.634324058770598e-06, "loss": 0.90303802, "memory(GiB)": 135.49, "step": 29460, "train_speed(iter/s)": 0.203425 }, { "acc": 0.75863323, "epoch": 0.687557415535368, "grad_norm": 5.03125, "learning_rate": 7.632718222050719e-06, "loss": 0.87715349, "memory(GiB)": 135.49, "step": 29470, "train_speed(iter/s)": 0.20346 }, { "acc": 0.7672586, "epoch": 0.6877907231076569, "grad_norm": 14.75, "learning_rate": 7.63111200951108e-06, "loss": 0.82872734, "memory(GiB)": 135.49, "step": 29480, "train_speed(iter/s)": 0.203495 }, { "acc": 0.74759321, "epoch": 0.6880240306799458, "grad_norm": 6.46875, "learning_rate": 7.629505421380965e-06, "loss": 0.93810482, "memory(GiB)": 135.49, "step": 29490, "train_speed(iter/s)": 0.203533 }, { "acc": 0.77637615, "epoch": 0.6882573382522347, "grad_norm": 5.1875, "learning_rate": 7.627898457889717e-06, "loss": 0.80836105, "memory(GiB)": 135.49, "step": 29500, "train_speed(iter/s)": 0.203568 }, { "epoch": 0.6882573382522347, "eval_acc": 0.7342585810215586, "eval_loss": 0.8372989296913147, "eval_runtime": 1263.0111, "eval_samples_per_second": 28.496, "eval_steps_per_second": 14.248, "step": 29500 }, { "acc": 0.78352432, "epoch": 0.6884906458245236, "grad_norm": 4.5625, "learning_rate": 7.6262911192667245e-06, "loss": 0.78174791, "memory(GiB)": 135.49, "step": 29510, "train_speed(iter/s)": 0.20181 }, { "acc": 0.7818965, "epoch": 0.6887239533968125, "grad_norm": 6.34375, "learning_rate": 7.62468340574144e-06, "loss": 0.78962369, "memory(GiB)": 135.49, "step": 29520, "train_speed(iter/s)": 0.201847 }, { "acc": 0.77680497, "epoch": 0.6889572609691014, "grad_norm": 6.0625, "learning_rate": 7.623075317543361e-06, "loss": 0.81022224, "memory(GiB)": 135.49, "step": 29530, "train_speed(iter/s)": 0.201884 }, { "acc": 0.78500633, "epoch": 0.6891905685413903, "grad_norm": 6.375, "learning_rate": 7.62146685490204e-06, "loss": 0.77696323, "memory(GiB)": 135.49, "step": 29540, "train_speed(iter/s)": 0.201922 }, { "acc": 0.76182184, "epoch": 0.6894238761136792, "grad_norm": 6.53125, "learning_rate": 7.6198580180470904e-06, "loss": 0.85575848, "memory(GiB)": 135.49, "step": 29550, "train_speed(iter/s)": 0.20196 }, { "acc": 0.75512772, "epoch": 0.689657183685968, "grad_norm": 5.84375, "learning_rate": 7.618248807208169e-06, "loss": 0.89176722, "memory(GiB)": 135.49, "step": 29560, "train_speed(iter/s)": 0.201996 }, { "acc": 0.78084784, "epoch": 0.6898904912582569, "grad_norm": 5.5625, "learning_rate": 7.61663922261499e-06, "loss": 0.78817868, "memory(GiB)": 135.49, "step": 29570, "train_speed(iter/s)": 0.202025 }, { "acc": 0.75899596, "epoch": 0.6901237988305458, "grad_norm": 5.6875, "learning_rate": 7.615029264497322e-06, "loss": 0.8771946, "memory(GiB)": 135.49, "step": 29580, "train_speed(iter/s)": 0.202061 }, { "acc": 0.75325875, "epoch": 0.6903571064028347, "grad_norm": 6.8125, "learning_rate": 7.6134189330849885e-06, "loss": 0.90175619, "memory(GiB)": 135.49, "step": 29590, "train_speed(iter/s)": 0.202098 }, { "acc": 0.76101179, "epoch": 0.6905904139751236, "grad_norm": 5.03125, "learning_rate": 7.611808228607859e-06, "loss": 0.86093674, "memory(GiB)": 135.49, "step": 29600, "train_speed(iter/s)": 0.202132 }, { "acc": 0.76534061, "epoch": 0.6908237215474125, "grad_norm": 9.125, "learning_rate": 7.610197151295865e-06, "loss": 0.83176813, "memory(GiB)": 135.49, "step": 29610, "train_speed(iter/s)": 0.202167 }, { "acc": 0.76389818, "epoch": 0.6910570291197013, "grad_norm": 4.96875, "learning_rate": 7.608585701378985e-06, "loss": 0.85893383, "memory(GiB)": 135.49, "step": 29620, "train_speed(iter/s)": 0.2022 }, { "acc": 0.7667244, "epoch": 0.6912903366919902, "grad_norm": 5.4375, "learning_rate": 7.6069738790872545e-06, "loss": 0.8511528, "memory(GiB)": 135.49, "step": 29630, "train_speed(iter/s)": 0.202236 }, { "acc": 0.77774682, "epoch": 0.6915236442642791, "grad_norm": 5.0625, "learning_rate": 7.6053616846507606e-06, "loss": 0.79887872, "memory(GiB)": 135.49, "step": 29640, "train_speed(iter/s)": 0.202273 }, { "acc": 0.76511588, "epoch": 0.691756951836568, "grad_norm": 6.15625, "learning_rate": 7.6037491182996415e-06, "loss": 0.86095028, "memory(GiB)": 135.49, "step": 29650, "train_speed(iter/s)": 0.202307 }, { "acc": 0.78870077, "epoch": 0.691990259408857, "grad_norm": 5.28125, "learning_rate": 7.602136180264094e-06, "loss": 0.75997128, "memory(GiB)": 135.49, "step": 29660, "train_speed(iter/s)": 0.202344 }, { "acc": 0.75694103, "epoch": 0.6922235669811458, "grad_norm": 7.25, "learning_rate": 7.6005228707743606e-06, "loss": 0.89193277, "memory(GiB)": 135.49, "step": 29670, "train_speed(iter/s)": 0.20238 }, { "acc": 0.77532086, "epoch": 0.6924568745534347, "grad_norm": 6.53125, "learning_rate": 7.598909190060744e-06, "loss": 0.80979176, "memory(GiB)": 135.49, "step": 29680, "train_speed(iter/s)": 0.202415 }, { "acc": 0.77639484, "epoch": 0.6926901821257236, "grad_norm": 4.125, "learning_rate": 7.597295138353596e-06, "loss": 0.79779243, "memory(GiB)": 135.49, "step": 29690, "train_speed(iter/s)": 0.20245 }, { "acc": 0.75787125, "epoch": 0.6929234896980125, "grad_norm": 7.875, "learning_rate": 7.595680715883321e-06, "loss": 0.8623291, "memory(GiB)": 135.49, "step": 29700, "train_speed(iter/s)": 0.202487 }, { "acc": 0.75996704, "epoch": 0.6931567972703014, "grad_norm": 6.25, "learning_rate": 7.594065922880378e-06, "loss": 0.86226597, "memory(GiB)": 135.49, "step": 29710, "train_speed(iter/s)": 0.202523 }, { "acc": 0.77499394, "epoch": 0.6933901048425903, "grad_norm": 4.8125, "learning_rate": 7.592450759575278e-06, "loss": 0.82837524, "memory(GiB)": 135.49, "step": 29720, "train_speed(iter/s)": 0.202559 }, { "acc": 0.77002277, "epoch": 0.6936234124148792, "grad_norm": 5.03125, "learning_rate": 7.590835226198585e-06, "loss": 0.83547287, "memory(GiB)": 135.49, "step": 29730, "train_speed(iter/s)": 0.202595 }, { "acc": 0.77883492, "epoch": 0.6938567199871681, "grad_norm": 5.125, "learning_rate": 7.589219322980916e-06, "loss": 0.78412685, "memory(GiB)": 135.49, "step": 29740, "train_speed(iter/s)": 0.202629 }, { "acc": 0.76465607, "epoch": 0.6940900275594569, "grad_norm": 5.53125, "learning_rate": 7.587603050152941e-06, "loss": 0.86185246, "memory(GiB)": 135.49, "step": 29750, "train_speed(iter/s)": 0.202661 }, { "acc": 0.78071594, "epoch": 0.6943233351317458, "grad_norm": 4.90625, "learning_rate": 7.585986407945383e-06, "loss": 0.7893683, "memory(GiB)": 135.49, "step": 29760, "train_speed(iter/s)": 0.202696 }, { "acc": 0.77200079, "epoch": 0.6945566427040347, "grad_norm": 6.8125, "learning_rate": 7.584369396589015e-06, "loss": 0.82503567, "memory(GiB)": 135.49, "step": 29770, "train_speed(iter/s)": 0.202732 }, { "acc": 0.77329698, "epoch": 0.6947899502763236, "grad_norm": 6.3125, "learning_rate": 7.582752016314669e-06, "loss": 0.80524864, "memory(GiB)": 135.49, "step": 29780, "train_speed(iter/s)": 0.202766 }, { "acc": 0.76356821, "epoch": 0.6950232578486125, "grad_norm": 4.5, "learning_rate": 7.58113426735322e-06, "loss": 0.84692316, "memory(GiB)": 135.49, "step": 29790, "train_speed(iter/s)": 0.202803 }, { "acc": 0.7799644, "epoch": 0.6952565654209014, "grad_norm": 6.71875, "learning_rate": 7.579516149935606e-06, "loss": 0.78571291, "memory(GiB)": 135.49, "step": 29800, "train_speed(iter/s)": 0.20284 }, { "acc": 0.77144923, "epoch": 0.6954898729931903, "grad_norm": 5.9375, "learning_rate": 7.577897664292811e-06, "loss": 0.82898283, "memory(GiB)": 135.49, "step": 29810, "train_speed(iter/s)": 0.202876 }, { "acc": 0.7739316, "epoch": 0.6957231805654792, "grad_norm": 7.21875, "learning_rate": 7.57627881065587e-06, "loss": 0.82423267, "memory(GiB)": 135.49, "step": 29820, "train_speed(iter/s)": 0.20291 }, { "acc": 0.75234284, "epoch": 0.6959564881377681, "grad_norm": 4.34375, "learning_rate": 7.574659589255881e-06, "loss": 0.9031086, "memory(GiB)": 135.49, "step": 29830, "train_speed(iter/s)": 0.202945 }, { "acc": 0.75815458, "epoch": 0.696189795710057, "grad_norm": 6.0625, "learning_rate": 7.573040000323984e-06, "loss": 0.85880089, "memory(GiB)": 135.49, "step": 29840, "train_speed(iter/s)": 0.202981 }, { "acc": 0.77644525, "epoch": 0.6964231032823459, "grad_norm": 6.4375, "learning_rate": 7.571420044091372e-06, "loss": 0.79489079, "memory(GiB)": 135.49, "step": 29850, "train_speed(iter/s)": 0.203015 }, { "acc": 0.75497885, "epoch": 0.6966564108546348, "grad_norm": 4.78125, "learning_rate": 7.569799720789297e-06, "loss": 0.89154387, "memory(GiB)": 135.49, "step": 29860, "train_speed(iter/s)": 0.203051 }, { "acc": 0.77968788, "epoch": 0.6968897184269237, "grad_norm": 6.59375, "learning_rate": 7.568179030649057e-06, "loss": 0.78865724, "memory(GiB)": 135.49, "step": 29870, "train_speed(iter/s)": 0.203086 }, { "acc": 0.77353277, "epoch": 0.6971230259992126, "grad_norm": 6.875, "learning_rate": 7.566557973902007e-06, "loss": 0.82582932, "memory(GiB)": 135.49, "step": 29880, "train_speed(iter/s)": 0.203122 }, { "acc": 0.76130466, "epoch": 0.6973563335715015, "grad_norm": 5.5625, "learning_rate": 7.564936550779553e-06, "loss": 0.86224556, "memory(GiB)": 135.49, "step": 29890, "train_speed(iter/s)": 0.203157 }, { "acc": 0.75666261, "epoch": 0.6975896411437904, "grad_norm": 5.09375, "learning_rate": 7.563314761513151e-06, "loss": 0.88601856, "memory(GiB)": 135.49, "step": 29900, "train_speed(iter/s)": 0.20319 }, { "acc": 0.77558813, "epoch": 0.6978229487160793, "grad_norm": 4.6875, "learning_rate": 7.56169260633431e-06, "loss": 0.80981598, "memory(GiB)": 135.49, "step": 29910, "train_speed(iter/s)": 0.203227 }, { "acc": 0.7820272, "epoch": 0.6980562562883682, "grad_norm": 5.40625, "learning_rate": 7.560070085474596e-06, "loss": 0.77382679, "memory(GiB)": 135.49, "step": 29920, "train_speed(iter/s)": 0.203264 }, { "acc": 0.76854143, "epoch": 0.6982895638606571, "grad_norm": 6.84375, "learning_rate": 7.55844719916562e-06, "loss": 0.84503422, "memory(GiB)": 135.49, "step": 29930, "train_speed(iter/s)": 0.2033 }, { "acc": 0.76472192, "epoch": 0.6985228714329459, "grad_norm": 4.34375, "learning_rate": 7.556823947639048e-06, "loss": 0.8516037, "memory(GiB)": 135.49, "step": 29940, "train_speed(iter/s)": 0.203337 }, { "acc": 0.77433224, "epoch": 0.6987561790052348, "grad_norm": 6.5, "learning_rate": 7.555200331126602e-06, "loss": 0.81957569, "memory(GiB)": 135.49, "step": 29950, "train_speed(iter/s)": 0.203372 }, { "acc": 0.75855608, "epoch": 0.6989894865775237, "grad_norm": 7.875, "learning_rate": 7.55357634986005e-06, "loss": 0.87865391, "memory(GiB)": 135.49, "step": 29960, "train_speed(iter/s)": 0.203409 }, { "acc": 0.76500187, "epoch": 0.6992227941498126, "grad_norm": 7.09375, "learning_rate": 7.551952004071217e-06, "loss": 0.84341812, "memory(GiB)": 135.49, "step": 29970, "train_speed(iter/s)": 0.203445 }, { "acc": 0.76465826, "epoch": 0.6994561017221015, "grad_norm": 7.03125, "learning_rate": 7.550327293991976e-06, "loss": 0.84774237, "memory(GiB)": 135.49, "step": 29980, "train_speed(iter/s)": 0.203476 }, { "acc": 0.75642629, "epoch": 0.6996894092943904, "grad_norm": 7.8125, "learning_rate": 7.5487022198542555e-06, "loss": 0.90608702, "memory(GiB)": 135.49, "step": 29990, "train_speed(iter/s)": 0.203512 }, { "acc": 0.76828275, "epoch": 0.6999227168666793, "grad_norm": 6.28125, "learning_rate": 7.547076781890032e-06, "loss": 0.83629513, "memory(GiB)": 135.49, "step": 30000, "train_speed(iter/s)": 0.203546 }, { "epoch": 0.6999227168666793, "eval_acc": 0.7344513857740748, "eval_loss": 0.8369702696800232, "eval_runtime": 1262.4508, "eval_samples_per_second": 28.509, "eval_steps_per_second": 14.255, "step": 30000 }, { "acc": 0.76531739, "epoch": 0.7001560244389682, "grad_norm": 9.5, "learning_rate": 7.5454509803313394e-06, "loss": 0.84379997, "memory(GiB)": 135.49, "step": 30010, "train_speed(iter/s)": 0.20182 }, { "acc": 0.76662283, "epoch": 0.7003893320112571, "grad_norm": 4.65625, "learning_rate": 7.543824815410259e-06, "loss": 0.83774624, "memory(GiB)": 135.49, "step": 30020, "train_speed(iter/s)": 0.201856 }, { "acc": 0.76076231, "epoch": 0.700622639583546, "grad_norm": 6.25, "learning_rate": 7.542198287358924e-06, "loss": 0.87203045, "memory(GiB)": 135.49, "step": 30030, "train_speed(iter/s)": 0.201891 }, { "acc": 0.76924276, "epoch": 0.7008559471558349, "grad_norm": 5.03125, "learning_rate": 7.540571396409522e-06, "loss": 0.82600813, "memory(GiB)": 135.49, "step": 30040, "train_speed(iter/s)": 0.201925 }, { "acc": 0.78140154, "epoch": 0.7010892547281238, "grad_norm": 5.15625, "learning_rate": 7.538944142794291e-06, "loss": 0.77349024, "memory(GiB)": 135.49, "step": 30050, "train_speed(iter/s)": 0.201961 }, { "acc": 0.7624053, "epoch": 0.7013225623004127, "grad_norm": 6.28125, "learning_rate": 7.537316526745522e-06, "loss": 0.86563005, "memory(GiB)": 135.49, "step": 30060, "train_speed(iter/s)": 0.201995 }, { "acc": 0.76929584, "epoch": 0.7015558698727016, "grad_norm": 5.96875, "learning_rate": 7.535688548495557e-06, "loss": 0.83428459, "memory(GiB)": 135.49, "step": 30070, "train_speed(iter/s)": 0.202028 }, { "acc": 0.77094131, "epoch": 0.7017891774449905, "grad_norm": 16.125, "learning_rate": 7.534060208276786e-06, "loss": 0.82803268, "memory(GiB)": 135.49, "step": 30080, "train_speed(iter/s)": 0.202064 }, { "acc": 0.76353426, "epoch": 0.7020224850172794, "grad_norm": 5.90625, "learning_rate": 7.532431506321657e-06, "loss": 0.87916117, "memory(GiB)": 135.49, "step": 30090, "train_speed(iter/s)": 0.202098 }, { "acc": 0.76903148, "epoch": 0.7022557925895683, "grad_norm": 6.71875, "learning_rate": 7.530802442862666e-06, "loss": 0.83185816, "memory(GiB)": 135.49, "step": 30100, "train_speed(iter/s)": 0.20213 }, { "acc": 0.77518959, "epoch": 0.7024891001618572, "grad_norm": 4.875, "learning_rate": 7.529173018132362e-06, "loss": 0.81832724, "memory(GiB)": 135.49, "step": 30110, "train_speed(iter/s)": 0.202164 }, { "acc": 0.78604441, "epoch": 0.7027224077341461, "grad_norm": 6.75, "learning_rate": 7.5275432323633446e-06, "loss": 0.77081251, "memory(GiB)": 135.49, "step": 30120, "train_speed(iter/s)": 0.202198 }, { "acc": 0.75579777, "epoch": 0.702955715306435, "grad_norm": 5.46875, "learning_rate": 7.525913085788264e-06, "loss": 0.88417435, "memory(GiB)": 135.49, "step": 30130, "train_speed(iter/s)": 0.202235 }, { "acc": 0.77028236, "epoch": 0.7031890228787238, "grad_norm": 6.21875, "learning_rate": 7.524282578639825e-06, "loss": 0.8388113, "memory(GiB)": 135.49, "step": 30140, "train_speed(iter/s)": 0.202269 }, { "acc": 0.76326919, "epoch": 0.7034223304510127, "grad_norm": 5.78125, "learning_rate": 7.522651711150781e-06, "loss": 0.8585247, "memory(GiB)": 135.49, "step": 30150, "train_speed(iter/s)": 0.202304 }, { "acc": 0.76514421, "epoch": 0.7036556380233016, "grad_norm": 7.5, "learning_rate": 7.521020483553939e-06, "loss": 0.82799749, "memory(GiB)": 135.49, "step": 30160, "train_speed(iter/s)": 0.202338 }, { "acc": 0.77464952, "epoch": 0.7038889455955905, "grad_norm": 5.0, "learning_rate": 7.519388896082154e-06, "loss": 0.79689078, "memory(GiB)": 135.49, "step": 30170, "train_speed(iter/s)": 0.202374 }, { "acc": 0.7704011, "epoch": 0.7041222531678794, "grad_norm": 5.125, "learning_rate": 7.517756948968338e-06, "loss": 0.81499815, "memory(GiB)": 135.49, "step": 30180, "train_speed(iter/s)": 0.20241 }, { "acc": 0.75606108, "epoch": 0.7043555607401683, "grad_norm": 5.0625, "learning_rate": 7.516124642445447e-06, "loss": 0.88528214, "memory(GiB)": 135.49, "step": 30190, "train_speed(iter/s)": 0.202445 }, { "acc": 0.78214951, "epoch": 0.7045888683124572, "grad_norm": 11.375, "learning_rate": 7.514491976746494e-06, "loss": 0.79449558, "memory(GiB)": 135.49, "step": 30200, "train_speed(iter/s)": 0.202481 }, { "acc": 0.76422424, "epoch": 0.704822175884746, "grad_norm": 7.21875, "learning_rate": 7.512858952104544e-06, "loss": 0.82433033, "memory(GiB)": 135.49, "step": 30210, "train_speed(iter/s)": 0.202519 }, { "acc": 0.76012383, "epoch": 0.705055483457035, "grad_norm": 9.125, "learning_rate": 7.511225568752707e-06, "loss": 0.86827412, "memory(GiB)": 135.49, "step": 30220, "train_speed(iter/s)": 0.202556 }, { "acc": 0.75598302, "epoch": 0.7052887910293238, "grad_norm": 4.40625, "learning_rate": 7.50959182692415e-06, "loss": 0.87952871, "memory(GiB)": 135.49, "step": 30230, "train_speed(iter/s)": 0.202592 }, { "acc": 0.78144822, "epoch": 0.7055220986016127, "grad_norm": 5.46875, "learning_rate": 7.507957726852087e-06, "loss": 0.7784215, "memory(GiB)": 135.49, "step": 30240, "train_speed(iter/s)": 0.202628 }, { "acc": 0.76633954, "epoch": 0.7057554061739016, "grad_norm": 5.59375, "learning_rate": 7.506323268769788e-06, "loss": 0.83292465, "memory(GiB)": 135.49, "step": 30250, "train_speed(iter/s)": 0.202665 }, { "acc": 0.76867728, "epoch": 0.7059887137461905, "grad_norm": 5.8125, "learning_rate": 7.504688452910571e-06, "loss": 0.84800682, "memory(GiB)": 135.49, "step": 30260, "train_speed(iter/s)": 0.2027 }, { "acc": 0.76142025, "epoch": 0.7062220213184794, "grad_norm": 5.5625, "learning_rate": 7.503053279507806e-06, "loss": 0.86670675, "memory(GiB)": 135.49, "step": 30270, "train_speed(iter/s)": 0.202733 }, { "acc": 0.77762432, "epoch": 0.7064553288907683, "grad_norm": 5.84375, "learning_rate": 7.501417748794911e-06, "loss": 0.8034915, "memory(GiB)": 135.49, "step": 30280, "train_speed(iter/s)": 0.20277 }, { "acc": 0.78613148, "epoch": 0.7066886364630572, "grad_norm": 6.15625, "learning_rate": 7.49978186100536e-06, "loss": 0.78480325, "memory(GiB)": 135.49, "step": 30290, "train_speed(iter/s)": 0.202802 }, { "acc": 0.78843708, "epoch": 0.7069219440353461, "grad_norm": 5.40625, "learning_rate": 7.498145616372674e-06, "loss": 0.76057506, "memory(GiB)": 135.49, "step": 30300, "train_speed(iter/s)": 0.202836 }, { "acc": 0.78782787, "epoch": 0.707155251607635, "grad_norm": 4.34375, "learning_rate": 7.4965090151304265e-06, "loss": 0.75142574, "memory(GiB)": 135.49, "step": 30310, "train_speed(iter/s)": 0.202868 }, { "acc": 0.76569653, "epoch": 0.7073885591799239, "grad_norm": 5.75, "learning_rate": 7.494872057512242e-06, "loss": 0.86738701, "memory(GiB)": 135.49, "step": 30320, "train_speed(iter/s)": 0.2029 }, { "acc": 0.76038399, "epoch": 0.7076218667522127, "grad_norm": 5.9375, "learning_rate": 7.493234743751797e-06, "loss": 0.86748848, "memory(GiB)": 135.49, "step": 30330, "train_speed(iter/s)": 0.20293 }, { "acc": 0.77935114, "epoch": 0.7078551743245016, "grad_norm": 7.34375, "learning_rate": 7.491597074082817e-06, "loss": 0.78629165, "memory(GiB)": 135.49, "step": 30340, "train_speed(iter/s)": 0.202965 }, { "acc": 0.7665904, "epoch": 0.7080884818967905, "grad_norm": 12.3125, "learning_rate": 7.489959048739079e-06, "loss": 0.84733658, "memory(GiB)": 135.49, "step": 30350, "train_speed(iter/s)": 0.202996 }, { "acc": 0.76237769, "epoch": 0.7083217894690794, "grad_norm": 11.375, "learning_rate": 7.488320667954408e-06, "loss": 0.86381168, "memory(GiB)": 135.49, "step": 30360, "train_speed(iter/s)": 0.203032 }, { "acc": 0.77635078, "epoch": 0.7085550970413683, "grad_norm": 4.75, "learning_rate": 7.486681931962686e-06, "loss": 0.79839311, "memory(GiB)": 135.49, "step": 30370, "train_speed(iter/s)": 0.203066 }, { "acc": 0.77790661, "epoch": 0.7087884046136572, "grad_norm": 5.25, "learning_rate": 7.48504284099784e-06, "loss": 0.80389252, "memory(GiB)": 135.49, "step": 30380, "train_speed(iter/s)": 0.203102 }, { "acc": 0.76305285, "epoch": 0.7090217121859461, "grad_norm": 6.59375, "learning_rate": 7.48340339529385e-06, "loss": 0.8344615, "memory(GiB)": 135.49, "step": 30390, "train_speed(iter/s)": 0.203138 }, { "acc": 0.77791061, "epoch": 0.709255019758235, "grad_norm": 7.25, "learning_rate": 7.481763595084747e-06, "loss": 0.79049816, "memory(GiB)": 135.49, "step": 30400, "train_speed(iter/s)": 0.203171 }, { "acc": 0.77360525, "epoch": 0.7094883273305239, "grad_norm": 8.4375, "learning_rate": 7.480123440604613e-06, "loss": 0.82184763, "memory(GiB)": 135.49, "step": 30410, "train_speed(iter/s)": 0.203204 }, { "acc": 0.77160139, "epoch": 0.7097216349028128, "grad_norm": 9.875, "learning_rate": 7.478482932087577e-06, "loss": 0.83148594, "memory(GiB)": 135.49, "step": 30420, "train_speed(iter/s)": 0.203238 }, { "acc": 0.78075581, "epoch": 0.7099549424751017, "grad_norm": 5.59375, "learning_rate": 7.476842069767824e-06, "loss": 0.78054819, "memory(GiB)": 135.49, "step": 30430, "train_speed(iter/s)": 0.203273 }, { "acc": 0.77380581, "epoch": 0.7101882500473906, "grad_norm": 9.1875, "learning_rate": 7.475200853879583e-06, "loss": 0.80113058, "memory(GiB)": 135.49, "step": 30440, "train_speed(iter/s)": 0.203309 }, { "acc": 0.77112603, "epoch": 0.7104215576196795, "grad_norm": 5.75, "learning_rate": 7.473559284657139e-06, "loss": 0.82226553, "memory(GiB)": 135.49, "step": 30450, "train_speed(iter/s)": 0.203344 }, { "acc": 0.75136137, "epoch": 0.7106548651919684, "grad_norm": 6.34375, "learning_rate": 7.471917362334828e-06, "loss": 0.91041164, "memory(GiB)": 135.49, "step": 30460, "train_speed(iter/s)": 0.203379 }, { "acc": 0.76851788, "epoch": 0.7108881727642573, "grad_norm": 5.5625, "learning_rate": 7.47027508714703e-06, "loss": 0.83766022, "memory(GiB)": 135.49, "step": 30470, "train_speed(iter/s)": 0.203415 }, { "acc": 0.76539278, "epoch": 0.7111214803365462, "grad_norm": 6.21875, "learning_rate": 7.468632459328181e-06, "loss": 0.85798607, "memory(GiB)": 135.49, "step": 30480, "train_speed(iter/s)": 0.20345 }, { "acc": 0.77414656, "epoch": 0.7113547879088351, "grad_norm": 5.625, "learning_rate": 7.466989479112766e-06, "loss": 0.81961927, "memory(GiB)": 135.49, "step": 30490, "train_speed(iter/s)": 0.203482 }, { "acc": 0.78865409, "epoch": 0.711588095481124, "grad_norm": 6.125, "learning_rate": 7.465346146735319e-06, "loss": 0.76299944, "memory(GiB)": 135.49, "step": 30500, "train_speed(iter/s)": 0.203519 }, { "epoch": 0.711588095481124, "eval_acc": 0.7344433186296181, "eval_loss": 0.8368908166885376, "eval_runtime": 1262.8067, "eval_samples_per_second": 28.501, "eval_steps_per_second": 14.251, "step": 30500 }, { "acc": 0.75475349, "epoch": 0.7118214030534129, "grad_norm": 6.53125, "learning_rate": 7.463702462430427e-06, "loss": 0.90547676, "memory(GiB)": 135.49, "step": 30510, "train_speed(iter/s)": 0.201819 }, { "acc": 0.77921515, "epoch": 0.7120547106257017, "grad_norm": 6.40625, "learning_rate": 7.4620584264327236e-06, "loss": 0.81535559, "memory(GiB)": 135.49, "step": 30520, "train_speed(iter/s)": 0.201854 }, { "acc": 0.76834478, "epoch": 0.7122880181979906, "grad_norm": 6.09375, "learning_rate": 7.460414038976894e-06, "loss": 0.83838253, "memory(GiB)": 135.49, "step": 30530, "train_speed(iter/s)": 0.201888 }, { "acc": 0.77193079, "epoch": 0.7125213257702795, "grad_norm": 5.53125, "learning_rate": 7.458769300297676e-06, "loss": 0.83897762, "memory(GiB)": 135.49, "step": 30540, "train_speed(iter/s)": 0.201922 }, { "acc": 0.77477484, "epoch": 0.7127546333425684, "grad_norm": 6.1875, "learning_rate": 7.457124210629853e-06, "loss": 0.78762474, "memory(GiB)": 135.49, "step": 30550, "train_speed(iter/s)": 0.201955 }, { "acc": 0.77794971, "epoch": 0.7129879409148573, "grad_norm": 4.53125, "learning_rate": 7.455478770208267e-06, "loss": 0.79307556, "memory(GiB)": 135.49, "step": 30560, "train_speed(iter/s)": 0.201989 }, { "acc": 0.7600493, "epoch": 0.7132212484871462, "grad_norm": 5.15625, "learning_rate": 7.453832979267796e-06, "loss": 0.85452671, "memory(GiB)": 135.49, "step": 30570, "train_speed(iter/s)": 0.202024 }, { "acc": 0.7757431, "epoch": 0.7134545560594351, "grad_norm": 6.5, "learning_rate": 7.452186838043381e-06, "loss": 0.80362415, "memory(GiB)": 135.49, "step": 30580, "train_speed(iter/s)": 0.202058 }, { "acc": 0.78791676, "epoch": 0.713687863631724, "grad_norm": 5.84375, "learning_rate": 7.450540346770008e-06, "loss": 0.76945915, "memory(GiB)": 135.49, "step": 30590, "train_speed(iter/s)": 0.202092 }, { "acc": 0.7705616, "epoch": 0.7139211712040129, "grad_norm": 6.125, "learning_rate": 7.4488935056827115e-06, "loss": 0.83055744, "memory(GiB)": 135.49, "step": 30600, "train_speed(iter/s)": 0.202125 }, { "acc": 0.76225252, "epoch": 0.7141544787763018, "grad_norm": 6.4375, "learning_rate": 7.447246315016579e-06, "loss": 0.86896315, "memory(GiB)": 135.49, "step": 30610, "train_speed(iter/s)": 0.202159 }, { "acc": 0.76492214, "epoch": 0.7143877863485907, "grad_norm": 6.78125, "learning_rate": 7.445598775006745e-06, "loss": 0.85779257, "memory(GiB)": 135.49, "step": 30620, "train_speed(iter/s)": 0.202196 }, { "acc": 0.77513752, "epoch": 0.7146210939208796, "grad_norm": 5.125, "learning_rate": 7.443950885888398e-06, "loss": 0.81504049, "memory(GiB)": 135.49, "step": 30630, "train_speed(iter/s)": 0.202231 }, { "acc": 0.77159925, "epoch": 0.7148544014931685, "grad_norm": 6.0, "learning_rate": 7.4423026478967706e-06, "loss": 0.82651834, "memory(GiB)": 135.49, "step": 30640, "train_speed(iter/s)": 0.202263 }, { "acc": 0.78114691, "epoch": 0.7150877090654574, "grad_norm": 5.46875, "learning_rate": 7.440654061267151e-06, "loss": 0.78425808, "memory(GiB)": 135.49, "step": 30650, "train_speed(iter/s)": 0.202298 }, { "acc": 0.76981783, "epoch": 0.7153210166377463, "grad_norm": 6.125, "learning_rate": 7.439005126234872e-06, "loss": 0.84237576, "memory(GiB)": 135.49, "step": 30660, "train_speed(iter/s)": 0.202333 }, { "acc": 0.76656733, "epoch": 0.7155543242100352, "grad_norm": 4.875, "learning_rate": 7.43735584303532e-06, "loss": 0.82887077, "memory(GiB)": 135.49, "step": 30670, "train_speed(iter/s)": 0.202368 }, { "acc": 0.76514053, "epoch": 0.7157876317823241, "grad_norm": 5.59375, "learning_rate": 7.435706211903929e-06, "loss": 0.86504021, "memory(GiB)": 135.49, "step": 30680, "train_speed(iter/s)": 0.202403 }, { "acc": 0.77078571, "epoch": 0.716020939354613, "grad_norm": 5.5, "learning_rate": 7.434056233076184e-06, "loss": 0.80922623, "memory(GiB)": 135.49, "step": 30690, "train_speed(iter/s)": 0.202434 }, { "acc": 0.763169, "epoch": 0.7162542469269019, "grad_norm": 5.125, "learning_rate": 7.43240590678762e-06, "loss": 0.8846014, "memory(GiB)": 135.49, "step": 30700, "train_speed(iter/s)": 0.202469 }, { "acc": 0.76851559, "epoch": 0.7164875544991907, "grad_norm": 4.9375, "learning_rate": 7.4307552332738184e-06, "loss": 0.82923203, "memory(GiB)": 135.49, "step": 30710, "train_speed(iter/s)": 0.202504 }, { "acc": 0.78098478, "epoch": 0.7167208620714796, "grad_norm": 5.59375, "learning_rate": 7.429104212770414e-06, "loss": 0.79543023, "memory(GiB)": 135.49, "step": 30720, "train_speed(iter/s)": 0.202536 }, { "acc": 0.78384085, "epoch": 0.7169541696437685, "grad_norm": 6.21875, "learning_rate": 7.427452845513088e-06, "loss": 0.78309216, "memory(GiB)": 135.49, "step": 30730, "train_speed(iter/s)": 0.202571 }, { "acc": 0.75105562, "epoch": 0.7171874772160574, "grad_norm": 5.25, "learning_rate": 7.4258011317375735e-06, "loss": 0.89793358, "memory(GiB)": 135.49, "step": 30740, "train_speed(iter/s)": 0.202605 }, { "acc": 0.79532223, "epoch": 0.7174207847883463, "grad_norm": 7.8125, "learning_rate": 7.424149071679654e-06, "loss": 0.72559948, "memory(GiB)": 135.49, "step": 30750, "train_speed(iter/s)": 0.202639 }, { "acc": 0.76881108, "epoch": 0.7176540923606352, "grad_norm": 6.375, "learning_rate": 7.422496665575156e-06, "loss": 0.84626827, "memory(GiB)": 135.49, "step": 30760, "train_speed(iter/s)": 0.202674 }, { "acc": 0.76651878, "epoch": 0.717887399932924, "grad_norm": 4.46875, "learning_rate": 7.420843913659965e-06, "loss": 0.85933685, "memory(GiB)": 135.49, "step": 30770, "train_speed(iter/s)": 0.202706 }, { "acc": 0.78040676, "epoch": 0.718120707505213, "grad_norm": 5.46875, "learning_rate": 7.419190816170008e-06, "loss": 0.79540796, "memory(GiB)": 135.49, "step": 30780, "train_speed(iter/s)": 0.202739 }, { "acc": 0.76034098, "epoch": 0.7183540150775019, "grad_norm": 10.9375, "learning_rate": 7.417537373341263e-06, "loss": 0.87204971, "memory(GiB)": 135.49, "step": 30790, "train_speed(iter/s)": 0.202774 }, { "acc": 0.78071833, "epoch": 0.7185873226497907, "grad_norm": 5.90625, "learning_rate": 7.415883585409762e-06, "loss": 0.79894595, "memory(GiB)": 135.49, "step": 30800, "train_speed(iter/s)": 0.202806 }, { "acc": 0.76618624, "epoch": 0.7188206302220796, "grad_norm": 7.625, "learning_rate": 7.414229452611582e-06, "loss": 0.85826979, "memory(GiB)": 135.49, "step": 30810, "train_speed(iter/s)": 0.202838 }, { "acc": 0.77082815, "epoch": 0.7190539377943685, "grad_norm": 7.78125, "learning_rate": 7.412574975182848e-06, "loss": 0.8116848, "memory(GiB)": 135.49, "step": 30820, "train_speed(iter/s)": 0.202873 }, { "acc": 0.76858549, "epoch": 0.7192872453666574, "grad_norm": 5.15625, "learning_rate": 7.410920153359736e-06, "loss": 0.84370232, "memory(GiB)": 135.49, "step": 30830, "train_speed(iter/s)": 0.20291 }, { "acc": 0.7702992, "epoch": 0.7195205529389463, "grad_norm": 12.8125, "learning_rate": 7.409264987378473e-06, "loss": 0.81769428, "memory(GiB)": 135.49, "step": 30840, "train_speed(iter/s)": 0.202945 }, { "acc": 0.7479753, "epoch": 0.7197538605112352, "grad_norm": 5.71875, "learning_rate": 7.407609477475334e-06, "loss": 0.90046587, "memory(GiB)": 135.49, "step": 30850, "train_speed(iter/s)": 0.20298 }, { "acc": 0.75194287, "epoch": 0.7199871680835241, "grad_norm": 5.75, "learning_rate": 7.405953623886642e-06, "loss": 0.90500698, "memory(GiB)": 135.49, "step": 30860, "train_speed(iter/s)": 0.203013 }, { "acc": 0.76677675, "epoch": 0.720220475655813, "grad_norm": 9.5, "learning_rate": 7.404297426848768e-06, "loss": 0.85510445, "memory(GiB)": 135.49, "step": 30870, "train_speed(iter/s)": 0.20305 }, { "acc": 0.73392177, "epoch": 0.7204537832281019, "grad_norm": 6.90625, "learning_rate": 7.4026408865981335e-06, "loss": 0.94933786, "memory(GiB)": 135.49, "step": 30880, "train_speed(iter/s)": 0.203086 }, { "acc": 0.76712651, "epoch": 0.7206870908003908, "grad_norm": 5.09375, "learning_rate": 7.400984003371211e-06, "loss": 0.83716106, "memory(GiB)": 135.49, "step": 30890, "train_speed(iter/s)": 0.20312 }, { "acc": 0.7789052, "epoch": 0.7209203983726797, "grad_norm": 5.65625, "learning_rate": 7.3993267774045206e-06, "loss": 0.79226727, "memory(GiB)": 135.49, "step": 30900, "train_speed(iter/s)": 0.203154 }, { "acc": 0.78225088, "epoch": 0.7211537059449685, "grad_norm": 6.25, "learning_rate": 7.397669208934628e-06, "loss": 0.78422041, "memory(GiB)": 135.49, "step": 30910, "train_speed(iter/s)": 0.203189 }, { "acc": 0.78960752, "epoch": 0.7213870135172574, "grad_norm": 5.28125, "learning_rate": 7.396011298198155e-06, "loss": 0.74619551, "memory(GiB)": 135.49, "step": 30920, "train_speed(iter/s)": 0.203222 }, { "acc": 0.76351528, "epoch": 0.7216203210895463, "grad_norm": 5.46875, "learning_rate": 7.394353045431765e-06, "loss": 0.84831753, "memory(GiB)": 135.49, "step": 30930, "train_speed(iter/s)": 0.203256 }, { "acc": 0.75684347, "epoch": 0.7218536286618352, "grad_norm": 7.1875, "learning_rate": 7.392694450872171e-06, "loss": 0.86795816, "memory(GiB)": 135.49, "step": 30940, "train_speed(iter/s)": 0.20329 }, { "acc": 0.77361774, "epoch": 0.7220869362341241, "grad_norm": 5.21875, "learning_rate": 7.3910355147561394e-06, "loss": 0.81870232, "memory(GiB)": 135.49, "step": 30950, "train_speed(iter/s)": 0.203324 }, { "acc": 0.76461606, "epoch": 0.722320243806413, "grad_norm": 6.84375, "learning_rate": 7.389376237320485e-06, "loss": 0.83553562, "memory(GiB)": 135.49, "step": 30960, "train_speed(iter/s)": 0.20336 }, { "acc": 0.77407084, "epoch": 0.7225535513787019, "grad_norm": 6.40625, "learning_rate": 7.387716618802064e-06, "loss": 0.81094723, "memory(GiB)": 135.49, "step": 30970, "train_speed(iter/s)": 0.203392 }, { "acc": 0.759863, "epoch": 0.7227868589509908, "grad_norm": 6.15625, "learning_rate": 7.386056659437792e-06, "loss": 0.86593027, "memory(GiB)": 135.49, "step": 30980, "train_speed(iter/s)": 0.203427 }, { "acc": 0.76759777, "epoch": 0.7230201665232797, "grad_norm": 7.1875, "learning_rate": 7.384396359464623e-06, "loss": 0.82997971, "memory(GiB)": 135.49, "step": 30990, "train_speed(iter/s)": 0.20346 }, { "acc": 0.78227673, "epoch": 0.7232534740955686, "grad_norm": 6.46875, "learning_rate": 7.382735719119568e-06, "loss": 0.77510743, "memory(GiB)": 135.49, "step": 31000, "train_speed(iter/s)": 0.203496 }, { "epoch": 0.7232534740955686, "eval_acc": 0.7345570653664582, "eval_loss": 0.8366439342498779, "eval_runtime": 1262.618, "eval_samples_per_second": 28.505, "eval_steps_per_second": 14.253, "step": 31000 }, { "acc": 0.76522532, "epoch": 0.7234867816678575, "grad_norm": 5.59375, "learning_rate": 7.38107473863968e-06, "loss": 0.84511013, "memory(GiB)": 135.49, "step": 31010, "train_speed(iter/s)": 0.201823 }, { "acc": 0.77219434, "epoch": 0.7237200892401464, "grad_norm": 5.5625, "learning_rate": 7.3794134182620646e-06, "loss": 0.84481077, "memory(GiB)": 135.49, "step": 31020, "train_speed(iter/s)": 0.201858 }, { "acc": 0.78725367, "epoch": 0.7239533968124353, "grad_norm": 4.90625, "learning_rate": 7.377751758223876e-06, "loss": 0.75467281, "memory(GiB)": 135.49, "step": 31030, "train_speed(iter/s)": 0.201893 }, { "acc": 0.76649733, "epoch": 0.7241867043847242, "grad_norm": 5.1875, "learning_rate": 7.376089758762315e-06, "loss": 0.83877449, "memory(GiB)": 135.49, "step": 31040, "train_speed(iter/s)": 0.201929 }, { "acc": 0.77166252, "epoch": 0.7244200119570131, "grad_norm": 6.625, "learning_rate": 7.374427420114629e-06, "loss": 0.83119164, "memory(GiB)": 135.49, "step": 31050, "train_speed(iter/s)": 0.201959 }, { "acc": 0.77270455, "epoch": 0.724653319529302, "grad_norm": 4.9375, "learning_rate": 7.37276474251812e-06, "loss": 0.82334099, "memory(GiB)": 135.49, "step": 31060, "train_speed(iter/s)": 0.201991 }, { "acc": 0.77385273, "epoch": 0.7248866271015909, "grad_norm": 5.1875, "learning_rate": 7.371101726210135e-06, "loss": 0.82843046, "memory(GiB)": 135.49, "step": 31070, "train_speed(iter/s)": 0.202026 }, { "acc": 0.75648241, "epoch": 0.7251199346738798, "grad_norm": 7.75, "learning_rate": 7.369438371428065e-06, "loss": 0.88075724, "memory(GiB)": 135.49, "step": 31080, "train_speed(iter/s)": 0.202061 }, { "acc": 0.76635637, "epoch": 0.7253532422461687, "grad_norm": 7.125, "learning_rate": 7.367774678409357e-06, "loss": 0.86565304, "memory(GiB)": 135.49, "step": 31090, "train_speed(iter/s)": 0.202093 }, { "acc": 0.78285856, "epoch": 0.7255865498184575, "grad_norm": 6.09375, "learning_rate": 7.366110647391501e-06, "loss": 0.78300924, "memory(GiB)": 135.49, "step": 31100, "train_speed(iter/s)": 0.202127 }, { "acc": 0.77723866, "epoch": 0.7258198573907464, "grad_norm": 8.0, "learning_rate": 7.364446278612036e-06, "loss": 0.79584332, "memory(GiB)": 135.49, "step": 31110, "train_speed(iter/s)": 0.20216 }, { "acc": 0.75060625, "epoch": 0.7260531649630353, "grad_norm": 4.8125, "learning_rate": 7.3627815723085535e-06, "loss": 0.92040253, "memory(GiB)": 135.49, "step": 31120, "train_speed(iter/s)": 0.202194 }, { "acc": 0.77877145, "epoch": 0.7262864725353242, "grad_norm": 7.28125, "learning_rate": 7.361116528718688e-06, "loss": 0.79169784, "memory(GiB)": 135.49, "step": 31130, "train_speed(iter/s)": 0.202229 }, { "acc": 0.76074963, "epoch": 0.7265197801076131, "grad_norm": 6.6875, "learning_rate": 7.359451148080123e-06, "loss": 0.86167946, "memory(GiB)": 135.49, "step": 31140, "train_speed(iter/s)": 0.202263 }, { "acc": 0.75911198, "epoch": 0.726753087679902, "grad_norm": 6.40625, "learning_rate": 7.357785430630593e-06, "loss": 0.88270855, "memory(GiB)": 135.49, "step": 31150, "train_speed(iter/s)": 0.202296 }, { "acc": 0.78220115, "epoch": 0.7269863952521909, "grad_norm": 6.1875, "learning_rate": 7.356119376607877e-06, "loss": 0.78846188, "memory(GiB)": 135.49, "step": 31160, "train_speed(iter/s)": 0.202329 }, { "acc": 0.77354059, "epoch": 0.7272197028244798, "grad_norm": 6.6875, "learning_rate": 7.354452986249805e-06, "loss": 0.81691418, "memory(GiB)": 135.49, "step": 31170, "train_speed(iter/s)": 0.202366 }, { "acc": 0.76600223, "epoch": 0.7274530103967687, "grad_norm": 11.625, "learning_rate": 7.352786259794252e-06, "loss": 0.85651207, "memory(GiB)": 135.49, "step": 31180, "train_speed(iter/s)": 0.202401 }, { "acc": 0.78149328, "epoch": 0.7276863179690576, "grad_norm": 6.03125, "learning_rate": 7.351119197479144e-06, "loss": 0.79199538, "memory(GiB)": 135.49, "step": 31190, "train_speed(iter/s)": 0.202436 }, { "acc": 0.77081318, "epoch": 0.7279196255413465, "grad_norm": 5.0625, "learning_rate": 7.349451799542455e-06, "loss": 0.82262917, "memory(GiB)": 135.49, "step": 31200, "train_speed(iter/s)": 0.202472 }, { "acc": 0.77909479, "epoch": 0.7281529331136354, "grad_norm": 4.53125, "learning_rate": 7.3477840662222045e-06, "loss": 0.78861818, "memory(GiB)": 135.49, "step": 31210, "train_speed(iter/s)": 0.202507 }, { "acc": 0.77962995, "epoch": 0.7283862406859243, "grad_norm": 17.5, "learning_rate": 7.346115997756459e-06, "loss": 0.79499588, "memory(GiB)": 135.49, "step": 31220, "train_speed(iter/s)": 0.202543 }, { "acc": 0.76875, "epoch": 0.7286195482582132, "grad_norm": 4.3125, "learning_rate": 7.3444475943833375e-06, "loss": 0.83683939, "memory(GiB)": 135.49, "step": 31230, "train_speed(iter/s)": 0.202578 }, { "acc": 0.77388468, "epoch": 0.7288528558305021, "grad_norm": 6.03125, "learning_rate": 7.342778856341002e-06, "loss": 0.83534527, "memory(GiB)": 135.49, "step": 31240, "train_speed(iter/s)": 0.20261 }, { "acc": 0.75612683, "epoch": 0.729086163402791, "grad_norm": 5.34375, "learning_rate": 7.3411097838676645e-06, "loss": 0.88731098, "memory(GiB)": 135.49, "step": 31250, "train_speed(iter/s)": 0.202644 }, { "acc": 0.76928425, "epoch": 0.7293194709750799, "grad_norm": 7.625, "learning_rate": 7.339440377201588e-06, "loss": 0.83798056, "memory(GiB)": 135.49, "step": 31260, "train_speed(iter/s)": 0.202677 }, { "acc": 0.74724503, "epoch": 0.7295527785473688, "grad_norm": 7.09375, "learning_rate": 7.337770636581075e-06, "loss": 0.92171021, "memory(GiB)": 135.49, "step": 31270, "train_speed(iter/s)": 0.20271 }, { "acc": 0.76763582, "epoch": 0.7297860861196577, "grad_norm": 6.4375, "learning_rate": 7.3361005622444834e-06, "loss": 0.85548096, "memory(GiB)": 135.49, "step": 31280, "train_speed(iter/s)": 0.202745 }, { "acc": 0.76422043, "epoch": 0.7300193936919465, "grad_norm": 7.03125, "learning_rate": 7.334430154430217e-06, "loss": 0.85881577, "memory(GiB)": 135.49, "step": 31290, "train_speed(iter/s)": 0.20278 }, { "acc": 0.76604805, "epoch": 0.7302527012642354, "grad_norm": 9.25, "learning_rate": 7.332759413376721e-06, "loss": 0.83757381, "memory(GiB)": 135.49, "step": 31300, "train_speed(iter/s)": 0.202813 }, { "acc": 0.77889051, "epoch": 0.7304860088365243, "grad_norm": 4.75, "learning_rate": 7.331088339322499e-06, "loss": 0.82205553, "memory(GiB)": 135.49, "step": 31310, "train_speed(iter/s)": 0.202845 }, { "acc": 0.76058779, "epoch": 0.7307193164088132, "grad_norm": 4.4375, "learning_rate": 7.3294169325060925e-06, "loss": 0.86132526, "memory(GiB)": 135.49, "step": 31320, "train_speed(iter/s)": 0.20288 }, { "acc": 0.75779657, "epoch": 0.7309526239811021, "grad_norm": 6.03125, "learning_rate": 7.327745193166096e-06, "loss": 0.90359535, "memory(GiB)": 135.49, "step": 31330, "train_speed(iter/s)": 0.202913 }, { "acc": 0.75761247, "epoch": 0.731185931553391, "grad_norm": 7.75, "learning_rate": 7.3260731215411484e-06, "loss": 0.88894062, "memory(GiB)": 135.49, "step": 31340, "train_speed(iter/s)": 0.202946 }, { "acc": 0.768397, "epoch": 0.7314192391256799, "grad_norm": 5.09375, "learning_rate": 7.32440071786994e-06, "loss": 0.84543056, "memory(GiB)": 135.49, "step": 31350, "train_speed(iter/s)": 0.202981 }, { "acc": 0.75336809, "epoch": 0.7316525466979688, "grad_norm": 5.46875, "learning_rate": 7.322727982391203e-06, "loss": 0.88619633, "memory(GiB)": 135.49, "step": 31360, "train_speed(iter/s)": 0.203014 }, { "acc": 0.76008153, "epoch": 0.7318858542702577, "grad_norm": 5.5625, "learning_rate": 7.321054915343722e-06, "loss": 0.87012215, "memory(GiB)": 135.49, "step": 31370, "train_speed(iter/s)": 0.20305 }, { "acc": 0.758426, "epoch": 0.7321191618425466, "grad_norm": 6.1875, "learning_rate": 7.3193815169663266e-06, "loss": 0.86424522, "memory(GiB)": 135.49, "step": 31380, "train_speed(iter/s)": 0.203084 }, { "acc": 0.74638395, "epoch": 0.7323524694148354, "grad_norm": 7.3125, "learning_rate": 7.317707787497892e-06, "loss": 0.9223649, "memory(GiB)": 135.49, "step": 31390, "train_speed(iter/s)": 0.203119 }, { "acc": 0.79841776, "epoch": 0.7325857769871243, "grad_norm": 6.59375, "learning_rate": 7.316033727177345e-06, "loss": 0.73577204, "memory(GiB)": 135.49, "step": 31400, "train_speed(iter/s)": 0.203152 }, { "acc": 0.76186571, "epoch": 0.7328190845594132, "grad_norm": 5.375, "learning_rate": 7.314359336243656e-06, "loss": 0.86153393, "memory(GiB)": 135.49, "step": 31410, "train_speed(iter/s)": 0.203187 }, { "acc": 0.7647089, "epoch": 0.7330523921317021, "grad_norm": 5.875, "learning_rate": 7.312684614935846e-06, "loss": 0.83673191, "memory(GiB)": 135.49, "step": 31420, "train_speed(iter/s)": 0.203218 }, { "acc": 0.77275014, "epoch": 0.733285699703991, "grad_norm": 4.875, "learning_rate": 7.311009563492977e-06, "loss": 0.80034552, "memory(GiB)": 135.49, "step": 31430, "train_speed(iter/s)": 0.203252 }, { "acc": 0.78162556, "epoch": 0.7335190072762799, "grad_norm": 5.84375, "learning_rate": 7.309334182154164e-06, "loss": 0.80073299, "memory(GiB)": 135.49, "step": 31440, "train_speed(iter/s)": 0.203287 }, { "acc": 0.76760168, "epoch": 0.7337523148485688, "grad_norm": 6.21875, "learning_rate": 7.307658471158567e-06, "loss": 0.8577692, "memory(GiB)": 135.49, "step": 31450, "train_speed(iter/s)": 0.203322 }, { "acc": 0.78104572, "epoch": 0.7339856224208577, "grad_norm": 9.6875, "learning_rate": 7.305982430745395e-06, "loss": 0.78912458, "memory(GiB)": 135.49, "step": 31460, "train_speed(iter/s)": 0.203356 }, { "acc": 0.76157084, "epoch": 0.7342189299931466, "grad_norm": 11.4375, "learning_rate": 7.3043060611538995e-06, "loss": 0.87535791, "memory(GiB)": 135.49, "step": 31470, "train_speed(iter/s)": 0.20339 }, { "acc": 0.77313342, "epoch": 0.7344522375654354, "grad_norm": 7.46875, "learning_rate": 7.302629362623384e-06, "loss": 0.82764997, "memory(GiB)": 135.49, "step": 31480, "train_speed(iter/s)": 0.203422 }, { "acc": 0.77299681, "epoch": 0.7346855451377243, "grad_norm": 7.5, "learning_rate": 7.3009523353931966e-06, "loss": 0.82360725, "memory(GiB)": 135.49, "step": 31490, "train_speed(iter/s)": 0.203457 }, { "acc": 0.76441636, "epoch": 0.7349188527100132, "grad_norm": 4.40625, "learning_rate": 7.299274979702732e-06, "loss": 0.85436125, "memory(GiB)": 135.49, "step": 31500, "train_speed(iter/s)": 0.20349 }, { "epoch": 0.7349188527100132, "eval_acc": 0.734506726385048, "eval_loss": 0.8364928960800171, "eval_runtime": 1264.2234, "eval_samples_per_second": 28.469, "eval_steps_per_second": 14.235, "step": 31500 }, { "acc": 0.77805948, "epoch": 0.7351521602823021, "grad_norm": 5.1875, "learning_rate": 7.29759729579143e-06, "loss": 0.80856667, "memory(GiB)": 135.49, "step": 31510, "train_speed(iter/s)": 0.201845 }, { "acc": 0.77437329, "epoch": 0.735385467854591, "grad_norm": 7.09375, "learning_rate": 7.295919283898782e-06, "loss": 0.82702351, "memory(GiB)": 135.49, "step": 31520, "train_speed(iter/s)": 0.201879 }, { "acc": 0.75750599, "epoch": 0.7356187754268799, "grad_norm": 4.46875, "learning_rate": 7.294240944264323e-06, "loss": 0.88753185, "memory(GiB)": 135.49, "step": 31530, "train_speed(iter/s)": 0.201912 }, { "acc": 0.77879477, "epoch": 0.7358520829991688, "grad_norm": 5.0625, "learning_rate": 7.292562277127637e-06, "loss": 0.78502007, "memory(GiB)": 135.49, "step": 31540, "train_speed(iter/s)": 0.201946 }, { "acc": 0.79281082, "epoch": 0.7360853905714577, "grad_norm": 5.21875, "learning_rate": 7.290883282728352e-06, "loss": 0.75411735, "memory(GiB)": 135.49, "step": 31550, "train_speed(iter/s)": 0.201978 }, { "acc": 0.77476025, "epoch": 0.7363186981437466, "grad_norm": 5.46875, "learning_rate": 7.289203961306143e-06, "loss": 0.8284317, "memory(GiB)": 135.49, "step": 31560, "train_speed(iter/s)": 0.20201 }, { "acc": 0.79054985, "epoch": 0.7365520057160355, "grad_norm": 8.3125, "learning_rate": 7.287524313100735e-06, "loss": 0.76499662, "memory(GiB)": 135.49, "step": 31570, "train_speed(iter/s)": 0.202045 }, { "acc": 0.75227427, "epoch": 0.7367853132883244, "grad_norm": 6.28125, "learning_rate": 7.285844338351894e-06, "loss": 0.89360924, "memory(GiB)": 135.49, "step": 31580, "train_speed(iter/s)": 0.202081 }, { "acc": 0.74280505, "epoch": 0.7370186208606133, "grad_norm": 5.34375, "learning_rate": 7.284164037299438e-06, "loss": 0.95064583, "memory(GiB)": 135.49, "step": 31590, "train_speed(iter/s)": 0.202113 }, { "acc": 0.78897605, "epoch": 0.7372519284329022, "grad_norm": 10.375, "learning_rate": 7.28248341018323e-06, "loss": 0.76137075, "memory(GiB)": 135.49, "step": 31600, "train_speed(iter/s)": 0.202146 }, { "acc": 0.7773303, "epoch": 0.7374852360051911, "grad_norm": 6.875, "learning_rate": 7.280802457243178e-06, "loss": 0.80659828, "memory(GiB)": 135.49, "step": 31610, "train_speed(iter/s)": 0.20218 }, { "acc": 0.74896994, "epoch": 0.73771854357748, "grad_norm": 6.71875, "learning_rate": 7.2791211787192376e-06, "loss": 0.89730034, "memory(GiB)": 135.49, "step": 31620, "train_speed(iter/s)": 0.202214 }, { "acc": 0.76203547, "epoch": 0.7379518511497689, "grad_norm": 5.84375, "learning_rate": 7.27743957485141e-06, "loss": 0.84598064, "memory(GiB)": 135.49, "step": 31630, "train_speed(iter/s)": 0.20225 }, { "acc": 0.76968899, "epoch": 0.7381851587220578, "grad_norm": 12.6875, "learning_rate": 7.2757576458797465e-06, "loss": 0.86082039, "memory(GiB)": 135.49, "step": 31640, "train_speed(iter/s)": 0.202283 }, { "acc": 0.78154497, "epoch": 0.7384184662943467, "grad_norm": 7.8125, "learning_rate": 7.27407539204434e-06, "loss": 0.78410368, "memory(GiB)": 135.49, "step": 31650, "train_speed(iter/s)": 0.202318 }, { "acc": 0.76255383, "epoch": 0.7386517738666356, "grad_norm": 8.875, "learning_rate": 7.272392813585332e-06, "loss": 0.86778812, "memory(GiB)": 135.49, "step": 31660, "train_speed(iter/s)": 0.202354 }, { "acc": 0.7992444, "epoch": 0.7388850814389245, "grad_norm": 3.84375, "learning_rate": 7.270709910742908e-06, "loss": 0.73734722, "memory(GiB)": 135.49, "step": 31670, "train_speed(iter/s)": 0.202389 }, { "acc": 0.76355066, "epoch": 0.7391183890112133, "grad_norm": 5.375, "learning_rate": 7.269026683757306e-06, "loss": 0.8527586, "memory(GiB)": 135.49, "step": 31680, "train_speed(iter/s)": 0.202423 }, { "acc": 0.7626164, "epoch": 0.7393516965835022, "grad_norm": 5.5, "learning_rate": 7.267343132868803e-06, "loss": 0.84941816, "memory(GiB)": 135.49, "step": 31690, "train_speed(iter/s)": 0.202457 }, { "acc": 0.77960076, "epoch": 0.7395850041557911, "grad_norm": 19.375, "learning_rate": 7.265659258317725e-06, "loss": 0.79699068, "memory(GiB)": 135.49, "step": 31700, "train_speed(iter/s)": 0.202492 }, { "acc": 0.77176495, "epoch": 0.73981831172808, "grad_norm": 4.65625, "learning_rate": 7.263975060344449e-06, "loss": 0.84431076, "memory(GiB)": 135.49, "step": 31710, "train_speed(iter/s)": 0.202525 }, { "acc": 0.77316489, "epoch": 0.7400516193003689, "grad_norm": 6.15625, "learning_rate": 7.26229053918939e-06, "loss": 0.83196583, "memory(GiB)": 135.49, "step": 31720, "train_speed(iter/s)": 0.202556 }, { "acc": 0.79501934, "epoch": 0.7402849268726578, "grad_norm": 5.125, "learning_rate": 7.260605695093014e-06, "loss": 0.71632643, "memory(GiB)": 135.49, "step": 31730, "train_speed(iter/s)": 0.20259 }, { "acc": 0.77350636, "epoch": 0.7405182344449467, "grad_norm": 5.5625, "learning_rate": 7.25892052829583e-06, "loss": 0.83045254, "memory(GiB)": 135.49, "step": 31740, "train_speed(iter/s)": 0.202623 }, { "acc": 0.76739392, "epoch": 0.7407515420172356, "grad_norm": 6.1875, "learning_rate": 7.257235039038397e-06, "loss": 0.84298439, "memory(GiB)": 135.49, "step": 31750, "train_speed(iter/s)": 0.202656 }, { "acc": 0.77621698, "epoch": 0.7409848495895245, "grad_norm": 8.0, "learning_rate": 7.25554922756132e-06, "loss": 0.81136847, "memory(GiB)": 135.49, "step": 31760, "train_speed(iter/s)": 0.202689 }, { "acc": 0.76866169, "epoch": 0.7412181571618134, "grad_norm": 5.34375, "learning_rate": 7.253863094105243e-06, "loss": 0.83101158, "memory(GiB)": 135.49, "step": 31770, "train_speed(iter/s)": 0.202721 }, { "acc": 0.77323771, "epoch": 0.7414514647341023, "grad_norm": 6.71875, "learning_rate": 7.252176638910867e-06, "loss": 0.81243591, "memory(GiB)": 135.49, "step": 31780, "train_speed(iter/s)": 0.202754 }, { "acc": 0.77432089, "epoch": 0.7416847723063912, "grad_norm": 6.75, "learning_rate": 7.25048986221893e-06, "loss": 0.81716862, "memory(GiB)": 135.49, "step": 31790, "train_speed(iter/s)": 0.202784 }, { "acc": 0.77698965, "epoch": 0.7419180798786801, "grad_norm": 7.0, "learning_rate": 7.248802764270217e-06, "loss": 0.7902503, "memory(GiB)": 135.49, "step": 31800, "train_speed(iter/s)": 0.202818 }, { "acc": 0.76149273, "epoch": 0.742151387450969, "grad_norm": 5.40625, "learning_rate": 7.247115345305564e-06, "loss": 0.8705555, "memory(GiB)": 135.49, "step": 31810, "train_speed(iter/s)": 0.202851 }, { "acc": 0.77488122, "epoch": 0.7423846950232579, "grad_norm": 6.21875, "learning_rate": 7.245427605565847e-06, "loss": 0.81562338, "memory(GiB)": 135.49, "step": 31820, "train_speed(iter/s)": 0.202885 }, { "acc": 0.76717925, "epoch": 0.7426180025955468, "grad_norm": 5.6875, "learning_rate": 7.243739545291994e-06, "loss": 0.84104557, "memory(GiB)": 135.49, "step": 31830, "train_speed(iter/s)": 0.202917 }, { "acc": 0.7742878, "epoch": 0.7428513101678357, "grad_norm": 5.375, "learning_rate": 7.24205116472497e-06, "loss": 0.81599722, "memory(GiB)": 135.49, "step": 31840, "train_speed(iter/s)": 0.202948 }, { "acc": 0.75339813, "epoch": 0.7430846177401246, "grad_norm": 7.25, "learning_rate": 7.240362464105795e-06, "loss": 0.88846521, "memory(GiB)": 135.49, "step": 31850, "train_speed(iter/s)": 0.20298 }, { "acc": 0.77686691, "epoch": 0.7433179253124135, "grad_norm": 6.46875, "learning_rate": 7.238673443675529e-06, "loss": 0.81155262, "memory(GiB)": 135.49, "step": 31860, "train_speed(iter/s)": 0.203014 }, { "acc": 0.78568268, "epoch": 0.7435512328847023, "grad_norm": 10.9375, "learning_rate": 7.236984103675278e-06, "loss": 0.75036492, "memory(GiB)": 135.49, "step": 31870, "train_speed(iter/s)": 0.203049 }, { "acc": 0.7447669, "epoch": 0.7437845404569912, "grad_norm": 5.3125, "learning_rate": 7.235294444346197e-06, "loss": 0.96801872, "memory(GiB)": 135.49, "step": 31880, "train_speed(iter/s)": 0.203081 }, { "acc": 0.75899253, "epoch": 0.7440178480292801, "grad_norm": 5.5, "learning_rate": 7.233604465929485e-06, "loss": 0.88703766, "memory(GiB)": 135.49, "step": 31890, "train_speed(iter/s)": 0.203114 }, { "acc": 0.77914457, "epoch": 0.744251155601569, "grad_norm": 6.03125, "learning_rate": 7.231914168666382e-06, "loss": 0.78175688, "memory(GiB)": 135.49, "step": 31900, "train_speed(iter/s)": 0.203147 }, { "acc": 0.78172417, "epoch": 0.7444844631738579, "grad_norm": 8.125, "learning_rate": 7.23022355279818e-06, "loss": 0.80395298, "memory(GiB)": 135.49, "step": 31910, "train_speed(iter/s)": 0.203181 }, { "acc": 0.74499464, "epoch": 0.7447177707461468, "grad_norm": 5.625, "learning_rate": 7.228532618566214e-06, "loss": 0.94640598, "memory(GiB)": 135.49, "step": 31920, "train_speed(iter/s)": 0.203213 }, { "acc": 0.78399668, "epoch": 0.7449510783184357, "grad_norm": 6.625, "learning_rate": 7.226841366211865e-06, "loss": 0.78285131, "memory(GiB)": 135.49, "step": 31930, "train_speed(iter/s)": 0.203247 }, { "acc": 0.76983337, "epoch": 0.7451843858907246, "grad_norm": 4.75, "learning_rate": 7.225149795976558e-06, "loss": 0.82338161, "memory(GiB)": 135.49, "step": 31940, "train_speed(iter/s)": 0.203279 }, { "acc": 0.78686285, "epoch": 0.7454176934630135, "grad_norm": 7.53125, "learning_rate": 7.223457908101763e-06, "loss": 0.76002026, "memory(GiB)": 135.49, "step": 31950, "train_speed(iter/s)": 0.203313 }, { "acc": 0.76330009, "epoch": 0.7456510010353024, "grad_norm": 5.4375, "learning_rate": 7.2217657028289974e-06, "loss": 0.87095013, "memory(GiB)": 135.49, "step": 31960, "train_speed(iter/s)": 0.203346 }, { "acc": 0.75749407, "epoch": 0.7458843086075913, "grad_norm": 6.09375, "learning_rate": 7.220073180399824e-06, "loss": 0.86348934, "memory(GiB)": 135.49, "step": 31970, "train_speed(iter/s)": 0.20338 }, { "acc": 0.76445417, "epoch": 0.7461176161798801, "grad_norm": 5.15625, "learning_rate": 7.218380341055848e-06, "loss": 0.84856453, "memory(GiB)": 135.49, "step": 31980, "train_speed(iter/s)": 0.203411 }, { "acc": 0.75914474, "epoch": 0.746350923752169, "grad_norm": 6.1875, "learning_rate": 7.216687185038724e-06, "loss": 0.87879314, "memory(GiB)": 135.49, "step": 31990, "train_speed(iter/s)": 0.203442 }, { "acc": 0.75205173, "epoch": 0.746584231324458, "grad_norm": 5.375, "learning_rate": 7.214993712590148e-06, "loss": 0.93073988, "memory(GiB)": 135.49, "step": 32000, "train_speed(iter/s)": 0.203476 }, { "epoch": 0.746584231324458, "eval_acc": 0.734645481269704, "eval_loss": 0.836223840713501, "eval_runtime": 1263.2887, "eval_samples_per_second": 28.49, "eval_steps_per_second": 14.245, "step": 32000 }, { "acc": 0.79165649, "epoch": 0.7468175388967468, "grad_norm": 5.0, "learning_rate": 7.213299923951863e-06, "loss": 0.80209103, "memory(GiB)": 135.49, "step": 32010, "train_speed(iter/s)": 0.201854 }, { "acc": 0.78245211, "epoch": 0.7470508464690357, "grad_norm": 5.46875, "learning_rate": 7.211605819365657e-06, "loss": 0.78493905, "memory(GiB)": 135.49, "step": 32020, "train_speed(iter/s)": 0.201888 }, { "acc": 0.76564789, "epoch": 0.7472841540413246, "grad_norm": 7.84375, "learning_rate": 7.209911399073361e-06, "loss": 0.85409994, "memory(GiB)": 135.49, "step": 32030, "train_speed(iter/s)": 0.201922 }, { "acc": 0.78350439, "epoch": 0.7475174616136135, "grad_norm": 7.84375, "learning_rate": 7.208216663316856e-06, "loss": 0.79119539, "memory(GiB)": 135.49, "step": 32040, "train_speed(iter/s)": 0.201957 }, { "acc": 0.78136501, "epoch": 0.7477507691859024, "grad_norm": 5.78125, "learning_rate": 7.206521612338064e-06, "loss": 0.79930515, "memory(GiB)": 135.49, "step": 32050, "train_speed(iter/s)": 0.201991 }, { "acc": 0.75664053, "epoch": 0.7479840767581912, "grad_norm": 7.59375, "learning_rate": 7.204826246378953e-06, "loss": 0.89448662, "memory(GiB)": 135.49, "step": 32060, "train_speed(iter/s)": 0.202023 }, { "acc": 0.76635666, "epoch": 0.7482173843304801, "grad_norm": 5.375, "learning_rate": 7.203130565681537e-06, "loss": 0.83034134, "memory(GiB)": 135.49, "step": 32070, "train_speed(iter/s)": 0.202054 }, { "acc": 0.76880155, "epoch": 0.748450691902769, "grad_norm": 5.875, "learning_rate": 7.201434570487871e-06, "loss": 0.83341951, "memory(GiB)": 135.49, "step": 32080, "train_speed(iter/s)": 0.202085 }, { "acc": 0.76148286, "epoch": 0.7486839994750579, "grad_norm": 6.9375, "learning_rate": 7.199738261040059e-06, "loss": 0.86638126, "memory(GiB)": 135.49, "step": 32090, "train_speed(iter/s)": 0.202119 }, { "acc": 0.78866158, "epoch": 0.7489173070473468, "grad_norm": 7.21875, "learning_rate": 7.1980416375802494e-06, "loss": 0.75557537, "memory(GiB)": 135.49, "step": 32100, "train_speed(iter/s)": 0.202153 }, { "acc": 0.76209784, "epoch": 0.7491506146196357, "grad_norm": 5.84375, "learning_rate": 7.196344700350635e-06, "loss": 0.87183704, "memory(GiB)": 135.49, "step": 32110, "train_speed(iter/s)": 0.202187 }, { "acc": 0.77241945, "epoch": 0.7493839221919246, "grad_norm": 5.59375, "learning_rate": 7.1946474495934535e-06, "loss": 0.80548906, "memory(GiB)": 135.49, "step": 32120, "train_speed(iter/s)": 0.202221 }, { "acc": 0.77580891, "epoch": 0.7496172297642135, "grad_norm": 5.125, "learning_rate": 7.192949885550986e-06, "loss": 0.8128335, "memory(GiB)": 135.49, "step": 32130, "train_speed(iter/s)": 0.202253 }, { "acc": 0.77738519, "epoch": 0.7498505373365024, "grad_norm": 5.4375, "learning_rate": 7.1912520084655594e-06, "loss": 0.8020052, "memory(GiB)": 135.49, "step": 32140, "train_speed(iter/s)": 0.202285 }, { "acc": 0.76242504, "epoch": 0.7500838449087913, "grad_norm": 4.8125, "learning_rate": 7.189553818579545e-06, "loss": 0.86575413, "memory(GiB)": 135.49, "step": 32150, "train_speed(iter/s)": 0.202318 }, { "acc": 0.77510843, "epoch": 0.7503171524810802, "grad_norm": 5.78125, "learning_rate": 7.187855316135358e-06, "loss": 0.80913887, "memory(GiB)": 135.49, "step": 32160, "train_speed(iter/s)": 0.20235 }, { "acc": 0.77517185, "epoch": 0.7505504600533691, "grad_norm": 5.84375, "learning_rate": 7.1861565013754605e-06, "loss": 0.79983697, "memory(GiB)": 135.49, "step": 32170, "train_speed(iter/s)": 0.202379 }, { "acc": 0.75861721, "epoch": 0.750783767625658, "grad_norm": 6.5625, "learning_rate": 7.18445737454236e-06, "loss": 0.87669563, "memory(GiB)": 135.49, "step": 32180, "train_speed(iter/s)": 0.202408 }, { "acc": 0.77318506, "epoch": 0.7510170751979469, "grad_norm": 7.40625, "learning_rate": 7.182757935878601e-06, "loss": 0.82265835, "memory(GiB)": 135.49, "step": 32190, "train_speed(iter/s)": 0.202443 }, { "acc": 0.76404715, "epoch": 0.7512503827702358, "grad_norm": 5.875, "learning_rate": 7.1810581856267815e-06, "loss": 0.87855015, "memory(GiB)": 135.49, "step": 32200, "train_speed(iter/s)": 0.202477 }, { "acc": 0.76944151, "epoch": 0.7514836903425247, "grad_norm": 5.375, "learning_rate": 7.17935812402954e-06, "loss": 0.81658773, "memory(GiB)": 135.49, "step": 32210, "train_speed(iter/s)": 0.202508 }, { "acc": 0.77259636, "epoch": 0.7517169979148136, "grad_norm": 4.65625, "learning_rate": 7.177657751329559e-06, "loss": 0.82084694, "memory(GiB)": 135.49, "step": 32220, "train_speed(iter/s)": 0.202538 }, { "acc": 0.76153712, "epoch": 0.7519503054871025, "grad_norm": 4.5625, "learning_rate": 7.1759570677695665e-06, "loss": 0.86768589, "memory(GiB)": 135.49, "step": 32230, "train_speed(iter/s)": 0.20257 }, { "acc": 0.75578389, "epoch": 0.7521836130593914, "grad_norm": 7.0, "learning_rate": 7.174256073592335e-06, "loss": 0.89248486, "memory(GiB)": 135.49, "step": 32240, "train_speed(iter/s)": 0.202605 }, { "acc": 0.78436146, "epoch": 0.7524169206316802, "grad_norm": 6.65625, "learning_rate": 7.172554769040681e-06, "loss": 0.80632496, "memory(GiB)": 135.49, "step": 32250, "train_speed(iter/s)": 0.202636 }, { "acc": 0.77383881, "epoch": 0.7526502282039691, "grad_norm": 5.875, "learning_rate": 7.1708531543574635e-06, "loss": 0.81358995, "memory(GiB)": 135.49, "step": 32260, "train_speed(iter/s)": 0.20267 }, { "acc": 0.77045765, "epoch": 0.752883535776258, "grad_norm": 6.34375, "learning_rate": 7.169151229785589e-06, "loss": 0.82464972, "memory(GiB)": 135.49, "step": 32270, "train_speed(iter/s)": 0.202705 }, { "acc": 0.78771482, "epoch": 0.7531168433485469, "grad_norm": 5.375, "learning_rate": 7.167448995568009e-06, "loss": 0.78627973, "memory(GiB)": 135.49, "step": 32280, "train_speed(iter/s)": 0.202736 }, { "acc": 0.77036452, "epoch": 0.7533501509208358, "grad_norm": 13.6875, "learning_rate": 7.165746451947713e-06, "loss": 0.83770742, "memory(GiB)": 135.49, "step": 32290, "train_speed(iter/s)": 0.202768 }, { "acc": 0.76245556, "epoch": 0.7535834584931247, "grad_norm": 5.53125, "learning_rate": 7.16404359916774e-06, "loss": 0.87206964, "memory(GiB)": 135.49, "step": 32300, "train_speed(iter/s)": 0.2028 }, { "acc": 0.75286222, "epoch": 0.7538167660654136, "grad_norm": 11.25, "learning_rate": 7.1623404374711715e-06, "loss": 0.92211695, "memory(GiB)": 135.49, "step": 32310, "train_speed(iter/s)": 0.202831 }, { "acc": 0.78123322, "epoch": 0.7540500736377025, "grad_norm": 5.3125, "learning_rate": 7.160636967101134e-06, "loss": 0.78774786, "memory(GiB)": 135.49, "step": 32320, "train_speed(iter/s)": 0.202865 }, { "acc": 0.78738894, "epoch": 0.7542833812099914, "grad_norm": 5.5, "learning_rate": 7.1589331883007965e-06, "loss": 0.75735531, "memory(GiB)": 135.49, "step": 32330, "train_speed(iter/s)": 0.202897 }, { "acc": 0.77453146, "epoch": 0.7545166887822803, "grad_norm": 6.53125, "learning_rate": 7.1572291013133745e-06, "loss": 0.80623207, "memory(GiB)": 135.49, "step": 32340, "train_speed(iter/s)": 0.202929 }, { "acc": 0.77494965, "epoch": 0.7547499963545692, "grad_norm": 5.3125, "learning_rate": 7.155524706382125e-06, "loss": 0.82651587, "memory(GiB)": 135.49, "step": 32350, "train_speed(iter/s)": 0.202962 }, { "acc": 0.76170378, "epoch": 0.7549833039268581, "grad_norm": 8.9375, "learning_rate": 7.15382000375035e-06, "loss": 0.86670132, "memory(GiB)": 135.49, "step": 32360, "train_speed(iter/s)": 0.202994 }, { "acc": 0.78115668, "epoch": 0.755216611499147, "grad_norm": 6.625, "learning_rate": 7.152114993661394e-06, "loss": 0.78536148, "memory(GiB)": 135.49, "step": 32370, "train_speed(iter/s)": 0.203027 }, { "acc": 0.76632404, "epoch": 0.7554499190714359, "grad_norm": 5.21875, "learning_rate": 7.150409676358649e-06, "loss": 0.8417572, "memory(GiB)": 135.49, "step": 32380, "train_speed(iter/s)": 0.203058 }, { "acc": 0.75803194, "epoch": 0.7556832266437248, "grad_norm": 5.25, "learning_rate": 7.148704052085547e-06, "loss": 0.86962547, "memory(GiB)": 135.49, "step": 32390, "train_speed(iter/s)": 0.203091 }, { "acc": 0.77272434, "epoch": 0.7559165342160137, "grad_norm": 10.375, "learning_rate": 7.146998121085566e-06, "loss": 0.83961744, "memory(GiB)": 135.49, "step": 32400, "train_speed(iter/s)": 0.203123 }, { "acc": 0.7810946, "epoch": 0.7561498417883026, "grad_norm": 5.59375, "learning_rate": 7.145291883602226e-06, "loss": 0.78020215, "memory(GiB)": 135.49, "step": 32410, "train_speed(iter/s)": 0.203155 }, { "acc": 0.76094933, "epoch": 0.7563831493605915, "grad_norm": 10.25, "learning_rate": 7.143585339879093e-06, "loss": 0.88001804, "memory(GiB)": 135.49, "step": 32420, "train_speed(iter/s)": 0.203188 }, { "acc": 0.77089195, "epoch": 0.7566164569328804, "grad_norm": 6.78125, "learning_rate": 7.141878490159777e-06, "loss": 0.84274883, "memory(GiB)": 135.49, "step": 32430, "train_speed(iter/s)": 0.203222 }, { "acc": 0.7868844, "epoch": 0.7568497645051693, "grad_norm": 5.0625, "learning_rate": 7.140171334687927e-06, "loss": 0.77596674, "memory(GiB)": 135.49, "step": 32440, "train_speed(iter/s)": 0.203256 }, { "acc": 0.75886841, "epoch": 0.7570830720774581, "grad_norm": 9.9375, "learning_rate": 7.138463873707242e-06, "loss": 0.87586117, "memory(GiB)": 135.49, "step": 32450, "train_speed(iter/s)": 0.20329 }, { "acc": 0.75892735, "epoch": 0.757316379649747, "grad_norm": 5.0625, "learning_rate": 7.13675610746146e-06, "loss": 0.87046604, "memory(GiB)": 135.49, "step": 32460, "train_speed(iter/s)": 0.203323 }, { "acc": 0.77215233, "epoch": 0.7575496872220359, "grad_norm": 6.28125, "learning_rate": 7.135048036194364e-06, "loss": 0.82401791, "memory(GiB)": 135.49, "step": 32470, "train_speed(iter/s)": 0.203356 }, { "acc": 0.75882378, "epoch": 0.7577829947943248, "grad_norm": 6.53125, "learning_rate": 7.13333966014978e-06, "loss": 0.87342663, "memory(GiB)": 135.49, "step": 32480, "train_speed(iter/s)": 0.203389 }, { "acc": 0.77842774, "epoch": 0.7580163023666137, "grad_norm": 7.75, "learning_rate": 7.131630979571581e-06, "loss": 0.79026127, "memory(GiB)": 135.49, "step": 32490, "train_speed(iter/s)": 0.203418 }, { "acc": 0.75150566, "epoch": 0.7582496099389026, "grad_norm": 5.4375, "learning_rate": 7.1299219947036795e-06, "loss": 0.91148853, "memory(GiB)": 135.49, "step": 32500, "train_speed(iter/s)": 0.203452 }, { "epoch": 0.7582496099389026, "eval_acc": 0.734701789938012, "eval_loss": 0.8361714482307434, "eval_runtime": 1262.9447, "eval_samples_per_second": 28.498, "eval_steps_per_second": 14.249, "step": 32500 }, { "acc": 0.74299264, "epoch": 0.7584829175111915, "grad_norm": 6.125, "learning_rate": 7.12821270579003e-06, "loss": 0.93753853, "memory(GiB)": 135.49, "step": 32510, "train_speed(iter/s)": 0.201856 }, { "acc": 0.7777771, "epoch": 0.7587162250834804, "grad_norm": 5.15625, "learning_rate": 7.126503113074636e-06, "loss": 0.79509411, "memory(GiB)": 135.49, "step": 32520, "train_speed(iter/s)": 0.201889 }, { "acc": 0.73758698, "epoch": 0.7589495326557693, "grad_norm": 5.96875, "learning_rate": 7.1247932168015396e-06, "loss": 0.96006432, "memory(GiB)": 135.49, "step": 32530, "train_speed(iter/s)": 0.201921 }, { "acc": 0.76981125, "epoch": 0.7591828402280582, "grad_norm": 8.0, "learning_rate": 7.123083017214829e-06, "loss": 0.83289547, "memory(GiB)": 135.49, "step": 32540, "train_speed(iter/s)": 0.201952 }, { "acc": 0.7519002, "epoch": 0.759416147800347, "grad_norm": 6.28125, "learning_rate": 7.121372514558635e-06, "loss": 0.92292547, "memory(GiB)": 135.49, "step": 32550, "train_speed(iter/s)": 0.201984 }, { "acc": 0.75627518, "epoch": 0.759649455372636, "grad_norm": 5.59375, "learning_rate": 7.1196617090771305e-06, "loss": 0.88578911, "memory(GiB)": 135.49, "step": 32560, "train_speed(iter/s)": 0.202017 }, { "acc": 0.7867939, "epoch": 0.7598827629449248, "grad_norm": 5.4375, "learning_rate": 7.1179506010145335e-06, "loss": 0.76889153, "memory(GiB)": 135.49, "step": 32570, "train_speed(iter/s)": 0.202051 }, { "acc": 0.75968094, "epoch": 0.7601160705172137, "grad_norm": 5.25, "learning_rate": 7.116239190615104e-06, "loss": 0.8683053, "memory(GiB)": 135.49, "step": 32580, "train_speed(iter/s)": 0.202084 }, { "acc": 0.75127802, "epoch": 0.7603493780895026, "grad_norm": 5.875, "learning_rate": 7.1145274781231435e-06, "loss": 0.91891174, "memory(GiB)": 135.49, "step": 32590, "train_speed(iter/s)": 0.202118 }, { "acc": 0.76619968, "epoch": 0.7605826856617915, "grad_norm": 5.46875, "learning_rate": 7.112815463782998e-06, "loss": 0.83956432, "memory(GiB)": 135.49, "step": 32600, "train_speed(iter/s)": 0.202152 }, { "acc": 0.77386999, "epoch": 0.7608159932340804, "grad_norm": 5.46875, "learning_rate": 7.111103147839062e-06, "loss": 0.81086197, "memory(GiB)": 135.49, "step": 32610, "train_speed(iter/s)": 0.202183 }, { "acc": 0.77574854, "epoch": 0.7610493008063693, "grad_norm": 5.375, "learning_rate": 7.109390530535762e-06, "loss": 0.80830059, "memory(GiB)": 135.49, "step": 32620, "train_speed(iter/s)": 0.202212 }, { "acc": 0.75750504, "epoch": 0.7612826083786582, "grad_norm": 7.09375, "learning_rate": 7.1076776121175794e-06, "loss": 0.87972698, "memory(GiB)": 135.49, "step": 32630, "train_speed(iter/s)": 0.202244 }, { "acc": 0.77956057, "epoch": 0.761515915950947, "grad_norm": 6.84375, "learning_rate": 7.105964392829029e-06, "loss": 0.78120928, "memory(GiB)": 135.49, "step": 32640, "train_speed(iter/s)": 0.202276 }, { "acc": 0.76642613, "epoch": 0.7617492235232359, "grad_norm": 5.75, "learning_rate": 7.104250872914673e-06, "loss": 0.85356979, "memory(GiB)": 135.49, "step": 32650, "train_speed(iter/s)": 0.202308 }, { "acc": 0.77473478, "epoch": 0.7619825310955248, "grad_norm": 22.375, "learning_rate": 7.102537052619116e-06, "loss": 0.79376745, "memory(GiB)": 135.49, "step": 32660, "train_speed(iter/s)": 0.20234 }, { "acc": 0.77661953, "epoch": 0.7622158386678137, "grad_norm": 6.625, "learning_rate": 7.100822932187006e-06, "loss": 0.79977713, "memory(GiB)": 135.49, "step": 32670, "train_speed(iter/s)": 0.202371 }, { "acc": 0.78884134, "epoch": 0.7624491462401026, "grad_norm": 5.3125, "learning_rate": 7.099108511863032e-06, "loss": 0.74057941, "memory(GiB)": 135.49, "step": 32680, "train_speed(iter/s)": 0.202405 }, { "acc": 0.7666934, "epoch": 0.7626824538123915, "grad_norm": 6.59375, "learning_rate": 7.097393791891929e-06, "loss": 0.83986902, "memory(GiB)": 135.49, "step": 32690, "train_speed(iter/s)": 0.20244 }, { "acc": 0.75571222, "epoch": 0.7629157613846804, "grad_norm": 8.5, "learning_rate": 7.095678772518471e-06, "loss": 0.87510262, "memory(GiB)": 135.49, "step": 32700, "train_speed(iter/s)": 0.202473 }, { "acc": 0.78188076, "epoch": 0.7631490689569693, "grad_norm": 7.75, "learning_rate": 7.093963453987476e-06, "loss": 0.78680334, "memory(GiB)": 135.49, "step": 32710, "train_speed(iter/s)": 0.202507 }, { "acc": 0.7536047, "epoch": 0.7633823765292582, "grad_norm": 8.0625, "learning_rate": 7.092247836543808e-06, "loss": 0.9105608, "memory(GiB)": 135.49, "step": 32720, "train_speed(iter/s)": 0.202538 }, { "acc": 0.77342873, "epoch": 0.7636156841015471, "grad_norm": 5.5, "learning_rate": 7.090531920432368e-06, "loss": 0.79517751, "memory(GiB)": 135.49, "step": 32730, "train_speed(iter/s)": 0.202572 }, { "acc": 0.77186055, "epoch": 0.763848991673836, "grad_norm": 8.625, "learning_rate": 7.088815705898103e-06, "loss": 0.8257617, "memory(GiB)": 135.49, "step": 32740, "train_speed(iter/s)": 0.202606 }, { "acc": 0.75718126, "epoch": 0.7640822992461249, "grad_norm": 5.25, "learning_rate": 7.0870991931860044e-06, "loss": 0.87925882, "memory(GiB)": 135.49, "step": 32750, "train_speed(iter/s)": 0.202637 }, { "acc": 0.78335104, "epoch": 0.7643156068184138, "grad_norm": 5.4375, "learning_rate": 7.0853823825411005e-06, "loss": 0.77106586, "memory(GiB)": 135.49, "step": 32760, "train_speed(iter/s)": 0.202667 }, { "acc": 0.78599453, "epoch": 0.7645489143907027, "grad_norm": 8.1875, "learning_rate": 7.083665274208469e-06, "loss": 0.76125383, "memory(GiB)": 135.49, "step": 32770, "train_speed(iter/s)": 0.202698 }, { "acc": 0.76618547, "epoch": 0.7647822219629916, "grad_norm": 6.96875, "learning_rate": 7.081947868433223e-06, "loss": 0.8239439, "memory(GiB)": 135.49, "step": 32780, "train_speed(iter/s)": 0.202732 }, { "acc": 0.75682535, "epoch": 0.7650155295352805, "grad_norm": 6.125, "learning_rate": 7.0802301654605255e-06, "loss": 0.89369793, "memory(GiB)": 135.49, "step": 32790, "train_speed(iter/s)": 0.202766 }, { "acc": 0.76944485, "epoch": 0.7652488371075694, "grad_norm": 6.375, "learning_rate": 7.078512165535576e-06, "loss": 0.83599787, "memory(GiB)": 135.49, "step": 32800, "train_speed(iter/s)": 0.202798 }, { "acc": 0.76855049, "epoch": 0.7654821446798583, "grad_norm": 7.375, "learning_rate": 7.076793868903617e-06, "loss": 0.83850698, "memory(GiB)": 135.49, "step": 32810, "train_speed(iter/s)": 0.20283 }, { "acc": 0.7679275, "epoch": 0.7657154522521472, "grad_norm": 7.9375, "learning_rate": 7.0750752758099384e-06, "loss": 0.83839092, "memory(GiB)": 135.49, "step": 32820, "train_speed(iter/s)": 0.202864 }, { "acc": 0.78628769, "epoch": 0.765948759824436, "grad_norm": 6.5625, "learning_rate": 7.073356386499865e-06, "loss": 0.77455244, "memory(GiB)": 135.49, "step": 32830, "train_speed(iter/s)": 0.202895 }, { "acc": 0.75966301, "epoch": 0.7661820673967249, "grad_norm": 6.625, "learning_rate": 7.071637201218772e-06, "loss": 0.88577118, "memory(GiB)": 135.49, "step": 32840, "train_speed(iter/s)": 0.202929 }, { "acc": 0.78861551, "epoch": 0.7664153749690138, "grad_norm": 5.78125, "learning_rate": 7.06991772021207e-06, "loss": 0.7652648, "memory(GiB)": 135.49, "step": 32850, "train_speed(iter/s)": 0.202962 }, { "acc": 0.77377348, "epoch": 0.7666486825413027, "grad_norm": 10.0625, "learning_rate": 7.068197943725214e-06, "loss": 0.82929983, "memory(GiB)": 135.49, "step": 32860, "train_speed(iter/s)": 0.202994 }, { "acc": 0.73943357, "epoch": 0.7668819901135916, "grad_norm": 10.875, "learning_rate": 7.0664778720037034e-06, "loss": 0.92212334, "memory(GiB)": 135.49, "step": 32870, "train_speed(iter/s)": 0.203029 }, { "acc": 0.77679515, "epoch": 0.7671152976858805, "grad_norm": 7.71875, "learning_rate": 7.064757505293075e-06, "loss": 0.80675335, "memory(GiB)": 135.49, "step": 32880, "train_speed(iter/s)": 0.203062 }, { "acc": 0.74726348, "epoch": 0.7673486052581694, "grad_norm": 8.0, "learning_rate": 7.063036843838913e-06, "loss": 0.90934544, "memory(GiB)": 135.49, "step": 32890, "train_speed(iter/s)": 0.203095 }, { "acc": 0.76249967, "epoch": 0.7675819128304583, "grad_norm": 6.46875, "learning_rate": 7.061315887886841e-06, "loss": 0.85747623, "memory(GiB)": 135.49, "step": 32900, "train_speed(iter/s)": 0.203127 }, { "acc": 0.77897086, "epoch": 0.7678152204027472, "grad_norm": 5.6875, "learning_rate": 7.059594637682526e-06, "loss": 0.79783416, "memory(GiB)": 135.49, "step": 32910, "train_speed(iter/s)": 0.203157 }, { "acc": 0.77268848, "epoch": 0.7680485279750361, "grad_norm": 5.375, "learning_rate": 7.057873093471673e-06, "loss": 0.82134495, "memory(GiB)": 135.49, "step": 32920, "train_speed(iter/s)": 0.203187 }, { "acc": 0.77287827, "epoch": 0.768281835547325, "grad_norm": 5.34375, "learning_rate": 7.056151255500036e-06, "loss": 0.80308228, "memory(GiB)": 135.49, "step": 32930, "train_speed(iter/s)": 0.20322 }, { "acc": 0.76616335, "epoch": 0.7685151431196139, "grad_norm": 5.625, "learning_rate": 7.0544291240134025e-06, "loss": 0.83275585, "memory(GiB)": 135.49, "step": 32940, "train_speed(iter/s)": 0.20325 }, { "acc": 0.7766037, "epoch": 0.7687484506919028, "grad_norm": 5.75, "learning_rate": 7.052706699257609e-06, "loss": 0.81560326, "memory(GiB)": 135.49, "step": 32950, "train_speed(iter/s)": 0.20328 }, { "acc": 0.76568203, "epoch": 0.7689817582641917, "grad_norm": 9.125, "learning_rate": 7.05098398147853e-06, "loss": 0.84754105, "memory(GiB)": 135.49, "step": 32960, "train_speed(iter/s)": 0.203313 }, { "acc": 0.77527127, "epoch": 0.7692150658364806, "grad_norm": 5.03125, "learning_rate": 7.0492609709220835e-06, "loss": 0.81409836, "memory(GiB)": 135.49, "step": 32970, "train_speed(iter/s)": 0.203342 }, { "acc": 0.78594308, "epoch": 0.7694483734087695, "grad_norm": 7.4375, "learning_rate": 7.04753766783423e-06, "loss": 0.77901993, "memory(GiB)": 135.49, "step": 32980, "train_speed(iter/s)": 0.203372 }, { "acc": 0.7648931, "epoch": 0.7696816809810584, "grad_norm": 8.0, "learning_rate": 7.045814072460968e-06, "loss": 0.83830662, "memory(GiB)": 135.49, "step": 32990, "train_speed(iter/s)": 0.203404 }, { "acc": 0.7738493, "epoch": 0.7699149885533473, "grad_norm": 4.875, "learning_rate": 7.044090185048343e-06, "loss": 0.83399, "memory(GiB)": 135.49, "step": 33000, "train_speed(iter/s)": 0.203436 }, { "epoch": 0.7699149885533473, "eval_acc": 0.7346125673203205, "eval_loss": 0.8360110521316528, "eval_runtime": 1262.106, "eval_samples_per_second": 28.517, "eval_steps_per_second": 14.259, "step": 33000 }, { "acc": 0.77031555, "epoch": 0.7701482961256362, "grad_norm": 5.6875, "learning_rate": 7.042366005842437e-06, "loss": 0.82693748, "memory(GiB)": 135.49, "step": 33010, "train_speed(iter/s)": 0.201866 }, { "acc": 0.77577066, "epoch": 0.770381603697925, "grad_norm": 4.875, "learning_rate": 7.040641535089377e-06, "loss": 0.80674667, "memory(GiB)": 135.49, "step": 33020, "train_speed(iter/s)": 0.201898 }, { "acc": 0.76229601, "epoch": 0.7706149112702139, "grad_norm": 4.625, "learning_rate": 7.038916773035332e-06, "loss": 0.85125818, "memory(GiB)": 135.49, "step": 33030, "train_speed(iter/s)": 0.201929 }, { "acc": 0.77196598, "epoch": 0.7708482188425028, "grad_norm": 11.0, "learning_rate": 7.037191719926507e-06, "loss": 0.83604336, "memory(GiB)": 135.49, "step": 33040, "train_speed(iter/s)": 0.201961 }, { "acc": 0.77533121, "epoch": 0.7710815264147917, "grad_norm": 5.28125, "learning_rate": 7.035466376009157e-06, "loss": 0.78843307, "memory(GiB)": 135.49, "step": 33050, "train_speed(iter/s)": 0.201994 }, { "acc": 0.76700058, "epoch": 0.7713148339870806, "grad_norm": 5.0, "learning_rate": 7.033740741529573e-06, "loss": 0.81921244, "memory(GiB)": 135.49, "step": 33060, "train_speed(iter/s)": 0.202027 }, { "acc": 0.77072287, "epoch": 0.7715481415593695, "grad_norm": 5.0625, "learning_rate": 7.03201481673409e-06, "loss": 0.79756155, "memory(GiB)": 135.49, "step": 33070, "train_speed(iter/s)": 0.202059 }, { "acc": 0.76558781, "epoch": 0.7717814491316584, "grad_norm": 6.8125, "learning_rate": 7.030288601869082e-06, "loss": 0.84936075, "memory(GiB)": 135.49, "step": 33080, "train_speed(iter/s)": 0.202094 }, { "acc": 0.73758526, "epoch": 0.7720147567039473, "grad_norm": 4.625, "learning_rate": 7.028562097180965e-06, "loss": 0.93443165, "memory(GiB)": 135.49, "step": 33090, "train_speed(iter/s)": 0.202126 }, { "acc": 0.77026672, "epoch": 0.7722480642762362, "grad_norm": 5.59375, "learning_rate": 7.026835302916198e-06, "loss": 0.82508516, "memory(GiB)": 135.49, "step": 33100, "train_speed(iter/s)": 0.202158 }, { "acc": 0.76525726, "epoch": 0.7724813718485251, "grad_norm": 6.375, "learning_rate": 7.025108219321281e-06, "loss": 0.847505, "memory(GiB)": 135.49, "step": 33110, "train_speed(iter/s)": 0.202189 }, { "acc": 0.76766987, "epoch": 0.772714679420814, "grad_norm": 12.1875, "learning_rate": 7.023380846642754e-06, "loss": 0.83569603, "memory(GiB)": 135.49, "step": 33120, "train_speed(iter/s)": 0.202222 }, { "acc": 0.76515474, "epoch": 0.7729479869931029, "grad_norm": 5.65625, "learning_rate": 7.021653185127197e-06, "loss": 0.84452934, "memory(GiB)": 135.49, "step": 33130, "train_speed(iter/s)": 0.202252 }, { "acc": 0.77313066, "epoch": 0.7731812945653918, "grad_norm": 7.75, "learning_rate": 7.019925235021237e-06, "loss": 0.81566048, "memory(GiB)": 135.49, "step": 33140, "train_speed(iter/s)": 0.202284 }, { "acc": 0.77677813, "epoch": 0.7734146021376807, "grad_norm": 4.5625, "learning_rate": 7.018196996571538e-06, "loss": 0.79558616, "memory(GiB)": 135.49, "step": 33150, "train_speed(iter/s)": 0.202317 }, { "acc": 0.78798265, "epoch": 0.7736479097099696, "grad_norm": 4.84375, "learning_rate": 7.016468470024802e-06, "loss": 0.75749788, "memory(GiB)": 135.49, "step": 33160, "train_speed(iter/s)": 0.202349 }, { "acc": 0.76569991, "epoch": 0.7738812172822584, "grad_norm": 6.0625, "learning_rate": 7.014739655627778e-06, "loss": 0.83953876, "memory(GiB)": 135.49, "step": 33170, "train_speed(iter/s)": 0.202381 }, { "acc": 0.74967184, "epoch": 0.7741145248545473, "grad_norm": 5.28125, "learning_rate": 7.013010553627253e-06, "loss": 0.90845585, "memory(GiB)": 135.49, "step": 33180, "train_speed(iter/s)": 0.202412 }, { "acc": 0.7755722, "epoch": 0.7743478324268362, "grad_norm": 4.46875, "learning_rate": 7.011281164270056e-06, "loss": 0.80650349, "memory(GiB)": 135.49, "step": 33190, "train_speed(iter/s)": 0.202444 }, { "acc": 0.77542038, "epoch": 0.7745811399991251, "grad_norm": 6.0625, "learning_rate": 7.009551487803058e-06, "loss": 0.80426903, "memory(GiB)": 135.49, "step": 33200, "train_speed(iter/s)": 0.202477 }, { "acc": 0.78277206, "epoch": 0.774814447571414, "grad_norm": 5.5625, "learning_rate": 7.0078215244731685e-06, "loss": 0.78186183, "memory(GiB)": 135.49, "step": 33210, "train_speed(iter/s)": 0.20251 }, { "acc": 0.7610095, "epoch": 0.7750477551437028, "grad_norm": 6.1875, "learning_rate": 7.00609127452734e-06, "loss": 0.8866272, "memory(GiB)": 135.49, "step": 33220, "train_speed(iter/s)": 0.202542 }, { "acc": 0.79169698, "epoch": 0.7752810627159917, "grad_norm": 8.8125, "learning_rate": 7.0043607382125645e-06, "loss": 0.73792334, "memory(GiB)": 135.49, "step": 33230, "train_speed(iter/s)": 0.202572 }, { "acc": 0.75568266, "epoch": 0.7755143702882806, "grad_norm": 4.90625, "learning_rate": 7.002629915775876e-06, "loss": 0.89212523, "memory(GiB)": 135.49, "step": 33240, "train_speed(iter/s)": 0.202601 }, { "acc": 0.7632627, "epoch": 0.7757476778605695, "grad_norm": 9.875, "learning_rate": 7.000898807464349e-06, "loss": 0.86013451, "memory(GiB)": 135.49, "step": 33250, "train_speed(iter/s)": 0.20263 }, { "acc": 0.77089071, "epoch": 0.7759809854328584, "grad_norm": 7.1875, "learning_rate": 6.999167413525099e-06, "loss": 0.80118637, "memory(GiB)": 135.49, "step": 33260, "train_speed(iter/s)": 0.202663 }, { "acc": 0.76848154, "epoch": 0.7762142930051473, "grad_norm": 7.625, "learning_rate": 6.9974357342052805e-06, "loss": 0.83356419, "memory(GiB)": 135.49, "step": 33270, "train_speed(iter/s)": 0.202696 }, { "acc": 0.76814375, "epoch": 0.7764476005774362, "grad_norm": 5.3125, "learning_rate": 6.995703769752091e-06, "loss": 0.82017965, "memory(GiB)": 135.49, "step": 33280, "train_speed(iter/s)": 0.202728 }, { "acc": 0.77799892, "epoch": 0.7766809081497251, "grad_norm": 5.375, "learning_rate": 6.993971520412769e-06, "loss": 0.78662944, "memory(GiB)": 135.49, "step": 33290, "train_speed(iter/s)": 0.20276 }, { "acc": 0.75689878, "epoch": 0.776914215722014, "grad_norm": 7.75, "learning_rate": 6.992238986434591e-06, "loss": 0.88487778, "memory(GiB)": 135.49, "step": 33300, "train_speed(iter/s)": 0.202791 }, { "acc": 0.77544413, "epoch": 0.7771475232943029, "grad_norm": 4.875, "learning_rate": 6.9905061680648765e-06, "loss": 0.81161795, "memory(GiB)": 135.49, "step": 33310, "train_speed(iter/s)": 0.202821 }, { "acc": 0.76364164, "epoch": 0.7773808308665918, "grad_norm": 4.96875, "learning_rate": 6.9887730655509855e-06, "loss": 0.86026726, "memory(GiB)": 135.49, "step": 33320, "train_speed(iter/s)": 0.202852 }, { "acc": 0.77247114, "epoch": 0.7776141384388807, "grad_norm": 6.5, "learning_rate": 6.987039679140316e-06, "loss": 0.81884499, "memory(GiB)": 135.49, "step": 33330, "train_speed(iter/s)": 0.202884 }, { "acc": 0.77582769, "epoch": 0.7778474460111696, "grad_norm": 14.125, "learning_rate": 6.9853060090803105e-06, "loss": 0.81818762, "memory(GiB)": 135.49, "step": 33340, "train_speed(iter/s)": 0.202915 }, { "acc": 0.77656264, "epoch": 0.7780807535834585, "grad_norm": 6.25, "learning_rate": 6.983572055618449e-06, "loss": 0.79645224, "memory(GiB)": 135.49, "step": 33350, "train_speed(iter/s)": 0.202948 }, { "acc": 0.77507391, "epoch": 0.7783140611557474, "grad_norm": 5.75, "learning_rate": 6.981837819002252e-06, "loss": 0.87807674, "memory(GiB)": 135.49, "step": 33360, "train_speed(iter/s)": 0.20298 }, { "acc": 0.77528234, "epoch": 0.7785473687280363, "grad_norm": 4.6875, "learning_rate": 6.980103299479281e-06, "loss": 0.8295805, "memory(GiB)": 135.49, "step": 33370, "train_speed(iter/s)": 0.20301 }, { "acc": 0.76956034, "epoch": 0.7787806763003252, "grad_norm": 6.34375, "learning_rate": 6.978368497297143e-06, "loss": 0.84135828, "memory(GiB)": 135.49, "step": 33380, "train_speed(iter/s)": 0.203042 }, { "acc": 0.7721858, "epoch": 0.7790139838726141, "grad_norm": 5.0, "learning_rate": 6.976633412703474e-06, "loss": 0.81510601, "memory(GiB)": 135.49, "step": 33390, "train_speed(iter/s)": 0.203074 }, { "acc": 0.77364769, "epoch": 0.779247291444903, "grad_norm": 6.71875, "learning_rate": 6.974898045945959e-06, "loss": 0.81708765, "memory(GiB)": 135.49, "step": 33400, "train_speed(iter/s)": 0.203104 }, { "acc": 0.76453128, "epoch": 0.7794805990171918, "grad_norm": 4.90625, "learning_rate": 6.973162397272323e-06, "loss": 0.85510845, "memory(GiB)": 135.49, "step": 33410, "train_speed(iter/s)": 0.203137 }, { "acc": 0.78361568, "epoch": 0.7797139065894807, "grad_norm": 7.03125, "learning_rate": 6.971426466930327e-06, "loss": 0.76267638, "memory(GiB)": 135.49, "step": 33420, "train_speed(iter/s)": 0.20317 }, { "acc": 0.75880518, "epoch": 0.7799472141617696, "grad_norm": 5.34375, "learning_rate": 6.969690255167777e-06, "loss": 0.86496983, "memory(GiB)": 135.49, "step": 33430, "train_speed(iter/s)": 0.203203 }, { "acc": 0.77461901, "epoch": 0.7801805217340585, "grad_norm": 7.0625, "learning_rate": 6.9679537622325154e-06, "loss": 0.82245407, "memory(GiB)": 135.49, "step": 33440, "train_speed(iter/s)": 0.203237 }, { "acc": 0.77793002, "epoch": 0.7804138293063474, "grad_norm": 5.5625, "learning_rate": 6.966216988372424e-06, "loss": 0.79502907, "memory(GiB)": 135.49, "step": 33450, "train_speed(iter/s)": 0.203268 }, { "acc": 0.7669158, "epoch": 0.7806471368786363, "grad_norm": 8.0625, "learning_rate": 6.964479933835429e-06, "loss": 0.8346529, "memory(GiB)": 135.49, "step": 33460, "train_speed(iter/s)": 0.2033 }, { "acc": 0.74899626, "epoch": 0.7808804444509252, "grad_norm": 9.0625, "learning_rate": 6.962742598869495e-06, "loss": 0.91430874, "memory(GiB)": 135.49, "step": 33470, "train_speed(iter/s)": 0.203333 }, { "acc": 0.75904198, "epoch": 0.7811137520232141, "grad_norm": 7.375, "learning_rate": 6.961004983722625e-06, "loss": 0.89857502, "memory(GiB)": 135.49, "step": 33480, "train_speed(iter/s)": 0.203362 }, { "acc": 0.75413561, "epoch": 0.781347059595503, "grad_norm": 9.5625, "learning_rate": 6.959267088642864e-06, "loss": 0.89365044, "memory(GiB)": 135.49, "step": 33490, "train_speed(iter/s)": 0.203391 }, { "acc": 0.77233696, "epoch": 0.7815803671677919, "grad_norm": 6.75, "learning_rate": 6.9575289138782944e-06, "loss": 0.81606398, "memory(GiB)": 135.49, "step": 33500, "train_speed(iter/s)": 0.203421 }, { "epoch": 0.7815803671677919, "eval_acc": 0.7345777172562674, "eval_loss": 0.8359377980232239, "eval_runtime": 1263.7504, "eval_samples_per_second": 28.48, "eval_steps_per_second": 14.24, "step": 33500 }, { "acc": 0.76813679, "epoch": 0.7818136747400808, "grad_norm": 5.6875, "learning_rate": 6.955790459677041e-06, "loss": 0.83076439, "memory(GiB)": 135.49, "step": 33510, "train_speed(iter/s)": 0.201871 }, { "acc": 0.7749506, "epoch": 0.7820469823123697, "grad_norm": 4.875, "learning_rate": 6.9540517262872675e-06, "loss": 0.82679291, "memory(GiB)": 135.49, "step": 33520, "train_speed(iter/s)": 0.201904 }, { "acc": 0.79052272, "epoch": 0.7822802898846586, "grad_norm": 7.1875, "learning_rate": 6.952312713957179e-06, "loss": 0.74970398, "memory(GiB)": 135.49, "step": 33530, "train_speed(iter/s)": 0.201934 }, { "acc": 0.77372274, "epoch": 0.7825135974569475, "grad_norm": 5.15625, "learning_rate": 6.9505734229350155e-06, "loss": 0.80560436, "memory(GiB)": 135.49, "step": 33540, "train_speed(iter/s)": 0.201962 }, { "acc": 0.77963905, "epoch": 0.7827469050292364, "grad_norm": 6.53125, "learning_rate": 6.948833853469065e-06, "loss": 0.78983154, "memory(GiB)": 135.49, "step": 33550, "train_speed(iter/s)": 0.201994 }, { "acc": 0.77383242, "epoch": 0.7829802126015253, "grad_norm": 5.28125, "learning_rate": 6.947094005807646e-06, "loss": 0.83015909, "memory(GiB)": 135.49, "step": 33560, "train_speed(iter/s)": 0.202026 }, { "acc": 0.77761059, "epoch": 0.7832135201738142, "grad_norm": 6.375, "learning_rate": 6.945353880199124e-06, "loss": 0.80975037, "memory(GiB)": 135.49, "step": 33570, "train_speed(iter/s)": 0.202055 }, { "acc": 0.76007242, "epoch": 0.7834468277461031, "grad_norm": 5.125, "learning_rate": 6.943613476891902e-06, "loss": 0.85215111, "memory(GiB)": 135.49, "step": 33580, "train_speed(iter/s)": 0.202087 }, { "acc": 0.75898762, "epoch": 0.783680135318392, "grad_norm": 5.28125, "learning_rate": 6.941872796134419e-06, "loss": 0.87166271, "memory(GiB)": 135.49, "step": 33590, "train_speed(iter/s)": 0.202119 }, { "acc": 0.76779656, "epoch": 0.7839134428906808, "grad_norm": 4.59375, "learning_rate": 6.940131838175159e-06, "loss": 0.85042667, "memory(GiB)": 135.49, "step": 33600, "train_speed(iter/s)": 0.202151 }, { "acc": 0.77405601, "epoch": 0.7841467504629697, "grad_norm": 6.8125, "learning_rate": 6.938390603262644e-06, "loss": 0.80653954, "memory(GiB)": 135.49, "step": 33610, "train_speed(iter/s)": 0.202183 }, { "acc": 0.75876932, "epoch": 0.7843800580352586, "grad_norm": 5.1875, "learning_rate": 6.936649091645431e-06, "loss": 0.89026585, "memory(GiB)": 135.49, "step": 33620, "train_speed(iter/s)": 0.202212 }, { "acc": 0.79509125, "epoch": 0.7846133656075475, "grad_norm": 5.59375, "learning_rate": 6.9349073035721235e-06, "loss": 0.73102217, "memory(GiB)": 135.49, "step": 33630, "train_speed(iter/s)": 0.202243 }, { "acc": 0.77387857, "epoch": 0.7848466731798364, "grad_norm": 4.625, "learning_rate": 6.933165239291362e-06, "loss": 0.79498205, "memory(GiB)": 135.49, "step": 33640, "train_speed(iter/s)": 0.202274 }, { "acc": 0.76256399, "epoch": 0.7850799807521253, "grad_norm": 5.6875, "learning_rate": 6.931422899051823e-06, "loss": 0.84475698, "memory(GiB)": 135.49, "step": 33650, "train_speed(iter/s)": 0.202304 }, { "acc": 0.75534487, "epoch": 0.7853132883244142, "grad_norm": 5.1875, "learning_rate": 6.929680283102227e-06, "loss": 0.88285789, "memory(GiB)": 135.49, "step": 33660, "train_speed(iter/s)": 0.202335 }, { "acc": 0.78323469, "epoch": 0.7855465958967031, "grad_norm": 21.5, "learning_rate": 6.9279373916913305e-06, "loss": 0.81194954, "memory(GiB)": 135.49, "step": 33670, "train_speed(iter/s)": 0.202367 }, { "acc": 0.78215952, "epoch": 0.785779903468992, "grad_norm": 6.03125, "learning_rate": 6.926194225067932e-06, "loss": 0.7805604, "memory(GiB)": 135.49, "step": 33680, "train_speed(iter/s)": 0.202398 }, { "acc": 0.76635623, "epoch": 0.7860132110412809, "grad_norm": 5.75, "learning_rate": 6.924450783480866e-06, "loss": 0.85554867, "memory(GiB)": 135.49, "step": 33690, "train_speed(iter/s)": 0.202428 }, { "acc": 0.77383561, "epoch": 0.7862465186135698, "grad_norm": 4.34375, "learning_rate": 6.922707067179011e-06, "loss": 0.80926981, "memory(GiB)": 135.49, "step": 33700, "train_speed(iter/s)": 0.20246 }, { "acc": 0.7825922, "epoch": 0.7864798261858587, "grad_norm": 7.28125, "learning_rate": 6.92096307641128e-06, "loss": 0.79526405, "memory(GiB)": 135.49, "step": 33710, "train_speed(iter/s)": 0.202491 }, { "acc": 0.79149861, "epoch": 0.7867131337581476, "grad_norm": 5.625, "learning_rate": 6.919218811426629e-06, "loss": 0.74586372, "memory(GiB)": 135.49, "step": 33720, "train_speed(iter/s)": 0.202523 }, { "acc": 0.78557248, "epoch": 0.7869464413304365, "grad_norm": 5.46875, "learning_rate": 6.91747427247405e-06, "loss": 0.75919523, "memory(GiB)": 135.49, "step": 33730, "train_speed(iter/s)": 0.202553 }, { "acc": 0.76386924, "epoch": 0.7871797489027254, "grad_norm": 6.15625, "learning_rate": 6.915729459802575e-06, "loss": 0.84220314, "memory(GiB)": 135.49, "step": 33740, "train_speed(iter/s)": 0.202586 }, { "acc": 0.78714857, "epoch": 0.7874130564750143, "grad_norm": 5.09375, "learning_rate": 6.913984373661275e-06, "loss": 0.751373, "memory(GiB)": 135.49, "step": 33750, "train_speed(iter/s)": 0.202618 }, { "acc": 0.77668886, "epoch": 0.7876463640473031, "grad_norm": 6.625, "learning_rate": 6.9122390142992634e-06, "loss": 0.80916176, "memory(GiB)": 135.49, "step": 33760, "train_speed(iter/s)": 0.202651 }, { "acc": 0.75773368, "epoch": 0.787879671619592, "grad_norm": 10.75, "learning_rate": 6.910493381965687e-06, "loss": 0.87770319, "memory(GiB)": 135.49, "step": 33770, "train_speed(iter/s)": 0.202684 }, { "acc": 0.7566967, "epoch": 0.788112979191881, "grad_norm": 7.65625, "learning_rate": 6.9087474769097366e-06, "loss": 0.87173691, "memory(GiB)": 135.49, "step": 33780, "train_speed(iter/s)": 0.202716 }, { "acc": 0.76255136, "epoch": 0.7883462867641697, "grad_norm": 7.59375, "learning_rate": 6.907001299380639e-06, "loss": 0.8658247, "memory(GiB)": 135.49, "step": 33790, "train_speed(iter/s)": 0.202746 }, { "acc": 0.77813082, "epoch": 0.7885795943364586, "grad_norm": 5.46875, "learning_rate": 6.905254849627658e-06, "loss": 0.80424032, "memory(GiB)": 135.49, "step": 33800, "train_speed(iter/s)": 0.202776 }, { "acc": 0.74670811, "epoch": 0.7888129019087475, "grad_norm": 8.25, "learning_rate": 6.9035081279001e-06, "loss": 0.94181862, "memory(GiB)": 135.49, "step": 33810, "train_speed(iter/s)": 0.202808 }, { "acc": 0.78592548, "epoch": 0.7890462094810364, "grad_norm": 5.6875, "learning_rate": 6.901761134447311e-06, "loss": 0.78315125, "memory(GiB)": 135.49, "step": 33820, "train_speed(iter/s)": 0.202838 }, { "acc": 0.78608665, "epoch": 0.7892795170533253, "grad_norm": 6.0625, "learning_rate": 6.900013869518673e-06, "loss": 0.76426868, "memory(GiB)": 135.49, "step": 33830, "train_speed(iter/s)": 0.202866 }, { "acc": 0.76866188, "epoch": 0.7895128246256142, "grad_norm": 7.90625, "learning_rate": 6.898266333363607e-06, "loss": 0.84759502, "memory(GiB)": 135.49, "step": 33840, "train_speed(iter/s)": 0.202899 }, { "acc": 0.76788983, "epoch": 0.7897461321979031, "grad_norm": 7.25, "learning_rate": 6.8965185262315725e-06, "loss": 0.82620926, "memory(GiB)": 135.49, "step": 33850, "train_speed(iter/s)": 0.20293 }, { "acc": 0.76346216, "epoch": 0.789979439770192, "grad_norm": 5.5625, "learning_rate": 6.89477044837207e-06, "loss": 0.83825531, "memory(GiB)": 135.49, "step": 33860, "train_speed(iter/s)": 0.202958 }, { "acc": 0.76284146, "epoch": 0.7902127473424809, "grad_norm": 5.40625, "learning_rate": 6.893022100034636e-06, "loss": 0.85629311, "memory(GiB)": 135.49, "step": 33870, "train_speed(iter/s)": 0.202989 }, { "acc": 0.76758823, "epoch": 0.7904460549147698, "grad_norm": 4.4375, "learning_rate": 6.891273481468847e-06, "loss": 0.8551836, "memory(GiB)": 135.49, "step": 33880, "train_speed(iter/s)": 0.203019 }, { "acc": 0.76683021, "epoch": 0.7906793624870587, "grad_norm": 7.21875, "learning_rate": 6.889524592924319e-06, "loss": 0.82830162, "memory(GiB)": 135.49, "step": 33890, "train_speed(iter/s)": 0.203051 }, { "acc": 0.79055734, "epoch": 0.7909126700593476, "grad_norm": 5.90625, "learning_rate": 6.887775434650704e-06, "loss": 0.74957256, "memory(GiB)": 135.49, "step": 33900, "train_speed(iter/s)": 0.203084 }, { "acc": 0.76347241, "epoch": 0.7911459776316365, "grad_norm": 6.125, "learning_rate": 6.8860260068976935e-06, "loss": 0.8620595, "memory(GiB)": 135.49, "step": 33910, "train_speed(iter/s)": 0.203114 }, { "acc": 0.76932597, "epoch": 0.7913792852039254, "grad_norm": 11.0, "learning_rate": 6.884276309915018e-06, "loss": 0.8094449, "memory(GiB)": 135.49, "step": 33920, "train_speed(iter/s)": 0.203147 }, { "acc": 0.7825582, "epoch": 0.7916125927762143, "grad_norm": 9.6875, "learning_rate": 6.882526343952448e-06, "loss": 0.80701618, "memory(GiB)": 135.49, "step": 33930, "train_speed(iter/s)": 0.203178 }, { "acc": 0.75771642, "epoch": 0.7918459003485032, "grad_norm": 6.25, "learning_rate": 6.880776109259788e-06, "loss": 0.86543331, "memory(GiB)": 135.49, "step": 33940, "train_speed(iter/s)": 0.203207 }, { "acc": 0.76931267, "epoch": 0.7920792079207921, "grad_norm": 5.1875, "learning_rate": 6.8790256060868866e-06, "loss": 0.84588699, "memory(GiB)": 135.49, "step": 33950, "train_speed(iter/s)": 0.203239 }, { "acc": 0.75514145, "epoch": 0.792312515493081, "grad_norm": 5.90625, "learning_rate": 6.8772748346836235e-06, "loss": 0.90041656, "memory(GiB)": 135.49, "step": 33960, "train_speed(iter/s)": 0.20327 }, { "acc": 0.76013713, "epoch": 0.7925458230653699, "grad_norm": 7.96875, "learning_rate": 6.875523795299925e-06, "loss": 0.87308292, "memory(GiB)": 135.49, "step": 33970, "train_speed(iter/s)": 0.203302 }, { "acc": 0.77887325, "epoch": 0.7927791306376588, "grad_norm": 5.03125, "learning_rate": 6.873772488185747e-06, "loss": 0.79331646, "memory(GiB)": 135.49, "step": 33980, "train_speed(iter/s)": 0.203333 }, { "acc": 0.78430071, "epoch": 0.7930124382099476, "grad_norm": 6.125, "learning_rate": 6.872020913591092e-06, "loss": 0.79897051, "memory(GiB)": 135.49, "step": 33990, "train_speed(iter/s)": 0.203365 }, { "acc": 0.78303289, "epoch": 0.7932457457822365, "grad_norm": 7.59375, "learning_rate": 6.870269071765997e-06, "loss": 0.76341157, "memory(GiB)": 135.49, "step": 34000, "train_speed(iter/s)": 0.203394 }, { "epoch": 0.7932457457822365, "eval_acc": 0.7346888825068812, "eval_loss": 0.8358190655708313, "eval_runtime": 1262.7989, "eval_samples_per_second": 28.501, "eval_steps_per_second": 14.251, "step": 34000 }, { "acc": 0.75391893, "epoch": 0.7934790533545254, "grad_norm": 6.78125, "learning_rate": 6.868516962960534e-06, "loss": 0.90578289, "memory(GiB)": 135.49, "step": 34010, "train_speed(iter/s)": 0.20187 }, { "acc": 0.76771941, "epoch": 0.7937123609268143, "grad_norm": 6.59375, "learning_rate": 6.866764587424818e-06, "loss": 0.83961964, "memory(GiB)": 135.49, "step": 34020, "train_speed(iter/s)": 0.201901 }, { "acc": 0.78078098, "epoch": 0.7939456684991032, "grad_norm": 5.09375, "learning_rate": 6.865011945408998e-06, "loss": 0.79371023, "memory(GiB)": 135.49, "step": 34030, "train_speed(iter/s)": 0.201934 }, { "acc": 0.75865526, "epoch": 0.7941789760713921, "grad_norm": 6.625, "learning_rate": 6.863259037163266e-06, "loss": 0.87410622, "memory(GiB)": 135.49, "step": 34040, "train_speed(iter/s)": 0.201964 }, { "acc": 0.76854038, "epoch": 0.794412283643681, "grad_norm": 11.6875, "learning_rate": 6.8615058629378465e-06, "loss": 0.83099804, "memory(GiB)": 135.49, "step": 34050, "train_speed(iter/s)": 0.201995 }, { "acc": 0.78212996, "epoch": 0.7946455912159699, "grad_norm": 5.3125, "learning_rate": 6.859752422983006e-06, "loss": 0.77999897, "memory(GiB)": 135.49, "step": 34060, "train_speed(iter/s)": 0.202024 }, { "acc": 0.77418499, "epoch": 0.7948788987882588, "grad_norm": 7.5625, "learning_rate": 6.857998717549048e-06, "loss": 0.79674397, "memory(GiB)": 135.49, "step": 34070, "train_speed(iter/s)": 0.202056 }, { "acc": 0.76531081, "epoch": 0.7951122063605477, "grad_norm": 6.125, "learning_rate": 6.856244746886313e-06, "loss": 0.85907936, "memory(GiB)": 135.49, "step": 34080, "train_speed(iter/s)": 0.202087 }, { "acc": 0.7606319, "epoch": 0.7953455139328366, "grad_norm": 7.03125, "learning_rate": 6.85449051124518e-06, "loss": 0.85352955, "memory(GiB)": 135.49, "step": 34090, "train_speed(iter/s)": 0.202119 }, { "acc": 0.76713209, "epoch": 0.7955788215051255, "grad_norm": 7.59375, "learning_rate": 6.852736010876063e-06, "loss": 0.84425993, "memory(GiB)": 135.49, "step": 34100, "train_speed(iter/s)": 0.202148 }, { "acc": 0.76732516, "epoch": 0.7958121290774144, "grad_norm": 5.1875, "learning_rate": 6.85098124602942e-06, "loss": 0.84102831, "memory(GiB)": 135.49, "step": 34110, "train_speed(iter/s)": 0.202179 }, { "acc": 0.77172017, "epoch": 0.7960454366497033, "grad_norm": 8.0625, "learning_rate": 6.8492262169557435e-06, "loss": 0.80656309, "memory(GiB)": 135.49, "step": 34120, "train_speed(iter/s)": 0.202209 }, { "acc": 0.78501539, "epoch": 0.7962787442219922, "grad_norm": 7.3125, "learning_rate": 6.847470923905559e-06, "loss": 0.76137629, "memory(GiB)": 135.49, "step": 34130, "train_speed(iter/s)": 0.202239 }, { "acc": 0.78301067, "epoch": 0.7965120517942811, "grad_norm": 9.625, "learning_rate": 6.845715367129438e-06, "loss": 0.79006815, "memory(GiB)": 135.49, "step": 34140, "train_speed(iter/s)": 0.20227 }, { "acc": 0.77190056, "epoch": 0.79674535936657, "grad_norm": 12.75, "learning_rate": 6.843959546877985e-06, "loss": 0.82719574, "memory(GiB)": 135.49, "step": 34150, "train_speed(iter/s)": 0.202302 }, { "acc": 0.77205901, "epoch": 0.7969786669388589, "grad_norm": 5.53125, "learning_rate": 6.842203463401842e-06, "loss": 0.83580313, "memory(GiB)": 135.49, "step": 34160, "train_speed(iter/s)": 0.20233 }, { "acc": 0.7802669, "epoch": 0.7972119745111478, "grad_norm": 6.09375, "learning_rate": 6.84044711695169e-06, "loss": 0.80621891, "memory(GiB)": 135.49, "step": 34170, "train_speed(iter/s)": 0.202362 }, { "acc": 0.76920366, "epoch": 0.7974452820834366, "grad_norm": 6.71875, "learning_rate": 6.838690507778247e-06, "loss": 0.83105431, "memory(GiB)": 135.49, "step": 34180, "train_speed(iter/s)": 0.202393 }, { "acc": 0.77289371, "epoch": 0.7976785896557255, "grad_norm": 5.1875, "learning_rate": 6.836933636132267e-06, "loss": 0.81782055, "memory(GiB)": 135.49, "step": 34190, "train_speed(iter/s)": 0.202424 }, { "acc": 0.77318869, "epoch": 0.7979118972280144, "grad_norm": 7.53125, "learning_rate": 6.835176502264544e-06, "loss": 0.81166658, "memory(GiB)": 135.49, "step": 34200, "train_speed(iter/s)": 0.202454 }, { "acc": 0.7651526, "epoch": 0.7981452048003033, "grad_norm": 9.125, "learning_rate": 6.8334191064259095e-06, "loss": 0.84078064, "memory(GiB)": 135.49, "step": 34210, "train_speed(iter/s)": 0.202485 }, { "acc": 0.77373877, "epoch": 0.7983785123725922, "grad_norm": 5.78125, "learning_rate": 6.8316614488672305e-06, "loss": 0.83665915, "memory(GiB)": 135.49, "step": 34220, "train_speed(iter/s)": 0.202512 }, { "acc": 0.77314005, "epoch": 0.7986118199448811, "grad_norm": 6.125, "learning_rate": 6.829903529839411e-06, "loss": 0.8028141, "memory(GiB)": 135.49, "step": 34230, "train_speed(iter/s)": 0.202542 }, { "acc": 0.77291203, "epoch": 0.79884512751717, "grad_norm": 5.75, "learning_rate": 6.828145349593395e-06, "loss": 0.81823702, "memory(GiB)": 135.49, "step": 34240, "train_speed(iter/s)": 0.202571 }, { "acc": 0.76377535, "epoch": 0.7990784350894589, "grad_norm": 6.1875, "learning_rate": 6.82638690838016e-06, "loss": 0.86065693, "memory(GiB)": 135.49, "step": 34250, "train_speed(iter/s)": 0.202602 }, { "acc": 0.76811981, "epoch": 0.7993117426617478, "grad_norm": 5.375, "learning_rate": 6.824628206450724e-06, "loss": 0.83933563, "memory(GiB)": 135.49, "step": 34260, "train_speed(iter/s)": 0.202631 }, { "acc": 0.75537663, "epoch": 0.7995450502340367, "grad_norm": 7.5, "learning_rate": 6.822869244056143e-06, "loss": 0.91993361, "memory(GiB)": 135.49, "step": 34270, "train_speed(iter/s)": 0.202661 }, { "acc": 0.77053614, "epoch": 0.7997783578063256, "grad_norm": 5.1875, "learning_rate": 6.821110021447506e-06, "loss": 0.84447041, "memory(GiB)": 135.49, "step": 34280, "train_speed(iter/s)": 0.202689 }, { "acc": 0.75996408, "epoch": 0.8000116653786145, "grad_norm": 6.65625, "learning_rate": 6.819350538875944e-06, "loss": 0.92140913, "memory(GiB)": 135.49, "step": 34290, "train_speed(iter/s)": 0.202722 }, { "acc": 0.77737007, "epoch": 0.8002449729509034, "grad_norm": 6.84375, "learning_rate": 6.817590796592621e-06, "loss": 0.80076485, "memory(GiB)": 135.49, "step": 34300, "train_speed(iter/s)": 0.202754 }, { "acc": 0.79343424, "epoch": 0.8004782805231923, "grad_norm": 5.875, "learning_rate": 6.815830794848739e-06, "loss": 0.71997566, "memory(GiB)": 135.49, "step": 34310, "train_speed(iter/s)": 0.202784 }, { "acc": 0.7762177, "epoch": 0.8007115880954812, "grad_norm": 9.875, "learning_rate": 6.8140705338955386e-06, "loss": 0.81103458, "memory(GiB)": 135.49, "step": 34320, "train_speed(iter/s)": 0.202817 }, { "acc": 0.75501623, "epoch": 0.80094489566777, "grad_norm": 7.40625, "learning_rate": 6.812310013984296e-06, "loss": 0.89738007, "memory(GiB)": 135.49, "step": 34330, "train_speed(iter/s)": 0.202848 }, { "acc": 0.77431316, "epoch": 0.801178203240059, "grad_norm": 5.96875, "learning_rate": 6.810549235366325e-06, "loss": 0.81825151, "memory(GiB)": 135.49, "step": 34340, "train_speed(iter/s)": 0.202879 }, { "acc": 0.77017155, "epoch": 0.8014115108123478, "grad_norm": 6.34375, "learning_rate": 6.808788198292977e-06, "loss": 0.82164736, "memory(GiB)": 135.49, "step": 34350, "train_speed(iter/s)": 0.202906 }, { "acc": 0.77453251, "epoch": 0.8016448183846367, "grad_norm": 7.375, "learning_rate": 6.80702690301564e-06, "loss": 0.82077312, "memory(GiB)": 135.49, "step": 34360, "train_speed(iter/s)": 0.202936 }, { "acc": 0.76997576, "epoch": 0.8018781259569255, "grad_norm": 5.1875, "learning_rate": 6.805265349785738e-06, "loss": 0.82469397, "memory(GiB)": 135.49, "step": 34370, "train_speed(iter/s)": 0.202968 }, { "acc": 0.77813187, "epoch": 0.8021114335292144, "grad_norm": 7.46875, "learning_rate": 6.80350353885473e-06, "loss": 0.78557167, "memory(GiB)": 135.49, "step": 34380, "train_speed(iter/s)": 0.202998 }, { "acc": 0.77235298, "epoch": 0.8023447411015033, "grad_norm": 6.90625, "learning_rate": 6.801741470474117e-06, "loss": 0.84912567, "memory(GiB)": 135.49, "step": 34390, "train_speed(iter/s)": 0.203029 }, { "acc": 0.7794302, "epoch": 0.8025780486737922, "grad_norm": 5.84375, "learning_rate": 6.799979144895432e-06, "loss": 0.79952393, "memory(GiB)": 135.49, "step": 34400, "train_speed(iter/s)": 0.20306 }, { "acc": 0.76445327, "epoch": 0.8028113562460811, "grad_norm": 5.40625, "learning_rate": 6.798216562370247e-06, "loss": 0.86945219, "memory(GiB)": 135.49, "step": 34410, "train_speed(iter/s)": 0.203093 }, { "acc": 0.76434002, "epoch": 0.80304466381837, "grad_norm": 7.875, "learning_rate": 6.79645372315017e-06, "loss": 0.86826267, "memory(GiB)": 135.49, "step": 34420, "train_speed(iter/s)": 0.203123 }, { "acc": 0.77110119, "epoch": 0.8032779713906589, "grad_norm": 6.15625, "learning_rate": 6.794690627486846e-06, "loss": 0.81111212, "memory(GiB)": 135.49, "step": 34430, "train_speed(iter/s)": 0.203154 }, { "acc": 0.7715529, "epoch": 0.8035112789629478, "grad_norm": 4.8125, "learning_rate": 6.792927275631957e-06, "loss": 0.80517626, "memory(GiB)": 135.49, "step": 34440, "train_speed(iter/s)": 0.203184 }, { "acc": 0.76867113, "epoch": 0.8037445865352367, "grad_norm": 5.6875, "learning_rate": 6.791163667837219e-06, "loss": 0.81227093, "memory(GiB)": 135.49, "step": 34450, "train_speed(iter/s)": 0.203215 }, { "acc": 0.76383495, "epoch": 0.8039778941075256, "grad_norm": 9.125, "learning_rate": 6.789399804354389e-06, "loss": 0.86716166, "memory(GiB)": 135.49, "step": 34460, "train_speed(iter/s)": 0.203246 }, { "acc": 0.77731686, "epoch": 0.8042112016798145, "grad_norm": 5.125, "learning_rate": 6.787635685435255e-06, "loss": 0.80420856, "memory(GiB)": 135.49, "step": 34470, "train_speed(iter/s)": 0.203276 }, { "acc": 0.75709114, "epoch": 0.8044445092521034, "grad_norm": 6.46875, "learning_rate": 6.785871311331648e-06, "loss": 0.8854001, "memory(GiB)": 135.49, "step": 34480, "train_speed(iter/s)": 0.203308 }, { "acc": 0.75023704, "epoch": 0.8046778168243923, "grad_norm": 4.5, "learning_rate": 6.7841066822954284e-06, "loss": 0.88922024, "memory(GiB)": 135.49, "step": 34490, "train_speed(iter/s)": 0.203339 }, { "acc": 0.78992453, "epoch": 0.8049111243966812, "grad_norm": 7.3125, "learning_rate": 6.7823417985784986e-06, "loss": 0.73737144, "memory(GiB)": 135.49, "step": 34500, "train_speed(iter/s)": 0.203368 }, { "epoch": 0.8049111243966812, "eval_acc": 0.7346667785310698, "eval_loss": 0.8355593681335449, "eval_runtime": 1263.6068, "eval_samples_per_second": 28.483, "eval_steps_per_second": 14.242, "step": 34500 }, { "acc": 0.76750937, "epoch": 0.8051444319689701, "grad_norm": 6.3125, "learning_rate": 6.780576660432797e-06, "loss": 0.84354811, "memory(GiB)": 135.49, "step": 34510, "train_speed(iter/s)": 0.201866 }, { "acc": 0.76457882, "epoch": 0.805377739541259, "grad_norm": 6.46875, "learning_rate": 6.778811268110294e-06, "loss": 0.86916695, "memory(GiB)": 135.49, "step": 34520, "train_speed(iter/s)": 0.201896 }, { "acc": 0.76375875, "epoch": 0.8056110471135479, "grad_norm": 5.71875, "learning_rate": 6.777045621862997e-06, "loss": 0.84757099, "memory(GiB)": 135.49, "step": 34530, "train_speed(iter/s)": 0.201927 }, { "acc": 0.75547895, "epoch": 0.8058443546858368, "grad_norm": 8.9375, "learning_rate": 6.775279721942954e-06, "loss": 0.89722013, "memory(GiB)": 135.49, "step": 34540, "train_speed(iter/s)": 0.201958 }, { "acc": 0.7762557, "epoch": 0.8060776622581257, "grad_norm": 4.71875, "learning_rate": 6.773513568602248e-06, "loss": 0.80288525, "memory(GiB)": 135.49, "step": 34550, "train_speed(iter/s)": 0.201988 }, { "acc": 0.76571379, "epoch": 0.8063109698304145, "grad_norm": 7.8125, "learning_rate": 6.771747162092993e-06, "loss": 0.84816465, "memory(GiB)": 135.49, "step": 34560, "train_speed(iter/s)": 0.202019 }, { "acc": 0.75126104, "epoch": 0.8065442774027034, "grad_norm": 6.34375, "learning_rate": 6.769980502667348e-06, "loss": 0.86956139, "memory(GiB)": 135.49, "step": 34570, "train_speed(iter/s)": 0.202049 }, { "acc": 0.77170181, "epoch": 0.8067775849749923, "grad_norm": 7.28125, "learning_rate": 6.7682135905775e-06, "loss": 0.80313225, "memory(GiB)": 135.49, "step": 34580, "train_speed(iter/s)": 0.202079 }, { "acc": 0.76073933, "epoch": 0.8070108925472812, "grad_norm": 5.65625, "learning_rate": 6.7664464260756745e-06, "loss": 0.89092865, "memory(GiB)": 135.49, "step": 34590, "train_speed(iter/s)": 0.20211 }, { "acc": 0.77603827, "epoch": 0.8072442001195701, "grad_norm": 5.8125, "learning_rate": 6.764679009414135e-06, "loss": 0.79649267, "memory(GiB)": 135.49, "step": 34600, "train_speed(iter/s)": 0.202141 }, { "acc": 0.75948963, "epoch": 0.807477507691859, "grad_norm": 6.78125, "learning_rate": 6.76291134084518e-06, "loss": 0.85301638, "memory(GiB)": 135.49, "step": 34610, "train_speed(iter/s)": 0.202172 }, { "acc": 0.75024691, "epoch": 0.8077108152641479, "grad_norm": 4.9375, "learning_rate": 6.761143420621141e-06, "loss": 0.91304073, "memory(GiB)": 135.49, "step": 34620, "train_speed(iter/s)": 0.202202 }, { "acc": 0.76194978, "epoch": 0.8079441228364368, "grad_norm": 18.875, "learning_rate": 6.759375248994393e-06, "loss": 0.87427797, "memory(GiB)": 135.49, "step": 34630, "train_speed(iter/s)": 0.202232 }, { "acc": 0.7824307, "epoch": 0.8081774304087257, "grad_norm": 5.0, "learning_rate": 6.757606826217339e-06, "loss": 0.7506021, "memory(GiB)": 135.49, "step": 34640, "train_speed(iter/s)": 0.202261 }, { "acc": 0.77898378, "epoch": 0.8084107379810146, "grad_norm": 4.40625, "learning_rate": 6.755838152542421e-06, "loss": 0.8109951, "memory(GiB)": 135.49, "step": 34650, "train_speed(iter/s)": 0.202292 }, { "acc": 0.74959412, "epoch": 0.8086440455533035, "grad_norm": 6.40625, "learning_rate": 6.754069228222117e-06, "loss": 0.92627735, "memory(GiB)": 135.49, "step": 34660, "train_speed(iter/s)": 0.202321 }, { "acc": 0.76522765, "epoch": 0.8088773531255924, "grad_norm": 5.09375, "learning_rate": 6.752300053508939e-06, "loss": 0.85909958, "memory(GiB)": 135.49, "step": 34670, "train_speed(iter/s)": 0.202352 }, { "acc": 0.7626533, "epoch": 0.8091106606978813, "grad_norm": 8.6875, "learning_rate": 6.750530628655437e-06, "loss": 0.85916185, "memory(GiB)": 135.49, "step": 34680, "train_speed(iter/s)": 0.202382 }, { "acc": 0.76275663, "epoch": 0.8093439682701702, "grad_norm": 6.28125, "learning_rate": 6.748760953914198e-06, "loss": 0.83586407, "memory(GiB)": 135.49, "step": 34690, "train_speed(iter/s)": 0.202412 }, { "acc": 0.75889187, "epoch": 0.8095772758424591, "grad_norm": 9.8125, "learning_rate": 6.746991029537841e-06, "loss": 0.88751869, "memory(GiB)": 135.49, "step": 34700, "train_speed(iter/s)": 0.202442 }, { "acc": 0.76937108, "epoch": 0.809810583414748, "grad_norm": 5.4375, "learning_rate": 6.74522085577902e-06, "loss": 0.83060989, "memory(GiB)": 135.49, "step": 34710, "train_speed(iter/s)": 0.20247 }, { "acc": 0.76837063, "epoch": 0.8100438909870369, "grad_norm": 6.71875, "learning_rate": 6.743450432890431e-06, "loss": 0.83674498, "memory(GiB)": 135.49, "step": 34720, "train_speed(iter/s)": 0.2025 }, { "acc": 0.77199278, "epoch": 0.8102771985593258, "grad_norm": 6.875, "learning_rate": 6.741679761124798e-06, "loss": 0.79722891, "memory(GiB)": 135.49, "step": 34730, "train_speed(iter/s)": 0.202529 }, { "acc": 0.77318316, "epoch": 0.8105105061316147, "grad_norm": 7.5625, "learning_rate": 6.739908840734885e-06, "loss": 0.81356478, "memory(GiB)": 135.49, "step": 34740, "train_speed(iter/s)": 0.202559 }, { "acc": 0.77957835, "epoch": 0.8107438137039036, "grad_norm": 5.625, "learning_rate": 6.738137671973492e-06, "loss": 0.79853497, "memory(GiB)": 135.49, "step": 34750, "train_speed(iter/s)": 0.202588 }, { "acc": 0.75123153, "epoch": 0.8109771212761924, "grad_norm": 6.84375, "learning_rate": 6.736366255093449e-06, "loss": 0.89982567, "memory(GiB)": 135.49, "step": 34760, "train_speed(iter/s)": 0.202621 }, { "acc": 0.77069273, "epoch": 0.8112104288484813, "grad_norm": 5.625, "learning_rate": 6.73459459034763e-06, "loss": 0.81729832, "memory(GiB)": 135.49, "step": 34770, "train_speed(iter/s)": 0.202652 }, { "acc": 0.77292175, "epoch": 0.8114437364207702, "grad_norm": 7.25, "learning_rate": 6.732822677988935e-06, "loss": 0.82419147, "memory(GiB)": 135.49, "step": 34780, "train_speed(iter/s)": 0.202682 }, { "acc": 0.76636286, "epoch": 0.8116770439930591, "grad_norm": 6.46875, "learning_rate": 6.731050518270307e-06, "loss": 0.84070797, "memory(GiB)": 135.49, "step": 34790, "train_speed(iter/s)": 0.202711 }, { "acc": 0.77017846, "epoch": 0.811910351565348, "grad_norm": 7.6875, "learning_rate": 6.729278111444721e-06, "loss": 0.85162334, "memory(GiB)": 135.49, "step": 34800, "train_speed(iter/s)": 0.20274 }, { "acc": 0.75391207, "epoch": 0.8121436591376369, "grad_norm": 4.25, "learning_rate": 6.727505457765185e-06, "loss": 0.89621372, "memory(GiB)": 135.49, "step": 34810, "train_speed(iter/s)": 0.20277 }, { "acc": 0.77734513, "epoch": 0.8123769667099258, "grad_norm": 8.6875, "learning_rate": 6.725732557484748e-06, "loss": 0.78659868, "memory(GiB)": 135.49, "step": 34820, "train_speed(iter/s)": 0.202801 }, { "acc": 0.75833912, "epoch": 0.8126102742822147, "grad_norm": 8.375, "learning_rate": 6.723959410856489e-06, "loss": 0.86880722, "memory(GiB)": 135.49, "step": 34830, "train_speed(iter/s)": 0.202829 }, { "acc": 0.76282129, "epoch": 0.8128435818545036, "grad_norm": 12.3125, "learning_rate": 6.722186018133525e-06, "loss": 0.87164001, "memory(GiB)": 135.49, "step": 34840, "train_speed(iter/s)": 0.20286 }, { "acc": 0.76781626, "epoch": 0.8130768894267925, "grad_norm": 7.875, "learning_rate": 6.720412379569008e-06, "loss": 0.82370806, "memory(GiB)": 135.49, "step": 34850, "train_speed(iter/s)": 0.202891 }, { "acc": 0.78474965, "epoch": 0.8133101969990814, "grad_norm": 6.625, "learning_rate": 6.718638495416124e-06, "loss": 0.77818871, "memory(GiB)": 135.49, "step": 34860, "train_speed(iter/s)": 0.202923 }, { "acc": 0.77277031, "epoch": 0.8135435045713703, "grad_norm": 7.46875, "learning_rate": 6.716864365928094e-06, "loss": 0.82348652, "memory(GiB)": 135.49, "step": 34870, "train_speed(iter/s)": 0.202951 }, { "acc": 0.76650515, "epoch": 0.8137768121436592, "grad_norm": 9.8125, "learning_rate": 6.715089991358174e-06, "loss": 0.85443516, "memory(GiB)": 135.49, "step": 34880, "train_speed(iter/s)": 0.202981 }, { "acc": 0.76595945, "epoch": 0.814010119715948, "grad_norm": 4.9375, "learning_rate": 6.713315371959656e-06, "loss": 0.83511705, "memory(GiB)": 135.49, "step": 34890, "train_speed(iter/s)": 0.203013 }, { "acc": 0.75873184, "epoch": 0.814243427288237, "grad_norm": 5.75, "learning_rate": 6.7115405079858656e-06, "loss": 0.87966213, "memory(GiB)": 135.49, "step": 34900, "train_speed(iter/s)": 0.203045 }, { "acc": 0.7801652, "epoch": 0.8144767348605259, "grad_norm": 5.9375, "learning_rate": 6.709765399690164e-06, "loss": 0.79271221, "memory(GiB)": 135.49, "step": 34910, "train_speed(iter/s)": 0.203077 }, { "acc": 0.75672255, "epoch": 0.8147100424328148, "grad_norm": 5.0625, "learning_rate": 6.707990047325952e-06, "loss": 0.8850729, "memory(GiB)": 135.49, "step": 34920, "train_speed(iter/s)": 0.203107 }, { "acc": 0.75571365, "epoch": 0.8149433500051037, "grad_norm": 5.4375, "learning_rate": 6.706214451146654e-06, "loss": 0.87981462, "memory(GiB)": 135.49, "step": 34930, "train_speed(iter/s)": 0.203137 }, { "acc": 0.77973862, "epoch": 0.8151766575773925, "grad_norm": 4.40625, "learning_rate": 6.70443861140574e-06, "loss": 0.79291983, "memory(GiB)": 135.49, "step": 34940, "train_speed(iter/s)": 0.203166 }, { "acc": 0.76949816, "epoch": 0.8154099651496813, "grad_norm": 6.25, "learning_rate": 6.702662528356709e-06, "loss": 0.82350006, "memory(GiB)": 135.49, "step": 34950, "train_speed(iter/s)": 0.203196 }, { "acc": 0.76434851, "epoch": 0.8156432727219702, "grad_norm": 6.78125, "learning_rate": 6.700886202253096e-06, "loss": 0.8617939, "memory(GiB)": 135.49, "step": 34960, "train_speed(iter/s)": 0.203226 }, { "acc": 0.75825796, "epoch": 0.8158765802942591, "grad_norm": 5.625, "learning_rate": 6.699109633348473e-06, "loss": 0.89965639, "memory(GiB)": 135.49, "step": 34970, "train_speed(iter/s)": 0.203257 }, { "acc": 0.76554809, "epoch": 0.816109887866548, "grad_norm": 10.9375, "learning_rate": 6.697332821896443e-06, "loss": 0.84881496, "memory(GiB)": 135.49, "step": 34980, "train_speed(iter/s)": 0.203287 }, { "acc": 0.75921764, "epoch": 0.8163431954388369, "grad_norm": 4.46875, "learning_rate": 6.695555768150644e-06, "loss": 0.86362915, "memory(GiB)": 135.49, "step": 34990, "train_speed(iter/s)": 0.203318 }, { "acc": 0.77602119, "epoch": 0.8165765030111258, "grad_norm": 3.421875, "learning_rate": 6.693778472364754e-06, "loss": 0.83269863, "memory(GiB)": 135.49, "step": 35000, "train_speed(iter/s)": 0.203349 }, { "epoch": 0.8165765030111258, "eval_acc": 0.7347021126237904, "eval_loss": 0.8354312181472778, "eval_runtime": 1263.2183, "eval_samples_per_second": 28.492, "eval_steps_per_second": 14.246, "step": 35000 }, { "acc": 0.76070886, "epoch": 0.8168098105834147, "grad_norm": 5.78125, "learning_rate": 6.692000934792479e-06, "loss": 0.85987244, "memory(GiB)": 135.49, "step": 35010, "train_speed(iter/s)": 0.20187 }, { "acc": 0.78013678, "epoch": 0.8170431181557036, "grad_norm": 7.40625, "learning_rate": 6.6902231556875605e-06, "loss": 0.79393625, "memory(GiB)": 135.49, "step": 35020, "train_speed(iter/s)": 0.2019 }, { "acc": 0.75598922, "epoch": 0.8172764257279925, "grad_norm": 6.96875, "learning_rate": 6.688445135303779e-06, "loss": 0.85087204, "memory(GiB)": 135.49, "step": 35030, "train_speed(iter/s)": 0.20193 }, { "acc": 0.78796244, "epoch": 0.8175097333002814, "grad_norm": 8.6875, "learning_rate": 6.686666873894945e-06, "loss": 0.74264903, "memory(GiB)": 135.49, "step": 35040, "train_speed(iter/s)": 0.201957 }, { "acc": 0.76834416, "epoch": 0.8177430408725703, "grad_norm": 5.71875, "learning_rate": 6.684888371714903e-06, "loss": 0.84269524, "memory(GiB)": 135.49, "step": 35050, "train_speed(iter/s)": 0.201985 }, { "acc": 0.77359486, "epoch": 0.8179763484448592, "grad_norm": 5.71875, "learning_rate": 6.683109629017536e-06, "loss": 0.81972351, "memory(GiB)": 135.49, "step": 35060, "train_speed(iter/s)": 0.202013 }, { "acc": 0.75968628, "epoch": 0.8182096560171481, "grad_norm": 5.125, "learning_rate": 6.681330646056758e-06, "loss": 0.87021561, "memory(GiB)": 135.49, "step": 35070, "train_speed(iter/s)": 0.202045 }, { "acc": 0.75461664, "epoch": 0.818442963589437, "grad_norm": 5.09375, "learning_rate": 6.679551423086521e-06, "loss": 0.91008358, "memory(GiB)": 135.49, "step": 35080, "train_speed(iter/s)": 0.202075 }, { "acc": 0.7547791, "epoch": 0.8186762711617259, "grad_norm": 5.6875, "learning_rate": 6.677771960360806e-06, "loss": 0.88707876, "memory(GiB)": 135.49, "step": 35090, "train_speed(iter/s)": 0.202106 }, { "acc": 0.77154684, "epoch": 0.8189095787340148, "grad_norm": 21.25, "learning_rate": 6.6759922581336285e-06, "loss": 0.83411198, "memory(GiB)": 135.49, "step": 35100, "train_speed(iter/s)": 0.202136 }, { "acc": 0.77028513, "epoch": 0.8191428863063037, "grad_norm": 4.84375, "learning_rate": 6.674212316659045e-06, "loss": 0.82057581, "memory(GiB)": 135.49, "step": 35110, "train_speed(iter/s)": 0.202165 }, { "acc": 0.75147638, "epoch": 0.8193761938785926, "grad_norm": 9.3125, "learning_rate": 6.6724321361911384e-06, "loss": 0.90325918, "memory(GiB)": 135.49, "step": 35120, "train_speed(iter/s)": 0.202195 }, { "acc": 0.75582671, "epoch": 0.8196095014508815, "grad_norm": 5.53125, "learning_rate": 6.6706517169840305e-06, "loss": 0.88464231, "memory(GiB)": 135.49, "step": 35130, "train_speed(iter/s)": 0.202226 }, { "acc": 0.75063915, "epoch": 0.8198428090231703, "grad_norm": 4.53125, "learning_rate": 6.668871059291875e-06, "loss": 0.90451097, "memory(GiB)": 135.49, "step": 35140, "train_speed(iter/s)": 0.202257 }, { "acc": 0.77949123, "epoch": 0.8200761165954592, "grad_norm": 4.96875, "learning_rate": 6.667090163368863e-06, "loss": 0.78842239, "memory(GiB)": 135.49, "step": 35150, "train_speed(iter/s)": 0.202287 }, { "acc": 0.76828089, "epoch": 0.8203094241677481, "grad_norm": 5.375, "learning_rate": 6.665309029469214e-06, "loss": 0.84408112, "memory(GiB)": 135.49, "step": 35160, "train_speed(iter/s)": 0.202317 }, { "acc": 0.77359524, "epoch": 0.820542731740037, "grad_norm": 5.28125, "learning_rate": 6.663527657847182e-06, "loss": 0.8122385, "memory(GiB)": 135.49, "step": 35170, "train_speed(iter/s)": 0.202346 }, { "acc": 0.76958046, "epoch": 0.8207760393123259, "grad_norm": 5.6875, "learning_rate": 6.661746048757061e-06, "loss": 0.83712263, "memory(GiB)": 135.49, "step": 35180, "train_speed(iter/s)": 0.202377 }, { "acc": 0.77402115, "epoch": 0.8210093468846148, "grad_norm": 6.78125, "learning_rate": 6.6599642024531755e-06, "loss": 0.81916838, "memory(GiB)": 135.49, "step": 35190, "train_speed(iter/s)": 0.202407 }, { "acc": 0.78216777, "epoch": 0.8212426544569037, "grad_norm": 5.34375, "learning_rate": 6.658182119189882e-06, "loss": 0.78543687, "memory(GiB)": 135.49, "step": 35200, "train_speed(iter/s)": 0.202436 }, { "acc": 0.75692053, "epoch": 0.8214759620291926, "grad_norm": 6.03125, "learning_rate": 6.656399799221572e-06, "loss": 0.8801712, "memory(GiB)": 135.49, "step": 35210, "train_speed(iter/s)": 0.202467 }, { "acc": 0.76173639, "epoch": 0.8217092696014815, "grad_norm": 7.125, "learning_rate": 6.654617242802672e-06, "loss": 0.87202244, "memory(GiB)": 135.49, "step": 35220, "train_speed(iter/s)": 0.202495 }, { "acc": 0.76898298, "epoch": 0.8219425771737704, "grad_norm": 5.84375, "learning_rate": 6.652834450187643e-06, "loss": 0.84956436, "memory(GiB)": 135.49, "step": 35230, "train_speed(iter/s)": 0.202525 }, { "acc": 0.77897587, "epoch": 0.8221758847460593, "grad_norm": 5.34375, "learning_rate": 6.651051421630974e-06, "loss": 0.80099325, "memory(GiB)": 135.49, "step": 35240, "train_speed(iter/s)": 0.202555 }, { "acc": 0.77690496, "epoch": 0.8224091923183482, "grad_norm": 5.71875, "learning_rate": 6.649268157387195e-06, "loss": 0.81550198, "memory(GiB)": 135.49, "step": 35250, "train_speed(iter/s)": 0.202587 }, { "acc": 0.76068206, "epoch": 0.8226424998906371, "grad_norm": 5.78125, "learning_rate": 6.647484657710867e-06, "loss": 0.87270164, "memory(GiB)": 135.49, "step": 35260, "train_speed(iter/s)": 0.202617 }, { "acc": 0.78375969, "epoch": 0.822875807462926, "grad_norm": 4.8125, "learning_rate": 6.645700922856582e-06, "loss": 0.78689084, "memory(GiB)": 135.49, "step": 35270, "train_speed(iter/s)": 0.202646 }, { "acc": 0.77160439, "epoch": 0.8231091150352149, "grad_norm": 6.8125, "learning_rate": 6.643916953078966e-06, "loss": 0.83683376, "memory(GiB)": 135.49, "step": 35280, "train_speed(iter/s)": 0.202676 }, { "acc": 0.76755419, "epoch": 0.8233424226075038, "grad_norm": 4.875, "learning_rate": 6.642132748632685e-06, "loss": 0.84104385, "memory(GiB)": 135.49, "step": 35290, "train_speed(iter/s)": 0.202705 }, { "acc": 0.76896658, "epoch": 0.8235757301797927, "grad_norm": 8.6875, "learning_rate": 6.640348309772431e-06, "loss": 0.84403772, "memory(GiB)": 135.49, "step": 35300, "train_speed(iter/s)": 0.202737 }, { "acc": 0.76595445, "epoch": 0.8238090377520816, "grad_norm": 4.9375, "learning_rate": 6.638563636752932e-06, "loss": 0.8534812, "memory(GiB)": 135.49, "step": 35310, "train_speed(iter/s)": 0.202766 }, { "acc": 0.7702178, "epoch": 0.8240423453243705, "grad_norm": 8.0, "learning_rate": 6.63677872982895e-06, "loss": 0.827845, "memory(GiB)": 135.49, "step": 35320, "train_speed(iter/s)": 0.202796 }, { "acc": 0.7839695, "epoch": 0.8242756528966593, "grad_norm": 6.75, "learning_rate": 6.634993589255278e-06, "loss": 0.7924387, "memory(GiB)": 135.49, "step": 35330, "train_speed(iter/s)": 0.202825 }, { "acc": 0.77943168, "epoch": 0.8245089604689482, "grad_norm": 9.3125, "learning_rate": 6.633208215286748e-06, "loss": 0.79969611, "memory(GiB)": 135.49, "step": 35340, "train_speed(iter/s)": 0.202852 }, { "acc": 0.76438208, "epoch": 0.8247422680412371, "grad_norm": 8.8125, "learning_rate": 6.6314226081782195e-06, "loss": 0.83449993, "memory(GiB)": 135.49, "step": 35350, "train_speed(iter/s)": 0.202881 }, { "acc": 0.77479038, "epoch": 0.824975575613526, "grad_norm": 5.9375, "learning_rate": 6.6296367681845875e-06, "loss": 0.79331341, "memory(GiB)": 135.49, "step": 35360, "train_speed(iter/s)": 0.202911 }, { "acc": 0.7632163, "epoch": 0.8252088831858149, "grad_norm": 6.65625, "learning_rate": 6.62785069556078e-06, "loss": 0.87866068, "memory(GiB)": 135.49, "step": 35370, "train_speed(iter/s)": 0.202942 }, { "acc": 0.76922092, "epoch": 0.8254421907581038, "grad_norm": 7.28125, "learning_rate": 6.6260643905617605e-06, "loss": 0.8181221, "memory(GiB)": 135.49, "step": 35380, "train_speed(iter/s)": 0.202972 }, { "acc": 0.77836666, "epoch": 0.8256754983303927, "grad_norm": 7.0, "learning_rate": 6.624277853442519e-06, "loss": 0.8111557, "memory(GiB)": 135.49, "step": 35390, "train_speed(iter/s)": 0.203001 }, { "acc": 0.77737522, "epoch": 0.8259088059026816, "grad_norm": 9.4375, "learning_rate": 6.622491084458087e-06, "loss": 0.80064812, "memory(GiB)": 135.49, "step": 35400, "train_speed(iter/s)": 0.203032 }, { "acc": 0.77465582, "epoch": 0.8261421134749705, "grad_norm": 6.0, "learning_rate": 6.620704083863523e-06, "loss": 0.82855377, "memory(GiB)": 135.49, "step": 35410, "train_speed(iter/s)": 0.203061 }, { "acc": 0.76455965, "epoch": 0.8263754210472594, "grad_norm": 6.125, "learning_rate": 6.618916851913923e-06, "loss": 0.8404623, "memory(GiB)": 135.49, "step": 35420, "train_speed(iter/s)": 0.203091 }, { "acc": 0.76800165, "epoch": 0.8266087286195483, "grad_norm": 8.4375, "learning_rate": 6.617129388864412e-06, "loss": 0.84971743, "memory(GiB)": 135.49, "step": 35430, "train_speed(iter/s)": 0.20312 }, { "acc": 0.76832962, "epoch": 0.8268420361918372, "grad_norm": 9.875, "learning_rate": 6.615341694970151e-06, "loss": 0.83105669, "memory(GiB)": 135.49, "step": 35440, "train_speed(iter/s)": 0.203148 }, { "acc": 0.78073254, "epoch": 0.8270753437641261, "grad_norm": 5.65625, "learning_rate": 6.613553770486333e-06, "loss": 0.78424454, "memory(GiB)": 135.49, "step": 35450, "train_speed(iter/s)": 0.203178 }, { "acc": 0.78676023, "epoch": 0.827308651336415, "grad_norm": 4.65625, "learning_rate": 6.611765615668182e-06, "loss": 0.76358309, "memory(GiB)": 135.49, "step": 35460, "train_speed(iter/s)": 0.20321 }, { "acc": 0.78013291, "epoch": 0.8275419589087039, "grad_norm": 4.4375, "learning_rate": 6.609977230770957e-06, "loss": 0.78428259, "memory(GiB)": 135.49, "step": 35470, "train_speed(iter/s)": 0.20324 }, { "acc": 0.77036591, "epoch": 0.8277752664809928, "grad_norm": 6.9375, "learning_rate": 6.608188616049951e-06, "loss": 0.83009224, "memory(GiB)": 135.49, "step": 35480, "train_speed(iter/s)": 0.203269 }, { "acc": 0.77365985, "epoch": 0.8280085740532817, "grad_norm": 6.15625, "learning_rate": 6.606399771760487e-06, "loss": 0.82847281, "memory(GiB)": 135.49, "step": 35490, "train_speed(iter/s)": 0.203299 }, { "acc": 0.78282738, "epoch": 0.8282418816255706, "grad_norm": 6.5, "learning_rate": 6.6046106981579216e-06, "loss": 0.77873158, "memory(GiB)": 135.49, "step": 35500, "train_speed(iter/s)": 0.203327 }, { "epoch": 0.8282418816255706, "eval_acc": 0.7348641008844817, "eval_loss": 0.8353628516197205, "eval_runtime": 1262.6414, "eval_samples_per_second": 28.505, "eval_steps_per_second": 14.253, "step": 35500 }, { "acc": 0.7761445, "epoch": 0.8284751891978595, "grad_norm": 8.5625, "learning_rate": 6.6028213954976474e-06, "loss": 0.79239721, "memory(GiB)": 135.49, "step": 35510, "train_speed(iter/s)": 0.201868 }, { "acc": 0.76057062, "epoch": 0.8287084967701484, "grad_norm": 5.875, "learning_rate": 6.601031864035082e-06, "loss": 0.87912045, "memory(GiB)": 135.49, "step": 35520, "train_speed(iter/s)": 0.201897 }, { "acc": 0.78623648, "epoch": 0.8289418043424371, "grad_norm": 6.03125, "learning_rate": 6.5992421040256834e-06, "loss": 0.76814694, "memory(GiB)": 135.49, "step": 35530, "train_speed(iter/s)": 0.201927 }, { "acc": 0.75459986, "epoch": 0.829175111914726, "grad_norm": 5.65625, "learning_rate": 6.597452115724939e-06, "loss": 0.89151554, "memory(GiB)": 135.49, "step": 35540, "train_speed(iter/s)": 0.201955 }, { "acc": 0.76101375, "epoch": 0.8294084194870149, "grad_norm": 6.5, "learning_rate": 6.5956618993883716e-06, "loss": 0.8624505, "memory(GiB)": 135.49, "step": 35550, "train_speed(iter/s)": 0.201985 }, { "acc": 0.78237309, "epoch": 0.8296417270593038, "grad_norm": 5.625, "learning_rate": 6.59387145527153e-06, "loss": 0.79737096, "memory(GiB)": 135.49, "step": 35560, "train_speed(iter/s)": 0.202012 }, { "acc": 0.75929461, "epoch": 0.8298750346315927, "grad_norm": 6.34375, "learning_rate": 6.59208078363e-06, "loss": 0.86149235, "memory(GiB)": 135.49, "step": 35570, "train_speed(iter/s)": 0.202042 }, { "acc": 0.77096329, "epoch": 0.8301083422038816, "grad_norm": 5.96875, "learning_rate": 6.590289884719403e-06, "loss": 0.8376997, "memory(GiB)": 135.49, "step": 35580, "train_speed(iter/s)": 0.202072 }, { "acc": 0.77389603, "epoch": 0.8303416497761705, "grad_norm": 6.125, "learning_rate": 6.588498758795386e-06, "loss": 0.81822805, "memory(GiB)": 135.49, "step": 35590, "train_speed(iter/s)": 0.202102 }, { "acc": 0.77243719, "epoch": 0.8305749573484594, "grad_norm": 5.125, "learning_rate": 6.586707406113632e-06, "loss": 0.81115446, "memory(GiB)": 135.49, "step": 35600, "train_speed(iter/s)": 0.202132 }, { "acc": 0.76190052, "epoch": 0.8308082649207483, "grad_norm": 7.875, "learning_rate": 6.5849158269298565e-06, "loss": 0.87566128, "memory(GiB)": 135.49, "step": 35610, "train_speed(iter/s)": 0.202162 }, { "acc": 0.76043797, "epoch": 0.8310415724930372, "grad_norm": 6.78125, "learning_rate": 6.583124021499807e-06, "loss": 0.87436037, "memory(GiB)": 135.49, "step": 35620, "train_speed(iter/s)": 0.202192 }, { "acc": 0.75144691, "epoch": 0.8312748800653261, "grad_norm": 6.96875, "learning_rate": 6.581331990079264e-06, "loss": 0.89469109, "memory(GiB)": 135.49, "step": 35630, "train_speed(iter/s)": 0.202221 }, { "acc": 0.75292072, "epoch": 0.831508187637615, "grad_norm": 5.6875, "learning_rate": 6.579539732924038e-06, "loss": 0.87996674, "memory(GiB)": 135.49, "step": 35640, "train_speed(iter/s)": 0.20225 }, { "acc": 0.75121789, "epoch": 0.8317414952099039, "grad_norm": 6.5625, "learning_rate": 6.5777472502899765e-06, "loss": 0.919417, "memory(GiB)": 135.49, "step": 35650, "train_speed(iter/s)": 0.202279 }, { "acc": 0.7688859, "epoch": 0.8319748027821928, "grad_norm": 8.875, "learning_rate": 6.5759545424329514e-06, "loss": 0.84883003, "memory(GiB)": 135.49, "step": 35660, "train_speed(iter/s)": 0.202306 }, { "acc": 0.74698782, "epoch": 0.8322081103544817, "grad_norm": 5.96875, "learning_rate": 6.574161609608873e-06, "loss": 0.90708427, "memory(GiB)": 135.49, "step": 35670, "train_speed(iter/s)": 0.202337 }, { "acc": 0.75089073, "epoch": 0.8324414179267706, "grad_norm": 5.96875, "learning_rate": 6.572368452073683e-06, "loss": 0.91331244, "memory(GiB)": 135.49, "step": 35680, "train_speed(iter/s)": 0.202365 }, { "acc": 0.78096085, "epoch": 0.8326747254990595, "grad_norm": 5.40625, "learning_rate": 6.570575070083351e-06, "loss": 0.78200617, "memory(GiB)": 135.49, "step": 35690, "train_speed(iter/s)": 0.202395 }, { "acc": 0.77464838, "epoch": 0.8329080330713484, "grad_norm": 10.1875, "learning_rate": 6.5687814638938865e-06, "loss": 0.80141621, "memory(GiB)": 135.49, "step": 35700, "train_speed(iter/s)": 0.202424 }, { "acc": 0.77669387, "epoch": 0.8331413406436373, "grad_norm": 7.0625, "learning_rate": 6.566987633761323e-06, "loss": 0.80451698, "memory(GiB)": 135.49, "step": 35710, "train_speed(iter/s)": 0.202453 }, { "acc": 0.76671748, "epoch": 0.8333746482159261, "grad_norm": 6.96875, "learning_rate": 6.5651935799417295e-06, "loss": 0.8627923, "memory(GiB)": 135.49, "step": 35720, "train_speed(iter/s)": 0.202482 }, { "acc": 0.77285118, "epoch": 0.833607955788215, "grad_norm": 5.40625, "learning_rate": 6.563399302691209e-06, "loss": 0.81943178, "memory(GiB)": 135.49, "step": 35730, "train_speed(iter/s)": 0.202511 }, { "acc": 0.77007408, "epoch": 0.8338412633605039, "grad_norm": 6.21875, "learning_rate": 6.561604802265891e-06, "loss": 0.84325275, "memory(GiB)": 135.49, "step": 35740, "train_speed(iter/s)": 0.202541 }, { "acc": 0.773312, "epoch": 0.8340745709327928, "grad_norm": 4.9375, "learning_rate": 6.55981007892194e-06, "loss": 0.80160437, "memory(GiB)": 135.49, "step": 35750, "train_speed(iter/s)": 0.202572 }, { "acc": 0.78434076, "epoch": 0.8343078785050817, "grad_norm": 5.25, "learning_rate": 6.558015132915554e-06, "loss": 0.77656713, "memory(GiB)": 135.49, "step": 35760, "train_speed(iter/s)": 0.202601 }, { "acc": 0.75565214, "epoch": 0.8345411860773706, "grad_norm": 7.4375, "learning_rate": 6.556219964502961e-06, "loss": 0.88104286, "memory(GiB)": 135.49, "step": 35770, "train_speed(iter/s)": 0.20263 }, { "acc": 0.75791149, "epoch": 0.8347744936496595, "grad_norm": 7.0625, "learning_rate": 6.5544245739404196e-06, "loss": 0.86384125, "memory(GiB)": 135.49, "step": 35780, "train_speed(iter/s)": 0.202656 }, { "acc": 0.78536572, "epoch": 0.8350078012219484, "grad_norm": 6.65625, "learning_rate": 6.552628961484222e-06, "loss": 0.76137533, "memory(GiB)": 135.49, "step": 35790, "train_speed(iter/s)": 0.202685 }, { "acc": 0.7715621, "epoch": 0.8352411087942373, "grad_norm": 6.8125, "learning_rate": 6.550833127390692e-06, "loss": 0.83115578, "memory(GiB)": 135.49, "step": 35800, "train_speed(iter/s)": 0.202714 }, { "acc": 0.75033894, "epoch": 0.8354744163665262, "grad_norm": 5.4375, "learning_rate": 6.549037071916184e-06, "loss": 0.92578382, "memory(GiB)": 135.49, "step": 35810, "train_speed(iter/s)": 0.202744 }, { "acc": 0.75607443, "epoch": 0.8357077239388151, "grad_norm": 5.0625, "learning_rate": 6.547240795317081e-06, "loss": 0.86899033, "memory(GiB)": 135.49, "step": 35820, "train_speed(iter/s)": 0.202772 }, { "acc": 0.77274904, "epoch": 0.835941031511104, "grad_norm": 9.125, "learning_rate": 6.545444297849808e-06, "loss": 0.80701113, "memory(GiB)": 135.49, "step": 35830, "train_speed(iter/s)": 0.202803 }, { "acc": 0.7686852, "epoch": 0.8361743390833929, "grad_norm": 4.90625, "learning_rate": 6.543647579770806e-06, "loss": 0.8194252, "memory(GiB)": 135.49, "step": 35840, "train_speed(iter/s)": 0.202829 }, { "acc": 0.76463118, "epoch": 0.8364076466556818, "grad_norm": 6.625, "learning_rate": 6.5418506413365634e-06, "loss": 0.8247427, "memory(GiB)": 135.49, "step": 35850, "train_speed(iter/s)": 0.202858 }, { "acc": 0.77797756, "epoch": 0.8366409542279707, "grad_norm": 5.0, "learning_rate": 6.5400534828035885e-06, "loss": 0.79317331, "memory(GiB)": 135.49, "step": 35860, "train_speed(iter/s)": 0.202887 }, { "acc": 0.74962153, "epoch": 0.8368742618002596, "grad_norm": 6.46875, "learning_rate": 6.538256104428427e-06, "loss": 0.9268157, "memory(GiB)": 135.49, "step": 35870, "train_speed(iter/s)": 0.202917 }, { "acc": 0.75981746, "epoch": 0.8371075693725485, "grad_norm": 6.28125, "learning_rate": 6.536458506467654e-06, "loss": 0.87663269, "memory(GiB)": 135.49, "step": 35880, "train_speed(iter/s)": 0.202948 }, { "acc": 0.75019522, "epoch": 0.8373408769448374, "grad_norm": 4.40625, "learning_rate": 6.5346606891778755e-06, "loss": 0.89643078, "memory(GiB)": 135.49, "step": 35890, "train_speed(iter/s)": 0.202979 }, { "acc": 0.7523303, "epoch": 0.8375741845171263, "grad_norm": 6.8125, "learning_rate": 6.532862652815728e-06, "loss": 0.8930747, "memory(GiB)": 135.49, "step": 35900, "train_speed(iter/s)": 0.203011 }, { "acc": 0.76003981, "epoch": 0.8378074920894151, "grad_norm": 5.875, "learning_rate": 6.531064397637883e-06, "loss": 0.87069206, "memory(GiB)": 135.49, "step": 35910, "train_speed(iter/s)": 0.203041 }, { "acc": 0.77827349, "epoch": 0.838040799661704, "grad_norm": 7.125, "learning_rate": 6.529265923901039e-06, "loss": 0.81698723, "memory(GiB)": 135.49, "step": 35920, "train_speed(iter/s)": 0.203072 }, { "acc": 0.76007066, "epoch": 0.8382741072339929, "grad_norm": 6.3125, "learning_rate": 6.527467231861929e-06, "loss": 0.86264648, "memory(GiB)": 135.49, "step": 35930, "train_speed(iter/s)": 0.203102 }, { "acc": 0.7699296, "epoch": 0.8385074148062818, "grad_norm": 6.53125, "learning_rate": 6.525668321777317e-06, "loss": 0.82459459, "memory(GiB)": 135.49, "step": 35940, "train_speed(iter/s)": 0.203131 }, { "acc": 0.76054144, "epoch": 0.8387407223785707, "grad_norm": 5.71875, "learning_rate": 6.523869193903994e-06, "loss": 0.87803087, "memory(GiB)": 135.49, "step": 35950, "train_speed(iter/s)": 0.203162 }, { "acc": 0.76383591, "epoch": 0.8389740299508596, "grad_norm": 4.53125, "learning_rate": 6.522069848498787e-06, "loss": 0.84045429, "memory(GiB)": 135.49, "step": 35960, "train_speed(iter/s)": 0.203192 }, { "acc": 0.78461308, "epoch": 0.8392073375231485, "grad_norm": 6.125, "learning_rate": 6.5202702858185495e-06, "loss": 0.76736608, "memory(GiB)": 135.49, "step": 35970, "train_speed(iter/s)": 0.203222 }, { "acc": 0.76647739, "epoch": 0.8394406450954374, "grad_norm": 6.21875, "learning_rate": 6.518470506120171e-06, "loss": 0.83384132, "memory(GiB)": 135.49, "step": 35980, "train_speed(iter/s)": 0.20325 }, { "acc": 0.7798687, "epoch": 0.8396739526677263, "grad_norm": 7.28125, "learning_rate": 6.51667050966057e-06, "loss": 0.80682163, "memory(GiB)": 135.49, "step": 35990, "train_speed(iter/s)": 0.203282 }, { "acc": 0.76051388, "epoch": 0.8399072602400152, "grad_norm": 5.6875, "learning_rate": 6.514870296696694e-06, "loss": 0.86594753, "memory(GiB)": 135.49, "step": 36000, "train_speed(iter/s)": 0.203311 }, { "epoch": 0.8399072602400152, "eval_acc": 0.7347992410430495, "eval_loss": 0.8353312611579895, "eval_runtime": 1262.0653, "eval_samples_per_second": 28.518, "eval_steps_per_second": 14.259, "step": 36000 }, { "acc": 0.76426535, "epoch": 0.8401405678123041, "grad_norm": 6.03125, "learning_rate": 6.513069867485523e-06, "loss": 0.8566143, "memory(GiB)": 135.49, "step": 36010, "train_speed(iter/s)": 0.201874 }, { "acc": 0.75096588, "epoch": 0.840373875384593, "grad_norm": 5.03125, "learning_rate": 6.511269222284069e-06, "loss": 0.88335075, "memory(GiB)": 135.49, "step": 36020, "train_speed(iter/s)": 0.201903 }, { "acc": 0.77601719, "epoch": 0.8406071829568819, "grad_norm": 5.65625, "learning_rate": 6.509468361349371e-06, "loss": 0.80898876, "memory(GiB)": 135.49, "step": 36030, "train_speed(iter/s)": 0.201931 }, { "acc": 0.79610329, "epoch": 0.8408404905291708, "grad_norm": 4.03125, "learning_rate": 6.507667284938502e-06, "loss": 0.75573997, "memory(GiB)": 135.49, "step": 36040, "train_speed(iter/s)": 0.201961 }, { "acc": 0.76580296, "epoch": 0.8410737981014597, "grad_norm": 5.4375, "learning_rate": 6.505865993308568e-06, "loss": 0.85461864, "memory(GiB)": 135.49, "step": 36050, "train_speed(iter/s)": 0.20199 }, { "acc": 0.73387671, "epoch": 0.8413071056737486, "grad_norm": 6.21875, "learning_rate": 6.5040644867167e-06, "loss": 1.00244284, "memory(GiB)": 135.49, "step": 36060, "train_speed(iter/s)": 0.202018 }, { "acc": 0.7624814, "epoch": 0.8415404132460375, "grad_norm": 6.65625, "learning_rate": 6.502262765420064e-06, "loss": 0.88100853, "memory(GiB)": 135.49, "step": 36070, "train_speed(iter/s)": 0.202046 }, { "acc": 0.77841053, "epoch": 0.8417737208183264, "grad_norm": 8.25, "learning_rate": 6.500460829675854e-06, "loss": 0.80238352, "memory(GiB)": 135.49, "step": 36080, "train_speed(iter/s)": 0.202074 }, { "acc": 0.76954527, "epoch": 0.8420070283906153, "grad_norm": 7.34375, "learning_rate": 6.498658679741298e-06, "loss": 0.84731674, "memory(GiB)": 135.49, "step": 36090, "train_speed(iter/s)": 0.202101 }, { "acc": 0.76615686, "epoch": 0.842240335962904, "grad_norm": 5.96875, "learning_rate": 6.49685631587365e-06, "loss": 0.8495533, "memory(GiB)": 135.49, "step": 36100, "train_speed(iter/s)": 0.202131 }, { "acc": 0.7684577, "epoch": 0.8424736435351929, "grad_norm": 5.1875, "learning_rate": 6.495053738330196e-06, "loss": 0.83194046, "memory(GiB)": 135.49, "step": 36110, "train_speed(iter/s)": 0.20216 }, { "acc": 0.7729784, "epoch": 0.8427069511074818, "grad_norm": 7.1875, "learning_rate": 6.493250947368257e-06, "loss": 0.82642803, "memory(GiB)": 135.49, "step": 36120, "train_speed(iter/s)": 0.20219 }, { "acc": 0.78717041, "epoch": 0.8429402586797707, "grad_norm": 6.09375, "learning_rate": 6.491447943245179e-06, "loss": 0.76449089, "memory(GiB)": 135.49, "step": 36130, "train_speed(iter/s)": 0.202219 }, { "acc": 0.78048487, "epoch": 0.8431735662520596, "grad_norm": 7.625, "learning_rate": 6.489644726218339e-06, "loss": 0.78254175, "memory(GiB)": 135.49, "step": 36140, "train_speed(iter/s)": 0.202248 }, { "acc": 0.78323984, "epoch": 0.8434068738243485, "grad_norm": 5.25, "learning_rate": 6.4878412965451485e-06, "loss": 0.77917295, "memory(GiB)": 135.49, "step": 36150, "train_speed(iter/s)": 0.202278 }, { "acc": 0.78349152, "epoch": 0.8436401813966374, "grad_norm": 9.25, "learning_rate": 6.486037654483046e-06, "loss": 0.76649933, "memory(GiB)": 135.49, "step": 36160, "train_speed(iter/s)": 0.202308 }, { "acc": 0.76681252, "epoch": 0.8438734889689263, "grad_norm": 5.71875, "learning_rate": 6.484233800289499e-06, "loss": 0.84543715, "memory(GiB)": 135.49, "step": 36170, "train_speed(iter/s)": 0.202337 }, { "acc": 0.76467338, "epoch": 0.8441067965412152, "grad_norm": 7.15625, "learning_rate": 6.482429734222008e-06, "loss": 0.85522146, "memory(GiB)": 135.49, "step": 36180, "train_speed(iter/s)": 0.202365 }, { "acc": 0.77310014, "epoch": 0.8443401041135041, "grad_norm": 6.40625, "learning_rate": 6.4806254565381025e-06, "loss": 0.82293968, "memory(GiB)": 135.49, "step": 36190, "train_speed(iter/s)": 0.202393 }, { "acc": 0.783218, "epoch": 0.844573411685793, "grad_norm": 6.6875, "learning_rate": 6.478820967495343e-06, "loss": 0.78347168, "memory(GiB)": 135.49, "step": 36200, "train_speed(iter/s)": 0.202423 }, { "acc": 0.77448263, "epoch": 0.8448067192580819, "grad_norm": 7.09375, "learning_rate": 6.47701626735132e-06, "loss": 0.81060057, "memory(GiB)": 135.49, "step": 36210, "train_speed(iter/s)": 0.202453 }, { "acc": 0.77819977, "epoch": 0.8450400268303708, "grad_norm": 6.96875, "learning_rate": 6.475211356363655e-06, "loss": 0.76679878, "memory(GiB)": 135.49, "step": 36220, "train_speed(iter/s)": 0.202481 }, { "acc": 0.78511515, "epoch": 0.8452733344026597, "grad_norm": 4.8125, "learning_rate": 6.473406234789998e-06, "loss": 0.77728028, "memory(GiB)": 135.49, "step": 36230, "train_speed(iter/s)": 0.202509 }, { "acc": 0.77200041, "epoch": 0.8455066419749486, "grad_norm": 7.96875, "learning_rate": 6.471600902888029e-06, "loss": 0.81132793, "memory(GiB)": 135.49, "step": 36240, "train_speed(iter/s)": 0.202539 }, { "acc": 0.75738187, "epoch": 0.8457399495472375, "grad_norm": 7.75, "learning_rate": 6.4697953609154575e-06, "loss": 0.89851227, "memory(GiB)": 135.49, "step": 36250, "train_speed(iter/s)": 0.202568 }, { "acc": 0.77597265, "epoch": 0.8459732571195264, "grad_norm": 5.25, "learning_rate": 6.467989609130024e-06, "loss": 0.80828342, "memory(GiB)": 135.49, "step": 36260, "train_speed(iter/s)": 0.202595 }, { "acc": 0.74939699, "epoch": 0.8462065646918153, "grad_norm": 6.09375, "learning_rate": 6.466183647789502e-06, "loss": 0.9060256, "memory(GiB)": 135.49, "step": 36270, "train_speed(iter/s)": 0.202623 }, { "acc": 0.75386567, "epoch": 0.8464398722641042, "grad_norm": 6.53125, "learning_rate": 6.46437747715169e-06, "loss": 0.90007544, "memory(GiB)": 135.49, "step": 36280, "train_speed(iter/s)": 0.202652 }, { "acc": 0.77030287, "epoch": 0.8466731798363931, "grad_norm": 7.21875, "learning_rate": 6.462571097474419e-06, "loss": 0.82086468, "memory(GiB)": 135.49, "step": 36290, "train_speed(iter/s)": 0.20268 }, { "acc": 0.78013821, "epoch": 0.8469064874086819, "grad_norm": 5.34375, "learning_rate": 6.460764509015547e-06, "loss": 0.80401993, "memory(GiB)": 135.49, "step": 36300, "train_speed(iter/s)": 0.202707 }, { "acc": 0.77585278, "epoch": 0.8471397949809708, "grad_norm": 4.875, "learning_rate": 6.4589577120329685e-06, "loss": 0.80357723, "memory(GiB)": 135.49, "step": 36310, "train_speed(iter/s)": 0.202736 }, { "acc": 0.79904785, "epoch": 0.8473731025532597, "grad_norm": 5.96875, "learning_rate": 6.4571507067845985e-06, "loss": 0.72914448, "memory(GiB)": 135.49, "step": 36320, "train_speed(iter/s)": 0.202765 }, { "acc": 0.77416334, "epoch": 0.8476064101255486, "grad_norm": 6.59375, "learning_rate": 6.455343493528388e-06, "loss": 0.82491264, "memory(GiB)": 135.49, "step": 36330, "train_speed(iter/s)": 0.202796 }, { "acc": 0.75395832, "epoch": 0.8478397176978375, "grad_norm": 9.1875, "learning_rate": 6.4535360725223175e-06, "loss": 0.89639263, "memory(GiB)": 135.49, "step": 36340, "train_speed(iter/s)": 0.202825 }, { "acc": 0.77564607, "epoch": 0.8480730252701264, "grad_norm": 4.8125, "learning_rate": 6.451728444024394e-06, "loss": 0.81132288, "memory(GiB)": 135.49, "step": 36350, "train_speed(iter/s)": 0.202854 }, { "acc": 0.77400017, "epoch": 0.8483063328424153, "grad_norm": 7.84375, "learning_rate": 6.449920608292658e-06, "loss": 0.80483217, "memory(GiB)": 135.49, "step": 36360, "train_speed(iter/s)": 0.202881 }, { "acc": 0.75621114, "epoch": 0.8485396404147042, "grad_norm": 5.84375, "learning_rate": 6.448112565585176e-06, "loss": 0.85788298, "memory(GiB)": 135.49, "step": 36370, "train_speed(iter/s)": 0.20291 }, { "acc": 0.76833649, "epoch": 0.8487729479869931, "grad_norm": 6.375, "learning_rate": 6.446304316160046e-06, "loss": 0.83846397, "memory(GiB)": 135.49, "step": 36380, "train_speed(iter/s)": 0.202939 }, { "acc": 0.74953012, "epoch": 0.849006255559282, "grad_norm": 5.4375, "learning_rate": 6.444495860275395e-06, "loss": 0.88581829, "memory(GiB)": 135.49, "step": 36390, "train_speed(iter/s)": 0.202968 }, { "acc": 0.74788074, "epoch": 0.8492395631315709, "grad_norm": 6.6875, "learning_rate": 6.442687198189379e-06, "loss": 0.91972685, "memory(GiB)": 135.49, "step": 36400, "train_speed(iter/s)": 0.202999 }, { "acc": 0.76187749, "epoch": 0.8494728707038598, "grad_norm": 4.59375, "learning_rate": 6.440878330160185e-06, "loss": 0.86110878, "memory(GiB)": 135.49, "step": 36410, "train_speed(iter/s)": 0.203028 }, { "acc": 0.76235104, "epoch": 0.8497061782761487, "grad_norm": 6.5, "learning_rate": 6.439069256446027e-06, "loss": 0.87020512, "memory(GiB)": 135.49, "step": 36420, "train_speed(iter/s)": 0.203057 }, { "acc": 0.76487455, "epoch": 0.8499394858484376, "grad_norm": 6.65625, "learning_rate": 6.437259977305152e-06, "loss": 0.85450592, "memory(GiB)": 135.49, "step": 36430, "train_speed(iter/s)": 0.203085 }, { "acc": 0.75309591, "epoch": 0.8501727934207265, "grad_norm": 5.96875, "learning_rate": 6.435450492995833e-06, "loss": 0.89372826, "memory(GiB)": 135.49, "step": 36440, "train_speed(iter/s)": 0.203114 }, { "acc": 0.76913452, "epoch": 0.8504061009930154, "grad_norm": 7.875, "learning_rate": 6.433640803776372e-06, "loss": 0.84629211, "memory(GiB)": 135.49, "step": 36450, "train_speed(iter/s)": 0.203143 }, { "acc": 0.77584696, "epoch": 0.8506394085653043, "grad_norm": 5.84375, "learning_rate": 6.431830909905105e-06, "loss": 0.81338835, "memory(GiB)": 135.49, "step": 36460, "train_speed(iter/s)": 0.203171 }, { "acc": 0.77951775, "epoch": 0.8508727161375932, "grad_norm": 6.3125, "learning_rate": 6.43002081164039e-06, "loss": 0.80279417, "memory(GiB)": 135.49, "step": 36470, "train_speed(iter/s)": 0.2032 }, { "acc": 0.78405552, "epoch": 0.8511060237098821, "grad_norm": 10.875, "learning_rate": 6.428210509240618e-06, "loss": 0.79853077, "memory(GiB)": 135.49, "step": 36480, "train_speed(iter/s)": 0.203229 }, { "acc": 0.78300495, "epoch": 0.8513393312821709, "grad_norm": 9.3125, "learning_rate": 6.426400002964211e-06, "loss": 0.78277912, "memory(GiB)": 135.49, "step": 36490, "train_speed(iter/s)": 0.203259 }, { "acc": 0.75567656, "epoch": 0.8515726388544598, "grad_norm": 4.96875, "learning_rate": 6.42458929306962e-06, "loss": 0.90394535, "memory(GiB)": 135.49, "step": 36500, "train_speed(iter/s)": 0.203287 }, { "epoch": 0.8515726388544598, "eval_acc": 0.7348371566219962, "eval_loss": 0.8352068066596985, "eval_runtime": 1263.7867, "eval_samples_per_second": 28.479, "eval_steps_per_second": 14.24, "step": 36500 }, { "acc": 0.7794198, "epoch": 0.8518059464267487, "grad_norm": 4.84375, "learning_rate": 6.42277837981532e-06, "loss": 0.78980422, "memory(GiB)": 135.49, "step": 36510, "train_speed(iter/s)": 0.201868 }, { "acc": 0.77703648, "epoch": 0.8520392539990376, "grad_norm": 5.03125, "learning_rate": 6.420967263459821e-06, "loss": 0.80045986, "memory(GiB)": 135.49, "step": 36520, "train_speed(iter/s)": 0.201897 }, { "acc": 0.7765892, "epoch": 0.8522725615713265, "grad_norm": 6.25, "learning_rate": 6.419155944261657e-06, "loss": 0.80885487, "memory(GiB)": 135.49, "step": 36530, "train_speed(iter/s)": 0.201925 }, { "acc": 0.77154164, "epoch": 0.8525058691436154, "grad_norm": 6.6875, "learning_rate": 6.4173444224793935e-06, "loss": 0.81433783, "memory(GiB)": 135.49, "step": 36540, "train_speed(iter/s)": 0.201955 }, { "acc": 0.77375345, "epoch": 0.8527391767159043, "grad_norm": 5.09375, "learning_rate": 6.415532698371625e-06, "loss": 0.79580154, "memory(GiB)": 135.49, "step": 36550, "train_speed(iter/s)": 0.201981 }, { "acc": 0.78895602, "epoch": 0.8529724842881932, "grad_norm": 5.03125, "learning_rate": 6.413720772196976e-06, "loss": 0.74862046, "memory(GiB)": 135.49, "step": 36560, "train_speed(iter/s)": 0.202009 }, { "acc": 0.73349042, "epoch": 0.8532057918604821, "grad_norm": 6.6875, "learning_rate": 6.411908644214098e-06, "loss": 0.97448406, "memory(GiB)": 135.49, "step": 36570, "train_speed(iter/s)": 0.202039 }, { "acc": 0.76690531, "epoch": 0.853439099432771, "grad_norm": 5.375, "learning_rate": 6.410096314681671e-06, "loss": 0.83221207, "memory(GiB)": 135.49, "step": 36580, "train_speed(iter/s)": 0.20207 }, { "acc": 0.75678039, "epoch": 0.8536724070050599, "grad_norm": 5.96875, "learning_rate": 6.408283783858405e-06, "loss": 0.88160324, "memory(GiB)": 135.49, "step": 36590, "train_speed(iter/s)": 0.202099 }, { "acc": 0.77271857, "epoch": 0.8539057145773488, "grad_norm": 9.25, "learning_rate": 6.406471052003036e-06, "loss": 0.82486458, "memory(GiB)": 135.49, "step": 36600, "train_speed(iter/s)": 0.202127 }, { "acc": 0.77039185, "epoch": 0.8541390221496377, "grad_norm": 7.71875, "learning_rate": 6.4046581193743344e-06, "loss": 0.839044, "memory(GiB)": 135.49, "step": 36610, "train_speed(iter/s)": 0.202156 }, { "acc": 0.76006289, "epoch": 0.8543723297219266, "grad_norm": 8.125, "learning_rate": 6.402844986231094e-06, "loss": 0.87052498, "memory(GiB)": 135.49, "step": 36620, "train_speed(iter/s)": 0.202185 }, { "acc": 0.75323486, "epoch": 0.8546056372942155, "grad_norm": 5.375, "learning_rate": 6.401031652832141e-06, "loss": 0.88803253, "memory(GiB)": 135.49, "step": 36630, "train_speed(iter/s)": 0.202212 }, { "acc": 0.7825983, "epoch": 0.8548389448665044, "grad_norm": 5.625, "learning_rate": 6.3992181194363234e-06, "loss": 0.79396586, "memory(GiB)": 135.49, "step": 36640, "train_speed(iter/s)": 0.202241 }, { "acc": 0.7771234, "epoch": 0.8550722524387933, "grad_norm": 7.96875, "learning_rate": 6.397404386302528e-06, "loss": 0.81740055, "memory(GiB)": 135.49, "step": 36650, "train_speed(iter/s)": 0.202272 }, { "acc": 0.77157397, "epoch": 0.8553055600110822, "grad_norm": 4.15625, "learning_rate": 6.395590453689662e-06, "loss": 0.8369236, "memory(GiB)": 135.49, "step": 36660, "train_speed(iter/s)": 0.202303 }, { "acc": 0.76857042, "epoch": 0.855538867583371, "grad_norm": 5.09375, "learning_rate": 6.393776321856664e-06, "loss": 0.83402729, "memory(GiB)": 135.49, "step": 36670, "train_speed(iter/s)": 0.202331 }, { "acc": 0.75712996, "epoch": 0.8557721751556598, "grad_norm": 8.9375, "learning_rate": 6.391961991062501e-06, "loss": 0.88019085, "memory(GiB)": 135.49, "step": 36680, "train_speed(iter/s)": 0.202358 }, { "acc": 0.74991326, "epoch": 0.8560054827279487, "grad_norm": 5.09375, "learning_rate": 6.390147461566167e-06, "loss": 0.91076698, "memory(GiB)": 135.49, "step": 36690, "train_speed(iter/s)": 0.202387 }, { "acc": 0.77759185, "epoch": 0.8562387903002376, "grad_norm": 6.78125, "learning_rate": 6.388332733626689e-06, "loss": 0.77681956, "memory(GiB)": 146.85, "step": 36700, "train_speed(iter/s)": 0.202412 }, { "acc": 0.75521927, "epoch": 0.8564720978725265, "grad_norm": 7.09375, "learning_rate": 6.386517807503114e-06, "loss": 0.88957253, "memory(GiB)": 146.85, "step": 36710, "train_speed(iter/s)": 0.202441 }, { "acc": 0.7582819, "epoch": 0.8567054054448154, "grad_norm": 5.8125, "learning_rate": 6.384702683454527e-06, "loss": 0.86072292, "memory(GiB)": 146.85, "step": 36720, "train_speed(iter/s)": 0.20247 }, { "acc": 0.7583951, "epoch": 0.8569387130171043, "grad_norm": 6.59375, "learning_rate": 6.382887361740033e-06, "loss": 0.87126198, "memory(GiB)": 146.85, "step": 36730, "train_speed(iter/s)": 0.202499 }, { "acc": 0.77206464, "epoch": 0.8571720205893932, "grad_norm": 5.0, "learning_rate": 6.38107184261877e-06, "loss": 0.83784122, "memory(GiB)": 146.85, "step": 36740, "train_speed(iter/s)": 0.202526 }, { "acc": 0.77288485, "epoch": 0.8574053281616821, "grad_norm": 7.5, "learning_rate": 6.379256126349903e-06, "loss": 0.82550116, "memory(GiB)": 146.85, "step": 36750, "train_speed(iter/s)": 0.202553 }, { "acc": 0.76891479, "epoch": 0.857638635733971, "grad_norm": 5.5, "learning_rate": 6.377440213192625e-06, "loss": 0.84657898, "memory(GiB)": 146.85, "step": 36760, "train_speed(iter/s)": 0.202581 }, { "acc": 0.76705828, "epoch": 0.8578719433062599, "grad_norm": 8.8125, "learning_rate": 6.375624103406155e-06, "loss": 0.84346275, "memory(GiB)": 146.85, "step": 36770, "train_speed(iter/s)": 0.202611 }, { "acc": 0.79057465, "epoch": 0.8581052508785488, "grad_norm": 18.375, "learning_rate": 6.373807797249744e-06, "loss": 0.75275364, "memory(GiB)": 146.85, "step": 36780, "train_speed(iter/s)": 0.202639 }, { "acc": 0.77857127, "epoch": 0.8583385584508377, "grad_norm": 6.28125, "learning_rate": 6.371991294982671e-06, "loss": 0.80810919, "memory(GiB)": 146.85, "step": 36790, "train_speed(iter/s)": 0.202668 }, { "acc": 0.76918335, "epoch": 0.8585718660231266, "grad_norm": 8.5, "learning_rate": 6.370174596864238e-06, "loss": 0.84469242, "memory(GiB)": 146.85, "step": 36800, "train_speed(iter/s)": 0.202697 }, { "acc": 0.77552633, "epoch": 0.8588051735954155, "grad_norm": 4.90625, "learning_rate": 6.368357703153782e-06, "loss": 0.8026268, "memory(GiB)": 146.85, "step": 36810, "train_speed(iter/s)": 0.202726 }, { "acc": 0.77062616, "epoch": 0.8590384811677044, "grad_norm": 6.28125, "learning_rate": 6.366540614110658e-06, "loss": 0.84824009, "memory(GiB)": 146.85, "step": 36820, "train_speed(iter/s)": 0.202755 }, { "acc": 0.75651412, "epoch": 0.8592717887399933, "grad_norm": 5.4375, "learning_rate": 6.364723329994259e-06, "loss": 0.87355871, "memory(GiB)": 146.85, "step": 36830, "train_speed(iter/s)": 0.202783 }, { "acc": 0.76815929, "epoch": 0.8595050963122822, "grad_norm": 6.0, "learning_rate": 6.362905851064001e-06, "loss": 0.8414855, "memory(GiB)": 146.85, "step": 36840, "train_speed(iter/s)": 0.202812 }, { "acc": 0.74769163, "epoch": 0.8597384038845711, "grad_norm": 6.375, "learning_rate": 6.361088177579329e-06, "loss": 0.91020451, "memory(GiB)": 146.85, "step": 36850, "train_speed(iter/s)": 0.202841 }, { "acc": 0.78032255, "epoch": 0.85997171145686, "grad_norm": 6.125, "learning_rate": 6.359270309799715e-06, "loss": 0.7846859, "memory(GiB)": 146.85, "step": 36860, "train_speed(iter/s)": 0.202868 }, { "acc": 0.75718789, "epoch": 0.8602050190291488, "grad_norm": 5.15625, "learning_rate": 6.357452247984659e-06, "loss": 0.9024229, "memory(GiB)": 146.85, "step": 36870, "train_speed(iter/s)": 0.202896 }, { "acc": 0.76710005, "epoch": 0.8604383266014377, "grad_norm": 6.125, "learning_rate": 6.35563399239369e-06, "loss": 0.85293093, "memory(GiB)": 146.85, "step": 36880, "train_speed(iter/s)": 0.202924 }, { "acc": 0.77378664, "epoch": 0.8606716341737266, "grad_norm": 5.96875, "learning_rate": 6.353815543286361e-06, "loss": 0.79997177, "memory(GiB)": 146.85, "step": 36890, "train_speed(iter/s)": 0.202951 }, { "acc": 0.75631418, "epoch": 0.8609049417460155, "grad_norm": 6.5, "learning_rate": 6.351996900922257e-06, "loss": 0.8876935, "memory(GiB)": 146.85, "step": 36900, "train_speed(iter/s)": 0.20298 }, { "acc": 0.76191511, "epoch": 0.8611382493183044, "grad_norm": 5.4375, "learning_rate": 6.3501780655609875e-06, "loss": 0.8531642, "memory(GiB)": 146.85, "step": 36910, "train_speed(iter/s)": 0.203008 }, { "acc": 0.77319026, "epoch": 0.8613715568905933, "grad_norm": 6.28125, "learning_rate": 6.348359037462194e-06, "loss": 0.8259491, "memory(GiB)": 146.85, "step": 36920, "train_speed(iter/s)": 0.203036 }, { "acc": 0.76271772, "epoch": 0.8616048644628822, "grad_norm": 8.0625, "learning_rate": 6.346539816885537e-06, "loss": 0.86501427, "memory(GiB)": 146.85, "step": 36930, "train_speed(iter/s)": 0.203064 }, { "acc": 0.7567266, "epoch": 0.8618381720351711, "grad_norm": 5.78125, "learning_rate": 6.3447204040907125e-06, "loss": 0.88126659, "memory(GiB)": 146.85, "step": 36940, "train_speed(iter/s)": 0.203092 }, { "acc": 0.77447062, "epoch": 0.86207147960746, "grad_norm": 6.875, "learning_rate": 6.342900799337443e-06, "loss": 0.81148958, "memory(GiB)": 146.85, "step": 36950, "train_speed(iter/s)": 0.203121 }, { "acc": 0.78242474, "epoch": 0.8623047871797489, "grad_norm": 7.1875, "learning_rate": 6.341081002885472e-06, "loss": 0.77231989, "memory(GiB)": 146.85, "step": 36960, "train_speed(iter/s)": 0.203149 }, { "acc": 0.77273288, "epoch": 0.8625380947520378, "grad_norm": 7.75, "learning_rate": 6.33926101499458e-06, "loss": 0.8035737, "memory(GiB)": 146.85, "step": 36970, "train_speed(iter/s)": 0.203177 }, { "acc": 0.7791398, "epoch": 0.8627714023243267, "grad_norm": 9.125, "learning_rate": 6.337440835924564e-06, "loss": 0.79581847, "memory(GiB)": 146.85, "step": 36980, "train_speed(iter/s)": 0.203207 }, { "acc": 0.77498159, "epoch": 0.8630047098966156, "grad_norm": 6.53125, "learning_rate": 6.335620465935259e-06, "loss": 0.81244307, "memory(GiB)": 146.85, "step": 36990, "train_speed(iter/s)": 0.203236 }, { "acc": 0.76272602, "epoch": 0.8632380174689045, "grad_norm": 5.9375, "learning_rate": 6.333799905286519e-06, "loss": 0.86231756, "memory(GiB)": 146.85, "step": 37000, "train_speed(iter/s)": 0.203264 }, { "epoch": 0.8632380174689045, "eval_acc": 0.734903952578098, "eval_loss": 0.8351283669471741, "eval_runtime": 1262.8403, "eval_samples_per_second": 28.5, "eval_steps_per_second": 14.25, "step": 37000 }, { "acc": 0.75669231, "epoch": 0.8634713250411934, "grad_norm": 4.90625, "learning_rate": 6.331979154238232e-06, "loss": 0.89732704, "memory(GiB)": 146.85, "step": 37010, "train_speed(iter/s)": 0.201866 }, { "acc": 0.76613717, "epoch": 0.8637046326134823, "grad_norm": 5.25, "learning_rate": 6.330158213050308e-06, "loss": 0.84911232, "memory(GiB)": 146.85, "step": 37020, "train_speed(iter/s)": 0.201895 }, { "acc": 0.79158893, "epoch": 0.8639379401857712, "grad_norm": 9.0625, "learning_rate": 6.328337081982685e-06, "loss": 0.73759341, "memory(GiB)": 146.85, "step": 37030, "train_speed(iter/s)": 0.201926 }, { "acc": 0.78358526, "epoch": 0.8641712477580601, "grad_norm": 6.28125, "learning_rate": 6.326515761295328e-06, "loss": 0.77447314, "memory(GiB)": 146.85, "step": 37040, "train_speed(iter/s)": 0.201954 }, { "acc": 0.76219263, "epoch": 0.864404555330349, "grad_norm": 8.25, "learning_rate": 6.3246942512482325e-06, "loss": 0.87890377, "memory(GiB)": 146.85, "step": 37050, "train_speed(iter/s)": 0.201984 }, { "acc": 0.76982918, "epoch": 0.8646378629026379, "grad_norm": 5.59375, "learning_rate": 6.3228725521014165e-06, "loss": 0.8432498, "memory(GiB)": 146.85, "step": 37060, "train_speed(iter/s)": 0.20201 }, { "acc": 0.78085179, "epoch": 0.8648711704749267, "grad_norm": 6.6875, "learning_rate": 6.32105066411493e-06, "loss": 0.7915338, "memory(GiB)": 146.85, "step": 37070, "train_speed(iter/s)": 0.202039 }, { "acc": 0.75289202, "epoch": 0.8651044780472156, "grad_norm": 8.125, "learning_rate": 6.319228587548843e-06, "loss": 0.88680954, "memory(GiB)": 146.85, "step": 37080, "train_speed(iter/s)": 0.202065 }, { "acc": 0.77636652, "epoch": 0.8653377856195045, "grad_norm": 4.40625, "learning_rate": 6.317406322663259e-06, "loss": 0.80414228, "memory(GiB)": 146.85, "step": 37090, "train_speed(iter/s)": 0.202095 }, { "acc": 0.77660446, "epoch": 0.8655710931917934, "grad_norm": 8.0, "learning_rate": 6.315583869718306e-06, "loss": 0.82159424, "memory(GiB)": 146.85, "step": 37100, "train_speed(iter/s)": 0.202124 }, { "acc": 0.77959056, "epoch": 0.8658044007640823, "grad_norm": 7.65625, "learning_rate": 6.313761228974137e-06, "loss": 0.825383, "memory(GiB)": 146.85, "step": 37110, "train_speed(iter/s)": 0.202151 }, { "acc": 0.75777769, "epoch": 0.8660377083363712, "grad_norm": 6.96875, "learning_rate": 6.311938400690933e-06, "loss": 0.88842716, "memory(GiB)": 146.85, "step": 37120, "train_speed(iter/s)": 0.20218 }, { "acc": 0.77611876, "epoch": 0.8662710159086601, "grad_norm": 5.1875, "learning_rate": 6.310115385128905e-06, "loss": 0.79074163, "memory(GiB)": 146.85, "step": 37130, "train_speed(iter/s)": 0.202207 }, { "acc": 0.75630836, "epoch": 0.866504323480949, "grad_norm": 6.59375, "learning_rate": 6.308292182548287e-06, "loss": 0.89605522, "memory(GiB)": 146.85, "step": 37140, "train_speed(iter/s)": 0.202236 }, { "acc": 0.7565794, "epoch": 0.8667376310532379, "grad_norm": 7.5, "learning_rate": 6.3064687932093386e-06, "loss": 0.88291397, "memory(GiB)": 146.85, "step": 37150, "train_speed(iter/s)": 0.202263 }, { "acc": 0.77125406, "epoch": 0.8669709386255268, "grad_norm": 7.96875, "learning_rate": 6.3046452173723495e-06, "loss": 0.81183233, "memory(GiB)": 146.85, "step": 37160, "train_speed(iter/s)": 0.20229 }, { "acc": 0.75891685, "epoch": 0.8672042461978157, "grad_norm": 5.84375, "learning_rate": 6.302821455297635e-06, "loss": 0.88171368, "memory(GiB)": 146.85, "step": 37170, "train_speed(iter/s)": 0.202319 }, { "acc": 0.75817728, "epoch": 0.8674375537701046, "grad_norm": 6.375, "learning_rate": 6.300997507245537e-06, "loss": 0.89808273, "memory(GiB)": 146.85, "step": 37180, "train_speed(iter/s)": 0.202347 }, { "acc": 0.77158489, "epoch": 0.8676708613423935, "grad_norm": 7.15625, "learning_rate": 6.299173373476422e-06, "loss": 0.82916222, "memory(GiB)": 146.85, "step": 37190, "train_speed(iter/s)": 0.202375 }, { "acc": 0.76193256, "epoch": 0.8679041689146824, "grad_norm": 6.5625, "learning_rate": 6.2973490542506854e-06, "loss": 0.87452641, "memory(GiB)": 146.85, "step": 37200, "train_speed(iter/s)": 0.202404 }, { "acc": 0.75474472, "epoch": 0.8681374764869713, "grad_norm": 5.15625, "learning_rate": 6.295524549828747e-06, "loss": 0.87155647, "memory(GiB)": 146.85, "step": 37210, "train_speed(iter/s)": 0.202433 }, { "acc": 0.75644732, "epoch": 0.8683707840592602, "grad_norm": 8.1875, "learning_rate": 6.293699860471057e-06, "loss": 0.87375603, "memory(GiB)": 146.85, "step": 37220, "train_speed(iter/s)": 0.202462 }, { "acc": 0.75566611, "epoch": 0.8686040916315491, "grad_norm": 6.5625, "learning_rate": 6.2918749864380875e-06, "loss": 0.89887581, "memory(GiB)": 146.85, "step": 37230, "train_speed(iter/s)": 0.202492 }, { "acc": 0.77003269, "epoch": 0.868837399203838, "grad_norm": 7.09375, "learning_rate": 6.290049927990339e-06, "loss": 0.86284485, "memory(GiB)": 146.85, "step": 37240, "train_speed(iter/s)": 0.20252 }, { "acc": 0.77881246, "epoch": 0.8690707067761269, "grad_norm": 5.0, "learning_rate": 6.288224685388337e-06, "loss": 0.80547953, "memory(GiB)": 146.85, "step": 37250, "train_speed(iter/s)": 0.202548 }, { "acc": 0.76065044, "epoch": 0.8693040143484156, "grad_norm": 6.15625, "learning_rate": 6.286399258892638e-06, "loss": 0.86736526, "memory(GiB)": 146.85, "step": 37260, "train_speed(iter/s)": 0.202577 }, { "acc": 0.76645474, "epoch": 0.8695373219207045, "grad_norm": 6.34375, "learning_rate": 6.284573648763816e-06, "loss": 0.83775511, "memory(GiB)": 146.85, "step": 37270, "train_speed(iter/s)": 0.202606 }, { "acc": 0.77470179, "epoch": 0.8697706294929934, "grad_norm": 5.25, "learning_rate": 6.28274785526248e-06, "loss": 0.82792587, "memory(GiB)": 146.85, "step": 37280, "train_speed(iter/s)": 0.202635 }, { "acc": 0.77726831, "epoch": 0.8700039370652823, "grad_norm": 9.5625, "learning_rate": 6.2809218786492595e-06, "loss": 0.7971715, "memory(GiB)": 146.85, "step": 37290, "train_speed(iter/s)": 0.202665 }, { "acc": 0.77360249, "epoch": 0.8702372446375712, "grad_norm": 7.03125, "learning_rate": 6.279095719184813e-06, "loss": 0.81482363, "memory(GiB)": 146.85, "step": 37300, "train_speed(iter/s)": 0.202692 }, { "acc": 0.76850772, "epoch": 0.8704705522098601, "grad_norm": 5.5, "learning_rate": 6.277269377129826e-06, "loss": 0.82174206, "memory(GiB)": 146.85, "step": 37310, "train_speed(iter/s)": 0.202721 }, { "acc": 0.77657328, "epoch": 0.870703859782149, "grad_norm": 7.96875, "learning_rate": 6.275442852745005e-06, "loss": 0.80740891, "memory(GiB)": 146.85, "step": 37320, "train_speed(iter/s)": 0.202749 }, { "acc": 0.76441174, "epoch": 0.8709371673544379, "grad_norm": 9.625, "learning_rate": 6.273616146291086e-06, "loss": 0.84200869, "memory(GiB)": 146.85, "step": 37330, "train_speed(iter/s)": 0.202778 }, { "acc": 0.76470532, "epoch": 0.8711704749267268, "grad_norm": 5.8125, "learning_rate": 6.2717892580288335e-06, "loss": 0.82975111, "memory(GiB)": 146.85, "step": 37340, "train_speed(iter/s)": 0.202806 }, { "acc": 0.75773749, "epoch": 0.8714037824990157, "grad_norm": 5.25, "learning_rate": 6.269962188219034e-06, "loss": 0.85145454, "memory(GiB)": 146.85, "step": 37350, "train_speed(iter/s)": 0.202835 }, { "acc": 0.7837183, "epoch": 0.8716370900713046, "grad_norm": 6.21875, "learning_rate": 6.2681349371225e-06, "loss": 0.77135925, "memory(GiB)": 146.85, "step": 37360, "train_speed(iter/s)": 0.202861 }, { "acc": 0.77120924, "epoch": 0.8718703976435935, "grad_norm": 4.65625, "learning_rate": 6.266307505000073e-06, "loss": 0.8112463, "memory(GiB)": 146.85, "step": 37370, "train_speed(iter/s)": 0.202888 }, { "acc": 0.76744738, "epoch": 0.8721037052158824, "grad_norm": 5.46875, "learning_rate": 6.264479892112619e-06, "loss": 0.8714798, "memory(GiB)": 146.85, "step": 37380, "train_speed(iter/s)": 0.202917 }, { "acc": 0.78199968, "epoch": 0.8723370127881713, "grad_norm": 10.0, "learning_rate": 6.262652098721026e-06, "loss": 0.79746461, "memory(GiB)": 146.85, "step": 37390, "train_speed(iter/s)": 0.202944 }, { "acc": 0.76465569, "epoch": 0.8725703203604602, "grad_norm": 8.4375, "learning_rate": 6.260824125086212e-06, "loss": 0.84016743, "memory(GiB)": 146.85, "step": 37400, "train_speed(iter/s)": 0.202972 }, { "acc": 0.75516653, "epoch": 0.8728036279327491, "grad_norm": 5.71875, "learning_rate": 6.258995971469122e-06, "loss": 0.89221458, "memory(GiB)": 146.85, "step": 37410, "train_speed(iter/s)": 0.203 }, { "acc": 0.77340412, "epoch": 0.873036935505038, "grad_norm": 5.15625, "learning_rate": 6.2571676381307215e-06, "loss": 0.83907337, "memory(GiB)": 146.85, "step": 37420, "train_speed(iter/s)": 0.203027 }, { "acc": 0.77925186, "epoch": 0.8732702430773269, "grad_norm": 6.5, "learning_rate": 6.255339125332007e-06, "loss": 0.77945838, "memory(GiB)": 146.85, "step": 37430, "train_speed(iter/s)": 0.203055 }, { "acc": 0.74956226, "epoch": 0.8735035506496158, "grad_norm": 7.75, "learning_rate": 6.253510433333996e-06, "loss": 0.91372099, "memory(GiB)": 146.85, "step": 37440, "train_speed(iter/s)": 0.20308 }, { "acc": 0.75774374, "epoch": 0.8737368582219046, "grad_norm": 7.15625, "learning_rate": 6.251681562397736e-06, "loss": 0.88292027, "memory(GiB)": 146.85, "step": 37450, "train_speed(iter/s)": 0.203109 }, { "acc": 0.7692791, "epoch": 0.8739701657941935, "grad_norm": 5.21875, "learning_rate": 6.2498525127842955e-06, "loss": 0.81767292, "memory(GiB)": 146.85, "step": 37460, "train_speed(iter/s)": 0.203135 }, { "acc": 0.76890182, "epoch": 0.8742034733664824, "grad_norm": 5.25, "learning_rate": 6.248023284754772e-06, "loss": 0.84124203, "memory(GiB)": 146.85, "step": 37470, "train_speed(iter/s)": 0.203164 }, { "acc": 0.77672687, "epoch": 0.8744367809387713, "grad_norm": 6.09375, "learning_rate": 6.2461938785702866e-06, "loss": 0.81048145, "memory(GiB)": 146.85, "step": 37480, "train_speed(iter/s)": 0.203191 }, { "acc": 0.77945795, "epoch": 0.8746700885110602, "grad_norm": 6.40625, "learning_rate": 6.244364294491989e-06, "loss": 0.78468075, "memory(GiB)": 146.85, "step": 37490, "train_speed(iter/s)": 0.203219 }, { "acc": 0.77536511, "epoch": 0.8749033960833491, "grad_norm": 4.90625, "learning_rate": 6.2425345327810485e-06, "loss": 0.8240839, "memory(GiB)": 146.85, "step": 37500, "train_speed(iter/s)": 0.203246 }, { "epoch": 0.8749033960833491, "eval_acc": 0.7349549369310646, "eval_loss": 0.8349810838699341, "eval_runtime": 1262.4591, "eval_samples_per_second": 28.509, "eval_steps_per_second": 14.255, "step": 37500 }, { "acc": 0.76671543, "epoch": 0.875136703655638, "grad_norm": 5.3125, "learning_rate": 6.240704593698664e-06, "loss": 0.8332509, "memory(GiB)": 146.85, "step": 37510, "train_speed(iter/s)": 0.201865 }, { "acc": 0.75850668, "epoch": 0.8753700112279269, "grad_norm": 6.25, "learning_rate": 6.238874477506061e-06, "loss": 0.87653751, "memory(GiB)": 146.85, "step": 37520, "train_speed(iter/s)": 0.201893 }, { "acc": 0.75433121, "epoch": 0.8756033188002158, "grad_norm": 4.625, "learning_rate": 6.237044184464485e-06, "loss": 0.90926723, "memory(GiB)": 146.85, "step": 37530, "train_speed(iter/s)": 0.201922 }, { "acc": 0.76575823, "epoch": 0.8758366263725047, "grad_norm": 8.6875, "learning_rate": 6.235213714835211e-06, "loss": 0.83439875, "memory(GiB)": 146.85, "step": 37540, "train_speed(iter/s)": 0.201948 }, { "acc": 0.78383951, "epoch": 0.8760699339447936, "grad_norm": 4.78125, "learning_rate": 6.233383068879538e-06, "loss": 0.76931934, "memory(GiB)": 146.85, "step": 37550, "train_speed(iter/s)": 0.201977 }, { "acc": 0.75730791, "epoch": 0.8763032415170825, "grad_norm": 7.34375, "learning_rate": 6.231552246858791e-06, "loss": 0.87686672, "memory(GiB)": 146.85, "step": 37560, "train_speed(iter/s)": 0.202004 }, { "acc": 0.77506695, "epoch": 0.8765365490893714, "grad_norm": 6.25, "learning_rate": 6.229721249034318e-06, "loss": 0.82185688, "memory(GiB)": 146.85, "step": 37570, "train_speed(iter/s)": 0.202034 }, { "acc": 0.77035737, "epoch": 0.8767698566616603, "grad_norm": 5.28125, "learning_rate": 6.227890075667492e-06, "loss": 0.83019485, "memory(GiB)": 146.85, "step": 37580, "train_speed(iter/s)": 0.202062 }, { "acc": 0.77275186, "epoch": 0.8770031642339492, "grad_norm": 5.6875, "learning_rate": 6.226058727019717e-06, "loss": 0.81771545, "memory(GiB)": 146.85, "step": 37590, "train_speed(iter/s)": 0.20209 }, { "acc": 0.76063013, "epoch": 0.8772364718062381, "grad_norm": 4.28125, "learning_rate": 6.224227203352415e-06, "loss": 0.84243021, "memory(GiB)": 146.85, "step": 37600, "train_speed(iter/s)": 0.202118 }, { "acc": 0.77469258, "epoch": 0.877469779378527, "grad_norm": 5.34375, "learning_rate": 6.222395504927035e-06, "loss": 0.82745342, "memory(GiB)": 146.85, "step": 37610, "train_speed(iter/s)": 0.202147 }, { "acc": 0.7624403, "epoch": 0.8777030869508159, "grad_norm": 7.625, "learning_rate": 6.22056363200505e-06, "loss": 0.85810566, "memory(GiB)": 146.85, "step": 37620, "train_speed(iter/s)": 0.202175 }, { "acc": 0.76887712, "epoch": 0.8779363945231048, "grad_norm": 8.6875, "learning_rate": 6.218731584847963e-06, "loss": 0.82082443, "memory(GiB)": 146.85, "step": 37630, "train_speed(iter/s)": 0.202202 }, { "acc": 0.77949915, "epoch": 0.8781697020953936, "grad_norm": 7.34375, "learning_rate": 6.216899363717295e-06, "loss": 0.81672897, "memory(GiB)": 146.85, "step": 37640, "train_speed(iter/s)": 0.20223 }, { "acc": 0.76657209, "epoch": 0.8784030096676825, "grad_norm": 7.0625, "learning_rate": 6.215066968874596e-06, "loss": 0.8328598, "memory(GiB)": 146.85, "step": 37650, "train_speed(iter/s)": 0.202256 }, { "acc": 0.76429977, "epoch": 0.8786363172399714, "grad_norm": 5.4375, "learning_rate": 6.213234400581442e-06, "loss": 0.85189304, "memory(GiB)": 146.85, "step": 37660, "train_speed(iter/s)": 0.202285 }, { "acc": 0.76915045, "epoch": 0.8788696248122603, "grad_norm": 11.5625, "learning_rate": 6.2114016590994295e-06, "loss": 0.82643423, "memory(GiB)": 146.85, "step": 37670, "train_speed(iter/s)": 0.202312 }, { "acc": 0.76777182, "epoch": 0.8791029323845492, "grad_norm": 6.28125, "learning_rate": 6.209568744690181e-06, "loss": 0.83179169, "memory(GiB)": 146.85, "step": 37680, "train_speed(iter/s)": 0.20234 }, { "acc": 0.78301878, "epoch": 0.8793362399568381, "grad_norm": 18.25, "learning_rate": 6.207735657615346e-06, "loss": 0.7671391, "memory(GiB)": 146.85, "step": 37690, "train_speed(iter/s)": 0.202367 }, { "acc": 0.79300222, "epoch": 0.879569547529127, "grad_norm": 4.625, "learning_rate": 6.2059023981365965e-06, "loss": 0.73934245, "memory(GiB)": 146.85, "step": 37700, "train_speed(iter/s)": 0.202394 }, { "acc": 0.77109556, "epoch": 0.8798028551014159, "grad_norm": 4.46875, "learning_rate": 6.20406896651563e-06, "loss": 0.82424793, "memory(GiB)": 146.85, "step": 37710, "train_speed(iter/s)": 0.202423 }, { "acc": 0.76557026, "epoch": 0.8800361626737048, "grad_norm": 5.65625, "learning_rate": 6.202235363014169e-06, "loss": 0.84205475, "memory(GiB)": 146.85, "step": 37720, "train_speed(iter/s)": 0.202448 }, { "acc": 0.79121323, "epoch": 0.8802694702459937, "grad_norm": 7.75, "learning_rate": 6.2004015878939585e-06, "loss": 0.76249275, "memory(GiB)": 146.85, "step": 37730, "train_speed(iter/s)": 0.202477 }, { "acc": 0.78503723, "epoch": 0.8805027778182826, "grad_norm": 7.0625, "learning_rate": 6.198567641416772e-06, "loss": 0.7814281, "memory(GiB)": 146.85, "step": 37740, "train_speed(iter/s)": 0.202505 }, { "acc": 0.77440767, "epoch": 0.8807360853905715, "grad_norm": 5.59375, "learning_rate": 6.1967335238444004e-06, "loss": 0.83118162, "memory(GiB)": 146.85, "step": 37750, "train_speed(iter/s)": 0.202534 }, { "acc": 0.76371922, "epoch": 0.8809693929628604, "grad_norm": 8.875, "learning_rate": 6.194899235438666e-06, "loss": 0.84017057, "memory(GiB)": 146.85, "step": 37760, "train_speed(iter/s)": 0.202559 }, { "acc": 0.74861412, "epoch": 0.8812027005351493, "grad_norm": 6.03125, "learning_rate": 6.193064776461415e-06, "loss": 0.91881924, "memory(GiB)": 146.85, "step": 37770, "train_speed(iter/s)": 0.202589 }, { "acc": 0.77183151, "epoch": 0.8814360081074382, "grad_norm": 5.0, "learning_rate": 6.191230147174512e-06, "loss": 0.82673359, "memory(GiB)": 146.85, "step": 37780, "train_speed(iter/s)": 0.202614 }, { "acc": 0.78210192, "epoch": 0.8816693156797271, "grad_norm": 5.40625, "learning_rate": 6.1893953478398515e-06, "loss": 0.78781776, "memory(GiB)": 146.85, "step": 37790, "train_speed(iter/s)": 0.202643 }, { "acc": 0.76448927, "epoch": 0.881902623252016, "grad_norm": 5.09375, "learning_rate": 6.18756037871935e-06, "loss": 0.84595814, "memory(GiB)": 146.85, "step": 37800, "train_speed(iter/s)": 0.20267 }, { "acc": 0.75413151, "epoch": 0.8821359308243049, "grad_norm": 10.875, "learning_rate": 6.185725240074951e-06, "loss": 0.90121269, "memory(GiB)": 146.85, "step": 37810, "train_speed(iter/s)": 0.202699 }, { "acc": 0.74233022, "epoch": 0.8823692383965938, "grad_norm": 5.53125, "learning_rate": 6.1838899321686185e-06, "loss": 0.94964428, "memory(GiB)": 146.85, "step": 37820, "train_speed(iter/s)": 0.202726 }, { "acc": 0.76483107, "epoch": 0.8826025459688827, "grad_norm": 5.40625, "learning_rate": 6.1820544552623415e-06, "loss": 0.84527988, "memory(GiB)": 146.85, "step": 37830, "train_speed(iter/s)": 0.202755 }, { "acc": 0.77913465, "epoch": 0.8828358535411714, "grad_norm": 6.46875, "learning_rate": 6.180218809618135e-06, "loss": 0.7849864, "memory(GiB)": 146.85, "step": 37840, "train_speed(iter/s)": 0.20278 }, { "acc": 0.77207131, "epoch": 0.8830691611134603, "grad_norm": 5.40625, "learning_rate": 6.1783829954980345e-06, "loss": 0.83006153, "memory(GiB)": 146.85, "step": 37850, "train_speed(iter/s)": 0.202807 }, { "acc": 0.76411309, "epoch": 0.8833024686857492, "grad_norm": 9.3125, "learning_rate": 6.176547013164104e-06, "loss": 0.84969664, "memory(GiB)": 146.85, "step": 37860, "train_speed(iter/s)": 0.202836 }, { "acc": 0.75173244, "epoch": 0.8835357762580381, "grad_norm": 5.0, "learning_rate": 6.17471086287843e-06, "loss": 0.90104294, "memory(GiB)": 146.85, "step": 37870, "train_speed(iter/s)": 0.202862 }, { "acc": 0.77045193, "epoch": 0.883769083830327, "grad_norm": 7.875, "learning_rate": 6.172874544903122e-06, "loss": 0.82008591, "memory(GiB)": 146.85, "step": 37880, "train_speed(iter/s)": 0.202889 }, { "acc": 0.77010374, "epoch": 0.8840023914026159, "grad_norm": 6.28125, "learning_rate": 6.171038059500315e-06, "loss": 0.85304852, "memory(GiB)": 146.85, "step": 37890, "train_speed(iter/s)": 0.202915 }, { "acc": 0.77351198, "epoch": 0.8842356989749048, "grad_norm": 5.78125, "learning_rate": 6.169201406932163e-06, "loss": 0.81927776, "memory(GiB)": 146.85, "step": 37900, "train_speed(iter/s)": 0.202943 }, { "acc": 0.77952728, "epoch": 0.8844690065471937, "grad_norm": 5.1875, "learning_rate": 6.167364587460849e-06, "loss": 0.78908634, "memory(GiB)": 146.85, "step": 37910, "train_speed(iter/s)": 0.20297 }, { "acc": 0.7574297, "epoch": 0.8847023141194826, "grad_norm": 15.25, "learning_rate": 6.16552760134858e-06, "loss": 0.91679649, "memory(GiB)": 146.85, "step": 37920, "train_speed(iter/s)": 0.202997 }, { "acc": 0.75851927, "epoch": 0.8849356216917715, "grad_norm": 6.34375, "learning_rate": 6.1636904488575845e-06, "loss": 0.88946905, "memory(GiB)": 146.85, "step": 37930, "train_speed(iter/s)": 0.203023 }, { "acc": 0.76474538, "epoch": 0.8851689292640604, "grad_norm": 6.03125, "learning_rate": 6.161853130250117e-06, "loss": 0.87733402, "memory(GiB)": 146.85, "step": 37940, "train_speed(iter/s)": 0.20305 }, { "acc": 0.77622237, "epoch": 0.8854022368363493, "grad_norm": 7.96875, "learning_rate": 6.160015645788451e-06, "loss": 0.80736256, "memory(GiB)": 146.85, "step": 37950, "train_speed(iter/s)": 0.203078 }, { "acc": 0.77534733, "epoch": 0.8856355444086382, "grad_norm": 4.96875, "learning_rate": 6.15817799573489e-06, "loss": 0.79920568, "memory(GiB)": 146.85, "step": 37960, "train_speed(iter/s)": 0.203106 }, { "acc": 0.77506895, "epoch": 0.8858688519809271, "grad_norm": 6.03125, "learning_rate": 6.1563401803517545e-06, "loss": 0.79451303, "memory(GiB)": 146.85, "step": 37970, "train_speed(iter/s)": 0.203135 }, { "acc": 0.76340027, "epoch": 0.886102159553216, "grad_norm": 4.96875, "learning_rate": 6.154502199901396e-06, "loss": 0.84602299, "memory(GiB)": 146.85, "step": 37980, "train_speed(iter/s)": 0.203159 }, { "acc": 0.76125336, "epoch": 0.8863354671255049, "grad_norm": 4.8125, "learning_rate": 6.152664054646183e-06, "loss": 0.85883341, "memory(GiB)": 146.85, "step": 37990, "train_speed(iter/s)": 0.203187 }, { "acc": 0.74044209, "epoch": 0.8865687746977938, "grad_norm": 5.25, "learning_rate": 6.150825744848511e-06, "loss": 0.93983269, "memory(GiB)": 146.85, "step": 38000, "train_speed(iter/s)": 0.203214 }, { "epoch": 0.8865687746977938, "eval_acc": 0.7349305741548052, "eval_loss": 0.8348593711853027, "eval_runtime": 1263.0996, "eval_samples_per_second": 28.494, "eval_steps_per_second": 14.247, "step": 38000 }, { "acc": 0.78466311, "epoch": 0.8868020822700827, "grad_norm": 6.75, "learning_rate": 6.148987270770798e-06, "loss": 0.77984581, "memory(GiB)": 146.85, "step": 38010, "train_speed(iter/s)": 0.201851 }, { "acc": 0.78270993, "epoch": 0.8870353898423716, "grad_norm": 7.6875, "learning_rate": 6.147148632675486e-06, "loss": 0.79879789, "memory(GiB)": 146.85, "step": 38020, "train_speed(iter/s)": 0.201878 }, { "acc": 0.76308918, "epoch": 0.8872686974146604, "grad_norm": 12.4375, "learning_rate": 6.145309830825041e-06, "loss": 0.91158552, "memory(GiB)": 146.85, "step": 38030, "train_speed(iter/s)": 0.201907 }, { "acc": 0.76208286, "epoch": 0.8875020049869493, "grad_norm": 5.71875, "learning_rate": 6.143470865481948e-06, "loss": 0.84497128, "memory(GiB)": 146.85, "step": 38040, "train_speed(iter/s)": 0.201936 }, { "acc": 0.74673553, "epoch": 0.8877353125592382, "grad_norm": 8.8125, "learning_rate": 6.141631736908723e-06, "loss": 0.91498032, "memory(GiB)": 146.85, "step": 38050, "train_speed(iter/s)": 0.201962 }, { "acc": 0.77104225, "epoch": 0.8879686201315271, "grad_norm": 5.78125, "learning_rate": 6.1397924453679e-06, "loss": 0.83551426, "memory(GiB)": 146.85, "step": 38060, "train_speed(iter/s)": 0.201989 }, { "acc": 0.76625814, "epoch": 0.888201927703816, "grad_norm": 4.21875, "learning_rate": 6.137952991122035e-06, "loss": 0.85627079, "memory(GiB)": 146.85, "step": 38070, "train_speed(iter/s)": 0.202018 }, { "acc": 0.75740509, "epoch": 0.8884352352761049, "grad_norm": 6.09375, "learning_rate": 6.136113374433712e-06, "loss": 0.85699959, "memory(GiB)": 146.85, "step": 38080, "train_speed(iter/s)": 0.202045 }, { "acc": 0.75813513, "epoch": 0.8886685428483938, "grad_norm": 5.625, "learning_rate": 6.134273595565534e-06, "loss": 0.87442579, "memory(GiB)": 146.85, "step": 38090, "train_speed(iter/s)": 0.202073 }, { "acc": 0.75815659, "epoch": 0.8889018504206827, "grad_norm": 7.59375, "learning_rate": 6.13243365478013e-06, "loss": 0.87502232, "memory(GiB)": 146.85, "step": 38100, "train_speed(iter/s)": 0.202101 }, { "acc": 0.77947197, "epoch": 0.8891351579929716, "grad_norm": 5.71875, "learning_rate": 6.13059355234015e-06, "loss": 0.78805699, "memory(GiB)": 146.85, "step": 38110, "train_speed(iter/s)": 0.20213 }, { "acc": 0.7540503, "epoch": 0.8893684655652605, "grad_norm": 7.28125, "learning_rate": 6.128753288508271e-06, "loss": 0.89246473, "memory(GiB)": 146.85, "step": 38120, "train_speed(iter/s)": 0.202157 }, { "acc": 0.76427631, "epoch": 0.8896017731375494, "grad_norm": 7.25, "learning_rate": 6.126912863547186e-06, "loss": 0.83796892, "memory(GiB)": 146.85, "step": 38130, "train_speed(iter/s)": 0.202184 }, { "acc": 0.7746242, "epoch": 0.8898350807098383, "grad_norm": 4.6875, "learning_rate": 6.125072277719618e-06, "loss": 0.8066433, "memory(GiB)": 146.85, "step": 38140, "train_speed(iter/s)": 0.202212 }, { "acc": 0.77104359, "epoch": 0.8900683882821272, "grad_norm": 6.90625, "learning_rate": 6.123231531288308e-06, "loss": 0.8335022, "memory(GiB)": 146.85, "step": 38150, "train_speed(iter/s)": 0.202241 }, { "acc": 0.78057413, "epoch": 0.8903016958544161, "grad_norm": 7.09375, "learning_rate": 6.121390624516026e-06, "loss": 0.7830143, "memory(GiB)": 146.85, "step": 38160, "train_speed(iter/s)": 0.20227 }, { "acc": 0.77255754, "epoch": 0.890535003426705, "grad_norm": 9.0625, "learning_rate": 6.119549557665556e-06, "loss": 0.8347208, "memory(GiB)": 146.85, "step": 38170, "train_speed(iter/s)": 0.202298 }, { "acc": 0.76472239, "epoch": 0.8907683109989939, "grad_norm": 7.84375, "learning_rate": 6.117708330999712e-06, "loss": 0.85625629, "memory(GiB)": 146.85, "step": 38180, "train_speed(iter/s)": 0.202324 }, { "acc": 0.76829, "epoch": 0.8910016185712828, "grad_norm": 5.5, "learning_rate": 6.115866944781329e-06, "loss": 0.8267621, "memory(GiB)": 146.85, "step": 38190, "train_speed(iter/s)": 0.202352 }, { "acc": 0.78261499, "epoch": 0.8912349261435717, "grad_norm": 6.9375, "learning_rate": 6.114025399273264e-06, "loss": 0.76450381, "memory(GiB)": 146.85, "step": 38200, "train_speed(iter/s)": 0.202377 }, { "acc": 0.76302118, "epoch": 0.8914682337158606, "grad_norm": 8.375, "learning_rate": 6.112183694738395e-06, "loss": 0.87226353, "memory(GiB)": 146.85, "step": 38210, "train_speed(iter/s)": 0.202402 }, { "acc": 0.78515453, "epoch": 0.8917015412881494, "grad_norm": 14.8125, "learning_rate": 6.110341831439628e-06, "loss": 0.7710206, "memory(GiB)": 146.85, "step": 38220, "train_speed(iter/s)": 0.202428 }, { "acc": 0.76235952, "epoch": 0.8919348488604383, "grad_norm": 6.59375, "learning_rate": 6.108499809639887e-06, "loss": 0.86974583, "memory(GiB)": 146.85, "step": 38230, "train_speed(iter/s)": 0.202453 }, { "acc": 0.76679382, "epoch": 0.8921681564327272, "grad_norm": 10.125, "learning_rate": 6.106657629602122e-06, "loss": 0.85181799, "memory(GiB)": 146.85, "step": 38240, "train_speed(iter/s)": 0.202482 }, { "acc": 0.77637691, "epoch": 0.8924014640050161, "grad_norm": 5.3125, "learning_rate": 6.104815291589299e-06, "loss": 0.81458035, "memory(GiB)": 146.85, "step": 38250, "train_speed(iter/s)": 0.202511 }, { "acc": 0.76452308, "epoch": 0.892634771577305, "grad_norm": 6.4375, "learning_rate": 6.1029727958644144e-06, "loss": 0.84412842, "memory(GiB)": 146.85, "step": 38260, "train_speed(iter/s)": 0.202539 }, { "acc": 0.76660986, "epoch": 0.8928680791495939, "grad_norm": 4.03125, "learning_rate": 6.1011301426904845e-06, "loss": 0.85418377, "memory(GiB)": 146.85, "step": 38270, "train_speed(iter/s)": 0.202566 }, { "acc": 0.78668671, "epoch": 0.8931013867218828, "grad_norm": 5.5, "learning_rate": 6.0992873323305465e-06, "loss": 0.78222876, "memory(GiB)": 146.85, "step": 38280, "train_speed(iter/s)": 0.202596 }, { "acc": 0.77651958, "epoch": 0.8933346942941717, "grad_norm": 5.03125, "learning_rate": 6.097444365047662e-06, "loss": 0.8063797, "memory(GiB)": 146.85, "step": 38290, "train_speed(iter/s)": 0.202623 }, { "acc": 0.76897268, "epoch": 0.8935680018664606, "grad_norm": 5.53125, "learning_rate": 6.095601241104911e-06, "loss": 0.84677076, "memory(GiB)": 146.85, "step": 38300, "train_speed(iter/s)": 0.202652 }, { "acc": 0.76138968, "epoch": 0.8938013094387495, "grad_norm": 7.53125, "learning_rate": 6.093757960765404e-06, "loss": 0.86825428, "memory(GiB)": 146.85, "step": 38310, "train_speed(iter/s)": 0.202681 }, { "acc": 0.77033072, "epoch": 0.8940346170110384, "grad_norm": 5.0, "learning_rate": 6.091914524292264e-06, "loss": 0.845263, "memory(GiB)": 146.85, "step": 38320, "train_speed(iter/s)": 0.20271 }, { "acc": 0.77076721, "epoch": 0.8942679245833273, "grad_norm": 7.8125, "learning_rate": 6.090070931948643e-06, "loss": 0.82378149, "memory(GiB)": 146.85, "step": 38330, "train_speed(iter/s)": 0.202738 }, { "acc": 0.77629185, "epoch": 0.8945012321556162, "grad_norm": 5.875, "learning_rate": 6.088227183997715e-06, "loss": 0.81273451, "memory(GiB)": 146.85, "step": 38340, "train_speed(iter/s)": 0.202765 }, { "acc": 0.76594772, "epoch": 0.8947345397279051, "grad_norm": 4.71875, "learning_rate": 6.08638328070267e-06, "loss": 0.83623638, "memory(GiB)": 146.85, "step": 38350, "train_speed(iter/s)": 0.202792 }, { "acc": 0.75623264, "epoch": 0.894967847300194, "grad_norm": 6.59375, "learning_rate": 6.084539222326728e-06, "loss": 0.90360785, "memory(GiB)": 146.85, "step": 38360, "train_speed(iter/s)": 0.20282 }, { "acc": 0.78467512, "epoch": 0.8952011548724829, "grad_norm": 6.03125, "learning_rate": 6.082695009133126e-06, "loss": 0.78002815, "memory(GiB)": 146.85, "step": 38370, "train_speed(iter/s)": 0.202847 }, { "acc": 0.76686568, "epoch": 0.8954344624447718, "grad_norm": 6.46875, "learning_rate": 6.080850641385129e-06, "loss": 0.8448307, "memory(GiB)": 146.85, "step": 38380, "train_speed(iter/s)": 0.202873 }, { "acc": 0.75776038, "epoch": 0.8956677700170607, "grad_norm": 4.8125, "learning_rate": 6.079006119346015e-06, "loss": 0.89329071, "memory(GiB)": 146.85, "step": 38390, "train_speed(iter/s)": 0.202902 }, { "acc": 0.77639208, "epoch": 0.8959010775893496, "grad_norm": 5.09375, "learning_rate": 6.0771614432790915e-06, "loss": 0.78370237, "memory(GiB)": 146.85, "step": 38400, "train_speed(iter/s)": 0.202928 }, { "acc": 0.77839222, "epoch": 0.8961343851616383, "grad_norm": 4.46875, "learning_rate": 6.075316613447684e-06, "loss": 0.80747833, "memory(GiB)": 146.85, "step": 38410, "train_speed(iter/s)": 0.202954 }, { "acc": 0.77338109, "epoch": 0.8963676927339272, "grad_norm": 6.03125, "learning_rate": 6.073471630115142e-06, "loss": 0.80097942, "memory(GiB)": 146.85, "step": 38420, "train_speed(iter/s)": 0.202982 }, { "acc": 0.76554976, "epoch": 0.8966010003062161, "grad_norm": 6.375, "learning_rate": 6.071626493544838e-06, "loss": 0.8325284, "memory(GiB)": 146.85, "step": 38430, "train_speed(iter/s)": 0.203009 }, { "acc": 0.77715769, "epoch": 0.896834307878505, "grad_norm": 4.90625, "learning_rate": 6.0697812040001625e-06, "loss": 0.80027943, "memory(GiB)": 146.85, "step": 38440, "train_speed(iter/s)": 0.203038 }, { "acc": 0.78042707, "epoch": 0.8970676154507939, "grad_norm": 7.09375, "learning_rate": 6.067935761744531e-06, "loss": 0.78589077, "memory(GiB)": 146.85, "step": 38450, "train_speed(iter/s)": 0.203065 }, { "acc": 0.74760275, "epoch": 0.8973009230230828, "grad_norm": 9.0625, "learning_rate": 6.066090167041381e-06, "loss": 0.90006866, "memory(GiB)": 146.85, "step": 38460, "train_speed(iter/s)": 0.203091 }, { "acc": 0.76728697, "epoch": 0.8975342305953717, "grad_norm": 6.0, "learning_rate": 6.0642444201541686e-06, "loss": 0.85084267, "memory(GiB)": 146.85, "step": 38470, "train_speed(iter/s)": 0.203119 }, { "acc": 0.77758236, "epoch": 0.8977675381676606, "grad_norm": 5.53125, "learning_rate": 6.062398521346374e-06, "loss": 0.79090834, "memory(GiB)": 146.85, "step": 38480, "train_speed(iter/s)": 0.203145 }, { "acc": 0.76656499, "epoch": 0.8980008457399495, "grad_norm": 6.125, "learning_rate": 6.060552470881498e-06, "loss": 0.83355865, "memory(GiB)": 146.85, "step": 38490, "train_speed(iter/s)": 0.203173 }, { "acc": 0.76201072, "epoch": 0.8982341533122384, "grad_norm": 4.21875, "learning_rate": 6.0587062690230654e-06, "loss": 0.84971781, "memory(GiB)": 146.85, "step": 38500, "train_speed(iter/s)": 0.203201 }, { "epoch": 0.8982341533122384, "eval_acc": 0.7348858821745149, "eval_loss": 0.8347864747047424, "eval_runtime": 1263.1633, "eval_samples_per_second": 28.493, "eval_steps_per_second": 14.247, "step": 38500 }, { "acc": 0.75970116, "epoch": 0.8984674608845273, "grad_norm": 6.53125, "learning_rate": 6.056859916034621e-06, "loss": 0.87236042, "memory(GiB)": 146.85, "step": 38510, "train_speed(iter/s)": 0.201855 }, { "acc": 0.7613492, "epoch": 0.8987007684568162, "grad_norm": 5.4375, "learning_rate": 6.055013412179732e-06, "loss": 0.87599478, "memory(GiB)": 146.85, "step": 38520, "train_speed(iter/s)": 0.201882 }, { "acc": 0.77253079, "epoch": 0.8989340760291051, "grad_norm": 6.4375, "learning_rate": 6.053166757721984e-06, "loss": 0.82082052, "memory(GiB)": 146.85, "step": 38530, "train_speed(iter/s)": 0.201909 }, { "acc": 0.77595739, "epoch": 0.899167383601394, "grad_norm": 5.875, "learning_rate": 6.051319952924987e-06, "loss": 0.80461044, "memory(GiB)": 146.85, "step": 38540, "train_speed(iter/s)": 0.201936 }, { "acc": 0.78109369, "epoch": 0.8994006911736829, "grad_norm": 8.125, "learning_rate": 6.049472998052371e-06, "loss": 0.78507872, "memory(GiB)": 146.85, "step": 38550, "train_speed(iter/s)": 0.201965 }, { "acc": 0.79469652, "epoch": 0.8996339987459718, "grad_norm": 9.25, "learning_rate": 6.047625893367791e-06, "loss": 0.73804913, "memory(GiB)": 146.85, "step": 38560, "train_speed(iter/s)": 0.201991 }, { "acc": 0.75910344, "epoch": 0.8998673063182607, "grad_norm": 10.1875, "learning_rate": 6.0457786391349195e-06, "loss": 0.85849304, "memory(GiB)": 146.85, "step": 38570, "train_speed(iter/s)": 0.202018 }, { "acc": 0.76364498, "epoch": 0.9001006138905496, "grad_norm": 44.0, "learning_rate": 6.0439312356174495e-06, "loss": 0.87708378, "memory(GiB)": 146.85, "step": 38580, "train_speed(iter/s)": 0.202045 }, { "acc": 0.75657558, "epoch": 0.9003339214628385, "grad_norm": 5.625, "learning_rate": 6.042083683079099e-06, "loss": 0.87597332, "memory(GiB)": 146.85, "step": 38590, "train_speed(iter/s)": 0.202071 }, { "acc": 0.78606443, "epoch": 0.9005672290351274, "grad_norm": 7.1875, "learning_rate": 6.0402359817836065e-06, "loss": 0.78787889, "memory(GiB)": 146.85, "step": 38600, "train_speed(iter/s)": 0.202098 }, { "acc": 0.76973152, "epoch": 0.9008005366074162, "grad_norm": 6.4375, "learning_rate": 6.038388131994729e-06, "loss": 0.82457561, "memory(GiB)": 146.85, "step": 38610, "train_speed(iter/s)": 0.202123 }, { "acc": 0.76959443, "epoch": 0.9010338441797051, "grad_norm": 7.40625, "learning_rate": 6.036540133976247e-06, "loss": 0.8282546, "memory(GiB)": 146.85, "step": 38620, "train_speed(iter/s)": 0.20215 }, { "acc": 0.75457964, "epoch": 0.901267151751994, "grad_norm": 5.625, "learning_rate": 6.034691987991963e-06, "loss": 0.87466469, "memory(GiB)": 146.85, "step": 38630, "train_speed(iter/s)": 0.202177 }, { "acc": 0.78235922, "epoch": 0.9015004593242829, "grad_norm": 5.3125, "learning_rate": 6.032843694305698e-06, "loss": 0.77273932, "memory(GiB)": 146.85, "step": 38640, "train_speed(iter/s)": 0.202203 }, { "acc": 0.76781778, "epoch": 0.9017337668965718, "grad_norm": 6.40625, "learning_rate": 6.0309952531812955e-06, "loss": 0.82000256, "memory(GiB)": 146.85, "step": 38650, "train_speed(iter/s)": 0.20223 }, { "acc": 0.76997166, "epoch": 0.9019670744688607, "grad_norm": 6.5625, "learning_rate": 6.029146664882619e-06, "loss": 0.83069944, "memory(GiB)": 146.85, "step": 38660, "train_speed(iter/s)": 0.202258 }, { "acc": 0.75424805, "epoch": 0.9022003820411496, "grad_norm": 7.8125, "learning_rate": 6.027297929673557e-06, "loss": 0.90522079, "memory(GiB)": 146.85, "step": 38670, "train_speed(iter/s)": 0.202285 }, { "acc": 0.77296782, "epoch": 0.9024336896134385, "grad_norm": 6.625, "learning_rate": 6.025449047818012e-06, "loss": 0.8158721, "memory(GiB)": 146.85, "step": 38680, "train_speed(iter/s)": 0.202312 }, { "acc": 0.77501383, "epoch": 0.9026669971857274, "grad_norm": 11.6875, "learning_rate": 6.0236000195799164e-06, "loss": 0.80679064, "memory(GiB)": 146.85, "step": 38690, "train_speed(iter/s)": 0.20234 }, { "acc": 0.78812399, "epoch": 0.9029003047580163, "grad_norm": 6.90625, "learning_rate": 6.0217508452232135e-06, "loss": 0.75358562, "memory(GiB)": 146.85, "step": 38700, "train_speed(iter/s)": 0.202365 }, { "acc": 0.76229782, "epoch": 0.9031336123303052, "grad_norm": 5.59375, "learning_rate": 6.019901525011873e-06, "loss": 0.8660284, "memory(GiB)": 146.85, "step": 38710, "train_speed(iter/s)": 0.202392 }, { "acc": 0.77274861, "epoch": 0.9033669199025941, "grad_norm": 4.96875, "learning_rate": 6.018052059209887e-06, "loss": 0.81943178, "memory(GiB)": 146.85, "step": 38720, "train_speed(iter/s)": 0.202419 }, { "acc": 0.76393929, "epoch": 0.903600227474883, "grad_norm": 5.8125, "learning_rate": 6.016202448081266e-06, "loss": 0.85924358, "memory(GiB)": 146.85, "step": 38730, "train_speed(iter/s)": 0.202445 }, { "acc": 0.7732513, "epoch": 0.9038335350471719, "grad_norm": 6.46875, "learning_rate": 6.014352691890041e-06, "loss": 0.8186718, "memory(GiB)": 146.85, "step": 38740, "train_speed(iter/s)": 0.202471 }, { "acc": 0.77900968, "epoch": 0.9040668426194608, "grad_norm": 6.0625, "learning_rate": 6.012502790900263e-06, "loss": 0.78385925, "memory(GiB)": 146.85, "step": 38750, "train_speed(iter/s)": 0.202499 }, { "acc": 0.762358, "epoch": 0.9043001501917497, "grad_norm": 4.9375, "learning_rate": 6.010652745376006e-06, "loss": 0.8556633, "memory(GiB)": 146.85, "step": 38760, "train_speed(iter/s)": 0.202527 }, { "acc": 0.75445137, "epoch": 0.9045334577640386, "grad_norm": 6.15625, "learning_rate": 6.008802555581364e-06, "loss": 0.87814779, "memory(GiB)": 146.85, "step": 38770, "train_speed(iter/s)": 0.202555 }, { "acc": 0.75289316, "epoch": 0.9047667653363275, "grad_norm": 7.4375, "learning_rate": 6.006952221780447e-06, "loss": 0.90262012, "memory(GiB)": 146.85, "step": 38780, "train_speed(iter/s)": 0.202584 }, { "acc": 0.78669343, "epoch": 0.9050000729086164, "grad_norm": 5.71875, "learning_rate": 6.005101744237396e-06, "loss": 0.76820955, "memory(GiB)": 146.85, "step": 38790, "train_speed(iter/s)": 0.202612 }, { "acc": 0.77637801, "epoch": 0.9052333804809052, "grad_norm": 7.03125, "learning_rate": 6.003251123216362e-06, "loss": 0.80556574, "memory(GiB)": 146.85, "step": 38800, "train_speed(iter/s)": 0.202638 }, { "acc": 0.78234882, "epoch": 0.9054666880531941, "grad_norm": 6.53125, "learning_rate": 6.001400358981522e-06, "loss": 0.78258495, "memory(GiB)": 146.85, "step": 38810, "train_speed(iter/s)": 0.202663 }, { "acc": 0.75998015, "epoch": 0.905699995625483, "grad_norm": 8.375, "learning_rate": 5.999549451797073e-06, "loss": 0.89165039, "memory(GiB)": 146.85, "step": 38820, "train_speed(iter/s)": 0.202691 }, { "acc": 0.7681417, "epoch": 0.9059333031977719, "grad_norm": 7.8125, "learning_rate": 5.997698401927228e-06, "loss": 0.83764801, "memory(GiB)": 146.85, "step": 38830, "train_speed(iter/s)": 0.202717 }, { "acc": 0.7551559, "epoch": 0.9061666107700608, "grad_norm": 7.90625, "learning_rate": 5.995847209636227e-06, "loss": 0.8769392, "memory(GiB)": 146.85, "step": 38840, "train_speed(iter/s)": 0.202745 }, { "acc": 0.77731366, "epoch": 0.9063999183423497, "grad_norm": 5.21875, "learning_rate": 5.993995875188324e-06, "loss": 0.79970899, "memory(GiB)": 146.85, "step": 38850, "train_speed(iter/s)": 0.202773 }, { "acc": 0.75631971, "epoch": 0.9066332259146386, "grad_norm": 5.15625, "learning_rate": 5.992144398847801e-06, "loss": 0.89241772, "memory(GiB)": 146.85, "step": 38860, "train_speed(iter/s)": 0.2028 }, { "acc": 0.77476664, "epoch": 0.9068665334869275, "grad_norm": 15.0, "learning_rate": 5.990292780878952e-06, "loss": 0.801688, "memory(GiB)": 146.85, "step": 38870, "train_speed(iter/s)": 0.202826 }, { "acc": 0.7512785, "epoch": 0.9070998410592164, "grad_norm": 6.84375, "learning_rate": 5.988441021546097e-06, "loss": 0.89643955, "memory(GiB)": 146.85, "step": 38880, "train_speed(iter/s)": 0.202854 }, { "acc": 0.76396742, "epoch": 0.9073331486315053, "grad_norm": 7.15625, "learning_rate": 5.986589121113574e-06, "loss": 0.86632099, "memory(GiB)": 146.85, "step": 38890, "train_speed(iter/s)": 0.202883 }, { "acc": 0.77178726, "epoch": 0.9075664562037942, "grad_norm": 5.65625, "learning_rate": 5.9847370798457395e-06, "loss": 0.83588676, "memory(GiB)": 146.85, "step": 38900, "train_speed(iter/s)": 0.20291 }, { "acc": 0.76915131, "epoch": 0.9077997637760831, "grad_norm": 4.90625, "learning_rate": 5.982884898006973e-06, "loss": 0.8183322, "memory(GiB)": 146.85, "step": 38910, "train_speed(iter/s)": 0.202937 }, { "acc": 0.76691313, "epoch": 0.908033071348372, "grad_norm": 4.65625, "learning_rate": 5.981032575861674e-06, "loss": 0.8380578, "memory(GiB)": 146.85, "step": 38920, "train_speed(iter/s)": 0.202964 }, { "acc": 0.77426949, "epoch": 0.9082663789206609, "grad_norm": 6.3125, "learning_rate": 5.979180113674258e-06, "loss": 0.81592741, "memory(GiB)": 146.85, "step": 38930, "train_speed(iter/s)": 0.202993 }, { "acc": 0.7810462, "epoch": 0.9084996864929498, "grad_norm": 6.21875, "learning_rate": 5.9773275117091655e-06, "loss": 0.78249865, "memory(GiB)": 146.85, "step": 38940, "train_speed(iter/s)": 0.203019 }, { "acc": 0.77316523, "epoch": 0.9087329940652387, "grad_norm": 5.84375, "learning_rate": 5.975474770230856e-06, "loss": 0.82059298, "memory(GiB)": 146.85, "step": 38950, "train_speed(iter/s)": 0.203045 }, { "acc": 0.7665, "epoch": 0.9089663016375276, "grad_norm": 5.09375, "learning_rate": 5.973621889503804e-06, "loss": 0.84913454, "memory(GiB)": 146.85, "step": 38960, "train_speed(iter/s)": 0.203072 }, { "acc": 0.75746002, "epoch": 0.9091996092098165, "grad_norm": 10.4375, "learning_rate": 5.9717688697925134e-06, "loss": 0.87770491, "memory(GiB)": 146.85, "step": 38970, "train_speed(iter/s)": 0.203099 }, { "acc": 0.76984544, "epoch": 0.9094329167821054, "grad_norm": 4.96875, "learning_rate": 5.969915711361497e-06, "loss": 0.8256978, "memory(GiB)": 146.85, "step": 38980, "train_speed(iter/s)": 0.203125 }, { "acc": 0.76707163, "epoch": 0.9096662243543941, "grad_norm": 5.3125, "learning_rate": 5.968062414475294e-06, "loss": 0.8461813, "memory(GiB)": 146.85, "step": 38990, "train_speed(iter/s)": 0.203151 }, { "acc": 0.76353416, "epoch": 0.909899531926683, "grad_norm": 5.375, "learning_rate": 5.966208979398462e-06, "loss": 0.84567833, "memory(GiB)": 146.85, "step": 39000, "train_speed(iter/s)": 0.203179 }, { "epoch": 0.909899531926683, "eval_acc": 0.7349839786511089, "eval_loss": 0.8346878886222839, "eval_runtime": 1262.3337, "eval_samples_per_second": 28.511, "eval_steps_per_second": 14.256, "step": 39000 }, { "acc": 0.7675509, "epoch": 0.910132839498972, "grad_norm": 5.84375, "learning_rate": 5.964355406395581e-06, "loss": 0.82274065, "memory(GiB)": 146.85, "step": 39010, "train_speed(iter/s)": 0.201852 }, { "acc": 0.77224808, "epoch": 0.9103661470712608, "grad_norm": 7.34375, "learning_rate": 5.962501695731245e-06, "loss": 0.81548853, "memory(GiB)": 146.85, "step": 39020, "train_speed(iter/s)": 0.201878 }, { "acc": 0.75423431, "epoch": 0.9105994546435497, "grad_norm": 8.4375, "learning_rate": 5.9606478476700714e-06, "loss": 0.88758583, "memory(GiB)": 146.85, "step": 39030, "train_speed(iter/s)": 0.201905 }, { "acc": 0.77082801, "epoch": 0.9108327622158386, "grad_norm": 6.78125, "learning_rate": 5.958793862476699e-06, "loss": 0.84845209, "memory(GiB)": 146.85, "step": 39040, "train_speed(iter/s)": 0.201932 }, { "acc": 0.7621253, "epoch": 0.9110660697881275, "grad_norm": 6.75, "learning_rate": 5.956939740415778e-06, "loss": 0.85999775, "memory(GiB)": 146.85, "step": 39050, "train_speed(iter/s)": 0.201958 }, { "acc": 0.76303191, "epoch": 0.9112993773604164, "grad_norm": 5.78125, "learning_rate": 5.9550854817519875e-06, "loss": 0.82503672, "memory(GiB)": 146.85, "step": 39060, "train_speed(iter/s)": 0.201985 }, { "acc": 0.78208771, "epoch": 0.9115326849327053, "grad_norm": 4.75, "learning_rate": 5.953231086750022e-06, "loss": 0.8033185, "memory(GiB)": 146.85, "step": 39070, "train_speed(iter/s)": 0.20201 }, { "acc": 0.76531477, "epoch": 0.9117659925049942, "grad_norm": 11.625, "learning_rate": 5.951376555674596e-06, "loss": 0.84774752, "memory(GiB)": 146.85, "step": 39080, "train_speed(iter/s)": 0.202037 }, { "acc": 0.76197786, "epoch": 0.9119993000772831, "grad_norm": 7.0, "learning_rate": 5.949521888790444e-06, "loss": 0.85626135, "memory(GiB)": 146.85, "step": 39090, "train_speed(iter/s)": 0.202063 }, { "acc": 0.75627751, "epoch": 0.912232607649572, "grad_norm": 7.28125, "learning_rate": 5.947667086362318e-06, "loss": 0.88786411, "memory(GiB)": 146.85, "step": 39100, "train_speed(iter/s)": 0.202092 }, { "acc": 0.76634159, "epoch": 0.9124659152218609, "grad_norm": 5.28125, "learning_rate": 5.945812148654991e-06, "loss": 0.84295959, "memory(GiB)": 146.85, "step": 39110, "train_speed(iter/s)": 0.202119 }, { "acc": 0.75715876, "epoch": 0.9126992227941498, "grad_norm": 8.9375, "learning_rate": 5.943957075933253e-06, "loss": 0.8815465, "memory(GiB)": 146.85, "step": 39120, "train_speed(iter/s)": 0.202146 }, { "acc": 0.77853236, "epoch": 0.9129325303664387, "grad_norm": 18.5, "learning_rate": 5.9421018684619165e-06, "loss": 0.77765436, "memory(GiB)": 146.85, "step": 39130, "train_speed(iter/s)": 0.202173 }, { "acc": 0.76497393, "epoch": 0.9131658379387276, "grad_norm": 6.625, "learning_rate": 5.940246526505814e-06, "loss": 0.85807304, "memory(GiB)": 146.85, "step": 39140, "train_speed(iter/s)": 0.2022 }, { "acc": 0.7535635, "epoch": 0.9133991455110165, "grad_norm": 7.4375, "learning_rate": 5.9383910503297915e-06, "loss": 0.92686825, "memory(GiB)": 146.85, "step": 39150, "train_speed(iter/s)": 0.202227 }, { "acc": 0.76748648, "epoch": 0.9136324530833054, "grad_norm": 4.90625, "learning_rate": 5.9365354401987195e-06, "loss": 0.84128475, "memory(GiB)": 146.85, "step": 39160, "train_speed(iter/s)": 0.202255 }, { "acc": 0.77858167, "epoch": 0.9138657606555943, "grad_norm": 7.59375, "learning_rate": 5.934679696377486e-06, "loss": 0.81103725, "memory(GiB)": 146.85, "step": 39170, "train_speed(iter/s)": 0.202282 }, { "acc": 0.76028986, "epoch": 0.9140990682278831, "grad_norm": 13.125, "learning_rate": 5.932823819130997e-06, "loss": 0.89393368, "memory(GiB)": 146.85, "step": 39180, "train_speed(iter/s)": 0.202309 }, { "acc": 0.77796755, "epoch": 0.914332375800172, "grad_norm": 6.3125, "learning_rate": 5.930967808724178e-06, "loss": 0.80624714, "memory(GiB)": 146.85, "step": 39190, "train_speed(iter/s)": 0.202336 }, { "acc": 0.79494667, "epoch": 0.9145656833724609, "grad_norm": 5.3125, "learning_rate": 5.929111665421976e-06, "loss": 0.73053131, "memory(GiB)": 146.85, "step": 39200, "train_speed(iter/s)": 0.202362 }, { "acc": 0.75212808, "epoch": 0.9147989909447498, "grad_norm": 8.875, "learning_rate": 5.927255389489354e-06, "loss": 0.88056393, "memory(GiB)": 146.85, "step": 39210, "train_speed(iter/s)": 0.202389 }, { "acc": 0.77176046, "epoch": 0.9150322985170387, "grad_norm": 7.8125, "learning_rate": 5.925398981191293e-06, "loss": 0.804809, "memory(GiB)": 146.85, "step": 39220, "train_speed(iter/s)": 0.202417 }, { "acc": 0.77409744, "epoch": 0.9152656060893276, "grad_norm": 4.8125, "learning_rate": 5.9235424407927965e-06, "loss": 0.82465858, "memory(GiB)": 146.85, "step": 39230, "train_speed(iter/s)": 0.202443 }, { "acc": 0.74998827, "epoch": 0.9154989136616165, "grad_norm": 6.8125, "learning_rate": 5.9216857685588855e-06, "loss": 0.88926544, "memory(GiB)": 146.85, "step": 39240, "train_speed(iter/s)": 0.202469 }, { "acc": 0.78663044, "epoch": 0.9157322212339054, "grad_norm": 10.5625, "learning_rate": 5.919828964754599e-06, "loss": 0.7744998, "memory(GiB)": 146.85, "step": 39250, "train_speed(iter/s)": 0.202496 }, { "acc": 0.76492562, "epoch": 0.9159655288061943, "grad_norm": 7.5, "learning_rate": 5.917972029644995e-06, "loss": 0.87401352, "memory(GiB)": 146.85, "step": 39260, "train_speed(iter/s)": 0.202524 }, { "acc": 0.76952744, "epoch": 0.9161988363784832, "grad_norm": 6.4375, "learning_rate": 5.91611496349515e-06, "loss": 0.8526969, "memory(GiB)": 146.85, "step": 39270, "train_speed(iter/s)": 0.20255 }, { "acc": 0.76301498, "epoch": 0.9164321439507721, "grad_norm": 6.6875, "learning_rate": 5.91425776657016e-06, "loss": 0.87166405, "memory(GiB)": 146.85, "step": 39280, "train_speed(iter/s)": 0.202575 }, { "acc": 0.76052985, "epoch": 0.916665451523061, "grad_norm": 5.84375, "learning_rate": 5.912400439135139e-06, "loss": 0.88216105, "memory(GiB)": 146.85, "step": 39290, "train_speed(iter/s)": 0.202604 }, { "acc": 0.76066074, "epoch": 0.9168987590953499, "grad_norm": 6.09375, "learning_rate": 5.9105429814552204e-06, "loss": 0.88707314, "memory(GiB)": 146.85, "step": 39300, "train_speed(iter/s)": 0.202632 }, { "acc": 0.79889703, "epoch": 0.9171320666676388, "grad_norm": 5.15625, "learning_rate": 5.908685393795557e-06, "loss": 0.70902166, "memory(GiB)": 146.85, "step": 39310, "train_speed(iter/s)": 0.202659 }, { "acc": 0.79534807, "epoch": 0.9173653742399277, "grad_norm": 8.875, "learning_rate": 5.9068276764213175e-06, "loss": 0.7448926, "memory(GiB)": 146.85, "step": 39320, "train_speed(iter/s)": 0.202688 }, { "acc": 0.77234559, "epoch": 0.9175986818122166, "grad_norm": 6.90625, "learning_rate": 5.90496982959769e-06, "loss": 0.83110647, "memory(GiB)": 146.85, "step": 39330, "train_speed(iter/s)": 0.202714 }, { "acc": 0.7626451, "epoch": 0.9178319893845055, "grad_norm": 10.25, "learning_rate": 5.903111853589881e-06, "loss": 0.83955717, "memory(GiB)": 146.85, "step": 39340, "train_speed(iter/s)": 0.20274 }, { "acc": 0.77587318, "epoch": 0.9180652969567944, "grad_norm": 7.0625, "learning_rate": 5.9012537486631185e-06, "loss": 0.81230478, "memory(GiB)": 146.85, "step": 39350, "train_speed(iter/s)": 0.202768 }, { "acc": 0.77052689, "epoch": 0.9182986045290833, "grad_norm": 4.625, "learning_rate": 5.899395515082644e-06, "loss": 0.84131031, "memory(GiB)": 146.85, "step": 39360, "train_speed(iter/s)": 0.202789 }, { "acc": 0.77391729, "epoch": 0.9185319121013722, "grad_norm": 6.28125, "learning_rate": 5.897537153113724e-06, "loss": 0.79849172, "memory(GiB)": 146.85, "step": 39370, "train_speed(iter/s)": 0.202816 }, { "acc": 0.7537631, "epoch": 0.918765219673661, "grad_norm": 8.9375, "learning_rate": 5.895678663021634e-06, "loss": 0.87179565, "memory(GiB)": 146.85, "step": 39380, "train_speed(iter/s)": 0.202842 }, { "acc": 0.75994492, "epoch": 0.9189985272459499, "grad_norm": 5.75, "learning_rate": 5.893820045071675e-06, "loss": 0.84072075, "memory(GiB)": 146.85, "step": 39390, "train_speed(iter/s)": 0.20287 }, { "acc": 0.75429225, "epoch": 0.9192318348182388, "grad_norm": 6.0625, "learning_rate": 5.891961299529165e-06, "loss": 0.89951591, "memory(GiB)": 146.85, "step": 39400, "train_speed(iter/s)": 0.202897 }, { "acc": 0.756565, "epoch": 0.9194651423905277, "grad_norm": 11.3125, "learning_rate": 5.890102426659438e-06, "loss": 0.89620533, "memory(GiB)": 146.85, "step": 39410, "train_speed(iter/s)": 0.202923 }, { "acc": 0.78424816, "epoch": 0.9196984499628166, "grad_norm": 6.5625, "learning_rate": 5.888243426727847e-06, "loss": 0.77990208, "memory(GiB)": 146.85, "step": 39420, "train_speed(iter/s)": 0.202949 }, { "acc": 0.76844358, "epoch": 0.9199317575351055, "grad_norm": 7.03125, "learning_rate": 5.886384299999767e-06, "loss": 0.83981075, "memory(GiB)": 146.85, "step": 39430, "train_speed(iter/s)": 0.202976 }, { "acc": 0.76949043, "epoch": 0.9201650651073944, "grad_norm": 5.5, "learning_rate": 5.884525046740586e-06, "loss": 0.84235926, "memory(GiB)": 146.85, "step": 39440, "train_speed(iter/s)": 0.202996 }, { "acc": 0.76512723, "epoch": 0.9203983726796833, "grad_norm": 5.0625, "learning_rate": 5.882665667215709e-06, "loss": 0.85974054, "memory(GiB)": 146.85, "step": 39450, "train_speed(iter/s)": 0.203023 }, { "acc": 0.75820079, "epoch": 0.9206316802519722, "grad_norm": 5.0, "learning_rate": 5.880806161690567e-06, "loss": 0.86139374, "memory(GiB)": 146.85, "step": 39460, "train_speed(iter/s)": 0.203051 }, { "acc": 0.77838526, "epoch": 0.9208649878242611, "grad_norm": 5.375, "learning_rate": 5.878946530430599e-06, "loss": 0.78823204, "memory(GiB)": 146.85, "step": 39470, "train_speed(iter/s)": 0.203077 }, { "acc": 0.77604084, "epoch": 0.92109829539655, "grad_norm": 6.53125, "learning_rate": 5.877086773701271e-06, "loss": 0.8318923, "memory(GiB)": 146.85, "step": 39480, "train_speed(iter/s)": 0.203101 }, { "acc": 0.77086182, "epoch": 0.9213316029688389, "grad_norm": 7.28125, "learning_rate": 5.87522689176806e-06, "loss": 0.84251051, "memory(GiB)": 146.85, "step": 39490, "train_speed(iter/s)": 0.203126 }, { "acc": 0.75624304, "epoch": 0.9215649105411278, "grad_norm": 20.125, "learning_rate": 5.873366884896464e-06, "loss": 0.90593586, "memory(GiB)": 146.85, "step": 39500, "train_speed(iter/s)": 0.203152 }, { "epoch": 0.9215649105411278, "eval_acc": 0.7349217002959029, "eval_loss": 0.8345761299133301, "eval_runtime": 1262.8638, "eval_samples_per_second": 28.5, "eval_steps_per_second": 14.25, "step": 39500 }, { "acc": 0.77606373, "epoch": 0.9217982181134167, "grad_norm": 6.1875, "learning_rate": 5.871506753352e-06, "loss": 0.81913071, "memory(GiB)": 146.85, "step": 39510, "train_speed(iter/s)": 0.20184 }, { "acc": 0.77181306, "epoch": 0.9220315256857056, "grad_norm": 6.1875, "learning_rate": 5.869646497400199e-06, "loss": 0.83348227, "memory(GiB)": 146.85, "step": 39520, "train_speed(iter/s)": 0.201865 }, { "acc": 0.76507521, "epoch": 0.9222648332579945, "grad_norm": 6.1875, "learning_rate": 5.867786117306614e-06, "loss": 0.82907104, "memory(GiB)": 146.85, "step": 39530, "train_speed(iter/s)": 0.20189 }, { "acc": 0.784866, "epoch": 0.9224981408302834, "grad_norm": 12.5625, "learning_rate": 5.865925613336814e-06, "loss": 0.76955271, "memory(GiB)": 146.85, "step": 39540, "train_speed(iter/s)": 0.201918 }, { "acc": 0.78137083, "epoch": 0.9227314484025723, "grad_norm": 5.9375, "learning_rate": 5.864064985756382e-06, "loss": 0.79345675, "memory(GiB)": 146.85, "step": 39550, "train_speed(iter/s)": 0.201944 }, { "acc": 0.76771641, "epoch": 0.9229647559748612, "grad_norm": 5.34375, "learning_rate": 5.862204234830925e-06, "loss": 0.85262566, "memory(GiB)": 146.85, "step": 39560, "train_speed(iter/s)": 0.20197 }, { "acc": 0.77624984, "epoch": 0.92319806354715, "grad_norm": 6.03125, "learning_rate": 5.860343360826063e-06, "loss": 0.80240736, "memory(GiB)": 146.85, "step": 39570, "train_speed(iter/s)": 0.201997 }, { "acc": 0.7534668, "epoch": 0.9234313711194388, "grad_norm": 6.5625, "learning_rate": 5.858482364007438e-06, "loss": 0.89943542, "memory(GiB)": 146.85, "step": 39580, "train_speed(iter/s)": 0.202025 }, { "acc": 0.77811289, "epoch": 0.9236646786917277, "grad_norm": 4.625, "learning_rate": 5.856621244640704e-06, "loss": 0.80016041, "memory(GiB)": 146.85, "step": 39590, "train_speed(iter/s)": 0.20205 }, { "acc": 0.76305203, "epoch": 0.9238979862640166, "grad_norm": 6.6875, "learning_rate": 5.8547600029915366e-06, "loss": 0.86027775, "memory(GiB)": 146.85, "step": 39600, "train_speed(iter/s)": 0.202078 }, { "acc": 0.77195287, "epoch": 0.9241312938363055, "grad_norm": 4.96875, "learning_rate": 5.852898639325627e-06, "loss": 0.8309967, "memory(GiB)": 146.85, "step": 39610, "train_speed(iter/s)": 0.202105 }, { "acc": 0.76952076, "epoch": 0.9243646014085944, "grad_norm": 6.6875, "learning_rate": 5.851037153908684e-06, "loss": 0.82292118, "memory(GiB)": 146.85, "step": 39620, "train_speed(iter/s)": 0.202132 }, { "acc": 0.76826696, "epoch": 0.9245979089808833, "grad_norm": 5.96875, "learning_rate": 5.849175547006433e-06, "loss": 0.86258068, "memory(GiB)": 146.85, "step": 39630, "train_speed(iter/s)": 0.202158 }, { "acc": 0.76060019, "epoch": 0.9248312165531722, "grad_norm": 5.875, "learning_rate": 5.8473138188846216e-06, "loss": 0.88299456, "memory(GiB)": 146.85, "step": 39640, "train_speed(iter/s)": 0.202184 }, { "acc": 0.76728258, "epoch": 0.9250645241254611, "grad_norm": 6.15625, "learning_rate": 5.845451969809009e-06, "loss": 0.83118668, "memory(GiB)": 146.85, "step": 39650, "train_speed(iter/s)": 0.202212 }, { "acc": 0.79294872, "epoch": 0.92529783169775, "grad_norm": 6.25, "learning_rate": 5.843590000045372e-06, "loss": 0.73196487, "memory(GiB)": 146.85, "step": 39660, "train_speed(iter/s)": 0.202239 }, { "acc": 0.76522956, "epoch": 0.9255311392700389, "grad_norm": 4.53125, "learning_rate": 5.841727909859508e-06, "loss": 0.83901768, "memory(GiB)": 146.85, "step": 39670, "train_speed(iter/s)": 0.202266 }, { "acc": 0.75989633, "epoch": 0.9257644468423278, "grad_norm": 4.71875, "learning_rate": 5.83986569951723e-06, "loss": 0.86379471, "memory(GiB)": 146.85, "step": 39680, "train_speed(iter/s)": 0.202293 }, { "acc": 0.77182226, "epoch": 0.9259977544146167, "grad_norm": 5.84375, "learning_rate": 5.838003369284366e-06, "loss": 0.81743755, "memory(GiB)": 146.85, "step": 39690, "train_speed(iter/s)": 0.20232 }, { "acc": 0.75143824, "epoch": 0.9262310619869056, "grad_norm": 5.53125, "learning_rate": 5.836140919426765e-06, "loss": 0.90095015, "memory(GiB)": 146.85, "step": 39700, "train_speed(iter/s)": 0.202346 }, { "acc": 0.76700768, "epoch": 0.9264643695591945, "grad_norm": 5.03125, "learning_rate": 5.834278350210292e-06, "loss": 0.84240513, "memory(GiB)": 146.85, "step": 39710, "train_speed(iter/s)": 0.202373 }, { "acc": 0.76647167, "epoch": 0.9266976771314834, "grad_norm": 8.8125, "learning_rate": 5.832415661900826e-06, "loss": 0.84225559, "memory(GiB)": 146.85, "step": 39720, "train_speed(iter/s)": 0.202397 }, { "acc": 0.77392035, "epoch": 0.9269309847037723, "grad_norm": 5.5625, "learning_rate": 5.830552854764265e-06, "loss": 0.81516552, "memory(GiB)": 146.85, "step": 39730, "train_speed(iter/s)": 0.202424 }, { "acc": 0.76718721, "epoch": 0.9271642922760612, "grad_norm": 7.84375, "learning_rate": 5.828689929066526e-06, "loss": 0.84219847, "memory(GiB)": 146.85, "step": 39740, "train_speed(iter/s)": 0.20245 }, { "acc": 0.77618122, "epoch": 0.9273975998483501, "grad_norm": 6.40625, "learning_rate": 5.826826885073541e-06, "loss": 0.80854244, "memory(GiB)": 146.85, "step": 39750, "train_speed(iter/s)": 0.202476 }, { "acc": 0.76439171, "epoch": 0.9276309074206389, "grad_norm": 6.40625, "learning_rate": 5.824963723051258e-06, "loss": 0.8460001, "memory(GiB)": 146.85, "step": 39760, "train_speed(iter/s)": 0.202504 }, { "acc": 0.75975204, "epoch": 0.9278642149929278, "grad_norm": 6.21875, "learning_rate": 5.823100443265643e-06, "loss": 0.87356758, "memory(GiB)": 146.85, "step": 39770, "train_speed(iter/s)": 0.202532 }, { "acc": 0.74452553, "epoch": 0.9280975225652167, "grad_norm": 9.5, "learning_rate": 5.821237045982679e-06, "loss": 0.93069744, "memory(GiB)": 146.85, "step": 39780, "train_speed(iter/s)": 0.202559 }, { "acc": 0.7596323, "epoch": 0.9283308301375056, "grad_norm": 5.75, "learning_rate": 5.819373531468364e-06, "loss": 0.86987305, "memory(GiB)": 146.85, "step": 39790, "train_speed(iter/s)": 0.202587 }, { "acc": 0.76424141, "epoch": 0.9285641377097945, "grad_norm": 6.03125, "learning_rate": 5.817509899988717e-06, "loss": 0.83402643, "memory(GiB)": 146.85, "step": 39800, "train_speed(iter/s)": 0.202615 }, { "acc": 0.7697423, "epoch": 0.9287974452820834, "grad_norm": 5.6875, "learning_rate": 5.8156461518097695e-06, "loss": 0.83730049, "memory(GiB)": 146.85, "step": 39810, "train_speed(iter/s)": 0.202641 }, { "acc": 0.77487946, "epoch": 0.9290307528543723, "grad_norm": 7.0, "learning_rate": 5.813782287197569e-06, "loss": 0.816222, "memory(GiB)": 146.85, "step": 39820, "train_speed(iter/s)": 0.20267 }, { "acc": 0.75314579, "epoch": 0.9292640604266612, "grad_norm": 8.25, "learning_rate": 5.8119183064181864e-06, "loss": 0.86598263, "memory(GiB)": 146.85, "step": 39830, "train_speed(iter/s)": 0.202698 }, { "acc": 0.76025419, "epoch": 0.9294973679989501, "grad_norm": 5.5625, "learning_rate": 5.810054209737699e-06, "loss": 0.83338852, "memory(GiB)": 146.85, "step": 39840, "train_speed(iter/s)": 0.202724 }, { "acc": 0.77169752, "epoch": 0.929730675571239, "grad_norm": 5.8125, "learning_rate": 5.8081899974222076e-06, "loss": 0.82351465, "memory(GiB)": 146.85, "step": 39850, "train_speed(iter/s)": 0.202751 }, { "acc": 0.77204561, "epoch": 0.9299639831435279, "grad_norm": 4.4375, "learning_rate": 5.80632566973783e-06, "loss": 0.8352293, "memory(GiB)": 146.85, "step": 39860, "train_speed(iter/s)": 0.202778 }, { "acc": 0.78529234, "epoch": 0.9301972907158168, "grad_norm": 5.0625, "learning_rate": 5.804461226950697e-06, "loss": 0.75873003, "memory(GiB)": 146.85, "step": 39870, "train_speed(iter/s)": 0.202805 }, { "acc": 0.73621883, "epoch": 0.9304305982881057, "grad_norm": 6.84375, "learning_rate": 5.80259666932696e-06, "loss": 0.94032249, "memory(GiB)": 146.85, "step": 39880, "train_speed(iter/s)": 0.202831 }, { "acc": 0.76161823, "epoch": 0.9306639058603946, "grad_norm": 5.5, "learning_rate": 5.800731997132779e-06, "loss": 0.8574913, "memory(GiB)": 146.85, "step": 39890, "train_speed(iter/s)": 0.202857 }, { "acc": 0.75908208, "epoch": 0.9308972134326835, "grad_norm": 7.46875, "learning_rate": 5.7988672106343395e-06, "loss": 0.89242363, "memory(GiB)": 146.85, "step": 39900, "train_speed(iter/s)": 0.202881 }, { "acc": 0.77650137, "epoch": 0.9311305210049724, "grad_norm": 5.15625, "learning_rate": 5.797002310097836e-06, "loss": 0.79820113, "memory(GiB)": 146.85, "step": 39910, "train_speed(iter/s)": 0.202907 }, { "acc": 0.76492949, "epoch": 0.9313638285772613, "grad_norm": 6.78125, "learning_rate": 5.795137295789486e-06, "loss": 0.85303383, "memory(GiB)": 146.85, "step": 39920, "train_speed(iter/s)": 0.202933 }, { "acc": 0.767869, "epoch": 0.9315971361495502, "grad_norm": 6.65625, "learning_rate": 5.7932721679755164e-06, "loss": 0.83927212, "memory(GiB)": 146.85, "step": 39930, "train_speed(iter/s)": 0.202959 }, { "acc": 0.75990105, "epoch": 0.9318304437218391, "grad_norm": 5.46875, "learning_rate": 5.791406926922176e-06, "loss": 0.87577877, "memory(GiB)": 146.85, "step": 39940, "train_speed(iter/s)": 0.202984 }, { "acc": 0.78177586, "epoch": 0.9320637512941279, "grad_norm": 9.125, "learning_rate": 5.789541572895727e-06, "loss": 0.78908548, "memory(GiB)": 146.85, "step": 39950, "train_speed(iter/s)": 0.20301 }, { "acc": 0.78436365, "epoch": 0.9322970588664168, "grad_norm": 5.8125, "learning_rate": 5.787676106162449e-06, "loss": 0.78315215, "memory(GiB)": 146.85, "step": 39960, "train_speed(iter/s)": 0.203036 }, { "acc": 0.77697597, "epoch": 0.9325303664387057, "grad_norm": 5.78125, "learning_rate": 5.785810526988633e-06, "loss": 0.81889591, "memory(GiB)": 146.85, "step": 39970, "train_speed(iter/s)": 0.203062 }, { "acc": 0.76890898, "epoch": 0.9327636740109946, "grad_norm": 5.1875, "learning_rate": 5.783944835640594e-06, "loss": 0.8268096, "memory(GiB)": 146.85, "step": 39980, "train_speed(iter/s)": 0.203089 }, { "acc": 0.7768373, "epoch": 0.9329969815832835, "grad_norm": 7.375, "learning_rate": 5.7820790323846566e-06, "loss": 0.79889517, "memory(GiB)": 146.85, "step": 39990, "train_speed(iter/s)": 0.203114 }, { "acc": 0.77683601, "epoch": 0.9332302891555724, "grad_norm": 7.75, "learning_rate": 5.780213117487167e-06, "loss": 0.78550539, "memory(GiB)": 146.85, "step": 40000, "train_speed(iter/s)": 0.203142 }, { "epoch": 0.9332302891555724, "eval_acc": 0.7350586804087783, "eval_loss": 0.8345077037811279, "eval_runtime": 1263.0928, "eval_samples_per_second": 28.494, "eval_steps_per_second": 14.248, "step": 40000 }, { "acc": 0.76426716, "epoch": 0.9334635967278613, "grad_norm": 6.46875, "learning_rate": 5.778347091214479e-06, "loss": 0.84505501, "memory(GiB)": 146.85, "step": 40010, "train_speed(iter/s)": 0.201848 }, { "acc": 0.77210712, "epoch": 0.9336969043001502, "grad_norm": 5.25, "learning_rate": 5.77648095383297e-06, "loss": 0.81654444, "memory(GiB)": 146.85, "step": 40020, "train_speed(iter/s)": 0.201875 }, { "acc": 0.73951969, "epoch": 0.9339302118724391, "grad_norm": 7.6875, "learning_rate": 5.774614705609032e-06, "loss": 0.95846786, "memory(GiB)": 146.85, "step": 40030, "train_speed(iter/s)": 0.201901 }, { "acc": 0.75816975, "epoch": 0.934163519444728, "grad_norm": 6.4375, "learning_rate": 5.7727483468090686e-06, "loss": 0.87248631, "memory(GiB)": 146.85, "step": 40040, "train_speed(iter/s)": 0.201929 }, { "acc": 0.76317949, "epoch": 0.9343968270170169, "grad_norm": 5.34375, "learning_rate": 5.770881877699502e-06, "loss": 0.86371384, "memory(GiB)": 146.85, "step": 40050, "train_speed(iter/s)": 0.201954 }, { "acc": 0.76525884, "epoch": 0.9346301345893058, "grad_norm": 5.5, "learning_rate": 5.769015298546774e-06, "loss": 0.86429806, "memory(GiB)": 146.85, "step": 40060, "train_speed(iter/s)": 0.201979 }, { "acc": 0.77121162, "epoch": 0.9348634421615947, "grad_norm": 6.40625, "learning_rate": 5.7671486096173336e-06, "loss": 0.82614269, "memory(GiB)": 146.85, "step": 40070, "train_speed(iter/s)": 0.202005 }, { "acc": 0.75312915, "epoch": 0.9350967497338836, "grad_norm": 8.25, "learning_rate": 5.765281811177652e-06, "loss": 0.91864395, "memory(GiB)": 146.85, "step": 40080, "train_speed(iter/s)": 0.202032 }, { "acc": 0.77462292, "epoch": 0.9353300573061725, "grad_norm": 8.0625, "learning_rate": 5.763414903494216e-06, "loss": 0.80621033, "memory(GiB)": 146.85, "step": 40090, "train_speed(iter/s)": 0.202059 }, { "acc": 0.75789804, "epoch": 0.9355633648784614, "grad_norm": 5.65625, "learning_rate": 5.761547886833523e-06, "loss": 0.8938633, "memory(GiB)": 146.85, "step": 40100, "train_speed(iter/s)": 0.202086 }, { "acc": 0.79608498, "epoch": 0.9357966724507503, "grad_norm": 6.46875, "learning_rate": 5.759680761462091e-06, "loss": 0.72719617, "memory(GiB)": 146.85, "step": 40110, "train_speed(iter/s)": 0.202113 }, { "acc": 0.77557716, "epoch": 0.9360299800230392, "grad_norm": 5.28125, "learning_rate": 5.757813527646449e-06, "loss": 0.81121569, "memory(GiB)": 146.85, "step": 40120, "train_speed(iter/s)": 0.202139 }, { "acc": 0.75938282, "epoch": 0.9362632875953281, "grad_norm": 4.75, "learning_rate": 5.755946185653148e-06, "loss": 0.88282127, "memory(GiB)": 146.85, "step": 40130, "train_speed(iter/s)": 0.202163 }, { "acc": 0.78604479, "epoch": 0.936496595167617, "grad_norm": 5.0625, "learning_rate": 5.7540787357487485e-06, "loss": 0.7422514, "memory(GiB)": 146.85, "step": 40140, "train_speed(iter/s)": 0.202189 }, { "acc": 0.78074269, "epoch": 0.9367299027399058, "grad_norm": 6.09375, "learning_rate": 5.752211178199828e-06, "loss": 0.77197924, "memory(GiB)": 146.85, "step": 40150, "train_speed(iter/s)": 0.202214 }, { "acc": 0.77504449, "epoch": 0.9369632103121946, "grad_norm": 3.375, "learning_rate": 5.7503435132729805e-06, "loss": 0.81155329, "memory(GiB)": 146.85, "step": 40160, "train_speed(iter/s)": 0.20224 }, { "acc": 0.76467695, "epoch": 0.9371965178844835, "grad_norm": 5.25, "learning_rate": 5.7484757412348146e-06, "loss": 0.84097843, "memory(GiB)": 146.85, "step": 40170, "train_speed(iter/s)": 0.202266 }, { "acc": 0.76652241, "epoch": 0.9374298254567724, "grad_norm": 4.8125, "learning_rate": 5.746607862351955e-06, "loss": 0.85581923, "memory(GiB)": 146.85, "step": 40180, "train_speed(iter/s)": 0.20229 }, { "acc": 0.74475312, "epoch": 0.9376631330290613, "grad_norm": 5.34375, "learning_rate": 5.744739876891038e-06, "loss": 0.93076229, "memory(GiB)": 146.85, "step": 40190, "train_speed(iter/s)": 0.202316 }, { "acc": 0.76890383, "epoch": 0.9378964406013502, "grad_norm": 7.0, "learning_rate": 5.742871785118721e-06, "loss": 0.82948809, "memory(GiB)": 146.85, "step": 40200, "train_speed(iter/s)": 0.202341 }, { "acc": 0.79526205, "epoch": 0.9381297481736391, "grad_norm": 6.53125, "learning_rate": 5.741003587301673e-06, "loss": 0.73552303, "memory(GiB)": 146.85, "step": 40210, "train_speed(iter/s)": 0.202365 }, { "acc": 0.77026224, "epoch": 0.938363055745928, "grad_norm": 5.625, "learning_rate": 5.739135283706576e-06, "loss": 0.80831099, "memory(GiB)": 146.85, "step": 40220, "train_speed(iter/s)": 0.202391 }, { "acc": 0.76156125, "epoch": 0.9385963633182169, "grad_norm": 11.25, "learning_rate": 5.737266874600134e-06, "loss": 0.87495499, "memory(GiB)": 146.85, "step": 40230, "train_speed(iter/s)": 0.202417 }, { "acc": 0.74690375, "epoch": 0.9388296708905058, "grad_norm": 5.46875, "learning_rate": 5.735398360249059e-06, "loss": 0.90000725, "memory(GiB)": 146.85, "step": 40240, "train_speed(iter/s)": 0.202442 }, { "acc": 0.75985203, "epoch": 0.9390629784627947, "grad_norm": 5.28125, "learning_rate": 5.733529740920083e-06, "loss": 0.87258577, "memory(GiB)": 146.85, "step": 40250, "train_speed(iter/s)": 0.20247 }, { "acc": 0.76269197, "epoch": 0.9392962860350836, "grad_norm": 6.71875, "learning_rate": 5.731661016879948e-06, "loss": 0.85806065, "memory(GiB)": 146.85, "step": 40260, "train_speed(iter/s)": 0.202495 }, { "acc": 0.76502762, "epoch": 0.9395295936073725, "grad_norm": 4.78125, "learning_rate": 5.729792188395415e-06, "loss": 0.87360754, "memory(GiB)": 146.85, "step": 40270, "train_speed(iter/s)": 0.202519 }, { "acc": 0.75723481, "epoch": 0.9397629011796614, "grad_norm": 5.71875, "learning_rate": 5.7279232557332595e-06, "loss": 0.87182026, "memory(GiB)": 146.85, "step": 40280, "train_speed(iter/s)": 0.202544 }, { "acc": 0.77919178, "epoch": 0.9399962087519503, "grad_norm": 6.0, "learning_rate": 5.726054219160273e-06, "loss": 0.80217247, "memory(GiB)": 146.85, "step": 40290, "train_speed(iter/s)": 0.20257 }, { "acc": 0.76131048, "epoch": 0.9402295163242392, "grad_norm": 6.15625, "learning_rate": 5.7241850789432555e-06, "loss": 0.87087593, "memory(GiB)": 146.85, "step": 40300, "train_speed(iter/s)": 0.202596 }, { "acc": 0.76880655, "epoch": 0.9404628238965281, "grad_norm": 10.4375, "learning_rate": 5.722315835349029e-06, "loss": 0.8536644, "memory(GiB)": 146.85, "step": 40310, "train_speed(iter/s)": 0.202624 }, { "acc": 0.76798449, "epoch": 0.940696131468817, "grad_norm": 6.03125, "learning_rate": 5.7204464886444265e-06, "loss": 0.83634319, "memory(GiB)": 146.85, "step": 40320, "train_speed(iter/s)": 0.202649 }, { "acc": 0.77525043, "epoch": 0.9409294390411059, "grad_norm": 4.84375, "learning_rate": 5.718577039096297e-06, "loss": 0.81705418, "memory(GiB)": 146.85, "step": 40330, "train_speed(iter/s)": 0.202674 }, { "acc": 0.75897069, "epoch": 0.9411627466133947, "grad_norm": 5.84375, "learning_rate": 5.7167074869715045e-06, "loss": 0.86775265, "memory(GiB)": 146.85, "step": 40340, "train_speed(iter/s)": 0.202697 }, { "acc": 0.76806278, "epoch": 0.9413960541856836, "grad_norm": 8.5, "learning_rate": 5.714837832536926e-06, "loss": 0.82835445, "memory(GiB)": 146.85, "step": 40350, "train_speed(iter/s)": 0.202724 }, { "acc": 0.76433344, "epoch": 0.9416293617579725, "grad_norm": 6.625, "learning_rate": 5.712968076059454e-06, "loss": 0.86697807, "memory(GiB)": 146.85, "step": 40360, "train_speed(iter/s)": 0.202749 }, { "acc": 0.78628836, "epoch": 0.9418626693302614, "grad_norm": 5.09375, "learning_rate": 5.711098217805997e-06, "loss": 0.76739521, "memory(GiB)": 146.85, "step": 40370, "train_speed(iter/s)": 0.202774 }, { "acc": 0.77075644, "epoch": 0.9420959769025503, "grad_norm": 5.5, "learning_rate": 5.709228258043476e-06, "loss": 0.81600399, "memory(GiB)": 146.85, "step": 40380, "train_speed(iter/s)": 0.2028 }, { "acc": 0.77602282, "epoch": 0.9423292844748392, "grad_norm": 5.03125, "learning_rate": 5.707358197038827e-06, "loss": 0.80324802, "memory(GiB)": 146.85, "step": 40390, "train_speed(iter/s)": 0.202825 }, { "acc": 0.74663925, "epoch": 0.9425625920471281, "grad_norm": 5.78125, "learning_rate": 5.7054880350590015e-06, "loss": 0.93901882, "memory(GiB)": 146.85, "step": 40400, "train_speed(iter/s)": 0.202851 }, { "acc": 0.76735601, "epoch": 0.942795899619417, "grad_norm": 7.375, "learning_rate": 5.703617772370963e-06, "loss": 0.84188232, "memory(GiB)": 146.85, "step": 40410, "train_speed(iter/s)": 0.202876 }, { "acc": 0.75100937, "epoch": 0.9430292071917059, "grad_norm": 5.75, "learning_rate": 5.701747409241691e-06, "loss": 0.90833664, "memory(GiB)": 146.85, "step": 40420, "train_speed(iter/s)": 0.202904 }, { "acc": 0.77400541, "epoch": 0.9432625147639948, "grad_norm": 5.53125, "learning_rate": 5.699876945938182e-06, "loss": 0.82409258, "memory(GiB)": 146.85, "step": 40430, "train_speed(iter/s)": 0.202933 }, { "acc": 0.75598321, "epoch": 0.9434958223362837, "grad_norm": 5.59375, "learning_rate": 5.698006382727441e-06, "loss": 0.88970175, "memory(GiB)": 146.85, "step": 40440, "train_speed(iter/s)": 0.202959 }, { "acc": 0.74864187, "epoch": 0.9437291299085726, "grad_norm": 5.09375, "learning_rate": 5.696135719876492e-06, "loss": 0.91293869, "memory(GiB)": 146.85, "step": 40450, "train_speed(iter/s)": 0.202987 }, { "acc": 0.76714592, "epoch": 0.9439624374808615, "grad_norm": 5.0, "learning_rate": 5.694264957652373e-06, "loss": 0.82724104, "memory(GiB)": 146.85, "step": 40460, "train_speed(iter/s)": 0.203012 }, { "acc": 0.78075228, "epoch": 0.9441957450531504, "grad_norm": 6.5, "learning_rate": 5.692394096322131e-06, "loss": 0.80575037, "memory(GiB)": 146.85, "step": 40470, "train_speed(iter/s)": 0.203038 }, { "acc": 0.76540294, "epoch": 0.9444290526254393, "grad_norm": 5.125, "learning_rate": 5.690523136152834e-06, "loss": 0.84594517, "memory(GiB)": 146.85, "step": 40480, "train_speed(iter/s)": 0.203064 }, { "acc": 0.74121733, "epoch": 0.9446623601977282, "grad_norm": 8.6875, "learning_rate": 5.688652077411558e-06, "loss": 0.9167079, "memory(GiB)": 146.85, "step": 40490, "train_speed(iter/s)": 0.20309 }, { "acc": 0.76702595, "epoch": 0.9448956677700171, "grad_norm": 6.46875, "learning_rate": 5.6867809203654004e-06, "loss": 0.84274502, "memory(GiB)": 146.85, "step": 40500, "train_speed(iter/s)": 0.203116 }, { "epoch": 0.9448956677700171, "eval_acc": 0.7350081800844791, "eval_loss": 0.8343724608421326, "eval_runtime": 1263.4575, "eval_samples_per_second": 28.486, "eval_steps_per_second": 14.243, "step": 40500 }, { "acc": 0.76879339, "epoch": 0.945128975342306, "grad_norm": 5.375, "learning_rate": 5.684909665281465e-06, "loss": 0.83076572, "memory(GiB)": 146.85, "step": 40510, "train_speed(iter/s)": 0.201838 }, { "acc": 0.77703342, "epoch": 0.9453622829145949, "grad_norm": 4.875, "learning_rate": 5.683038312426873e-06, "loss": 0.79323826, "memory(GiB)": 146.85, "step": 40520, "train_speed(iter/s)": 0.201863 }, { "acc": 0.76318407, "epoch": 0.9455955904868837, "grad_norm": 6.53125, "learning_rate": 5.681166862068761e-06, "loss": 0.85659294, "memory(GiB)": 146.85, "step": 40530, "train_speed(iter/s)": 0.201889 }, { "acc": 0.76754179, "epoch": 0.9458288980591726, "grad_norm": 5.21875, "learning_rate": 5.679295314474278e-06, "loss": 0.8564827, "memory(GiB)": 146.85, "step": 40540, "train_speed(iter/s)": 0.201914 }, { "acc": 0.76044474, "epoch": 0.9460622056314615, "grad_norm": 5.5625, "learning_rate": 5.677423669910584e-06, "loss": 0.87136173, "memory(GiB)": 146.85, "step": 40550, "train_speed(iter/s)": 0.20194 }, { "acc": 0.78014479, "epoch": 0.9462955132037504, "grad_norm": 5.0625, "learning_rate": 5.67555192864486e-06, "loss": 0.79344311, "memory(GiB)": 146.85, "step": 40560, "train_speed(iter/s)": 0.201966 }, { "acc": 0.75469098, "epoch": 0.9465288207760393, "grad_norm": 4.59375, "learning_rate": 5.673680090944294e-06, "loss": 0.88895769, "memory(GiB)": 146.85, "step": 40570, "train_speed(iter/s)": 0.201993 }, { "acc": 0.7741993, "epoch": 0.9467621283483282, "grad_norm": 11.0, "learning_rate": 5.671808157076091e-06, "loss": 0.81307583, "memory(GiB)": 146.85, "step": 40580, "train_speed(iter/s)": 0.202014 }, { "acc": 0.75000648, "epoch": 0.9469954359206171, "grad_norm": 6.125, "learning_rate": 5.669936127307468e-06, "loss": 0.91796036, "memory(GiB)": 146.85, "step": 40590, "train_speed(iter/s)": 0.202039 }, { "acc": 0.76478934, "epoch": 0.947228743492906, "grad_norm": 7.21875, "learning_rate": 5.668064001905658e-06, "loss": 0.85637264, "memory(GiB)": 146.85, "step": 40600, "train_speed(iter/s)": 0.202065 }, { "acc": 0.76286554, "epoch": 0.9474620510651949, "grad_norm": 5.3125, "learning_rate": 5.666191781137905e-06, "loss": 0.8558691, "memory(GiB)": 146.85, "step": 40610, "train_speed(iter/s)": 0.202089 }, { "acc": 0.77602291, "epoch": 0.9476953586374838, "grad_norm": 4.71875, "learning_rate": 5.66431946527147e-06, "loss": 0.82535172, "memory(GiB)": 146.85, "step": 40620, "train_speed(iter/s)": 0.202114 }, { "acc": 0.77202206, "epoch": 0.9479286662097727, "grad_norm": 6.40625, "learning_rate": 5.662447054573624e-06, "loss": 0.81684761, "memory(GiB)": 146.85, "step": 40630, "train_speed(iter/s)": 0.20214 }, { "acc": 0.77196636, "epoch": 0.9481619737820616, "grad_norm": 5.0625, "learning_rate": 5.660574549311653e-06, "loss": 0.82266045, "memory(GiB)": 146.85, "step": 40640, "train_speed(iter/s)": 0.202166 }, { "acc": 0.77030272, "epoch": 0.9483952813543505, "grad_norm": 5.75, "learning_rate": 5.658701949752856e-06, "loss": 0.81909962, "memory(GiB)": 146.85, "step": 40650, "train_speed(iter/s)": 0.202189 }, { "acc": 0.74134116, "epoch": 0.9486285889266394, "grad_norm": 6.0, "learning_rate": 5.656829256164549e-06, "loss": 0.96631203, "memory(GiB)": 146.85, "step": 40660, "train_speed(iter/s)": 0.202214 }, { "acc": 0.76086178, "epoch": 0.9488618964989283, "grad_norm": 8.5625, "learning_rate": 5.6549564688140555e-06, "loss": 0.86997395, "memory(GiB)": 146.85, "step": 40670, "train_speed(iter/s)": 0.202242 }, { "acc": 0.78814526, "epoch": 0.9490952040712172, "grad_norm": 4.875, "learning_rate": 5.653083587968716e-06, "loss": 0.76020193, "memory(GiB)": 146.85, "step": 40680, "train_speed(iter/s)": 0.202268 }, { "acc": 0.7603097, "epoch": 0.9493285116435061, "grad_norm": 7.59375, "learning_rate": 5.651210613895885e-06, "loss": 0.86122742, "memory(GiB)": 146.85, "step": 40690, "train_speed(iter/s)": 0.202296 }, { "acc": 0.76155176, "epoch": 0.949561819215795, "grad_norm": 6.0625, "learning_rate": 5.649337546862927e-06, "loss": 0.86499615, "memory(GiB)": 146.85, "step": 40700, "train_speed(iter/s)": 0.202321 }, { "acc": 0.77271838, "epoch": 0.9497951267880839, "grad_norm": 4.5625, "learning_rate": 5.647464387137224e-06, "loss": 0.81805744, "memory(GiB)": 146.85, "step": 40710, "train_speed(iter/s)": 0.202346 }, { "acc": 0.7574667, "epoch": 0.9500284343603727, "grad_norm": 5.53125, "learning_rate": 5.645591134986166e-06, "loss": 0.87971783, "memory(GiB)": 146.85, "step": 40720, "train_speed(iter/s)": 0.202371 }, { "acc": 0.76335607, "epoch": 0.9502617419326616, "grad_norm": 9.625, "learning_rate": 5.643717790677162e-06, "loss": 0.87412224, "memory(GiB)": 146.85, "step": 40730, "train_speed(iter/s)": 0.202395 }, { "acc": 0.75022221, "epoch": 0.9504950495049505, "grad_norm": 6.4375, "learning_rate": 5.641844354477631e-06, "loss": 0.91665878, "memory(GiB)": 146.85, "step": 40740, "train_speed(iter/s)": 0.20242 }, { "acc": 0.75101633, "epoch": 0.9507283570772393, "grad_norm": 5.21875, "learning_rate": 5.639970826655005e-06, "loss": 0.89709568, "memory(GiB)": 146.85, "step": 40750, "train_speed(iter/s)": 0.202447 }, { "acc": 0.77892671, "epoch": 0.9509616646495282, "grad_norm": 4.53125, "learning_rate": 5.63809720747673e-06, "loss": 0.79566307, "memory(GiB)": 146.85, "step": 40760, "train_speed(iter/s)": 0.20247 }, { "acc": 0.77345943, "epoch": 0.9511949722218171, "grad_norm": 7.5, "learning_rate": 5.636223497210261e-06, "loss": 0.7961132, "memory(GiB)": 146.85, "step": 40770, "train_speed(iter/s)": 0.202497 }, { "acc": 0.76129465, "epoch": 0.951428279794106, "grad_norm": 6.75, "learning_rate": 5.634349696123075e-06, "loss": 0.87621574, "memory(GiB)": 146.85, "step": 40780, "train_speed(iter/s)": 0.202522 }, { "acc": 0.78088217, "epoch": 0.951661587366395, "grad_norm": 18.25, "learning_rate": 5.6324758044826535e-06, "loss": 0.7921072, "memory(GiB)": 146.85, "step": 40790, "train_speed(iter/s)": 0.202544 }, { "acc": 0.78042965, "epoch": 0.9518948949386838, "grad_norm": 5.25, "learning_rate": 5.6306018225564955e-06, "loss": 0.79977417, "memory(GiB)": 146.85, "step": 40800, "train_speed(iter/s)": 0.202568 }, { "acc": 0.75889955, "epoch": 0.9521282025109727, "grad_norm": 10.5, "learning_rate": 5.6287277506121084e-06, "loss": 0.87272005, "memory(GiB)": 146.85, "step": 40810, "train_speed(iter/s)": 0.202595 }, { "acc": 0.75821152, "epoch": 0.9523615100832616, "grad_norm": 5.4375, "learning_rate": 5.626853588917021e-06, "loss": 0.86497602, "memory(GiB)": 146.85, "step": 40820, "train_speed(iter/s)": 0.20262 }, { "acc": 0.77513609, "epoch": 0.9525948176555505, "grad_norm": 5.0, "learning_rate": 5.624979337738763e-06, "loss": 0.80202837, "memory(GiB)": 146.85, "step": 40830, "train_speed(iter/s)": 0.202646 }, { "acc": 0.77589493, "epoch": 0.9528281252278394, "grad_norm": 5.46875, "learning_rate": 5.623104997344886e-06, "loss": 0.81551495, "memory(GiB)": 146.85, "step": 40840, "train_speed(iter/s)": 0.202672 }, { "acc": 0.793435, "epoch": 0.9530614328001283, "grad_norm": 7.625, "learning_rate": 5.621230568002952e-06, "loss": 0.75108919, "memory(GiB)": 146.85, "step": 40850, "train_speed(iter/s)": 0.202697 }, { "acc": 0.76541109, "epoch": 0.9532947403724172, "grad_norm": 6.875, "learning_rate": 5.619356049980536e-06, "loss": 0.84592733, "memory(GiB)": 146.85, "step": 40860, "train_speed(iter/s)": 0.202726 }, { "acc": 0.75376549, "epoch": 0.9535280479447061, "grad_norm": 6.15625, "learning_rate": 5.617481443545223e-06, "loss": 0.89939823, "memory(GiB)": 146.85, "step": 40870, "train_speed(iter/s)": 0.202754 }, { "acc": 0.77550421, "epoch": 0.953761355516995, "grad_norm": 5.25, "learning_rate": 5.615606748964613e-06, "loss": 0.82564449, "memory(GiB)": 146.85, "step": 40880, "train_speed(iter/s)": 0.202779 }, { "acc": 0.76897359, "epoch": 0.9539946630892839, "grad_norm": 8.25, "learning_rate": 5.613731966506321e-06, "loss": 0.83825207, "memory(GiB)": 146.85, "step": 40890, "train_speed(iter/s)": 0.202804 }, { "acc": 0.76135731, "epoch": 0.9542279706615728, "grad_norm": 6.75, "learning_rate": 5.611857096437966e-06, "loss": 0.85582018, "memory(GiB)": 146.85, "step": 40900, "train_speed(iter/s)": 0.20283 }, { "acc": 0.76692095, "epoch": 0.9544612782338617, "grad_norm": 7.5625, "learning_rate": 5.60998213902719e-06, "loss": 0.87113705, "memory(GiB)": 146.85, "step": 40910, "train_speed(iter/s)": 0.202855 }, { "acc": 0.7739974, "epoch": 0.9546945858061505, "grad_norm": 6.4375, "learning_rate": 5.60810709454164e-06, "loss": 0.8285737, "memory(GiB)": 146.85, "step": 40920, "train_speed(iter/s)": 0.202881 }, { "acc": 0.76305838, "epoch": 0.9549278933784394, "grad_norm": 8.5, "learning_rate": 5.606231963248978e-06, "loss": 0.85952911, "memory(GiB)": 146.85, "step": 40930, "train_speed(iter/s)": 0.202907 }, { "acc": 0.76125917, "epoch": 0.9551612009507283, "grad_norm": 4.9375, "learning_rate": 5.60435674541688e-06, "loss": 0.86668005, "memory(GiB)": 146.85, "step": 40940, "train_speed(iter/s)": 0.202931 }, { "acc": 0.76277037, "epoch": 0.9553945085230172, "grad_norm": 5.4375, "learning_rate": 5.602481441313032e-06, "loss": 0.8552887, "memory(GiB)": 146.85, "step": 40950, "train_speed(iter/s)": 0.202957 }, { "acc": 0.76722813, "epoch": 0.9556278160953061, "grad_norm": 9.625, "learning_rate": 5.6006060512051355e-06, "loss": 0.85493565, "memory(GiB)": 146.85, "step": 40960, "train_speed(iter/s)": 0.202981 }, { "acc": 0.77087679, "epoch": 0.955861123667595, "grad_norm": 5.25, "learning_rate": 5.598730575360898e-06, "loss": 0.82372246, "memory(GiB)": 146.85, "step": 40970, "train_speed(iter/s)": 0.203007 }, { "acc": 0.77105575, "epoch": 0.9560944312398839, "grad_norm": 5.78125, "learning_rate": 5.596855014048045e-06, "loss": 0.82025394, "memory(GiB)": 146.85, "step": 40980, "train_speed(iter/s)": 0.203033 }, { "acc": 0.76818609, "epoch": 0.9563277388121728, "grad_norm": 5.96875, "learning_rate": 5.594979367534311e-06, "loss": 0.86420727, "memory(GiB)": 146.85, "step": 40990, "train_speed(iter/s)": 0.20306 }, { "acc": 0.7725831, "epoch": 0.9565610463844617, "grad_norm": 5.59375, "learning_rate": 5.593103636087446e-06, "loss": 0.81275692, "memory(GiB)": 146.85, "step": 41000, "train_speed(iter/s)": 0.203086 }, { "epoch": 0.9565610463844617, "eval_acc": 0.7349822038793284, "eval_loss": 0.8343979716300964, "eval_runtime": 1263.7796, "eval_samples_per_second": 28.479, "eval_steps_per_second": 14.24, "step": 41000 }, { "acc": 0.77463846, "epoch": 0.9567943539567506, "grad_norm": 5.8125, "learning_rate": 5.591227819975209e-06, "loss": 0.80327339, "memory(GiB)": 146.85, "step": 41010, "train_speed(iter/s)": 0.201823 }, { "acc": 0.78184681, "epoch": 0.9570276615290395, "grad_norm": 6.78125, "learning_rate": 5.589351919465373e-06, "loss": 0.80083857, "memory(GiB)": 146.85, "step": 41020, "train_speed(iter/s)": 0.201847 }, { "acc": 0.76846457, "epoch": 0.9572609691013284, "grad_norm": 4.96875, "learning_rate": 5.587475934825721e-06, "loss": 0.84973316, "memory(GiB)": 146.85, "step": 41030, "train_speed(iter/s)": 0.201872 }, { "acc": 0.78798761, "epoch": 0.9574942766736173, "grad_norm": 4.875, "learning_rate": 5.585599866324052e-06, "loss": 0.74825821, "memory(GiB)": 146.85, "step": 41040, "train_speed(iter/s)": 0.201896 }, { "acc": 0.77331343, "epoch": 0.9577275842459062, "grad_norm": 5.75, "learning_rate": 5.583723714228169e-06, "loss": 0.81054382, "memory(GiB)": 146.85, "step": 41050, "train_speed(iter/s)": 0.201921 }, { "acc": 0.75995092, "epoch": 0.9579608918181951, "grad_norm": 5.25, "learning_rate": 5.581847478805898e-06, "loss": 0.87824345, "memory(GiB)": 146.85, "step": 41060, "train_speed(iter/s)": 0.201945 }, { "acc": 0.76433544, "epoch": 0.958194199390484, "grad_norm": 4.625, "learning_rate": 5.579971160325066e-06, "loss": 0.85493202, "memory(GiB)": 146.85, "step": 41070, "train_speed(iter/s)": 0.201971 }, { "acc": 0.78005247, "epoch": 0.9584275069627729, "grad_norm": 5.8125, "learning_rate": 5.578094759053521e-06, "loss": 0.80216808, "memory(GiB)": 146.85, "step": 41080, "train_speed(iter/s)": 0.201998 }, { "acc": 0.7557415, "epoch": 0.9586608145350618, "grad_norm": 6.5625, "learning_rate": 5.576218275259116e-06, "loss": 0.90051384, "memory(GiB)": 146.85, "step": 41090, "train_speed(iter/s)": 0.202024 }, { "acc": 0.7912075, "epoch": 0.9588941221073507, "grad_norm": 6.78125, "learning_rate": 5.574341709209721e-06, "loss": 0.7509913, "memory(GiB)": 146.85, "step": 41100, "train_speed(iter/s)": 0.20205 }, { "acc": 0.78250074, "epoch": 0.9591274296796395, "grad_norm": 4.59375, "learning_rate": 5.572465061173215e-06, "loss": 0.79123855, "memory(GiB)": 146.85, "step": 41110, "train_speed(iter/s)": 0.202077 }, { "acc": 0.7715601, "epoch": 0.9593607372519284, "grad_norm": 5.875, "learning_rate": 5.5705883314174845e-06, "loss": 0.81324301, "memory(GiB)": 146.85, "step": 41120, "train_speed(iter/s)": 0.202103 }, { "acc": 0.77603507, "epoch": 0.9595940448242173, "grad_norm": 5.34375, "learning_rate": 5.568711520210437e-06, "loss": 0.81781864, "memory(GiB)": 146.85, "step": 41130, "train_speed(iter/s)": 0.202129 }, { "acc": 0.75903053, "epoch": 0.9598273523965062, "grad_norm": 5.90625, "learning_rate": 5.566834627819986e-06, "loss": 0.87466373, "memory(GiB)": 146.85, "step": 41140, "train_speed(iter/s)": 0.202155 }, { "acc": 0.77237949, "epoch": 0.9600606599687951, "grad_norm": 7.71875, "learning_rate": 5.564957654514055e-06, "loss": 0.84418812, "memory(GiB)": 146.85, "step": 41150, "train_speed(iter/s)": 0.20218 }, { "acc": 0.7625782, "epoch": 0.960293967541084, "grad_norm": 3.859375, "learning_rate": 5.563080600560584e-06, "loss": 0.84532719, "memory(GiB)": 146.85, "step": 41160, "train_speed(iter/s)": 0.202209 }, { "acc": 0.77680664, "epoch": 0.9605272751133729, "grad_norm": 5.4375, "learning_rate": 5.5612034662275205e-06, "loss": 0.81075726, "memory(GiB)": 146.85, "step": 41170, "train_speed(iter/s)": 0.202234 }, { "acc": 0.75513387, "epoch": 0.9607605826856618, "grad_norm": 4.46875, "learning_rate": 5.559326251782825e-06, "loss": 0.87801552, "memory(GiB)": 146.85, "step": 41180, "train_speed(iter/s)": 0.20226 }, { "acc": 0.76070414, "epoch": 0.9609938902579507, "grad_norm": 6.21875, "learning_rate": 5.55744895749447e-06, "loss": 0.87253017, "memory(GiB)": 146.85, "step": 41190, "train_speed(iter/s)": 0.202285 }, { "acc": 0.76914644, "epoch": 0.9612271978302396, "grad_norm": 6.09375, "learning_rate": 5.555571583630439e-06, "loss": 0.83227539, "memory(GiB)": 146.85, "step": 41200, "train_speed(iter/s)": 0.20231 }, { "acc": 0.77785153, "epoch": 0.9614605054025285, "grad_norm": 6.3125, "learning_rate": 5.553694130458725e-06, "loss": 0.80200777, "memory(GiB)": 146.85, "step": 41210, "train_speed(iter/s)": 0.202336 }, { "acc": 0.75367651, "epoch": 0.9616938129748174, "grad_norm": 5.875, "learning_rate": 5.551816598247334e-06, "loss": 0.90671749, "memory(GiB)": 146.85, "step": 41220, "train_speed(iter/s)": 0.20236 }, { "acc": 0.7913641, "epoch": 0.9619271205471063, "grad_norm": 5.8125, "learning_rate": 5.549938987264284e-06, "loss": 0.74341764, "memory(GiB)": 146.85, "step": 41230, "train_speed(iter/s)": 0.202384 }, { "acc": 0.79213161, "epoch": 0.9621604281193952, "grad_norm": 6.4375, "learning_rate": 5.548061297777604e-06, "loss": 0.7500237, "memory(GiB)": 146.85, "step": 41240, "train_speed(iter/s)": 0.202409 }, { "acc": 0.77722778, "epoch": 0.9623937356916841, "grad_norm": 5.03125, "learning_rate": 5.546183530055334e-06, "loss": 0.79778366, "memory(GiB)": 146.85, "step": 41250, "train_speed(iter/s)": 0.202434 }, { "acc": 0.74577351, "epoch": 0.962627043263973, "grad_norm": 5.5625, "learning_rate": 5.544305684365522e-06, "loss": 0.93410463, "memory(GiB)": 146.85, "step": 41260, "train_speed(iter/s)": 0.202461 }, { "acc": 0.76189752, "epoch": 0.9628603508362619, "grad_norm": 6.5625, "learning_rate": 5.542427760976232e-06, "loss": 0.85853777, "memory(GiB)": 146.85, "step": 41270, "train_speed(iter/s)": 0.202487 }, { "acc": 0.7638546, "epoch": 0.9630936584085508, "grad_norm": 7.9375, "learning_rate": 5.540549760155537e-06, "loss": 0.89324217, "memory(GiB)": 146.85, "step": 41280, "train_speed(iter/s)": 0.202512 }, { "acc": 0.77508364, "epoch": 0.9633269659808397, "grad_norm": 4.03125, "learning_rate": 5.53867168217152e-06, "loss": 0.80854826, "memory(GiB)": 146.85, "step": 41290, "train_speed(iter/s)": 0.202533 }, { "acc": 0.76533318, "epoch": 0.9635602735531285, "grad_norm": 5.125, "learning_rate": 5.536793527292278e-06, "loss": 0.84807873, "memory(GiB)": 146.85, "step": 41300, "train_speed(iter/s)": 0.202558 }, { "acc": 0.75640626, "epoch": 0.9637935811254174, "grad_norm": 6.71875, "learning_rate": 5.5349152957859155e-06, "loss": 0.8727808, "memory(GiB)": 146.85, "step": 41310, "train_speed(iter/s)": 0.202582 }, { "acc": 0.76438446, "epoch": 0.9640268886977063, "grad_norm": 4.75, "learning_rate": 5.53303698792055e-06, "loss": 0.86975384, "memory(GiB)": 146.85, "step": 41320, "train_speed(iter/s)": 0.202608 }, { "acc": 0.7579072, "epoch": 0.9642601962699952, "grad_norm": 8.4375, "learning_rate": 5.531158603964309e-06, "loss": 0.87389364, "memory(GiB)": 146.85, "step": 41330, "train_speed(iter/s)": 0.202634 }, { "acc": 0.76368313, "epoch": 0.964493503842284, "grad_norm": 5.59375, "learning_rate": 5.529280144185331e-06, "loss": 0.85823898, "memory(GiB)": 146.85, "step": 41340, "train_speed(iter/s)": 0.202659 }, { "acc": 0.76249714, "epoch": 0.964726811414573, "grad_norm": 7.5625, "learning_rate": 5.5274016088517676e-06, "loss": 0.85808716, "memory(GiB)": 146.85, "step": 41350, "train_speed(iter/s)": 0.202684 }, { "acc": 0.76950188, "epoch": 0.9649601189868618, "grad_norm": 5.71875, "learning_rate": 5.525522998231777e-06, "loss": 0.83581066, "memory(GiB)": 146.85, "step": 41360, "train_speed(iter/s)": 0.20271 }, { "acc": 0.78802104, "epoch": 0.9651934265591507, "grad_norm": 5.15625, "learning_rate": 5.523644312593533e-06, "loss": 0.73156481, "memory(GiB)": 146.85, "step": 41370, "train_speed(iter/s)": 0.202735 }, { "acc": 0.77203388, "epoch": 0.9654267341314396, "grad_norm": 4.3125, "learning_rate": 5.521765552205213e-06, "loss": 0.80846348, "memory(GiB)": 146.85, "step": 41380, "train_speed(iter/s)": 0.202762 }, { "acc": 0.76575894, "epoch": 0.9656600417037285, "grad_norm": 5.5, "learning_rate": 5.519886717335012e-06, "loss": 0.85318937, "memory(GiB)": 146.85, "step": 41390, "train_speed(iter/s)": 0.202786 }, { "acc": 0.74892769, "epoch": 0.9658933492760174, "grad_norm": 5.96875, "learning_rate": 5.518007808251135e-06, "loss": 0.90718517, "memory(GiB)": 146.85, "step": 41400, "train_speed(iter/s)": 0.202812 }, { "acc": 0.78149261, "epoch": 0.9661266568483063, "grad_norm": 8.1875, "learning_rate": 5.516128825221792e-06, "loss": 0.76758795, "memory(GiB)": 146.85, "step": 41410, "train_speed(iter/s)": 0.202838 }, { "acc": 0.75409966, "epoch": 0.9663599644205952, "grad_norm": 5.3125, "learning_rate": 5.514249768515209e-06, "loss": 0.89690409, "memory(GiB)": 146.85, "step": 41420, "train_speed(iter/s)": 0.202862 }, { "acc": 0.76686411, "epoch": 0.9665932719928841, "grad_norm": 8.625, "learning_rate": 5.512370638399622e-06, "loss": 0.86925259, "memory(GiB)": 146.85, "step": 41430, "train_speed(iter/s)": 0.202886 }, { "acc": 0.76910534, "epoch": 0.966826579565173, "grad_norm": 10.25, "learning_rate": 5.510491435143275e-06, "loss": 0.81182756, "memory(GiB)": 146.85, "step": 41440, "train_speed(iter/s)": 0.202909 }, { "acc": 0.7776391, "epoch": 0.9670598871374619, "grad_norm": 7.75, "learning_rate": 5.508612159014424e-06, "loss": 0.80778675, "memory(GiB)": 146.85, "step": 41450, "train_speed(iter/s)": 0.202936 }, { "acc": 0.7659112, "epoch": 0.9672931947097508, "grad_norm": 6.59375, "learning_rate": 5.506732810281335e-06, "loss": 0.8531641, "memory(GiB)": 146.85, "step": 41460, "train_speed(iter/s)": 0.202962 }, { "acc": 0.75308437, "epoch": 0.9675265022820397, "grad_norm": 8.4375, "learning_rate": 5.504853389212285e-06, "loss": 0.90871124, "memory(GiB)": 146.85, "step": 41470, "train_speed(iter/s)": 0.202987 }, { "acc": 0.78673964, "epoch": 0.9677598098543286, "grad_norm": 6.21875, "learning_rate": 5.502973896075559e-06, "loss": 0.76346426, "memory(GiB)": 146.85, "step": 41480, "train_speed(iter/s)": 0.203012 }, { "acc": 0.76796374, "epoch": 0.9679931174266175, "grad_norm": 6.59375, "learning_rate": 5.501094331139457e-06, "loss": 0.82095699, "memory(GiB)": 146.85, "step": 41490, "train_speed(iter/s)": 0.203038 }, { "acc": 0.76164951, "epoch": 0.9682264249989063, "grad_norm": 5.28125, "learning_rate": 5.499214694672283e-06, "loss": 0.85442295, "memory(GiB)": 146.85, "step": 41500, "train_speed(iter/s)": 0.203062 }, { "epoch": 0.9682264249989063, "eval_acc": 0.7350752987263592, "eval_loss": 0.8342850208282471, "eval_runtime": 1264.5428, "eval_samples_per_second": 28.462, "eval_steps_per_second": 14.231, "step": 41500 }, { "acc": 0.76850939, "epoch": 0.9684597325711952, "grad_norm": 4.9375, "learning_rate": 5.497334986942358e-06, "loss": 0.83257713, "memory(GiB)": 146.85, "step": 41510, "train_speed(iter/s)": 0.201813 }, { "acc": 0.75029321, "epoch": 0.9686930401434841, "grad_norm": 5.375, "learning_rate": 5.495455208218008e-06, "loss": 0.90280247, "memory(GiB)": 146.85, "step": 41520, "train_speed(iter/s)": 0.201838 }, { "acc": 0.75492835, "epoch": 0.968926347715773, "grad_norm": 6.6875, "learning_rate": 5.493575358767571e-06, "loss": 0.88161411, "memory(GiB)": 146.85, "step": 41530, "train_speed(iter/s)": 0.201863 }, { "acc": 0.78265538, "epoch": 0.9691596552880619, "grad_norm": 5.5625, "learning_rate": 5.491695438859394e-06, "loss": 0.79777527, "memory(GiB)": 146.85, "step": 41540, "train_speed(iter/s)": 0.201888 }, { "acc": 0.76806545, "epoch": 0.9693929628603508, "grad_norm": 6.15625, "learning_rate": 5.489815448761837e-06, "loss": 0.83888817, "memory(GiB)": 146.85, "step": 41550, "train_speed(iter/s)": 0.201915 }, { "acc": 0.79108734, "epoch": 0.9696262704326397, "grad_norm": 5.78125, "learning_rate": 5.487935388743266e-06, "loss": 0.7496654, "memory(GiB)": 146.85, "step": 41560, "train_speed(iter/s)": 0.201939 }, { "acc": 0.76397147, "epoch": 0.9698595780049286, "grad_norm": 11.375, "learning_rate": 5.486055259072059e-06, "loss": 0.84092026, "memory(GiB)": 146.85, "step": 41570, "train_speed(iter/s)": 0.201965 }, { "acc": 0.75412369, "epoch": 0.9700928855772175, "grad_norm": 6.0625, "learning_rate": 5.484175060016607e-06, "loss": 0.86975746, "memory(GiB)": 146.85, "step": 41580, "train_speed(iter/s)": 0.20199 }, { "acc": 0.78792439, "epoch": 0.9703261931495064, "grad_norm": 7.03125, "learning_rate": 5.482294791845305e-06, "loss": 0.74036889, "memory(GiB)": 146.85, "step": 41590, "train_speed(iter/s)": 0.202015 }, { "acc": 0.7832056, "epoch": 0.9705595007217953, "grad_norm": 5.125, "learning_rate": 5.480414454826563e-06, "loss": 0.77135987, "memory(GiB)": 146.85, "step": 41600, "train_speed(iter/s)": 0.202038 }, { "acc": 0.76343384, "epoch": 0.9707928082940842, "grad_norm": 5.53125, "learning_rate": 5.478534049228794e-06, "loss": 0.85995426, "memory(GiB)": 146.85, "step": 41610, "train_speed(iter/s)": 0.202064 }, { "acc": 0.77008839, "epoch": 0.9710261158663731, "grad_norm": 5.78125, "learning_rate": 5.476653575320432e-06, "loss": 0.81860237, "memory(GiB)": 146.85, "step": 41620, "train_speed(iter/s)": 0.202089 }, { "acc": 0.76442561, "epoch": 0.971259423438662, "grad_norm": 9.875, "learning_rate": 5.474773033369908e-06, "loss": 0.85700989, "memory(GiB)": 146.85, "step": 41630, "train_speed(iter/s)": 0.202112 }, { "acc": 0.79297767, "epoch": 0.9714927310109509, "grad_norm": 7.25, "learning_rate": 5.472892423645673e-06, "loss": 0.75309258, "memory(GiB)": 146.85, "step": 41640, "train_speed(iter/s)": 0.202138 }, { "acc": 0.78030109, "epoch": 0.9717260385832398, "grad_norm": 8.8125, "learning_rate": 5.47101174641618e-06, "loss": 0.78893614, "memory(GiB)": 146.85, "step": 41650, "train_speed(iter/s)": 0.202163 }, { "acc": 0.76028423, "epoch": 0.9719593461555287, "grad_norm": 6.625, "learning_rate": 5.469131001949899e-06, "loss": 0.89322453, "memory(GiB)": 146.85, "step": 41660, "train_speed(iter/s)": 0.202189 }, { "acc": 0.77747183, "epoch": 0.9721926537278176, "grad_norm": 7.875, "learning_rate": 5.467250190515303e-06, "loss": 0.8205121, "memory(GiB)": 146.85, "step": 41670, "train_speed(iter/s)": 0.202214 }, { "acc": 0.78137574, "epoch": 0.9724259613001065, "grad_norm": 6.96875, "learning_rate": 5.465369312380879e-06, "loss": 0.79352617, "memory(GiB)": 146.85, "step": 41680, "train_speed(iter/s)": 0.20224 }, { "acc": 0.77266665, "epoch": 0.9726592688723953, "grad_norm": 6.625, "learning_rate": 5.463488367815119e-06, "loss": 0.85282888, "memory(GiB)": 146.85, "step": 41690, "train_speed(iter/s)": 0.202265 }, { "acc": 0.77782087, "epoch": 0.9728925764446842, "grad_norm": 5.40625, "learning_rate": 5.46160735708653e-06, "loss": 0.81585226, "memory(GiB)": 146.85, "step": 41700, "train_speed(iter/s)": 0.202289 }, { "acc": 0.77884874, "epoch": 0.9731258840169731, "grad_norm": 6.75, "learning_rate": 5.459726280463625e-06, "loss": 0.80125637, "memory(GiB)": 146.85, "step": 41710, "train_speed(iter/s)": 0.202314 }, { "acc": 0.7937582, "epoch": 0.973359191589262, "grad_norm": 5.5625, "learning_rate": 5.4578451382149275e-06, "loss": 0.7342762, "memory(GiB)": 146.85, "step": 41720, "train_speed(iter/s)": 0.202338 }, { "acc": 0.77932258, "epoch": 0.9735924991615509, "grad_norm": 10.1875, "learning_rate": 5.455963930608969e-06, "loss": 0.78439808, "memory(GiB)": 146.85, "step": 41730, "train_speed(iter/s)": 0.202363 }, { "acc": 0.77118759, "epoch": 0.9738258067338398, "grad_norm": 5.75, "learning_rate": 5.454082657914292e-06, "loss": 0.82466145, "memory(GiB)": 146.85, "step": 41740, "train_speed(iter/s)": 0.202387 }, { "acc": 0.77372656, "epoch": 0.9740591143061287, "grad_norm": 8.5625, "learning_rate": 5.452201320399447e-06, "loss": 0.8068984, "memory(GiB)": 146.85, "step": 41750, "train_speed(iter/s)": 0.202411 }, { "acc": 0.78595514, "epoch": 0.9742924218784176, "grad_norm": 7.59375, "learning_rate": 5.450319918332995e-06, "loss": 0.75981498, "memory(GiB)": 146.85, "step": 41760, "train_speed(iter/s)": 0.202436 }, { "acc": 0.7740561, "epoch": 0.9745257294507065, "grad_norm": 5.96875, "learning_rate": 5.448438451983507e-06, "loss": 0.82321033, "memory(GiB)": 146.85, "step": 41770, "train_speed(iter/s)": 0.202462 }, { "acc": 0.78715658, "epoch": 0.9747590370229954, "grad_norm": 5.4375, "learning_rate": 5.4465569216195576e-06, "loss": 0.78344431, "memory(GiB)": 146.85, "step": 41780, "train_speed(iter/s)": 0.202487 }, { "acc": 0.76964197, "epoch": 0.9749923445952843, "grad_norm": 5.8125, "learning_rate": 5.444675327509738e-06, "loss": 0.81264076, "memory(GiB)": 146.85, "step": 41790, "train_speed(iter/s)": 0.202512 }, { "acc": 0.77464685, "epoch": 0.9752256521675732, "grad_norm": 8.875, "learning_rate": 5.4427936699226455e-06, "loss": 0.79836607, "memory(GiB)": 146.85, "step": 41800, "train_speed(iter/s)": 0.202536 }, { "acc": 0.77787471, "epoch": 0.9754589597398621, "grad_norm": 6.9375, "learning_rate": 5.440911949126885e-06, "loss": 0.81084003, "memory(GiB)": 146.85, "step": 41810, "train_speed(iter/s)": 0.202561 }, { "acc": 0.78694754, "epoch": 0.975692267312151, "grad_norm": 10.5625, "learning_rate": 5.4390301653910726e-06, "loss": 0.74757605, "memory(GiB)": 146.85, "step": 41820, "train_speed(iter/s)": 0.202586 }, { "acc": 0.76876125, "epoch": 0.9759255748844399, "grad_norm": 5.34375, "learning_rate": 5.4371483189838315e-06, "loss": 0.83725672, "memory(GiB)": 146.85, "step": 41830, "train_speed(iter/s)": 0.202612 }, { "acc": 0.74538679, "epoch": 0.9761588824567288, "grad_norm": 7.6875, "learning_rate": 5.435266410173794e-06, "loss": 0.90994329, "memory(GiB)": 146.85, "step": 41840, "train_speed(iter/s)": 0.202637 }, { "acc": 0.74019022, "epoch": 0.9763921900290177, "grad_norm": 5.90625, "learning_rate": 5.433384439229603e-06, "loss": 0.93858433, "memory(GiB)": 146.85, "step": 41850, "train_speed(iter/s)": 0.202662 }, { "acc": 0.77913074, "epoch": 0.9766254976013066, "grad_norm": 6.21875, "learning_rate": 5.431502406419908e-06, "loss": 0.82655201, "memory(GiB)": 146.85, "step": 41860, "train_speed(iter/s)": 0.202686 }, { "acc": 0.78760228, "epoch": 0.9768588051735955, "grad_norm": 9.75, "learning_rate": 5.429620312013372e-06, "loss": 0.7640677, "memory(GiB)": 146.85, "step": 41870, "train_speed(iter/s)": 0.202712 }, { "acc": 0.76670876, "epoch": 0.9770921127458843, "grad_norm": 7.71875, "learning_rate": 5.427738156278662e-06, "loss": 0.85035973, "memory(GiB)": 146.85, "step": 41880, "train_speed(iter/s)": 0.202738 }, { "acc": 0.76901255, "epoch": 0.9773254203181732, "grad_norm": 4.5625, "learning_rate": 5.4258559394844515e-06, "loss": 0.82234755, "memory(GiB)": 146.85, "step": 41890, "train_speed(iter/s)": 0.202764 }, { "acc": 0.7419385, "epoch": 0.977558727890462, "grad_norm": 11.0, "learning_rate": 5.423973661899431e-06, "loss": 0.93633137, "memory(GiB)": 146.85, "step": 41900, "train_speed(iter/s)": 0.20279 }, { "acc": 0.77764921, "epoch": 0.977792035462751, "grad_norm": 7.0, "learning_rate": 5.4220913237922936e-06, "loss": 0.78978491, "memory(GiB)": 146.85, "step": 41910, "train_speed(iter/s)": 0.202814 }, { "acc": 0.76253948, "epoch": 0.9780253430350399, "grad_norm": 24.25, "learning_rate": 5.4202089254317415e-06, "loss": 0.8930316, "memory(GiB)": 146.85, "step": 41920, "train_speed(iter/s)": 0.202839 }, { "acc": 0.75747943, "epoch": 0.9782586506073288, "grad_norm": 5.5, "learning_rate": 5.418326467086488e-06, "loss": 0.88021317, "memory(GiB)": 146.85, "step": 41930, "train_speed(iter/s)": 0.202864 }, { "acc": 0.73700428, "epoch": 0.9784919581796176, "grad_norm": 5.6875, "learning_rate": 5.416443949025253e-06, "loss": 0.95929623, "memory(GiB)": 146.85, "step": 41940, "train_speed(iter/s)": 0.20289 }, { "acc": 0.75944109, "epoch": 0.9787252657519065, "grad_norm": 6.28125, "learning_rate": 5.414561371516764e-06, "loss": 0.89794159, "memory(GiB)": 146.85, "step": 41950, "train_speed(iter/s)": 0.202914 }, { "acc": 0.78869295, "epoch": 0.9789585733241954, "grad_norm": 5.5625, "learning_rate": 5.41267873482976e-06, "loss": 0.76782894, "memory(GiB)": 146.85, "step": 41960, "train_speed(iter/s)": 0.202937 }, { "acc": 0.76911068, "epoch": 0.9791918808964843, "grad_norm": 5.9375, "learning_rate": 5.410796039232989e-06, "loss": 0.81421261, "memory(GiB)": 146.85, "step": 41970, "train_speed(iter/s)": 0.202961 }, { "acc": 0.77965732, "epoch": 0.9794251884687732, "grad_norm": 6.6875, "learning_rate": 5.4089132849952e-06, "loss": 0.81585827, "memory(GiB)": 146.85, "step": 41980, "train_speed(iter/s)": 0.202985 }, { "acc": 0.77741079, "epoch": 0.9796584960410621, "grad_norm": 4.71875, "learning_rate": 5.407030472385158e-06, "loss": 0.7857789, "memory(GiB)": 146.85, "step": 41990, "train_speed(iter/s)": 0.203009 }, { "acc": 0.77037249, "epoch": 0.979891803613351, "grad_norm": 9.3125, "learning_rate": 5.4051476016716365e-06, "loss": 0.82318439, "memory(GiB)": 146.85, "step": 42000, "train_speed(iter/s)": 0.203035 }, { "epoch": 0.979891803613351, "eval_acc": 0.7350502905785433, "eval_loss": 0.8342345356941223, "eval_runtime": 1263.7978, "eval_samples_per_second": 28.478, "eval_steps_per_second": 14.24, "step": 42000 }, { "acc": 0.79304676, "epoch": 0.9801251111856399, "grad_norm": 5.53125, "learning_rate": 5.4032646731234115e-06, "loss": 0.74371376, "memory(GiB)": 146.85, "step": 42010, "train_speed(iter/s)": 0.201801 }, { "acc": 0.79099517, "epoch": 0.9803584187579288, "grad_norm": 6.5625, "learning_rate": 5.401381687009271e-06, "loss": 0.75646496, "memory(GiB)": 146.85, "step": 42020, "train_speed(iter/s)": 0.201827 }, { "acc": 0.77155228, "epoch": 0.9805917263302177, "grad_norm": 6.34375, "learning_rate": 5.399498643598011e-06, "loss": 0.82130165, "memory(GiB)": 146.85, "step": 42030, "train_speed(iter/s)": 0.201853 }, { "acc": 0.77557516, "epoch": 0.9808250339025066, "grad_norm": 6.34375, "learning_rate": 5.3976155431584375e-06, "loss": 0.80665817, "memory(GiB)": 146.85, "step": 42040, "train_speed(iter/s)": 0.201877 }, { "acc": 0.79300051, "epoch": 0.9810583414747955, "grad_norm": 6.34375, "learning_rate": 5.3957323859593604e-06, "loss": 0.74045367, "memory(GiB)": 146.85, "step": 42050, "train_speed(iter/s)": 0.201903 }, { "acc": 0.76051526, "epoch": 0.9812916490470844, "grad_norm": 6.53125, "learning_rate": 5.3938491722695996e-06, "loss": 0.86525126, "memory(GiB)": 146.85, "step": 42060, "train_speed(iter/s)": 0.201928 }, { "acc": 0.77385149, "epoch": 0.9815249566193732, "grad_norm": 15.0, "learning_rate": 5.391965902357983e-06, "loss": 0.83429775, "memory(GiB)": 146.85, "step": 42070, "train_speed(iter/s)": 0.201955 }, { "acc": 0.78118372, "epoch": 0.9817582641916621, "grad_norm": 6.3125, "learning_rate": 5.390082576493348e-06, "loss": 0.75570602, "memory(GiB)": 146.85, "step": 42080, "train_speed(iter/s)": 0.20198 }, { "acc": 0.76961064, "epoch": 0.981991571763951, "grad_norm": 6.875, "learning_rate": 5.388199194944539e-06, "loss": 0.81490707, "memory(GiB)": 146.85, "step": 42090, "train_speed(iter/s)": 0.202008 }, { "acc": 0.77684274, "epoch": 0.9822248793362399, "grad_norm": 5.40625, "learning_rate": 5.3863157579804075e-06, "loss": 0.80352173, "memory(GiB)": 146.85, "step": 42100, "train_speed(iter/s)": 0.202032 }, { "acc": 0.75882893, "epoch": 0.9824581869085288, "grad_norm": 6.5625, "learning_rate": 5.384432265869815e-06, "loss": 0.88333492, "memory(GiB)": 146.85, "step": 42110, "train_speed(iter/s)": 0.202057 }, { "acc": 0.77351456, "epoch": 0.9826914944808177, "grad_norm": 5.9375, "learning_rate": 5.382548718881627e-06, "loss": 0.82765541, "memory(GiB)": 146.85, "step": 42120, "train_speed(iter/s)": 0.202084 }, { "acc": 0.77885485, "epoch": 0.9829248020531066, "grad_norm": 6.0, "learning_rate": 5.380665117284721e-06, "loss": 0.80152836, "memory(GiB)": 146.85, "step": 42130, "train_speed(iter/s)": 0.202109 }, { "acc": 0.76749001, "epoch": 0.9831581096253955, "grad_norm": 5.3125, "learning_rate": 5.378781461347979e-06, "loss": 0.84406185, "memory(GiB)": 146.85, "step": 42140, "train_speed(iter/s)": 0.202134 }, { "acc": 0.76671195, "epoch": 0.9833914171976844, "grad_norm": 4.4375, "learning_rate": 5.376897751340294e-06, "loss": 0.87486477, "memory(GiB)": 146.85, "step": 42150, "train_speed(iter/s)": 0.202161 }, { "acc": 0.77769756, "epoch": 0.9836247247699733, "grad_norm": 6.28125, "learning_rate": 5.375013987530565e-06, "loss": 0.79158592, "memory(GiB)": 146.85, "step": 42160, "train_speed(iter/s)": 0.202188 }, { "acc": 0.75720644, "epoch": 0.9838580323422622, "grad_norm": 6.5625, "learning_rate": 5.3731301701876985e-06, "loss": 0.87791109, "memory(GiB)": 146.85, "step": 42170, "train_speed(iter/s)": 0.202214 }, { "acc": 0.77101374, "epoch": 0.9840913399145511, "grad_norm": 5.53125, "learning_rate": 5.371246299580608e-06, "loss": 0.82796421, "memory(GiB)": 146.85, "step": 42180, "train_speed(iter/s)": 0.202241 }, { "acc": 0.7759099, "epoch": 0.98432464748684, "grad_norm": 4.5625, "learning_rate": 5.3693623759782165e-06, "loss": 0.81007042, "memory(GiB)": 146.85, "step": 42190, "train_speed(iter/s)": 0.202267 }, { "acc": 0.75756006, "epoch": 0.9845579550591289, "grad_norm": 5.28125, "learning_rate": 5.367478399649453e-06, "loss": 0.88520021, "memory(GiB)": 146.85, "step": 42200, "train_speed(iter/s)": 0.202293 }, { "acc": 0.75170927, "epoch": 0.9847912626314178, "grad_norm": 9.0, "learning_rate": 5.365594370863254e-06, "loss": 0.89316978, "memory(GiB)": 146.85, "step": 42210, "train_speed(iter/s)": 0.202318 }, { "acc": 0.76776953, "epoch": 0.9850245702037067, "grad_norm": 7.375, "learning_rate": 5.363710289888564e-06, "loss": 0.84392643, "memory(GiB)": 146.85, "step": 42220, "train_speed(iter/s)": 0.202345 }, { "acc": 0.76380329, "epoch": 0.9852578777759956, "grad_norm": 5.53125, "learning_rate": 5.361826156994338e-06, "loss": 0.86166515, "memory(GiB)": 146.85, "step": 42230, "train_speed(iter/s)": 0.202371 }, { "acc": 0.77980299, "epoch": 0.9854911853482845, "grad_norm": 7.125, "learning_rate": 5.359941972449532e-06, "loss": 0.80089149, "memory(GiB)": 146.85, "step": 42240, "train_speed(iter/s)": 0.202395 }, { "acc": 0.77414103, "epoch": 0.9857244929205734, "grad_norm": 17.375, "learning_rate": 5.358057736523114e-06, "loss": 0.82716923, "memory(GiB)": 146.85, "step": 42250, "train_speed(iter/s)": 0.202421 }, { "acc": 0.76927471, "epoch": 0.9859578004928623, "grad_norm": 4.75, "learning_rate": 5.356173449484059e-06, "loss": 0.8477499, "memory(GiB)": 146.85, "step": 42260, "train_speed(iter/s)": 0.202445 }, { "acc": 0.77313156, "epoch": 0.9861911080651511, "grad_norm": 6.90625, "learning_rate": 5.3542891116013465e-06, "loss": 0.81512127, "memory(GiB)": 146.85, "step": 42270, "train_speed(iter/s)": 0.20247 }, { "acc": 0.77674561, "epoch": 0.98642441563744, "grad_norm": 6.96875, "learning_rate": 5.352404723143968e-06, "loss": 0.80933437, "memory(GiB)": 146.85, "step": 42280, "train_speed(iter/s)": 0.202494 }, { "acc": 0.78157797, "epoch": 0.9866577232097289, "grad_norm": 4.5, "learning_rate": 5.350520284380916e-06, "loss": 0.7971859, "memory(GiB)": 146.85, "step": 42290, "train_speed(iter/s)": 0.20252 }, { "acc": 0.76100512, "epoch": 0.9868910307820178, "grad_norm": 19.75, "learning_rate": 5.3486357955811945e-06, "loss": 0.89437256, "memory(GiB)": 146.85, "step": 42300, "train_speed(iter/s)": 0.202544 }, { "acc": 0.75436926, "epoch": 0.9871243383543067, "grad_norm": 6.21875, "learning_rate": 5.346751257013815e-06, "loss": 0.88417444, "memory(GiB)": 146.85, "step": 42310, "train_speed(iter/s)": 0.202569 }, { "acc": 0.76330199, "epoch": 0.9873576459265956, "grad_norm": 7.8125, "learning_rate": 5.344866668947794e-06, "loss": 0.8365387, "memory(GiB)": 146.85, "step": 42320, "train_speed(iter/s)": 0.202595 }, { "acc": 0.76811113, "epoch": 0.9875909534988845, "grad_norm": 5.8125, "learning_rate": 5.342982031652159e-06, "loss": 0.82834139, "memory(GiB)": 146.85, "step": 42330, "train_speed(iter/s)": 0.202621 }, { "acc": 0.76846867, "epoch": 0.9878242610711734, "grad_norm": 7.5, "learning_rate": 5.341097345395937e-06, "loss": 0.8321455, "memory(GiB)": 146.85, "step": 42340, "train_speed(iter/s)": 0.202645 }, { "acc": 0.76858196, "epoch": 0.9880575686434623, "grad_norm": 8.0625, "learning_rate": 5.339212610448167e-06, "loss": 0.82284317, "memory(GiB)": 146.85, "step": 42350, "train_speed(iter/s)": 0.202669 }, { "acc": 0.75678115, "epoch": 0.9882908762157512, "grad_norm": 6.15625, "learning_rate": 5.3373278270778965e-06, "loss": 0.89752789, "memory(GiB)": 146.85, "step": 42360, "train_speed(iter/s)": 0.202693 }, { "acc": 0.76957498, "epoch": 0.9885241837880401, "grad_norm": 6.53125, "learning_rate": 5.3354429955541755e-06, "loss": 0.84740868, "memory(GiB)": 146.85, "step": 42370, "train_speed(iter/s)": 0.20272 }, { "acc": 0.75167208, "epoch": 0.988757491360329, "grad_norm": 7.8125, "learning_rate": 5.333558116146063e-06, "loss": 0.90653305, "memory(GiB)": 146.85, "step": 42380, "train_speed(iter/s)": 0.202746 }, { "acc": 0.78133278, "epoch": 0.9889907989326179, "grad_norm": 5.09375, "learning_rate": 5.33167318912263e-06, "loss": 0.78815203, "memory(GiB)": 146.85, "step": 42390, "train_speed(iter/s)": 0.202769 }, { "acc": 0.7861352, "epoch": 0.9892241065049068, "grad_norm": 11.3125, "learning_rate": 5.329788214752944e-06, "loss": 0.76699166, "memory(GiB)": 146.85, "step": 42400, "train_speed(iter/s)": 0.202795 }, { "acc": 0.77182093, "epoch": 0.9894574140771957, "grad_norm": 5.6875, "learning_rate": 5.327903193306087e-06, "loss": 0.81069126, "memory(GiB)": 146.85, "step": 42410, "train_speed(iter/s)": 0.20282 }, { "acc": 0.74673018, "epoch": 0.9896907216494846, "grad_norm": 4.84375, "learning_rate": 5.326018125051142e-06, "loss": 0.90465021, "memory(GiB)": 146.85, "step": 42420, "train_speed(iter/s)": 0.202844 }, { "acc": 0.76963024, "epoch": 0.9899240292217735, "grad_norm": 5.84375, "learning_rate": 5.324133010257206e-06, "loss": 0.81718426, "memory(GiB)": 146.85, "step": 42430, "train_speed(iter/s)": 0.202868 }, { "acc": 0.78747649, "epoch": 0.9901573367940624, "grad_norm": 6.03125, "learning_rate": 5.3222478491933775e-06, "loss": 0.76919718, "memory(GiB)": 146.85, "step": 42440, "train_speed(iter/s)": 0.202892 }, { "acc": 0.7655582, "epoch": 0.9903906443663513, "grad_norm": 7.90625, "learning_rate": 5.320362642128761e-06, "loss": 0.85436954, "memory(GiB)": 146.85, "step": 42450, "train_speed(iter/s)": 0.202919 }, { "acc": 0.78522739, "epoch": 0.9906239519386401, "grad_norm": 6.5625, "learning_rate": 5.318477389332471e-06, "loss": 0.76466851, "memory(GiB)": 146.85, "step": 42460, "train_speed(iter/s)": 0.202945 }, { "acc": 0.77136359, "epoch": 0.990857259510929, "grad_norm": 6.34375, "learning_rate": 5.316592091073626e-06, "loss": 0.82749691, "memory(GiB)": 146.85, "step": 42470, "train_speed(iter/s)": 0.202968 }, { "acc": 0.77691717, "epoch": 0.9910905670832179, "grad_norm": 5.8125, "learning_rate": 5.314706747621352e-06, "loss": 0.79079647, "memory(GiB)": 146.85, "step": 42480, "train_speed(iter/s)": 0.202991 }, { "acc": 0.76659031, "epoch": 0.9913238746555068, "grad_norm": 6.21875, "learning_rate": 5.312821359244781e-06, "loss": 0.85491762, "memory(GiB)": 146.85, "step": 42490, "train_speed(iter/s)": 0.203018 }, { "acc": 0.77897854, "epoch": 0.9915571822277957, "grad_norm": 4.90625, "learning_rate": 5.310935926213052e-06, "loss": 0.79307537, "memory(GiB)": 146.85, "step": 42500, "train_speed(iter/s)": 0.203043 }, { "epoch": 0.9915571822277957, "eval_acc": 0.7350856246712638, "eval_loss": 0.8342196345329285, "eval_runtime": 1264.9615, "eval_samples_per_second": 28.452, "eval_steps_per_second": 14.227, "step": 42500 }, { "acc": 0.74740429, "epoch": 0.9917904898000846, "grad_norm": 5.84375, "learning_rate": 5.309050448795311e-06, "loss": 0.92279253, "memory(GiB)": 146.85, "step": 42510, "train_speed(iter/s)": 0.201825 }, { "acc": 0.76644993, "epoch": 0.9920237973723735, "grad_norm": 5.625, "learning_rate": 5.307164927260706e-06, "loss": 0.82636089, "memory(GiB)": 146.85, "step": 42520, "train_speed(iter/s)": 0.201849 }, { "acc": 0.77798042, "epoch": 0.9922571049446623, "grad_norm": 6.3125, "learning_rate": 5.305279361878398e-06, "loss": 0.80739994, "memory(GiB)": 146.85, "step": 42530, "train_speed(iter/s)": 0.201875 }, { "acc": 0.7744658, "epoch": 0.9924904125169512, "grad_norm": 4.8125, "learning_rate": 5.30339375291755e-06, "loss": 0.81212473, "memory(GiB)": 146.85, "step": 42540, "train_speed(iter/s)": 0.201901 }, { "acc": 0.75312023, "epoch": 0.9927237200892401, "grad_norm": 7.09375, "learning_rate": 5.3015081006473315e-06, "loss": 0.87775688, "memory(GiB)": 146.85, "step": 42550, "train_speed(iter/s)": 0.201924 }, { "acc": 0.76577749, "epoch": 0.992957027661529, "grad_norm": 4.5, "learning_rate": 5.299622405336919e-06, "loss": 0.84794836, "memory(GiB)": 146.85, "step": 42560, "train_speed(iter/s)": 0.201948 }, { "acc": 0.76237688, "epoch": 0.993190335233818, "grad_norm": 7.34375, "learning_rate": 5.297736667255497e-06, "loss": 0.86210899, "memory(GiB)": 146.85, "step": 42570, "train_speed(iter/s)": 0.201973 }, { "acc": 0.79137697, "epoch": 0.9934236428061068, "grad_norm": 6.625, "learning_rate": 5.2958508866722506e-06, "loss": 0.73400631, "memory(GiB)": 146.85, "step": 42580, "train_speed(iter/s)": 0.201996 }, { "acc": 0.76074829, "epoch": 0.9936569503783957, "grad_norm": 5.09375, "learning_rate": 5.293965063856375e-06, "loss": 0.87553158, "memory(GiB)": 146.85, "step": 42590, "train_speed(iter/s)": 0.202019 }, { "acc": 0.75579443, "epoch": 0.9938902579506846, "grad_norm": 5.3125, "learning_rate": 5.292079199077073e-06, "loss": 0.88669062, "memory(GiB)": 146.85, "step": 42600, "train_speed(iter/s)": 0.202044 }, { "acc": 0.76391835, "epoch": 0.9941235655229735, "grad_norm": 6.1875, "learning_rate": 5.290193292603551e-06, "loss": 0.84605522, "memory(GiB)": 146.85, "step": 42610, "train_speed(iter/s)": 0.202068 }, { "acc": 0.77237349, "epoch": 0.9943568730952624, "grad_norm": 6.9375, "learning_rate": 5.2883073447050205e-06, "loss": 0.85261784, "memory(GiB)": 146.85, "step": 42620, "train_speed(iter/s)": 0.202093 }, { "acc": 0.77593832, "epoch": 0.9945901806675513, "grad_norm": 6.8125, "learning_rate": 5.2864213556507e-06, "loss": 0.81686268, "memory(GiB)": 146.85, "step": 42630, "train_speed(iter/s)": 0.202118 }, { "acc": 0.76713772, "epoch": 0.9948234882398402, "grad_norm": 8.0, "learning_rate": 5.2845353257098146e-06, "loss": 0.83380098, "memory(GiB)": 146.85, "step": 42640, "train_speed(iter/s)": 0.202143 }, { "acc": 0.75946646, "epoch": 0.995056795812129, "grad_norm": 10.4375, "learning_rate": 5.282649255151593e-06, "loss": 0.87980042, "memory(GiB)": 146.85, "step": 42650, "train_speed(iter/s)": 0.202167 }, { "acc": 0.76475697, "epoch": 0.9952901033844179, "grad_norm": 5.9375, "learning_rate": 5.280763144245272e-06, "loss": 0.87103176, "memory(GiB)": 146.85, "step": 42660, "train_speed(iter/s)": 0.202189 }, { "acc": 0.77876897, "epoch": 0.9955234109567068, "grad_norm": 6.09375, "learning_rate": 5.2788769932600944e-06, "loss": 0.78195391, "memory(GiB)": 146.85, "step": 42670, "train_speed(iter/s)": 0.202213 }, { "acc": 0.76091385, "epoch": 0.9957567185289957, "grad_norm": 8.25, "learning_rate": 5.276990802465309e-06, "loss": 0.87133751, "memory(GiB)": 146.85, "step": 42680, "train_speed(iter/s)": 0.202237 }, { "acc": 0.76979465, "epoch": 0.9959900261012846, "grad_norm": 7.1875, "learning_rate": 5.275104572130167e-06, "loss": 0.81508818, "memory(GiB)": 146.85, "step": 42690, "train_speed(iter/s)": 0.202263 }, { "acc": 0.76570783, "epoch": 0.9962233336735735, "grad_norm": 5.125, "learning_rate": 5.273218302523925e-06, "loss": 0.83456144, "memory(GiB)": 146.85, "step": 42700, "train_speed(iter/s)": 0.202287 }, { "acc": 0.75970249, "epoch": 0.9964566412458624, "grad_norm": 6.1875, "learning_rate": 5.2713319939158494e-06, "loss": 0.85853777, "memory(GiB)": 146.85, "step": 42710, "train_speed(iter/s)": 0.202312 }, { "acc": 0.76425867, "epoch": 0.9966899488181513, "grad_norm": 4.78125, "learning_rate": 5.2694456465752104e-06, "loss": 0.84755325, "memory(GiB)": 146.85, "step": 42720, "train_speed(iter/s)": 0.202335 }, { "acc": 0.7661067, "epoch": 0.9969232563904402, "grad_norm": 5.84375, "learning_rate": 5.267559260771285e-06, "loss": 0.8266264, "memory(GiB)": 146.85, "step": 42730, "train_speed(iter/s)": 0.202361 }, { "acc": 0.77440033, "epoch": 0.9971565639627291, "grad_norm": 4.71875, "learning_rate": 5.265672836773353e-06, "loss": 0.82872925, "memory(GiB)": 146.85, "step": 42740, "train_speed(iter/s)": 0.202385 }, { "acc": 0.75733433, "epoch": 0.997389871535018, "grad_norm": 8.0, "learning_rate": 5.2637863748507e-06, "loss": 0.89635563, "memory(GiB)": 146.85, "step": 42750, "train_speed(iter/s)": 0.202408 }, { "acc": 0.77492108, "epoch": 0.9976231791073069, "grad_norm": 5.5625, "learning_rate": 5.261899875272619e-06, "loss": 0.82115765, "memory(GiB)": 146.85, "step": 42760, "train_speed(iter/s)": 0.202431 }, { "acc": 0.74216313, "epoch": 0.9978564866795958, "grad_norm": 7.25, "learning_rate": 5.260013338308408e-06, "loss": 0.94691372, "memory(GiB)": 146.85, "step": 42770, "train_speed(iter/s)": 0.202456 }, { "acc": 0.77298188, "epoch": 0.9980897942518847, "grad_norm": 6.21875, "learning_rate": 5.258126764227366e-06, "loss": 0.80149088, "memory(GiB)": 146.85, "step": 42780, "train_speed(iter/s)": 0.20248 }, { "acc": 0.77139697, "epoch": 0.9983231018241736, "grad_norm": 5.40625, "learning_rate": 5.256240153298804e-06, "loss": 0.83971195, "memory(GiB)": 146.85, "step": 42790, "train_speed(iter/s)": 0.202503 }, { "acc": 0.77042894, "epoch": 0.9985564093964625, "grad_norm": 8.625, "learning_rate": 5.254353505792036e-06, "loss": 0.84297667, "memory(GiB)": 146.85, "step": 42800, "train_speed(iter/s)": 0.202528 }, { "acc": 0.77785368, "epoch": 0.9987897169687514, "grad_norm": 4.5, "learning_rate": 5.252466821976377e-06, "loss": 0.78628273, "memory(GiB)": 146.85, "step": 42810, "train_speed(iter/s)": 0.202553 }, { "acc": 0.76967249, "epoch": 0.9990230245410403, "grad_norm": 5.09375, "learning_rate": 5.250580102121153e-06, "loss": 0.85903788, "memory(GiB)": 146.85, "step": 42820, "train_speed(iter/s)": 0.202577 }, { "acc": 0.77603655, "epoch": 0.9992563321133292, "grad_norm": 4.84375, "learning_rate": 5.248693346495694e-06, "loss": 0.79483457, "memory(GiB)": 146.85, "step": 42830, "train_speed(iter/s)": 0.202601 }, { "acc": 0.7878788, "epoch": 0.999489639685618, "grad_norm": 5.59375, "learning_rate": 5.2468065553693306e-06, "loss": 0.77299385, "memory(GiB)": 146.85, "step": 42840, "train_speed(iter/s)": 0.202624 }, { "acc": 0.77078924, "epoch": 0.9997229472579069, "grad_norm": 4.53125, "learning_rate": 5.244919729011403e-06, "loss": 0.84942894, "memory(GiB)": 146.85, "step": 42850, "train_speed(iter/s)": 0.202648 }, { "acc": 0.7601594, "epoch": 0.9999562548301958, "grad_norm": 6.4375, "learning_rate": 5.243032867691257e-06, "loss": 0.89184542, "memory(GiB)": 146.85, "step": 42860, "train_speed(iter/s)": 0.202669 }, { "acc": 0.77479343, "epoch": 1.0001895624024848, "grad_norm": 5.90625, "learning_rate": 5.241145971678238e-06, "loss": 0.84690819, "memory(GiB)": 146.85, "step": 42870, "train_speed(iter/s)": 0.202691 }, { "acc": 0.77034283, "epoch": 1.0004228699747737, "grad_norm": 5.5625, "learning_rate": 5.239259041241701e-06, "loss": 0.83939457, "memory(GiB)": 146.85, "step": 42880, "train_speed(iter/s)": 0.202716 }, { "acc": 0.77744513, "epoch": 1.0006561775470626, "grad_norm": 5.0, "learning_rate": 5.237372076651006e-06, "loss": 0.77625914, "memory(GiB)": 146.85, "step": 42890, "train_speed(iter/s)": 0.20274 }, { "acc": 0.775562, "epoch": 1.0008894851193515, "grad_norm": 5.84375, "learning_rate": 5.2354850781755175e-06, "loss": 0.80578156, "memory(GiB)": 146.85, "step": 42900, "train_speed(iter/s)": 0.202763 }, { "acc": 0.79066343, "epoch": 1.0011227926916404, "grad_norm": 4.1875, "learning_rate": 5.233598046084602e-06, "loss": 0.7595161, "memory(GiB)": 146.85, "step": 42910, "train_speed(iter/s)": 0.202789 }, { "acc": 0.76628675, "epoch": 1.0013561002639293, "grad_norm": 4.84375, "learning_rate": 5.231710980647632e-06, "loss": 0.85740776, "memory(GiB)": 146.85, "step": 42920, "train_speed(iter/s)": 0.202814 }, { "acc": 0.7830986, "epoch": 1.0015894078362182, "grad_norm": 4.9375, "learning_rate": 5.229823882133987e-06, "loss": 0.78767452, "memory(GiB)": 146.85, "step": 42930, "train_speed(iter/s)": 0.202838 }, { "acc": 0.75460744, "epoch": 1.0018227154085069, "grad_norm": 8.625, "learning_rate": 5.22793675081305e-06, "loss": 0.89739933, "memory(GiB)": 146.85, "step": 42940, "train_speed(iter/s)": 0.202865 }, { "acc": 0.77448912, "epoch": 1.0020560229807958, "grad_norm": 8.0625, "learning_rate": 5.226049586954207e-06, "loss": 0.80343647, "memory(GiB)": 146.85, "step": 42950, "train_speed(iter/s)": 0.202889 }, { "acc": 0.75036073, "epoch": 1.0022893305530847, "grad_norm": 6.84375, "learning_rate": 5.2241623908268524e-06, "loss": 0.91737595, "memory(GiB)": 146.85, "step": 42960, "train_speed(iter/s)": 0.202915 }, { "acc": 0.76906662, "epoch": 1.0025226381253736, "grad_norm": 4.78125, "learning_rate": 5.222275162700382e-06, "loss": 0.81884937, "memory(GiB)": 146.85, "step": 42970, "train_speed(iter/s)": 0.202938 }, { "acc": 0.77532082, "epoch": 1.0027559456976625, "grad_norm": 5.59375, "learning_rate": 5.2203879028441975e-06, "loss": 0.81495304, "memory(GiB)": 146.85, "step": 42980, "train_speed(iter/s)": 0.202963 }, { "acc": 0.76245427, "epoch": 1.0029892532699514, "grad_norm": 5.71875, "learning_rate": 5.218500611527701e-06, "loss": 0.85907278, "memory(GiB)": 146.85, "step": 42990, "train_speed(iter/s)": 0.202987 }, { "acc": 0.75585804, "epoch": 1.0032225608422403, "grad_norm": 5.5, "learning_rate": 5.216613289020307e-06, "loss": 0.90456028, "memory(GiB)": 146.85, "step": 43000, "train_speed(iter/s)": 0.203012 }, { "epoch": 1.0032225608422403, "eval_acc": 0.7351078899899645, "eval_loss": 0.8342560529708862, "eval_runtime": 1262.8862, "eval_samples_per_second": 28.499, "eval_steps_per_second": 14.25, "step": 43000 }, { "acc": 0.763274, "epoch": 1.0034558684145292, "grad_norm": 4.875, "learning_rate": 5.214725935591429e-06, "loss": 0.86529617, "memory(GiB)": 146.85, "step": 43010, "train_speed(iter/s)": 0.201809 }, { "acc": 0.77442613, "epoch": 1.003689175986818, "grad_norm": 7.25, "learning_rate": 5.2128385515104865e-06, "loss": 0.79229941, "memory(GiB)": 146.85, "step": 43020, "train_speed(iter/s)": 0.201832 }, { "acc": 0.77314572, "epoch": 1.003922483559107, "grad_norm": 5.96875, "learning_rate": 5.210951137046903e-06, "loss": 0.8158679, "memory(GiB)": 146.85, "step": 43030, "train_speed(iter/s)": 0.201856 }, { "acc": 0.7690835, "epoch": 1.0041557911313959, "grad_norm": 6.28125, "learning_rate": 5.209063692470104e-06, "loss": 0.83936758, "memory(GiB)": 146.85, "step": 43040, "train_speed(iter/s)": 0.20188 }, { "acc": 0.73614769, "epoch": 1.0043890987036848, "grad_norm": 6.21875, "learning_rate": 5.207176218049526e-06, "loss": 0.95314646, "memory(GiB)": 146.85, "step": 43050, "train_speed(iter/s)": 0.201905 }, { "acc": 0.76142244, "epoch": 1.0046224062759737, "grad_norm": 7.0625, "learning_rate": 5.205288714054602e-06, "loss": 0.85775928, "memory(GiB)": 146.85, "step": 43060, "train_speed(iter/s)": 0.201929 }, { "acc": 0.76880307, "epoch": 1.0048557138482626, "grad_norm": 5.96875, "learning_rate": 5.203401180754772e-06, "loss": 0.80616302, "memory(GiB)": 146.85, "step": 43070, "train_speed(iter/s)": 0.201953 }, { "acc": 0.76376657, "epoch": 1.0050890214205515, "grad_norm": 5.28125, "learning_rate": 5.201513618419486e-06, "loss": 0.82846823, "memory(GiB)": 146.85, "step": 43080, "train_speed(iter/s)": 0.201977 }, { "acc": 0.7835917, "epoch": 1.0053223289928404, "grad_norm": 7.0625, "learning_rate": 5.199626027318188e-06, "loss": 0.77191753, "memory(GiB)": 146.85, "step": 43090, "train_speed(iter/s)": 0.202002 }, { "acc": 0.77477331, "epoch": 1.0055556365651293, "grad_norm": 6.0, "learning_rate": 5.197738407720331e-06, "loss": 0.79531918, "memory(GiB)": 146.85, "step": 43100, "train_speed(iter/s)": 0.202027 }, { "acc": 0.79113998, "epoch": 1.0057889441374182, "grad_norm": 6.0625, "learning_rate": 5.195850759895374e-06, "loss": 0.77267733, "memory(GiB)": 146.85, "step": 43110, "train_speed(iter/s)": 0.20205 }, { "acc": 0.78115225, "epoch": 1.006022251709707, "grad_norm": 6.28125, "learning_rate": 5.193963084112781e-06, "loss": 0.78105011, "memory(GiB)": 146.85, "step": 43120, "train_speed(iter/s)": 0.202075 }, { "acc": 0.78650894, "epoch": 1.006255559281996, "grad_norm": 6.59375, "learning_rate": 5.192075380642011e-06, "loss": 0.7592967, "memory(GiB)": 146.85, "step": 43130, "train_speed(iter/s)": 0.202098 }, { "acc": 0.76814203, "epoch": 1.0064888668542848, "grad_norm": 6.78125, "learning_rate": 5.190187649752538e-06, "loss": 0.83026867, "memory(GiB)": 146.85, "step": 43140, "train_speed(iter/s)": 0.202124 }, { "acc": 0.78148327, "epoch": 1.0067221744265737, "grad_norm": 7.21875, "learning_rate": 5.1882998917138324e-06, "loss": 0.76571088, "memory(GiB)": 146.85, "step": 43150, "train_speed(iter/s)": 0.202148 }, { "acc": 0.77027521, "epoch": 1.0069554819988626, "grad_norm": 5.1875, "learning_rate": 5.186412106795371e-06, "loss": 0.82578678, "memory(GiB)": 146.85, "step": 43160, "train_speed(iter/s)": 0.202172 }, { "acc": 0.77664433, "epoch": 1.0071887895711515, "grad_norm": 5.40625, "learning_rate": 5.1845242952666365e-06, "loss": 0.80411177, "memory(GiB)": 146.85, "step": 43170, "train_speed(iter/s)": 0.202196 }, { "acc": 0.77595587, "epoch": 1.0074220971434404, "grad_norm": 4.6875, "learning_rate": 5.1826364573971125e-06, "loss": 0.80143204, "memory(GiB)": 146.85, "step": 43180, "train_speed(iter/s)": 0.202218 }, { "acc": 0.77765265, "epoch": 1.0076554047157293, "grad_norm": 5.0625, "learning_rate": 5.180748593456289e-06, "loss": 0.81327124, "memory(GiB)": 146.85, "step": 43190, "train_speed(iter/s)": 0.202241 }, { "acc": 0.76768694, "epoch": 1.0078887122880182, "grad_norm": 4.46875, "learning_rate": 5.178860703713654e-06, "loss": 0.85175667, "memory(GiB)": 146.85, "step": 43200, "train_speed(iter/s)": 0.202266 }, { "acc": 0.77664089, "epoch": 1.0081220198603071, "grad_norm": 6.25, "learning_rate": 5.176972788438705e-06, "loss": 0.8098712, "memory(GiB)": 146.85, "step": 43210, "train_speed(iter/s)": 0.20229 }, { "acc": 0.76323442, "epoch": 1.008355327432596, "grad_norm": 6.125, "learning_rate": 5.175084847900943e-06, "loss": 0.84657001, "memory(GiB)": 146.85, "step": 43220, "train_speed(iter/s)": 0.202316 }, { "acc": 0.77069197, "epoch": 1.008588635004885, "grad_norm": 5.375, "learning_rate": 5.17319688236987e-06, "loss": 0.83288631, "memory(GiB)": 146.85, "step": 43230, "train_speed(iter/s)": 0.202342 }, { "acc": 0.7637044, "epoch": 1.0088219425771738, "grad_norm": 5.84375, "learning_rate": 5.171308892114991e-06, "loss": 0.88265848, "memory(GiB)": 146.85, "step": 43240, "train_speed(iter/s)": 0.202365 }, { "acc": 0.77459707, "epoch": 1.0090552501494627, "grad_norm": 11.0625, "learning_rate": 5.16942087740582e-06, "loss": 0.78823905, "memory(GiB)": 146.85, "step": 43250, "train_speed(iter/s)": 0.202386 }, { "acc": 0.74776268, "epoch": 1.0092885577217516, "grad_norm": 5.1875, "learning_rate": 5.167532838511866e-06, "loss": 0.90610294, "memory(GiB)": 146.85, "step": 43260, "train_speed(iter/s)": 0.202411 }, { "acc": 0.76043139, "epoch": 1.0095218652940405, "grad_norm": 6.125, "learning_rate": 5.16564477570265e-06, "loss": 0.86283216, "memory(GiB)": 146.85, "step": 43270, "train_speed(iter/s)": 0.202436 }, { "acc": 0.77895937, "epoch": 1.0097551728663294, "grad_norm": 6.09375, "learning_rate": 5.163756689247687e-06, "loss": 0.79463282, "memory(GiB)": 146.85, "step": 43280, "train_speed(iter/s)": 0.202459 }, { "acc": 0.76004834, "epoch": 1.0099884804386183, "grad_norm": 6.9375, "learning_rate": 5.1618685794165066e-06, "loss": 0.86768379, "memory(GiB)": 146.85, "step": 43290, "train_speed(iter/s)": 0.202484 }, { "acc": 0.75561504, "epoch": 1.0102217880109072, "grad_norm": 5.5625, "learning_rate": 5.159980446478633e-06, "loss": 0.90660591, "memory(GiB)": 146.85, "step": 43300, "train_speed(iter/s)": 0.202509 }, { "acc": 0.77633491, "epoch": 1.0104550955831961, "grad_norm": 4.71875, "learning_rate": 5.158092290703597e-06, "loss": 0.79639406, "memory(GiB)": 146.85, "step": 43310, "train_speed(iter/s)": 0.202532 }, { "acc": 0.77002201, "epoch": 1.010688403155485, "grad_norm": 6.0, "learning_rate": 5.156204112360933e-06, "loss": 0.82090425, "memory(GiB)": 146.85, "step": 43320, "train_speed(iter/s)": 0.202556 }, { "acc": 0.78131151, "epoch": 1.0109217107277737, "grad_norm": 4.53125, "learning_rate": 5.154315911720176e-06, "loss": 0.79243221, "memory(GiB)": 146.85, "step": 43330, "train_speed(iter/s)": 0.20258 }, { "acc": 0.78043408, "epoch": 1.0111550183000626, "grad_norm": 6.0625, "learning_rate": 5.152427689050869e-06, "loss": 0.8141221, "memory(GiB)": 146.85, "step": 43340, "train_speed(iter/s)": 0.202604 }, { "acc": 0.76782613, "epoch": 1.0113883258723515, "grad_norm": 6.625, "learning_rate": 5.150539444622552e-06, "loss": 0.84960737, "memory(GiB)": 146.85, "step": 43350, "train_speed(iter/s)": 0.20263 }, { "acc": 0.78140826, "epoch": 1.0116216334446404, "grad_norm": 4.96875, "learning_rate": 5.148651178704775e-06, "loss": 0.81172667, "memory(GiB)": 146.85, "step": 43360, "train_speed(iter/s)": 0.202655 }, { "acc": 0.76727514, "epoch": 1.0118549410169293, "grad_norm": 8.0625, "learning_rate": 5.146762891567084e-06, "loss": 0.85402832, "memory(GiB)": 146.85, "step": 43370, "train_speed(iter/s)": 0.20268 }, { "acc": 0.78471832, "epoch": 1.0120882485892182, "grad_norm": 5.4375, "learning_rate": 5.144874583479034e-06, "loss": 0.76695824, "memory(GiB)": 146.85, "step": 43380, "train_speed(iter/s)": 0.202705 }, { "acc": 0.77188606, "epoch": 1.012321556161507, "grad_norm": 6.8125, "learning_rate": 5.142986254710177e-06, "loss": 0.82424679, "memory(GiB)": 146.85, "step": 43390, "train_speed(iter/s)": 0.20273 }, { "acc": 0.77655811, "epoch": 1.012554863733796, "grad_norm": 7.3125, "learning_rate": 5.141097905530077e-06, "loss": 0.80112209, "memory(GiB)": 146.85, "step": 43400, "train_speed(iter/s)": 0.202753 }, { "acc": 0.77730417, "epoch": 1.012788171306085, "grad_norm": 6.0625, "learning_rate": 5.139209536208289e-06, "loss": 0.79740028, "memory(GiB)": 146.85, "step": 43410, "train_speed(iter/s)": 0.202777 }, { "acc": 0.76906929, "epoch": 1.0130214788783738, "grad_norm": 5.65625, "learning_rate": 5.1373211470143814e-06, "loss": 0.82430763, "memory(GiB)": 146.85, "step": 43420, "train_speed(iter/s)": 0.202799 }, { "acc": 0.77107887, "epoch": 1.0132547864506627, "grad_norm": 5.0, "learning_rate": 5.13543273821792e-06, "loss": 0.83154545, "memory(GiB)": 146.85, "step": 43430, "train_speed(iter/s)": 0.202822 }, { "acc": 0.76009459, "epoch": 1.0134880940229516, "grad_norm": 5.0, "learning_rate": 5.133544310088474e-06, "loss": 0.87767601, "memory(GiB)": 146.85, "step": 43440, "train_speed(iter/s)": 0.202846 }, { "acc": 0.76659665, "epoch": 1.0137214015952405, "grad_norm": 6.21875, "learning_rate": 5.131655862895617e-06, "loss": 0.84489632, "memory(GiB)": 146.85, "step": 43450, "train_speed(iter/s)": 0.202871 }, { "acc": 0.78512101, "epoch": 1.0139547091675294, "grad_norm": 5.84375, "learning_rate": 5.129767396908923e-06, "loss": 0.77260714, "memory(GiB)": 146.85, "step": 43460, "train_speed(iter/s)": 0.202895 }, { "acc": 0.77200689, "epoch": 1.0141880167398183, "grad_norm": 6.125, "learning_rate": 5.1278789123979736e-06, "loss": 0.8318841, "memory(GiB)": 146.85, "step": 43470, "train_speed(iter/s)": 0.20292 }, { "acc": 0.78591695, "epoch": 1.0144213243121072, "grad_norm": 6.71875, "learning_rate": 5.125990409632344e-06, "loss": 0.79668665, "memory(GiB)": 146.85, "step": 43480, "train_speed(iter/s)": 0.202944 }, { "acc": 0.78981314, "epoch": 1.014654631884396, "grad_norm": 5.40625, "learning_rate": 5.1241018888816205e-06, "loss": 0.76531739, "memory(GiB)": 146.85, "step": 43490, "train_speed(iter/s)": 0.202968 }, { "acc": 0.77621956, "epoch": 1.014887939456685, "grad_norm": 4.96875, "learning_rate": 5.122213350415389e-06, "loss": 0.78938389, "memory(GiB)": 146.85, "step": 43500, "train_speed(iter/s)": 0.202991 }, { "epoch": 1.014887939456685, "eval_acc": 0.7351529046560331, "eval_loss": 0.8341851830482483, "eval_runtime": 1262.5547, "eval_samples_per_second": 28.506, "eval_steps_per_second": 14.254, "step": 43500 }, { "acc": 0.77314425, "epoch": 1.0151212470289739, "grad_norm": 7.03125, "learning_rate": 5.1203247945032365e-06, "loss": 0.81065636, "memory(GiB)": 146.85, "step": 43510, "train_speed(iter/s)": 0.201803 }, { "acc": 0.77398539, "epoch": 1.0153545546012628, "grad_norm": 7.0625, "learning_rate": 5.118436221414754e-06, "loss": 0.81466293, "memory(GiB)": 146.85, "step": 43520, "train_speed(iter/s)": 0.201826 }, { "acc": 0.75737419, "epoch": 1.0155878621735517, "grad_norm": 10.0, "learning_rate": 5.116547631419536e-06, "loss": 0.88428965, "memory(GiB)": 146.85, "step": 43530, "train_speed(iter/s)": 0.201851 }, { "acc": 0.77672596, "epoch": 1.0158211697458406, "grad_norm": 5.0, "learning_rate": 5.114659024787179e-06, "loss": 0.77761879, "memory(GiB)": 146.85, "step": 43540, "train_speed(iter/s)": 0.201874 }, { "acc": 0.77496719, "epoch": 1.0160544773181295, "grad_norm": 6.1875, "learning_rate": 5.112770401787278e-06, "loss": 0.8233799, "memory(GiB)": 146.85, "step": 43550, "train_speed(iter/s)": 0.201899 }, { "acc": 0.76987143, "epoch": 1.0162877848904184, "grad_norm": 5.125, "learning_rate": 5.110881762689435e-06, "loss": 0.84732246, "memory(GiB)": 146.85, "step": 43560, "train_speed(iter/s)": 0.201924 }, { "acc": 0.77240353, "epoch": 1.0165210924627073, "grad_norm": 5.4375, "learning_rate": 5.1089931077632514e-06, "loss": 0.81364002, "memory(GiB)": 146.85, "step": 43570, "train_speed(iter/s)": 0.201948 }, { "acc": 0.75940781, "epoch": 1.0167544000349962, "grad_norm": 6.125, "learning_rate": 5.1071044372783355e-06, "loss": 0.86164455, "memory(GiB)": 146.85, "step": 43580, "train_speed(iter/s)": 0.201972 }, { "acc": 0.76941757, "epoch": 1.016987707607285, "grad_norm": 7.25, "learning_rate": 5.10521575150429e-06, "loss": 0.8206378, "memory(GiB)": 146.85, "step": 43590, "train_speed(iter/s)": 0.201997 }, { "acc": 0.77368555, "epoch": 1.017221015179574, "grad_norm": 5.5, "learning_rate": 5.103327050710729e-06, "loss": 0.8199295, "memory(GiB)": 146.85, "step": 43600, "train_speed(iter/s)": 0.202021 }, { "acc": 0.79934459, "epoch": 1.0174543227518629, "grad_norm": 5.5625, "learning_rate": 5.10143833516726e-06, "loss": 0.72160072, "memory(GiB)": 146.85, "step": 43610, "train_speed(iter/s)": 0.202045 }, { "acc": 0.7822938, "epoch": 1.0176876303241518, "grad_norm": 6.21875, "learning_rate": 5.099549605143499e-06, "loss": 0.79984331, "memory(GiB)": 146.85, "step": 43620, "train_speed(iter/s)": 0.202069 }, { "acc": 0.77438293, "epoch": 1.0179209378964407, "grad_norm": 10.0625, "learning_rate": 5.0976608609090606e-06, "loss": 0.80100155, "memory(GiB)": 146.85, "step": 43630, "train_speed(iter/s)": 0.202094 }, { "acc": 0.77763648, "epoch": 1.0181542454687296, "grad_norm": 6.46875, "learning_rate": 5.095772102733561e-06, "loss": 0.8078105, "memory(GiB)": 146.85, "step": 43640, "train_speed(iter/s)": 0.202118 }, { "acc": 0.77579784, "epoch": 1.0183875530410185, "grad_norm": 5.28125, "learning_rate": 5.093883330886623e-06, "loss": 0.80251427, "memory(GiB)": 146.85, "step": 43650, "train_speed(iter/s)": 0.202143 }, { "acc": 0.79155049, "epoch": 1.0186208606133074, "grad_norm": 7.0, "learning_rate": 5.091994545637867e-06, "loss": 0.74920616, "memory(GiB)": 146.85, "step": 43660, "train_speed(iter/s)": 0.202167 }, { "acc": 0.79484272, "epoch": 1.0188541681855963, "grad_norm": 5.75, "learning_rate": 5.090105747256916e-06, "loss": 0.73041935, "memory(GiB)": 146.85, "step": 43670, "train_speed(iter/s)": 0.202192 }, { "acc": 0.78123922, "epoch": 1.0190874757578852, "grad_norm": 4.71875, "learning_rate": 5.088216936013396e-06, "loss": 0.79872165, "memory(GiB)": 146.85, "step": 43680, "train_speed(iter/s)": 0.202214 }, { "acc": 0.76830459, "epoch": 1.019320783330174, "grad_norm": 8.0625, "learning_rate": 5.086328112176934e-06, "loss": 0.83753986, "memory(GiB)": 146.85, "step": 43690, "train_speed(iter/s)": 0.202237 }, { "acc": 0.75318651, "epoch": 1.019554090902463, "grad_norm": 6.125, "learning_rate": 5.084439276017159e-06, "loss": 0.90451717, "memory(GiB)": 146.85, "step": 43700, "train_speed(iter/s)": 0.20226 }, { "acc": 0.77358508, "epoch": 1.0197873984747516, "grad_norm": 7.65625, "learning_rate": 5.082550427803702e-06, "loss": 0.80145102, "memory(GiB)": 146.85, "step": 43710, "train_speed(iter/s)": 0.202282 }, { "acc": 0.7988863, "epoch": 1.0200207060470405, "grad_norm": 6.78125, "learning_rate": 5.080661567806195e-06, "loss": 0.71119533, "memory(GiB)": 146.85, "step": 43720, "train_speed(iter/s)": 0.202306 }, { "acc": 0.7441927, "epoch": 1.0202540136193294, "grad_norm": 6.3125, "learning_rate": 5.078772696294273e-06, "loss": 0.91385212, "memory(GiB)": 146.85, "step": 43730, "train_speed(iter/s)": 0.202329 }, { "acc": 0.78541441, "epoch": 1.0204873211916183, "grad_norm": 6.0, "learning_rate": 5.076883813537571e-06, "loss": 0.7754158, "memory(GiB)": 146.85, "step": 43740, "train_speed(iter/s)": 0.202353 }, { "acc": 0.76732278, "epoch": 1.0207206287639072, "grad_norm": 7.0, "learning_rate": 5.074994919805727e-06, "loss": 0.84893265, "memory(GiB)": 146.85, "step": 43750, "train_speed(iter/s)": 0.202378 }, { "acc": 0.75506659, "epoch": 1.0209539363361961, "grad_norm": 5.46875, "learning_rate": 5.073106015368381e-06, "loss": 0.9038312, "memory(GiB)": 146.85, "step": 43760, "train_speed(iter/s)": 0.202403 }, { "acc": 0.7831953, "epoch": 1.021187243908485, "grad_norm": 5.78125, "learning_rate": 5.071217100495172e-06, "loss": 0.77417383, "memory(GiB)": 146.85, "step": 43770, "train_speed(iter/s)": 0.202428 }, { "acc": 0.77479897, "epoch": 1.021420551480774, "grad_norm": 6.71875, "learning_rate": 5.069328175455742e-06, "loss": 0.79651661, "memory(GiB)": 146.85, "step": 43780, "train_speed(iter/s)": 0.202452 }, { "acc": 0.77992926, "epoch": 1.0216538590530628, "grad_norm": 6.96875, "learning_rate": 5.067439240519735e-06, "loss": 0.79422455, "memory(GiB)": 146.85, "step": 43790, "train_speed(iter/s)": 0.202477 }, { "acc": 0.77505245, "epoch": 1.0218871666253517, "grad_norm": 5.8125, "learning_rate": 5.065550295956796e-06, "loss": 0.81135931, "memory(GiB)": 146.85, "step": 43800, "train_speed(iter/s)": 0.2025 }, { "acc": 0.75834475, "epoch": 1.0221204741976406, "grad_norm": 5.5625, "learning_rate": 5.063661342036571e-06, "loss": 0.84107094, "memory(GiB)": 146.85, "step": 43810, "train_speed(iter/s)": 0.202523 }, { "acc": 0.76649089, "epoch": 1.0223537817699295, "grad_norm": 6.125, "learning_rate": 5.061772379028709e-06, "loss": 0.86454134, "memory(GiB)": 146.85, "step": 43820, "train_speed(iter/s)": 0.202547 }, { "acc": 0.7870203, "epoch": 1.0225870893422184, "grad_norm": 4.375, "learning_rate": 5.059883407202858e-06, "loss": 0.78075867, "memory(GiB)": 146.85, "step": 43830, "train_speed(iter/s)": 0.202572 }, { "acc": 0.76279225, "epoch": 1.0228203969145073, "grad_norm": 6.03125, "learning_rate": 5.057994426828669e-06, "loss": 0.88252697, "memory(GiB)": 146.85, "step": 43840, "train_speed(iter/s)": 0.202596 }, { "acc": 0.76994467, "epoch": 1.0230537044867962, "grad_norm": 5.96875, "learning_rate": 5.05610543817579e-06, "loss": 0.83455143, "memory(GiB)": 146.85, "step": 43850, "train_speed(iter/s)": 0.202622 }, { "acc": 0.7879096, "epoch": 1.0232870120590851, "grad_norm": 7.21875, "learning_rate": 5.054216441513876e-06, "loss": 0.74170156, "memory(GiB)": 146.85, "step": 43860, "train_speed(iter/s)": 0.202645 }, { "acc": 0.78155804, "epoch": 1.023520319631374, "grad_norm": 5.1875, "learning_rate": 5.052327437112582e-06, "loss": 0.76864309, "memory(GiB)": 146.85, "step": 43870, "train_speed(iter/s)": 0.202671 }, { "acc": 0.76172791, "epoch": 1.023753627203663, "grad_norm": 6.1875, "learning_rate": 5.050438425241562e-06, "loss": 0.86766567, "memory(GiB)": 146.85, "step": 43880, "train_speed(iter/s)": 0.202694 }, { "acc": 0.77293143, "epoch": 1.0239869347759518, "grad_norm": 6.28125, "learning_rate": 5.0485494061704695e-06, "loss": 0.82862225, "memory(GiB)": 146.85, "step": 43890, "train_speed(iter/s)": 0.20272 }, { "acc": 0.7672471, "epoch": 1.0242202423482407, "grad_norm": 6.78125, "learning_rate": 5.0466603801689655e-06, "loss": 0.84194336, "memory(GiB)": 146.85, "step": 43900, "train_speed(iter/s)": 0.202744 }, { "acc": 0.79665308, "epoch": 1.0244535499205296, "grad_norm": 4.9375, "learning_rate": 5.044771347506705e-06, "loss": 0.72636251, "memory(GiB)": 146.85, "step": 43910, "train_speed(iter/s)": 0.202766 }, { "acc": 0.7764535, "epoch": 1.0246868574928185, "grad_norm": 5.65625, "learning_rate": 5.0428823084533475e-06, "loss": 0.79091449, "memory(GiB)": 146.85, "step": 43920, "train_speed(iter/s)": 0.20279 }, { "acc": 0.76463261, "epoch": 1.0249201650651074, "grad_norm": 5.75, "learning_rate": 5.040993263278552e-06, "loss": 0.86701269, "memory(GiB)": 146.85, "step": 43930, "train_speed(iter/s)": 0.202812 }, { "acc": 0.76179886, "epoch": 1.0251534726373963, "grad_norm": 6.34375, "learning_rate": 5.0391042122519815e-06, "loss": 0.88429203, "memory(GiB)": 146.85, "step": 43940, "train_speed(iter/s)": 0.202836 }, { "acc": 0.76681828, "epoch": 1.0253867802096852, "grad_norm": 5.46875, "learning_rate": 5.037215155643296e-06, "loss": 0.8264019, "memory(GiB)": 146.85, "step": 43950, "train_speed(iter/s)": 0.20286 }, { "acc": 0.77985663, "epoch": 1.025620087781974, "grad_norm": 6.59375, "learning_rate": 5.035326093722157e-06, "loss": 0.78932076, "memory(GiB)": 146.85, "step": 43960, "train_speed(iter/s)": 0.202884 }, { "acc": 0.78355989, "epoch": 1.025853395354263, "grad_norm": 5.3125, "learning_rate": 5.033437026758228e-06, "loss": 0.78051805, "memory(GiB)": 146.85, "step": 43970, "train_speed(iter/s)": 0.202908 }, { "acc": 0.78858528, "epoch": 1.026086702926552, "grad_norm": 5.71875, "learning_rate": 5.0315479550211746e-06, "loss": 0.75477114, "memory(GiB)": 146.85, "step": 43980, "train_speed(iter/s)": 0.202933 }, { "acc": 0.77580481, "epoch": 1.0263200104988408, "grad_norm": 7.03125, "learning_rate": 5.029658878780659e-06, "loss": 0.81395245, "memory(GiB)": 146.85, "step": 43990, "train_speed(iter/s)": 0.202957 }, { "acc": 0.78009377, "epoch": 1.0265533180711297, "grad_norm": 6.40625, "learning_rate": 5.0277697983063476e-06, "loss": 0.78800297, "memory(GiB)": 146.85, "step": 44000, "train_speed(iter/s)": 0.20298 }, { "epoch": 1.0265533180711297, "eval_acc": 0.7350259278022839, "eval_loss": 0.8342590928077698, "eval_runtime": 1263.7299, "eval_samples_per_second": 28.48, "eval_steps_per_second": 14.24, "step": 44000 }, { "acc": 0.77329693, "epoch": 1.0267866256434186, "grad_norm": 8.125, "learning_rate": 5.025880713867904e-06, "loss": 0.82545176, "memory(GiB)": 146.85, "step": 44010, "train_speed(iter/s)": 0.201803 }, { "acc": 0.79429884, "epoch": 1.0270199332157075, "grad_norm": 5.9375, "learning_rate": 5.023991625734998e-06, "loss": 0.75631909, "memory(GiB)": 146.85, "step": 44020, "train_speed(iter/s)": 0.201827 }, { "acc": 0.75515709, "epoch": 1.0272532407879964, "grad_norm": 5.59375, "learning_rate": 5.022102534177293e-06, "loss": 0.86756105, "memory(GiB)": 146.85, "step": 44030, "train_speed(iter/s)": 0.20185 }, { "acc": 0.78160276, "epoch": 1.0274865483602853, "grad_norm": 5.3125, "learning_rate": 5.020213439464458e-06, "loss": 0.77976904, "memory(GiB)": 146.85, "step": 44040, "train_speed(iter/s)": 0.201871 }, { "acc": 0.77402606, "epoch": 1.0277198559325742, "grad_norm": 7.6875, "learning_rate": 5.018324341866161e-06, "loss": 0.81598177, "memory(GiB)": 146.85, "step": 44050, "train_speed(iter/s)": 0.201896 }, { "acc": 0.78077269, "epoch": 1.027953163504863, "grad_norm": 6.1875, "learning_rate": 5.01643524165207e-06, "loss": 0.76026106, "memory(GiB)": 146.85, "step": 44060, "train_speed(iter/s)": 0.201919 }, { "acc": 0.77804012, "epoch": 1.028186471077152, "grad_norm": 6.96875, "learning_rate": 5.014546139091851e-06, "loss": 0.8130168, "memory(GiB)": 146.85, "step": 44070, "train_speed(iter/s)": 0.201945 }, { "acc": 0.76915932, "epoch": 1.028419778649441, "grad_norm": 6.1875, "learning_rate": 5.012657034455176e-06, "loss": 0.85895348, "memory(GiB)": 146.85, "step": 44080, "train_speed(iter/s)": 0.201968 }, { "acc": 0.76214957, "epoch": 1.0286530862217296, "grad_norm": 5.96875, "learning_rate": 5.010767928011713e-06, "loss": 0.85185966, "memory(GiB)": 146.85, "step": 44090, "train_speed(iter/s)": 0.201992 }, { "acc": 0.76913805, "epoch": 1.0288863937940185, "grad_norm": 5.4375, "learning_rate": 5.008878820031131e-06, "loss": 0.82675171, "memory(GiB)": 146.85, "step": 44100, "train_speed(iter/s)": 0.202017 }, { "acc": 0.77211838, "epoch": 1.0291197013663074, "grad_norm": 6.40625, "learning_rate": 5.006989710783101e-06, "loss": 0.82405853, "memory(GiB)": 146.85, "step": 44110, "train_speed(iter/s)": 0.202042 }, { "acc": 0.77747879, "epoch": 1.0293530089385963, "grad_norm": 7.34375, "learning_rate": 5.005100600537292e-06, "loss": 0.79159555, "memory(GiB)": 146.85, "step": 44120, "train_speed(iter/s)": 0.202067 }, { "acc": 0.76889443, "epoch": 1.0295863165108852, "grad_norm": 5.53125, "learning_rate": 5.003211489563373e-06, "loss": 0.82548885, "memory(GiB)": 146.85, "step": 44130, "train_speed(iter/s)": 0.202092 }, { "acc": 0.77039394, "epoch": 1.029819624083174, "grad_norm": 3.84375, "learning_rate": 5.001322378131015e-06, "loss": 0.8394845, "memory(GiB)": 146.85, "step": 44140, "train_speed(iter/s)": 0.202115 }, { "acc": 0.77622881, "epoch": 1.030052931655463, "grad_norm": 5.03125, "learning_rate": 4.9994332665098885e-06, "loss": 0.81605034, "memory(GiB)": 146.85, "step": 44150, "train_speed(iter/s)": 0.202139 }, { "acc": 0.74947562, "epoch": 1.0302862392277519, "grad_norm": 5.0625, "learning_rate": 4.997544154969661e-06, "loss": 0.91669846, "memory(GiB)": 146.85, "step": 44160, "train_speed(iter/s)": 0.202162 }, { "acc": 0.77912297, "epoch": 1.0305195468000408, "grad_norm": 4.875, "learning_rate": 4.995655043780006e-06, "loss": 0.78838434, "memory(GiB)": 146.85, "step": 44170, "train_speed(iter/s)": 0.202185 }, { "acc": 0.77773871, "epoch": 1.0307528543723297, "grad_norm": 8.375, "learning_rate": 4.993765933210592e-06, "loss": 0.81201715, "memory(GiB)": 146.85, "step": 44180, "train_speed(iter/s)": 0.202208 }, { "acc": 0.77178321, "epoch": 1.0309861619446186, "grad_norm": 5.3125, "learning_rate": 4.991876823531089e-06, "loss": 0.8212389, "memory(GiB)": 146.85, "step": 44190, "train_speed(iter/s)": 0.202231 }, { "acc": 0.75890913, "epoch": 1.0312194695169075, "grad_norm": 9.5, "learning_rate": 4.989987715011168e-06, "loss": 0.87066402, "memory(GiB)": 146.85, "step": 44200, "train_speed(iter/s)": 0.202254 }, { "acc": 0.78845406, "epoch": 1.0314527770891964, "grad_norm": 4.875, "learning_rate": 4.988098607920497e-06, "loss": 0.75614586, "memory(GiB)": 146.85, "step": 44210, "train_speed(iter/s)": 0.202279 }, { "acc": 0.76269288, "epoch": 1.0316860846614853, "grad_norm": 6.28125, "learning_rate": 4.986209502528746e-06, "loss": 0.84936705, "memory(GiB)": 146.85, "step": 44220, "train_speed(iter/s)": 0.202303 }, { "acc": 0.78124018, "epoch": 1.0319193922337742, "grad_norm": 4.96875, "learning_rate": 4.984320399105585e-06, "loss": 0.78717561, "memory(GiB)": 146.85, "step": 44230, "train_speed(iter/s)": 0.202326 }, { "acc": 0.77639952, "epoch": 1.032152699806063, "grad_norm": 4.4375, "learning_rate": 4.982431297920682e-06, "loss": 0.79171686, "memory(GiB)": 146.85, "step": 44240, "train_speed(iter/s)": 0.20235 }, { "acc": 0.7711462, "epoch": 1.032386007378352, "grad_norm": 7.46875, "learning_rate": 4.980542199243709e-06, "loss": 0.833251, "memory(GiB)": 146.85, "step": 44250, "train_speed(iter/s)": 0.202375 }, { "acc": 0.78774829, "epoch": 1.0326193149506409, "grad_norm": 8.125, "learning_rate": 4.978653103344328e-06, "loss": 0.77102232, "memory(GiB)": 146.85, "step": 44260, "train_speed(iter/s)": 0.202399 }, { "acc": 0.76663551, "epoch": 1.0328526225229298, "grad_norm": 12.6875, "learning_rate": 4.976764010492211e-06, "loss": 0.85114069, "memory(GiB)": 146.85, "step": 44270, "train_speed(iter/s)": 0.202422 }, { "acc": 0.77251348, "epoch": 1.0330859300952187, "grad_norm": 4.96875, "learning_rate": 4.974874920957025e-06, "loss": 0.82101374, "memory(GiB)": 146.85, "step": 44280, "train_speed(iter/s)": 0.202445 }, { "acc": 0.78931313, "epoch": 1.0333192376675076, "grad_norm": 8.0625, "learning_rate": 4.972985835008437e-06, "loss": 0.76537514, "memory(GiB)": 146.85, "step": 44290, "train_speed(iter/s)": 0.202468 }, { "acc": 0.77930632, "epoch": 1.0335525452397964, "grad_norm": 4.78125, "learning_rate": 4.971096752916113e-06, "loss": 0.80029364, "memory(GiB)": 146.85, "step": 44300, "train_speed(iter/s)": 0.202491 }, { "acc": 0.7794229, "epoch": 1.0337858528120853, "grad_norm": 5.65625, "learning_rate": 4.969207674949719e-06, "loss": 0.79528255, "memory(GiB)": 146.85, "step": 44310, "train_speed(iter/s)": 0.202515 }, { "acc": 0.77463112, "epoch": 1.0340191603843742, "grad_norm": 6.34375, "learning_rate": 4.96731860137892e-06, "loss": 0.81293583, "memory(GiB)": 146.85, "step": 44320, "train_speed(iter/s)": 0.202539 }, { "acc": 0.77360716, "epoch": 1.0342524679566631, "grad_norm": 6.53125, "learning_rate": 4.965429532473383e-06, "loss": 0.82128067, "memory(GiB)": 146.85, "step": 44330, "train_speed(iter/s)": 0.202562 }, { "acc": 0.78999949, "epoch": 1.034485775528952, "grad_norm": 8.4375, "learning_rate": 4.963540468502768e-06, "loss": 0.73391094, "memory(GiB)": 146.85, "step": 44340, "train_speed(iter/s)": 0.202586 }, { "acc": 0.75832496, "epoch": 1.034719083101241, "grad_norm": 6.25, "learning_rate": 4.961651409736741e-06, "loss": 0.8749939, "memory(GiB)": 146.85, "step": 44350, "train_speed(iter/s)": 0.202609 }, { "acc": 0.76441422, "epoch": 1.0349523906735298, "grad_norm": 5.25, "learning_rate": 4.959762356444964e-06, "loss": 0.85119743, "memory(GiB)": 146.85, "step": 44360, "train_speed(iter/s)": 0.202633 }, { "acc": 0.75923433, "epoch": 1.0351856982458187, "grad_norm": 6.65625, "learning_rate": 4.957873308897102e-06, "loss": 0.89618969, "memory(GiB)": 146.85, "step": 44370, "train_speed(iter/s)": 0.202657 }, { "acc": 0.76288209, "epoch": 1.0354190058181076, "grad_norm": 5.28125, "learning_rate": 4.95598426736281e-06, "loss": 0.8677681, "memory(GiB)": 146.85, "step": 44380, "train_speed(iter/s)": 0.202681 }, { "acc": 0.77355604, "epoch": 1.0356523133903965, "grad_norm": 4.75, "learning_rate": 4.954095232111751e-06, "loss": 0.81760464, "memory(GiB)": 146.85, "step": 44390, "train_speed(iter/s)": 0.202704 }, { "acc": 0.77838202, "epoch": 1.0358856209626854, "grad_norm": 5.125, "learning_rate": 4.9522062034135845e-06, "loss": 0.78511724, "memory(GiB)": 146.85, "step": 44400, "train_speed(iter/s)": 0.202727 }, { "acc": 0.77499886, "epoch": 1.0361189285349743, "grad_norm": 7.625, "learning_rate": 4.9503171815379695e-06, "loss": 0.83074865, "memory(GiB)": 146.85, "step": 44410, "train_speed(iter/s)": 0.202751 }, { "acc": 0.79128094, "epoch": 1.0363522361072632, "grad_norm": 6.59375, "learning_rate": 4.948428166754561e-06, "loss": 0.73558502, "memory(GiB)": 146.85, "step": 44420, "train_speed(iter/s)": 0.202775 }, { "acc": 0.75806122, "epoch": 1.0365855436795521, "grad_norm": 9.5, "learning_rate": 4.946539159333017e-06, "loss": 0.8863884, "memory(GiB)": 146.85, "step": 44430, "train_speed(iter/s)": 0.202798 }, { "acc": 0.77096276, "epoch": 1.036818851251841, "grad_norm": 4.78125, "learning_rate": 4.944650159542993e-06, "loss": 0.821984, "memory(GiB)": 146.85, "step": 44440, "train_speed(iter/s)": 0.202821 }, { "acc": 0.78029099, "epoch": 1.03705215882413, "grad_norm": 5.0625, "learning_rate": 4.942761167654142e-06, "loss": 0.78590918, "memory(GiB)": 146.85, "step": 44450, "train_speed(iter/s)": 0.202845 }, { "acc": 0.77999668, "epoch": 1.0372854663964188, "grad_norm": 7.375, "learning_rate": 4.940872183936118e-06, "loss": 0.79216881, "memory(GiB)": 146.85, "step": 44460, "train_speed(iter/s)": 0.202869 }, { "acc": 0.76958227, "epoch": 1.0375187739687077, "grad_norm": 5.71875, "learning_rate": 4.938983208658574e-06, "loss": 0.82844009, "memory(GiB)": 146.85, "step": 44470, "train_speed(iter/s)": 0.202892 }, { "acc": 0.76502914, "epoch": 1.0377520815409964, "grad_norm": 5.78125, "learning_rate": 4.937094242091158e-06, "loss": 0.8450902, "memory(GiB)": 146.85, "step": 44480, "train_speed(iter/s)": 0.202916 }, { "acc": 0.758254, "epoch": 1.0379853891132853, "grad_norm": 6.40625, "learning_rate": 4.935205284503522e-06, "loss": 0.8721776, "memory(GiB)": 146.85, "step": 44490, "train_speed(iter/s)": 0.20294 }, { "acc": 0.78718271, "epoch": 1.0382186966855742, "grad_norm": 7.03125, "learning_rate": 4.933316336165311e-06, "loss": 0.76614799, "memory(GiB)": 146.85, "step": 44500, "train_speed(iter/s)": 0.202963 }, { "epoch": 1.0382186966855742, "eval_acc": 0.7350811070703681, "eval_loss": 0.8342785835266113, "eval_runtime": 1263.3596, "eval_samples_per_second": 28.488, "eval_steps_per_second": 14.245, "step": 44500 }, { "acc": 0.77162404, "epoch": 1.038452004257863, "grad_norm": 7.09375, "learning_rate": 4.931427397346174e-06, "loss": 0.80907192, "memory(GiB)": 146.85, "step": 44510, "train_speed(iter/s)": 0.2018 }, { "acc": 0.7528574, "epoch": 1.038685311830152, "grad_norm": 8.9375, "learning_rate": 4.929538468315756e-06, "loss": 0.89988995, "memory(GiB)": 146.85, "step": 44520, "train_speed(iter/s)": 0.201823 }, { "acc": 0.75105257, "epoch": 1.038918619402441, "grad_norm": 5.6875, "learning_rate": 4.927649549343701e-06, "loss": 0.90184383, "memory(GiB)": 146.85, "step": 44530, "train_speed(iter/s)": 0.201845 }, { "acc": 0.76821961, "epoch": 1.0391519269747298, "grad_norm": 4.34375, "learning_rate": 4.9257606406996525e-06, "loss": 0.81837959, "memory(GiB)": 146.85, "step": 44540, "train_speed(iter/s)": 0.20187 }, { "acc": 0.77501631, "epoch": 1.0393852345470187, "grad_norm": 7.15625, "learning_rate": 4.923871742653251e-06, "loss": 0.81273088, "memory(GiB)": 146.85, "step": 44550, "train_speed(iter/s)": 0.201893 }, { "acc": 0.76110039, "epoch": 1.0396185421193076, "grad_norm": 5.125, "learning_rate": 4.921982855474136e-06, "loss": 0.87299328, "memory(GiB)": 146.85, "step": 44560, "train_speed(iter/s)": 0.201917 }, { "acc": 0.77302666, "epoch": 1.0398518496915965, "grad_norm": 4.84375, "learning_rate": 4.9200939794319444e-06, "loss": 0.83813858, "memory(GiB)": 146.85, "step": 44570, "train_speed(iter/s)": 0.201939 }, { "acc": 0.78122907, "epoch": 1.0400851572638854, "grad_norm": 6.34375, "learning_rate": 4.918205114796315e-06, "loss": 0.80088253, "memory(GiB)": 146.85, "step": 44580, "train_speed(iter/s)": 0.201962 }, { "acc": 0.76488495, "epoch": 1.0403184648361743, "grad_norm": 6.90625, "learning_rate": 4.916316261836882e-06, "loss": 0.85123291, "memory(GiB)": 146.85, "step": 44590, "train_speed(iter/s)": 0.201986 }, { "acc": 0.77452803, "epoch": 1.0405517724084632, "grad_norm": 5.53125, "learning_rate": 4.91442742082328e-06, "loss": 0.80372305, "memory(GiB)": 146.85, "step": 44600, "train_speed(iter/s)": 0.202008 }, { "acc": 0.78402786, "epoch": 1.040785079980752, "grad_norm": 6.0625, "learning_rate": 4.912538592025137e-06, "loss": 0.7651319, "memory(GiB)": 146.85, "step": 44610, "train_speed(iter/s)": 0.202032 }, { "acc": 0.77343397, "epoch": 1.041018387553041, "grad_norm": 5.5, "learning_rate": 4.910649775712084e-06, "loss": 0.823454, "memory(GiB)": 146.85, "step": 44620, "train_speed(iter/s)": 0.202055 }, { "acc": 0.76696243, "epoch": 1.04125169512533, "grad_norm": 6.5625, "learning_rate": 4.908760972153751e-06, "loss": 0.85213194, "memory(GiB)": 146.85, "step": 44630, "train_speed(iter/s)": 0.202079 }, { "acc": 0.77030811, "epoch": 1.0414850026976188, "grad_norm": 5.21875, "learning_rate": 4.9068721816197615e-06, "loss": 0.85186367, "memory(GiB)": 146.85, "step": 44640, "train_speed(iter/s)": 0.202101 }, { "acc": 0.78552227, "epoch": 1.0417183102699077, "grad_norm": 7.84375, "learning_rate": 4.904983404379741e-06, "loss": 0.77506571, "memory(GiB)": 146.85, "step": 44650, "train_speed(iter/s)": 0.202125 }, { "acc": 0.7883357, "epoch": 1.0419516178421966, "grad_norm": 4.8125, "learning_rate": 4.903094640703312e-06, "loss": 0.7719048, "memory(GiB)": 146.85, "step": 44660, "train_speed(iter/s)": 0.20215 }, { "acc": 0.78168688, "epoch": 1.0421849254144855, "grad_norm": 6.59375, "learning_rate": 4.901205890860095e-06, "loss": 0.76300621, "memory(GiB)": 146.85, "step": 44670, "train_speed(iter/s)": 0.202173 }, { "acc": 0.77951674, "epoch": 1.0424182329867744, "grad_norm": 5.84375, "learning_rate": 4.899317155119708e-06, "loss": 0.79144344, "memory(GiB)": 146.85, "step": 44680, "train_speed(iter/s)": 0.202197 }, { "acc": 0.7658967, "epoch": 1.0426515405590633, "grad_norm": 5.15625, "learning_rate": 4.89742843375177e-06, "loss": 0.85568752, "memory(GiB)": 146.85, "step": 44690, "train_speed(iter/s)": 0.20222 }, { "acc": 0.76518545, "epoch": 1.0428848481313522, "grad_norm": 5.25, "learning_rate": 4.895539727025891e-06, "loss": 0.84334679, "memory(GiB)": 146.85, "step": 44700, "train_speed(iter/s)": 0.202244 }, { "acc": 0.7759861, "epoch": 1.043118155703641, "grad_norm": 4.75, "learning_rate": 4.8936510352116886e-06, "loss": 0.79182692, "memory(GiB)": 146.85, "step": 44710, "train_speed(iter/s)": 0.202268 }, { "acc": 0.75499878, "epoch": 1.04335146327593, "grad_norm": 8.4375, "learning_rate": 4.891762358578767e-06, "loss": 0.87169609, "memory(GiB)": 146.85, "step": 44720, "train_speed(iter/s)": 0.202293 }, { "acc": 0.76265516, "epoch": 1.0435847708482189, "grad_norm": 6.15625, "learning_rate": 4.889873697396738e-06, "loss": 0.8675333, "memory(GiB)": 146.85, "step": 44730, "train_speed(iter/s)": 0.202316 }, { "acc": 0.76423411, "epoch": 1.0438180784205078, "grad_norm": 5.25, "learning_rate": 4.887985051935206e-06, "loss": 0.865065, "memory(GiB)": 146.85, "step": 44740, "train_speed(iter/s)": 0.202339 }, { "acc": 0.76058111, "epoch": 1.0440513859927967, "grad_norm": 6.09375, "learning_rate": 4.8860964224637756e-06, "loss": 0.87458725, "memory(GiB)": 146.85, "step": 44750, "train_speed(iter/s)": 0.202363 }, { "acc": 0.77040291, "epoch": 1.0442846935650856, "grad_norm": 4.5625, "learning_rate": 4.884207809252049e-06, "loss": 0.82514143, "memory(GiB)": 146.85, "step": 44760, "train_speed(iter/s)": 0.202387 }, { "acc": 0.78636799, "epoch": 1.0445180011373745, "grad_norm": 4.9375, "learning_rate": 4.882319212569623e-06, "loss": 0.77382393, "memory(GiB)": 146.85, "step": 44770, "train_speed(iter/s)": 0.202409 }, { "acc": 0.76000242, "epoch": 1.0447513087096634, "grad_norm": 6.34375, "learning_rate": 4.880430632686096e-06, "loss": 0.84873676, "memory(GiB)": 146.85, "step": 44780, "train_speed(iter/s)": 0.202432 }, { "acc": 0.75962181, "epoch": 1.0449846162819523, "grad_norm": 4.46875, "learning_rate": 4.87854206987106e-06, "loss": 0.86507549, "memory(GiB)": 146.85, "step": 44790, "train_speed(iter/s)": 0.202454 }, { "acc": 0.77422829, "epoch": 1.0452179238542412, "grad_norm": 6.46875, "learning_rate": 4.876653524394109e-06, "loss": 0.82604179, "memory(GiB)": 146.85, "step": 44800, "train_speed(iter/s)": 0.202477 }, { "acc": 0.76466093, "epoch": 1.04545123142653, "grad_norm": 5.71875, "learning_rate": 4.874764996524831e-06, "loss": 0.86216316, "memory(GiB)": 146.85, "step": 44810, "train_speed(iter/s)": 0.202501 }, { "acc": 0.77390881, "epoch": 1.045684538998819, "grad_norm": 6.34375, "learning_rate": 4.872876486532814e-06, "loss": 0.80724916, "memory(GiB)": 146.85, "step": 44820, "train_speed(iter/s)": 0.202526 }, { "acc": 0.77889662, "epoch": 1.0459178465711079, "grad_norm": 4.8125, "learning_rate": 4.870987994687644e-06, "loss": 0.79095716, "memory(GiB)": 146.85, "step": 44830, "train_speed(iter/s)": 0.202548 }, { "acc": 0.76679168, "epoch": 1.0461511541433968, "grad_norm": 7.15625, "learning_rate": 4.869099521258897e-06, "loss": 0.83461933, "memory(GiB)": 146.85, "step": 44840, "train_speed(iter/s)": 0.202571 }, { "acc": 0.77347746, "epoch": 1.0463844617156857, "grad_norm": 5.71875, "learning_rate": 4.867211066516157e-06, "loss": 0.82208719, "memory(GiB)": 146.85, "step": 44850, "train_speed(iter/s)": 0.202596 }, { "acc": 0.78504734, "epoch": 1.0466177692879746, "grad_norm": 5.1875, "learning_rate": 4.865322630728998e-06, "loss": 0.76834736, "memory(GiB)": 146.85, "step": 44860, "train_speed(iter/s)": 0.20262 }, { "acc": 0.77472029, "epoch": 1.0468510768602632, "grad_norm": 5.65625, "learning_rate": 4.863434214166994e-06, "loss": 0.80153389, "memory(GiB)": 146.85, "step": 44870, "train_speed(iter/s)": 0.202645 }, { "acc": 0.78099418, "epoch": 1.0470843844325521, "grad_norm": 4.53125, "learning_rate": 4.8615458170997166e-06, "loss": 0.80185795, "memory(GiB)": 146.85, "step": 44880, "train_speed(iter/s)": 0.202668 }, { "acc": 0.78055086, "epoch": 1.047317692004841, "grad_norm": 6.53125, "learning_rate": 4.8596574397967324e-06, "loss": 0.77937989, "memory(GiB)": 146.85, "step": 44890, "train_speed(iter/s)": 0.202691 }, { "acc": 0.75424228, "epoch": 1.04755099957713, "grad_norm": 6.03125, "learning_rate": 4.857769082527609e-06, "loss": 0.87933121, "memory(GiB)": 146.85, "step": 44900, "train_speed(iter/s)": 0.202714 }, { "acc": 0.75324812, "epoch": 1.0477843071494188, "grad_norm": 5.46875, "learning_rate": 4.855880745561909e-06, "loss": 0.88726673, "memory(GiB)": 146.85, "step": 44910, "train_speed(iter/s)": 0.202737 }, { "acc": 0.75582433, "epoch": 1.0480176147217077, "grad_norm": 4.875, "learning_rate": 4.853992429169189e-06, "loss": 0.88867321, "memory(GiB)": 146.85, "step": 44920, "train_speed(iter/s)": 0.20276 }, { "acc": 0.76293945, "epoch": 1.0482509222939966, "grad_norm": 7.5, "learning_rate": 4.852104133619008e-06, "loss": 0.86322069, "memory(GiB)": 146.85, "step": 44930, "train_speed(iter/s)": 0.202783 }, { "acc": 0.76653099, "epoch": 1.0484842298662855, "grad_norm": 8.5625, "learning_rate": 4.85021585918092e-06, "loss": 0.82724934, "memory(GiB)": 146.85, "step": 44940, "train_speed(iter/s)": 0.202807 }, { "acc": 0.78976212, "epoch": 1.0487175374385744, "grad_norm": 6.46875, "learning_rate": 4.848327606124473e-06, "loss": 0.75558767, "memory(GiB)": 146.85, "step": 44950, "train_speed(iter/s)": 0.20283 }, { "acc": 0.78020658, "epoch": 1.0489508450108633, "grad_norm": 5.21875, "learning_rate": 4.846439374719217e-06, "loss": 0.8028904, "memory(GiB)": 146.85, "step": 44960, "train_speed(iter/s)": 0.202852 }, { "acc": 0.77314577, "epoch": 1.0491841525831522, "grad_norm": 5.46875, "learning_rate": 4.844551165234694e-06, "loss": 0.81522789, "memory(GiB)": 146.85, "step": 44970, "train_speed(iter/s)": 0.202875 }, { "acc": 0.77901726, "epoch": 1.0494174601554411, "grad_norm": 5.84375, "learning_rate": 4.842662977940448e-06, "loss": 0.78876677, "memory(GiB)": 146.85, "step": 44980, "train_speed(iter/s)": 0.202898 }, { "acc": 0.7648201, "epoch": 1.04965076772773, "grad_norm": 5.1875, "learning_rate": 4.8407748131060175e-06, "loss": 0.83683262, "memory(GiB)": 146.85, "step": 44990, "train_speed(iter/s)": 0.202921 }, { "acc": 0.76898303, "epoch": 1.049884075300019, "grad_norm": 4.71875, "learning_rate": 4.838886671000934e-06, "loss": 0.82650032, "memory(GiB)": 146.85, "step": 45000, "train_speed(iter/s)": 0.202941 }, { "epoch": 1.049884075300019, "eval_acc": 0.7351232175644323, "eval_loss": 0.8342551589012146, "eval_runtime": 1263.521, "eval_samples_per_second": 28.485, "eval_steps_per_second": 14.243, "step": 45000 }, { "acc": 0.76591787, "epoch": 1.0501173828723078, "grad_norm": 7.59375, "learning_rate": 4.8369985518947336e-06, "loss": 0.86757374, "memory(GiB)": 146.85, "step": 45010, "train_speed(iter/s)": 0.20179 }, { "acc": 0.77129245, "epoch": 1.0503506904445967, "grad_norm": 6.03125, "learning_rate": 4.83511045605694e-06, "loss": 0.84492817, "memory(GiB)": 146.85, "step": 45020, "train_speed(iter/s)": 0.201812 }, { "acc": 0.75980659, "epoch": 1.0505839980168856, "grad_norm": 6.5, "learning_rate": 4.8332223837570824e-06, "loss": 0.85361919, "memory(GiB)": 146.85, "step": 45030, "train_speed(iter/s)": 0.201836 }, { "acc": 0.78096895, "epoch": 1.0508173055891745, "grad_norm": 6.09375, "learning_rate": 4.831334335264682e-06, "loss": 0.79429159, "memory(GiB)": 146.85, "step": 45040, "train_speed(iter/s)": 0.20186 }, { "acc": 0.7662343, "epoch": 1.0510506131614634, "grad_norm": 6.4375, "learning_rate": 4.829446310849256e-06, "loss": 0.84193211, "memory(GiB)": 146.85, "step": 45050, "train_speed(iter/s)": 0.201883 }, { "acc": 0.76798086, "epoch": 1.0512839207337523, "grad_norm": 5.40625, "learning_rate": 4.827558310780319e-06, "loss": 0.84052353, "memory(GiB)": 146.85, "step": 45060, "train_speed(iter/s)": 0.201906 }, { "acc": 0.7802978, "epoch": 1.0515172283060412, "grad_norm": 6.125, "learning_rate": 4.825670335327383e-06, "loss": 0.7774354, "memory(GiB)": 146.85, "step": 45070, "train_speed(iter/s)": 0.201929 }, { "acc": 0.78689861, "epoch": 1.0517505358783301, "grad_norm": 11.375, "learning_rate": 4.823782384759955e-06, "loss": 0.76345701, "memory(GiB)": 146.85, "step": 45080, "train_speed(iter/s)": 0.201951 }, { "acc": 0.7699544, "epoch": 1.051983843450619, "grad_norm": 4.6875, "learning_rate": 4.821894459347542e-06, "loss": 0.82761135, "memory(GiB)": 146.85, "step": 45090, "train_speed(iter/s)": 0.201974 }, { "acc": 0.78059616, "epoch": 1.052217151022908, "grad_norm": 6.53125, "learning_rate": 4.820006559359642e-06, "loss": 0.79403286, "memory(GiB)": 146.85, "step": 45100, "train_speed(iter/s)": 0.201995 }, { "acc": 0.77263212, "epoch": 1.0524504585951968, "grad_norm": 7.5, "learning_rate": 4.818118685065754e-06, "loss": 0.82440634, "memory(GiB)": 146.85, "step": 45110, "train_speed(iter/s)": 0.202018 }, { "acc": 0.77697763, "epoch": 1.0526837661674857, "grad_norm": 12.5, "learning_rate": 4.8162308367353705e-06, "loss": 0.82175312, "memory(GiB)": 146.85, "step": 45120, "train_speed(iter/s)": 0.202041 }, { "acc": 0.76947927, "epoch": 1.0529170737397746, "grad_norm": 5.875, "learning_rate": 4.814343014637982e-06, "loss": 0.83662767, "memory(GiB)": 146.85, "step": 45130, "train_speed(iter/s)": 0.202065 }, { "acc": 0.77608733, "epoch": 1.0531503813120635, "grad_norm": 5.65625, "learning_rate": 4.812455219043074e-06, "loss": 0.80224438, "memory(GiB)": 146.85, "step": 45140, "train_speed(iter/s)": 0.202089 }, { "acc": 0.77443542, "epoch": 1.0533836888843524, "grad_norm": 6.8125, "learning_rate": 4.810567450220128e-06, "loss": 0.80430746, "memory(GiB)": 146.85, "step": 45150, "train_speed(iter/s)": 0.202113 }, { "acc": 0.78226423, "epoch": 1.0536169964566413, "grad_norm": 7.375, "learning_rate": 4.808679708438624e-06, "loss": 0.78874955, "memory(GiB)": 146.85, "step": 45160, "train_speed(iter/s)": 0.202135 }, { "acc": 0.76126108, "epoch": 1.0538503040289302, "grad_norm": 5.28125, "learning_rate": 4.806791993968039e-06, "loss": 0.864851, "memory(GiB)": 146.85, "step": 45170, "train_speed(iter/s)": 0.202157 }, { "acc": 0.77425637, "epoch": 1.054083611601219, "grad_norm": 7.375, "learning_rate": 4.804904307077838e-06, "loss": 0.79819064, "memory(GiB)": 146.85, "step": 45180, "train_speed(iter/s)": 0.202178 }, { "acc": 0.76247158, "epoch": 1.054316919173508, "grad_norm": 7.15625, "learning_rate": 4.80301664803749e-06, "loss": 0.8631403, "memory(GiB)": 146.85, "step": 45190, "train_speed(iter/s)": 0.202202 }, { "acc": 0.78667421, "epoch": 1.054550226745797, "grad_norm": 6.28125, "learning_rate": 4.80112901711646e-06, "loss": 0.78081274, "memory(GiB)": 146.85, "step": 45200, "train_speed(iter/s)": 0.202227 }, { "acc": 0.76169443, "epoch": 1.0547835343180858, "grad_norm": 8.5625, "learning_rate": 4.799241414584204e-06, "loss": 0.87445507, "memory(GiB)": 146.85, "step": 45210, "train_speed(iter/s)": 0.20225 }, { "acc": 0.78098598, "epoch": 1.0550168418903747, "grad_norm": 4.9375, "learning_rate": 4.797353840710178e-06, "loss": 0.78209891, "memory(GiB)": 146.85, "step": 45220, "train_speed(iter/s)": 0.202274 }, { "acc": 0.76852036, "epoch": 1.0552501494626636, "grad_norm": 5.5, "learning_rate": 4.795466295763832e-06, "loss": 0.83971214, "memory(GiB)": 146.85, "step": 45230, "train_speed(iter/s)": 0.202297 }, { "acc": 0.75752134, "epoch": 1.0554834570349523, "grad_norm": 5.34375, "learning_rate": 4.793578780014612e-06, "loss": 0.88083477, "memory(GiB)": 146.85, "step": 45240, "train_speed(iter/s)": 0.20232 }, { "acc": 0.75792656, "epoch": 1.0557167646072414, "grad_norm": 5.875, "learning_rate": 4.791691293731962e-06, "loss": 0.86407251, "memory(GiB)": 146.85, "step": 45250, "train_speed(iter/s)": 0.202343 }, { "acc": 0.78439798, "epoch": 1.05595007217953, "grad_norm": 5.78125, "learning_rate": 4.78980383718532e-06, "loss": 0.76215339, "memory(GiB)": 146.85, "step": 45260, "train_speed(iter/s)": 0.202367 }, { "acc": 0.76960773, "epoch": 1.056183379751819, "grad_norm": 5.25, "learning_rate": 4.787916410644119e-06, "loss": 0.80575542, "memory(GiB)": 146.85, "step": 45270, "train_speed(iter/s)": 0.202389 }, { "acc": 0.77248497, "epoch": 1.0564166873241079, "grad_norm": 5.5, "learning_rate": 4.786029014377789e-06, "loss": 0.81225901, "memory(GiB)": 146.85, "step": 45280, "train_speed(iter/s)": 0.202411 }, { "acc": 0.75535679, "epoch": 1.0566499948963968, "grad_norm": 5.75, "learning_rate": 4.784141648655756e-06, "loss": 0.90506268, "memory(GiB)": 146.85, "step": 45290, "train_speed(iter/s)": 0.202436 }, { "acc": 0.78133883, "epoch": 1.0568833024686857, "grad_norm": 5.4375, "learning_rate": 4.782254313747438e-06, "loss": 0.77007685, "memory(GiB)": 146.85, "step": 45300, "train_speed(iter/s)": 0.20246 }, { "acc": 0.760748, "epoch": 1.0571166100409746, "grad_norm": 6.1875, "learning_rate": 4.780367009922253e-06, "loss": 0.8731638, "memory(GiB)": 146.85, "step": 45310, "train_speed(iter/s)": 0.202483 }, { "acc": 0.78135881, "epoch": 1.0573499176132635, "grad_norm": 4.75, "learning_rate": 4.778479737449614e-06, "loss": 0.79151292, "memory(GiB)": 146.85, "step": 45320, "train_speed(iter/s)": 0.202507 }, { "acc": 0.79502687, "epoch": 1.0575832251855524, "grad_norm": 6.34375, "learning_rate": 4.7765924965989286e-06, "loss": 0.72482891, "memory(GiB)": 146.85, "step": 45330, "train_speed(iter/s)": 0.20253 }, { "acc": 0.79588642, "epoch": 1.0578165327578413, "grad_norm": 6.09375, "learning_rate": 4.7747052876396e-06, "loss": 0.73362589, "memory(GiB)": 146.85, "step": 45340, "train_speed(iter/s)": 0.202552 }, { "acc": 0.76037588, "epoch": 1.0580498403301302, "grad_norm": 5.90625, "learning_rate": 4.772818110841025e-06, "loss": 0.86198864, "memory(GiB)": 146.85, "step": 45350, "train_speed(iter/s)": 0.202574 }, { "acc": 0.78701954, "epoch": 1.058283147902419, "grad_norm": 5.59375, "learning_rate": 4.7709309664726e-06, "loss": 0.7610939, "memory(GiB)": 146.85, "step": 45360, "train_speed(iter/s)": 0.202598 }, { "acc": 0.76704617, "epoch": 1.058516455474708, "grad_norm": 9.3125, "learning_rate": 4.769043854803712e-06, "loss": 0.8296402, "memory(GiB)": 146.85, "step": 45370, "train_speed(iter/s)": 0.202622 }, { "acc": 0.75898442, "epoch": 1.0587497630469969, "grad_norm": 5.875, "learning_rate": 4.767156776103746e-06, "loss": 0.87566595, "memory(GiB)": 146.85, "step": 45380, "train_speed(iter/s)": 0.202646 }, { "acc": 0.77678924, "epoch": 1.0589830706192858, "grad_norm": 5.71875, "learning_rate": 4.765269730642083e-06, "loss": 0.79444351, "memory(GiB)": 146.85, "step": 45390, "train_speed(iter/s)": 0.202667 }, { "acc": 0.78092165, "epoch": 1.0592163781915747, "grad_norm": 4.625, "learning_rate": 4.7633827186881e-06, "loss": 0.77619624, "memory(GiB)": 146.85, "step": 45400, "train_speed(iter/s)": 0.20269 }, { "acc": 0.76637177, "epoch": 1.0594496857638636, "grad_norm": 7.25, "learning_rate": 4.7614957405111635e-06, "loss": 0.8312397, "memory(GiB)": 146.85, "step": 45410, "train_speed(iter/s)": 0.202713 }, { "acc": 0.77795687, "epoch": 1.0596829933361525, "grad_norm": 6.1875, "learning_rate": 4.759608796380642e-06, "loss": 0.80397911, "memory(GiB)": 146.85, "step": 45420, "train_speed(iter/s)": 0.202735 }, { "acc": 0.7341445, "epoch": 1.0599163009084414, "grad_norm": 5.0, "learning_rate": 4.757721886565893e-06, "loss": 0.95101261, "memory(GiB)": 146.85, "step": 45430, "train_speed(iter/s)": 0.202759 }, { "acc": 0.76221576, "epoch": 1.0601496084807303, "grad_norm": 10.0, "learning_rate": 4.755835011336274e-06, "loss": 0.87475471, "memory(GiB)": 146.85, "step": 45440, "train_speed(iter/s)": 0.202781 }, { "acc": 0.77139292, "epoch": 1.0603829160530192, "grad_norm": 7.03125, "learning_rate": 4.753948170961137e-06, "loss": 0.8200367, "memory(GiB)": 146.85, "step": 45450, "train_speed(iter/s)": 0.202804 }, { "acc": 0.7882678, "epoch": 1.060616223625308, "grad_norm": 6.9375, "learning_rate": 4.752061365709827e-06, "loss": 0.74654026, "memory(GiB)": 146.85, "step": 45460, "train_speed(iter/s)": 0.202828 }, { "acc": 0.78378644, "epoch": 1.060849531197597, "grad_norm": 5.0, "learning_rate": 4.750174595851685e-06, "loss": 0.78551769, "memory(GiB)": 146.85, "step": 45470, "train_speed(iter/s)": 0.202852 }, { "acc": 0.76068316, "epoch": 1.0610828387698858, "grad_norm": 5.40625, "learning_rate": 4.748287861656047e-06, "loss": 0.87385139, "memory(GiB)": 146.85, "step": 45480, "train_speed(iter/s)": 0.202874 }, { "acc": 0.77255068, "epoch": 1.0613161463421747, "grad_norm": 6.625, "learning_rate": 4.746401163392244e-06, "loss": 0.83322678, "memory(GiB)": 146.85, "step": 45490, "train_speed(iter/s)": 0.202898 }, { "acc": 0.77044153, "epoch": 1.0615494539144636, "grad_norm": 6.25, "learning_rate": 4.744514501329601e-06, "loss": 0.81641579, "memory(GiB)": 146.85, "step": 45500, "train_speed(iter/s)": 0.202922 }, { "epoch": 1.0615494539144636, "eval_acc": 0.735114505048419, "eval_loss": 0.8342726230621338, "eval_runtime": 1264.237, "eval_samples_per_second": 28.469, "eval_steps_per_second": 14.235, "step": 45500 }, { "acc": 0.77505522, "epoch": 1.0617827614867525, "grad_norm": 5.84375, "learning_rate": 4.74262787573744e-06, "loss": 0.8118578, "memory(GiB)": 146.85, "step": 45510, "train_speed(iter/s)": 0.201784 }, { "acc": 0.77702608, "epoch": 1.0620160690590414, "grad_norm": 7.59375, "learning_rate": 4.7407412868850734e-06, "loss": 0.81309042, "memory(GiB)": 146.85, "step": 45520, "train_speed(iter/s)": 0.201806 }, { "acc": 0.79039626, "epoch": 1.0622493766313303, "grad_norm": 5.65625, "learning_rate": 4.738854735041813e-06, "loss": 0.74820251, "memory(GiB)": 146.85, "step": 45530, "train_speed(iter/s)": 0.201827 }, { "acc": 0.76583538, "epoch": 1.0624826842036192, "grad_norm": 7.625, "learning_rate": 4.736968220476963e-06, "loss": 0.84892645, "memory(GiB)": 146.85, "step": 45540, "train_speed(iter/s)": 0.201848 }, { "acc": 0.76576843, "epoch": 1.0627159917759081, "grad_norm": 5.5625, "learning_rate": 4.735081743459823e-06, "loss": 0.85578604, "memory(GiB)": 146.85, "step": 45550, "train_speed(iter/s)": 0.201871 }, { "acc": 0.78218203, "epoch": 1.062949299348197, "grad_norm": 7.5625, "learning_rate": 4.733195304259689e-06, "loss": 0.77176266, "memory(GiB)": 146.85, "step": 45560, "train_speed(iter/s)": 0.201895 }, { "acc": 0.77734594, "epoch": 1.063182606920486, "grad_norm": 6.0625, "learning_rate": 4.731308903145846e-06, "loss": 0.78835325, "memory(GiB)": 146.85, "step": 45570, "train_speed(iter/s)": 0.201919 }, { "acc": 0.77738895, "epoch": 1.0634159144927748, "grad_norm": 6.6875, "learning_rate": 4.729422540387579e-06, "loss": 0.79052706, "memory(GiB)": 146.85, "step": 45580, "train_speed(iter/s)": 0.201942 }, { "acc": 0.77080345, "epoch": 1.0636492220650637, "grad_norm": 8.3125, "learning_rate": 4.727536216254166e-06, "loss": 0.83894444, "memory(GiB)": 146.85, "step": 45590, "train_speed(iter/s)": 0.201965 }, { "acc": 0.78804512, "epoch": 1.0638825296373526, "grad_norm": 4.6875, "learning_rate": 4.725649931014879e-06, "loss": 0.76008296, "memory(GiB)": 146.85, "step": 45600, "train_speed(iter/s)": 0.201988 }, { "acc": 0.76914301, "epoch": 1.0641158372096415, "grad_norm": 5.5625, "learning_rate": 4.723763684938985e-06, "loss": 0.82806282, "memory(GiB)": 146.85, "step": 45610, "train_speed(iter/s)": 0.20201 }, { "acc": 0.79075432, "epoch": 1.0643491447819304, "grad_norm": 9.5625, "learning_rate": 4.721877478295745e-06, "loss": 0.76575642, "memory(GiB)": 146.85, "step": 45620, "train_speed(iter/s)": 0.202033 }, { "acc": 0.75019488, "epoch": 1.064582452354219, "grad_norm": 6.6875, "learning_rate": 4.719991311354415e-06, "loss": 0.90429993, "memory(GiB)": 146.85, "step": 45630, "train_speed(iter/s)": 0.202056 }, { "acc": 0.78480792, "epoch": 1.0648157599265082, "grad_norm": 3.921875, "learning_rate": 4.718105184384243e-06, "loss": 0.77231259, "memory(GiB)": 146.85, "step": 45640, "train_speed(iter/s)": 0.202079 }, { "acc": 0.77810774, "epoch": 1.065049067498797, "grad_norm": 7.5625, "learning_rate": 4.7162190976544735e-06, "loss": 0.82032814, "memory(GiB)": 146.85, "step": 45650, "train_speed(iter/s)": 0.202103 }, { "acc": 0.77681627, "epoch": 1.0652823750710858, "grad_norm": 7.625, "learning_rate": 4.7143330514343446e-06, "loss": 0.79912605, "memory(GiB)": 146.85, "step": 45660, "train_speed(iter/s)": 0.202127 }, { "acc": 0.79399171, "epoch": 1.0655156826433747, "grad_norm": 5.78125, "learning_rate": 4.712447045993091e-06, "loss": 0.74573898, "memory(GiB)": 146.85, "step": 45670, "train_speed(iter/s)": 0.202152 }, { "acc": 0.76620202, "epoch": 1.0657489902156636, "grad_norm": 5.3125, "learning_rate": 4.710561081599937e-06, "loss": 0.85073414, "memory(GiB)": 146.85, "step": 45680, "train_speed(iter/s)": 0.202175 }, { "acc": 0.7564765, "epoch": 1.0659822977879525, "grad_norm": 5.65625, "learning_rate": 4.708675158524105e-06, "loss": 0.87724762, "memory(GiB)": 146.85, "step": 45690, "train_speed(iter/s)": 0.202198 }, { "acc": 0.7736577, "epoch": 1.0662156053602414, "grad_norm": 6.28125, "learning_rate": 4.706789277034811e-06, "loss": 0.83307533, "memory(GiB)": 146.85, "step": 45700, "train_speed(iter/s)": 0.202221 }, { "acc": 0.78277054, "epoch": 1.0664489129325303, "grad_norm": 5.3125, "learning_rate": 4.704903437401261e-06, "loss": 0.77735848, "memory(GiB)": 146.85, "step": 45710, "train_speed(iter/s)": 0.202244 }, { "acc": 0.78391008, "epoch": 1.0666822205048192, "grad_norm": 5.09375, "learning_rate": 4.703017639892659e-06, "loss": 0.76511688, "memory(GiB)": 146.85, "step": 45720, "train_speed(iter/s)": 0.202268 }, { "acc": 0.75578098, "epoch": 1.066915528077108, "grad_norm": 6.21875, "learning_rate": 4.701131884778204e-06, "loss": 0.86010094, "memory(GiB)": 146.85, "step": 45730, "train_speed(iter/s)": 0.202291 }, { "acc": 0.76692505, "epoch": 1.067148835649397, "grad_norm": 5.71875, "learning_rate": 4.699246172327087e-06, "loss": 0.8230463, "memory(GiB)": 146.85, "step": 45740, "train_speed(iter/s)": 0.202313 }, { "acc": 0.76969395, "epoch": 1.067382143221686, "grad_norm": 20.0, "learning_rate": 4.697360502808488e-06, "loss": 0.83777514, "memory(GiB)": 146.85, "step": 45750, "train_speed(iter/s)": 0.202337 }, { "acc": 0.78805599, "epoch": 1.0676154507939748, "grad_norm": 6.75, "learning_rate": 4.695474876491592e-06, "loss": 0.77601781, "memory(GiB)": 146.85, "step": 45760, "train_speed(iter/s)": 0.20236 }, { "acc": 0.76907129, "epoch": 1.0678487583662637, "grad_norm": 6.125, "learning_rate": 4.6935892936455664e-06, "loss": 0.83271551, "memory(GiB)": 146.85, "step": 45770, "train_speed(iter/s)": 0.202384 }, { "acc": 0.78475161, "epoch": 1.0680820659385526, "grad_norm": 6.5625, "learning_rate": 4.691703754539583e-06, "loss": 0.77926497, "memory(GiB)": 146.85, "step": 45780, "train_speed(iter/s)": 0.202406 }, { "acc": 0.77388039, "epoch": 1.0683153735108415, "grad_norm": 6.34375, "learning_rate": 4.689818259442797e-06, "loss": 0.82609501, "memory(GiB)": 146.85, "step": 45790, "train_speed(iter/s)": 0.202431 }, { "acc": 0.76939859, "epoch": 1.0685486810831304, "grad_norm": 6.4375, "learning_rate": 4.687932808624365e-06, "loss": 0.83745403, "memory(GiB)": 146.85, "step": 45800, "train_speed(iter/s)": 0.202453 }, { "acc": 0.7728673, "epoch": 1.0687819886554193, "grad_norm": 5.6875, "learning_rate": 4.686047402353433e-06, "loss": 0.81675415, "memory(GiB)": 146.85, "step": 45810, "train_speed(iter/s)": 0.202475 }, { "acc": 0.79280968, "epoch": 1.0690152962277082, "grad_norm": 5.375, "learning_rate": 4.684162040899144e-06, "loss": 0.7459116, "memory(GiB)": 146.85, "step": 45820, "train_speed(iter/s)": 0.202497 }, { "acc": 0.75406685, "epoch": 1.069248603799997, "grad_norm": 8.25, "learning_rate": 4.682276724530633e-06, "loss": 0.87660933, "memory(GiB)": 146.85, "step": 45830, "train_speed(iter/s)": 0.20252 }, { "acc": 0.76002917, "epoch": 1.069481911372286, "grad_norm": 6.75, "learning_rate": 4.680391453517026e-06, "loss": 0.87248163, "memory(GiB)": 146.85, "step": 45840, "train_speed(iter/s)": 0.202543 }, { "acc": 0.78770475, "epoch": 1.0697152189445749, "grad_norm": 10.9375, "learning_rate": 4.678506228127447e-06, "loss": 0.7505013, "memory(GiB)": 146.85, "step": 45850, "train_speed(iter/s)": 0.202565 }, { "acc": 0.77395411, "epoch": 1.0699485265168638, "grad_norm": 9.0, "learning_rate": 4.67662104863101e-06, "loss": 0.81669159, "memory(GiB)": 146.85, "step": 45860, "train_speed(iter/s)": 0.202588 }, { "acc": 0.76048994, "epoch": 1.0701818340891527, "grad_norm": 5.8125, "learning_rate": 4.674735915296824e-06, "loss": 0.84855928, "memory(GiB)": 146.85, "step": 45870, "train_speed(iter/s)": 0.202612 }, { "acc": 0.76864223, "epoch": 1.0704151416614416, "grad_norm": 7.75, "learning_rate": 4.672850828393992e-06, "loss": 0.82694702, "memory(GiB)": 146.85, "step": 45880, "train_speed(iter/s)": 0.202634 }, { "acc": 0.7826335, "epoch": 1.0706484492337305, "grad_norm": 5.5, "learning_rate": 4.670965788191609e-06, "loss": 0.78336468, "memory(GiB)": 146.85, "step": 45890, "train_speed(iter/s)": 0.202657 }, { "acc": 0.75663548, "epoch": 1.0708817568060194, "grad_norm": 5.46875, "learning_rate": 4.669080794958764e-06, "loss": 0.89130421, "memory(GiB)": 146.85, "step": 45900, "train_speed(iter/s)": 0.20268 }, { "acc": 0.77340479, "epoch": 1.0711150643783083, "grad_norm": 4.90625, "learning_rate": 4.6671958489645394e-06, "loss": 0.81936035, "memory(GiB)": 146.85, "step": 45910, "train_speed(iter/s)": 0.202703 }, { "acc": 0.76682692, "epoch": 1.0713483719505972, "grad_norm": 7.0625, "learning_rate": 4.665310950478011e-06, "loss": 0.84819832, "memory(GiB)": 146.85, "step": 45920, "train_speed(iter/s)": 0.202725 }, { "acc": 0.77395658, "epoch": 1.071581679522886, "grad_norm": 6.5, "learning_rate": 4.663426099768247e-06, "loss": 0.79053164, "memory(GiB)": 146.85, "step": 45930, "train_speed(iter/s)": 0.202748 }, { "acc": 0.78021679, "epoch": 1.071814987095175, "grad_norm": 6.375, "learning_rate": 4.661541297104309e-06, "loss": 0.79922423, "memory(GiB)": 146.85, "step": 45940, "train_speed(iter/s)": 0.20277 }, { "acc": 0.77258115, "epoch": 1.0720482946674639, "grad_norm": 7.65625, "learning_rate": 4.659656542755253e-06, "loss": 0.81868629, "memory(GiB)": 146.85, "step": 45950, "train_speed(iter/s)": 0.202793 }, { "acc": 0.7758543, "epoch": 1.0722816022397528, "grad_norm": 7.0, "learning_rate": 4.657771836990127e-06, "loss": 0.79683056, "memory(GiB)": 146.85, "step": 45960, "train_speed(iter/s)": 0.202816 }, { "acc": 0.77470608, "epoch": 1.0725149098120417, "grad_norm": 7.5625, "learning_rate": 4.655887180077973e-06, "loss": 0.80642796, "memory(GiB)": 146.85, "step": 45970, "train_speed(iter/s)": 0.202839 }, { "acc": 0.77835646, "epoch": 1.0727482173843306, "grad_norm": 4.59375, "learning_rate": 4.654002572287822e-06, "loss": 0.80886593, "memory(GiB)": 146.85, "step": 45980, "train_speed(iter/s)": 0.20286 }, { "acc": 0.77891054, "epoch": 1.0729815249566195, "grad_norm": 5.78125, "learning_rate": 4.652118013888704e-06, "loss": 0.79475851, "memory(GiB)": 146.85, "step": 45990, "train_speed(iter/s)": 0.202883 }, { "acc": 0.75409913, "epoch": 1.0732148325289084, "grad_norm": 7.40625, "learning_rate": 4.650233505149639e-06, "loss": 0.90177345, "memory(GiB)": 146.85, "step": 46000, "train_speed(iter/s)": 0.202906 }, { "epoch": 1.0732148325289084, "eval_acc": 0.7351185386206474, "eval_loss": 0.8342730402946472, "eval_runtime": 1263.8647, "eval_samples_per_second": 28.477, "eval_steps_per_second": 14.239, "step": 46000 }, { "acc": 0.76445303, "epoch": 1.0734481401011973, "grad_norm": 4.96875, "learning_rate": 4.648349046339639e-06, "loss": 0.84066982, "memory(GiB)": 146.85, "step": 46010, "train_speed(iter/s)": 0.201782 }, { "acc": 0.7795372, "epoch": 1.073681447673486, "grad_norm": 5.28125, "learning_rate": 4.64646463772771e-06, "loss": 0.78699789, "memory(GiB)": 146.85, "step": 46020, "train_speed(iter/s)": 0.201804 }, { "acc": 0.79339676, "epoch": 1.0739147552457748, "grad_norm": 5.8125, "learning_rate": 4.6445802795828515e-06, "loss": 0.73901038, "memory(GiB)": 146.85, "step": 46030, "train_speed(iter/s)": 0.201828 }, { "acc": 0.78729029, "epoch": 1.0741480628180637, "grad_norm": 6.6875, "learning_rate": 4.642695972174055e-06, "loss": 0.76920104, "memory(GiB)": 146.85, "step": 46040, "train_speed(iter/s)": 0.20185 }, { "acc": 0.76127958, "epoch": 1.0743813703903526, "grad_norm": 6.15625, "learning_rate": 4.640811715770305e-06, "loss": 0.85668287, "memory(GiB)": 146.85, "step": 46050, "train_speed(iter/s)": 0.201871 }, { "acc": 0.76707354, "epoch": 1.0746146779626415, "grad_norm": 5.4375, "learning_rate": 4.638927510640578e-06, "loss": 0.84976969, "memory(GiB)": 146.85, "step": 46060, "train_speed(iter/s)": 0.201893 }, { "acc": 0.78345504, "epoch": 1.0748479855349304, "grad_norm": 6.1875, "learning_rate": 4.637043357053844e-06, "loss": 0.76769781, "memory(GiB)": 146.85, "step": 46070, "train_speed(iter/s)": 0.201915 }, { "acc": 0.76250429, "epoch": 1.0750812931072193, "grad_norm": 6.90625, "learning_rate": 4.635159255279066e-06, "loss": 0.85240726, "memory(GiB)": 146.85, "step": 46080, "train_speed(iter/s)": 0.201937 }, { "acc": 0.78800907, "epoch": 1.0753146006795082, "grad_norm": 8.3125, "learning_rate": 4.633275205585198e-06, "loss": 0.74775758, "memory(GiB)": 146.85, "step": 46090, "train_speed(iter/s)": 0.201959 }, { "acc": 0.75288391, "epoch": 1.0755479082517971, "grad_norm": 5.84375, "learning_rate": 4.631391208241187e-06, "loss": 0.9072319, "memory(GiB)": 146.85, "step": 46100, "train_speed(iter/s)": 0.201982 }, { "acc": 0.78955197, "epoch": 1.075781215824086, "grad_norm": 4.875, "learning_rate": 4.6295072635159744e-06, "loss": 0.75119467, "memory(GiB)": 146.85, "step": 46110, "train_speed(iter/s)": 0.202005 }, { "acc": 0.79227581, "epoch": 1.076014523396375, "grad_norm": 4.375, "learning_rate": 4.627623371678492e-06, "loss": 0.74528217, "memory(GiB)": 146.85, "step": 46120, "train_speed(iter/s)": 0.202028 }, { "acc": 0.76495056, "epoch": 1.0762478309686638, "grad_norm": 5.0, "learning_rate": 4.625739532997665e-06, "loss": 0.84585667, "memory(GiB)": 146.85, "step": 46130, "train_speed(iter/s)": 0.202052 }, { "acc": 0.77738194, "epoch": 1.0764811385409527, "grad_norm": 6.34375, "learning_rate": 4.623855747742412e-06, "loss": 0.79663029, "memory(GiB)": 146.85, "step": 46140, "train_speed(iter/s)": 0.202075 }, { "acc": 0.7757164, "epoch": 1.0767144461132416, "grad_norm": 5.09375, "learning_rate": 4.62197201618164e-06, "loss": 0.790695, "memory(GiB)": 146.85, "step": 46150, "train_speed(iter/s)": 0.202097 }, { "acc": 0.75897698, "epoch": 1.0769477536855305, "grad_norm": 5.21875, "learning_rate": 4.620088338584254e-06, "loss": 0.8802557, "memory(GiB)": 146.85, "step": 46160, "train_speed(iter/s)": 0.202119 }, { "acc": 0.73953605, "epoch": 1.0771810612578194, "grad_norm": 5.21875, "learning_rate": 4.618204715219147e-06, "loss": 0.94416447, "memory(GiB)": 146.85, "step": 46170, "train_speed(iter/s)": 0.202142 }, { "acc": 0.77572503, "epoch": 1.0774143688301083, "grad_norm": 6.1875, "learning_rate": 4.616321146355206e-06, "loss": 0.82584476, "memory(GiB)": 146.85, "step": 46180, "train_speed(iter/s)": 0.202165 }, { "acc": 0.76823077, "epoch": 1.0776476764023972, "grad_norm": 6.1875, "learning_rate": 4.614437632261311e-06, "loss": 0.83852921, "memory(GiB)": 146.85, "step": 46190, "train_speed(iter/s)": 0.202188 }, { "acc": 0.75300283, "epoch": 1.0778809839746861, "grad_norm": 9.5625, "learning_rate": 4.6125541732063315e-06, "loss": 0.903053, "memory(GiB)": 146.85, "step": 46200, "train_speed(iter/s)": 0.20221 }, { "acc": 0.77362919, "epoch": 1.078114291546975, "grad_norm": 5.5, "learning_rate": 4.6106707694591324e-06, "loss": 0.81713572, "memory(GiB)": 146.85, "step": 46210, "train_speed(iter/s)": 0.202232 }, { "acc": 0.77700553, "epoch": 1.078347599119264, "grad_norm": 6.40625, "learning_rate": 4.608787421288566e-06, "loss": 0.79847412, "memory(GiB)": 146.85, "step": 46220, "train_speed(iter/s)": 0.202255 }, { "acc": 0.76304941, "epoch": 1.0785809066915528, "grad_norm": 9.875, "learning_rate": 4.606904128963482e-06, "loss": 0.85855026, "memory(GiB)": 146.85, "step": 46230, "train_speed(iter/s)": 0.202277 }, { "acc": 0.78181858, "epoch": 1.0788142142638417, "grad_norm": 5.84375, "learning_rate": 4.605020892752718e-06, "loss": 0.77969236, "memory(GiB)": 146.85, "step": 46240, "train_speed(iter/s)": 0.2023 }, { "acc": 0.7791337, "epoch": 1.0790475218361306, "grad_norm": 5.75, "learning_rate": 4.603137712925108e-06, "loss": 0.79643998, "memory(GiB)": 146.85, "step": 46250, "train_speed(iter/s)": 0.202324 }, { "acc": 0.76047506, "epoch": 1.0792808294084195, "grad_norm": 4.9375, "learning_rate": 4.601254589749474e-06, "loss": 0.88265877, "memory(GiB)": 146.85, "step": 46260, "train_speed(iter/s)": 0.202346 }, { "acc": 0.77622762, "epoch": 1.0795141369807084, "grad_norm": 5.15625, "learning_rate": 4.599371523494632e-06, "loss": 0.79760637, "memory(GiB)": 146.85, "step": 46270, "train_speed(iter/s)": 0.202369 }, { "acc": 0.79836493, "epoch": 1.0797474445529973, "grad_norm": 3.984375, "learning_rate": 4.597488514429388e-06, "loss": 0.73880239, "memory(GiB)": 146.85, "step": 46280, "train_speed(iter/s)": 0.202392 }, { "acc": 0.78718739, "epoch": 1.0799807521252862, "grad_norm": 4.46875, "learning_rate": 4.595605562822542e-06, "loss": 0.74480476, "memory(GiB)": 146.85, "step": 46290, "train_speed(iter/s)": 0.202416 }, { "acc": 0.77419367, "epoch": 1.0802140596975751, "grad_norm": 7.0625, "learning_rate": 4.593722668942884e-06, "loss": 0.82528763, "memory(GiB)": 146.85, "step": 46300, "train_speed(iter/s)": 0.202439 }, { "acc": 0.77444296, "epoch": 1.080447367269864, "grad_norm": 6.375, "learning_rate": 4.5918398330592e-06, "loss": 0.79903688, "memory(GiB)": 146.85, "step": 46310, "train_speed(iter/s)": 0.202462 }, { "acc": 0.75693264, "epoch": 1.080680674842153, "grad_norm": 8.75, "learning_rate": 4.589957055440259e-06, "loss": 0.86695976, "memory(GiB)": 146.85, "step": 46320, "train_speed(iter/s)": 0.202485 }, { "acc": 0.76545906, "epoch": 1.0809139824144418, "grad_norm": 6.1875, "learning_rate": 4.588074336354828e-06, "loss": 0.86123972, "memory(GiB)": 146.85, "step": 46330, "train_speed(iter/s)": 0.202507 }, { "acc": 0.76874595, "epoch": 1.0811472899867307, "grad_norm": 6.40625, "learning_rate": 4.586191676071666e-06, "loss": 0.83643112, "memory(GiB)": 146.85, "step": 46340, "train_speed(iter/s)": 0.20253 }, { "acc": 0.7711853, "epoch": 1.0813805975590196, "grad_norm": 5.28125, "learning_rate": 4.584309074859524e-06, "loss": 0.82294464, "memory(GiB)": 146.85, "step": 46350, "train_speed(iter/s)": 0.202553 }, { "acc": 0.78801003, "epoch": 1.0816139051313085, "grad_norm": 6.21875, "learning_rate": 4.5824265329871395e-06, "loss": 0.76550179, "memory(GiB)": 146.85, "step": 46360, "train_speed(iter/s)": 0.202575 }, { "acc": 0.77396183, "epoch": 1.0818472127035974, "grad_norm": 5.84375, "learning_rate": 4.580544050723246e-06, "loss": 0.83781986, "memory(GiB)": 146.85, "step": 46370, "train_speed(iter/s)": 0.202598 }, { "acc": 0.79517903, "epoch": 1.0820805202758863, "grad_norm": 4.8125, "learning_rate": 4.578661628336567e-06, "loss": 0.72782364, "memory(GiB)": 146.85, "step": 46380, "train_speed(iter/s)": 0.20262 }, { "acc": 0.76348696, "epoch": 1.082313827848175, "grad_norm": 6.90625, "learning_rate": 4.576779266095818e-06, "loss": 0.86238232, "memory(GiB)": 146.85, "step": 46390, "train_speed(iter/s)": 0.202642 }, { "acc": 0.75387368, "epoch": 1.082547135420464, "grad_norm": 6.65625, "learning_rate": 4.574896964269707e-06, "loss": 0.89800739, "memory(GiB)": 146.85, "step": 46400, "train_speed(iter/s)": 0.202666 }, { "acc": 0.77286119, "epoch": 1.0827804429927528, "grad_norm": 5.34375, "learning_rate": 4.573014723126931e-06, "loss": 0.82716885, "memory(GiB)": 146.85, "step": 46410, "train_speed(iter/s)": 0.202688 }, { "acc": 0.7636342, "epoch": 1.0830137505650417, "grad_norm": 5.875, "learning_rate": 4.571132542936179e-06, "loss": 0.85395021, "memory(GiB)": 146.85, "step": 46420, "train_speed(iter/s)": 0.202711 }, { "acc": 0.77754278, "epoch": 1.0832470581373306, "grad_norm": 7.34375, "learning_rate": 4.569250423966132e-06, "loss": 0.7973753, "memory(GiB)": 146.85, "step": 46430, "train_speed(iter/s)": 0.202732 }, { "acc": 0.75390358, "epoch": 1.0834803657096195, "grad_norm": 4.96875, "learning_rate": 4.567368366485462e-06, "loss": 0.90350456, "memory(GiB)": 146.85, "step": 46440, "train_speed(iter/s)": 0.202755 }, { "acc": 0.76248937, "epoch": 1.0837136732819084, "grad_norm": 6.9375, "learning_rate": 4.56548637076283e-06, "loss": 0.84808512, "memory(GiB)": 146.85, "step": 46450, "train_speed(iter/s)": 0.202778 }, { "acc": 0.80073624, "epoch": 1.0839469808541973, "grad_norm": 5.1875, "learning_rate": 4.563604437066894e-06, "loss": 0.73557119, "memory(GiB)": 146.85, "step": 46460, "train_speed(iter/s)": 0.2028 }, { "acc": 0.79884405, "epoch": 1.0841802884264862, "grad_norm": 4.90625, "learning_rate": 4.561722565666298e-06, "loss": 0.72826395, "memory(GiB)": 146.85, "step": 46470, "train_speed(iter/s)": 0.202823 }, { "acc": 0.75921268, "epoch": 1.084413595998775, "grad_norm": 4.96875, "learning_rate": 4.559840756829677e-06, "loss": 0.8611743, "memory(GiB)": 146.85, "step": 46480, "train_speed(iter/s)": 0.202846 }, { "acc": 0.75257316, "epoch": 1.084646903571064, "grad_norm": 5.53125, "learning_rate": 4.557959010825662e-06, "loss": 0.89326477, "memory(GiB)": 146.85, "step": 46490, "train_speed(iter/s)": 0.202869 }, { "acc": 0.76157398, "epoch": 1.0848802111433529, "grad_norm": 7.46875, "learning_rate": 4.5560773279228686e-06, "loss": 0.85227242, "memory(GiB)": 146.85, "step": 46500, "train_speed(iter/s)": 0.202891 }, { "epoch": 1.0848802111433529, "eval_acc": 0.7350912716723835, "eval_loss": 0.8343065977096558, "eval_runtime": 1264.1886, "eval_samples_per_second": 28.47, "eval_steps_per_second": 14.235, "step": 46500 }, { "acc": 0.77760019, "epoch": 1.0851135187156418, "grad_norm": 7.625, "learning_rate": 4.5541957083899075e-06, "loss": 0.83825283, "memory(GiB)": 146.85, "step": 46510, "train_speed(iter/s)": 0.201776 }, { "acc": 0.76131144, "epoch": 1.0853468262879307, "grad_norm": 7.375, "learning_rate": 4.55231415249538e-06, "loss": 0.86152458, "memory(GiB)": 146.85, "step": 46520, "train_speed(iter/s)": 0.201798 }, { "acc": 0.756394, "epoch": 1.0855801338602196, "grad_norm": 5.53125, "learning_rate": 4.550432660507877e-06, "loss": 0.89896221, "memory(GiB)": 146.85, "step": 46530, "train_speed(iter/s)": 0.201821 }, { "acc": 0.77655582, "epoch": 1.0858134414325085, "grad_norm": 9.0, "learning_rate": 4.548551232695983e-06, "loss": 0.80235071, "memory(GiB)": 146.85, "step": 46540, "train_speed(iter/s)": 0.201841 }, { "acc": 0.76150618, "epoch": 1.0860467490047974, "grad_norm": 5.09375, "learning_rate": 4.5466698693282675e-06, "loss": 0.86047573, "memory(GiB)": 146.85, "step": 46550, "train_speed(iter/s)": 0.201864 }, { "acc": 0.78337049, "epoch": 1.0862800565770863, "grad_norm": 5.6875, "learning_rate": 4.544788570673296e-06, "loss": 0.78947868, "memory(GiB)": 146.85, "step": 46560, "train_speed(iter/s)": 0.201888 }, { "acc": 0.77519979, "epoch": 1.0865133641493752, "grad_norm": 13.3125, "learning_rate": 4.542907336999625e-06, "loss": 0.80391264, "memory(GiB)": 146.85, "step": 46570, "train_speed(iter/s)": 0.201911 }, { "acc": 0.77142277, "epoch": 1.086746671721664, "grad_norm": 5.5, "learning_rate": 4.541026168575798e-06, "loss": 0.82485819, "memory(GiB)": 146.85, "step": 46580, "train_speed(iter/s)": 0.201935 }, { "acc": 0.78354998, "epoch": 1.086979979293953, "grad_norm": 4.21875, "learning_rate": 4.539145065670353e-06, "loss": 0.77589326, "memory(GiB)": 146.85, "step": 46590, "train_speed(iter/s)": 0.201958 }, { "acc": 0.78610592, "epoch": 1.0872132868662419, "grad_norm": 5.875, "learning_rate": 4.537264028551814e-06, "loss": 0.77043753, "memory(GiB)": 146.85, "step": 46600, "train_speed(iter/s)": 0.20198 }, { "acc": 0.78355446, "epoch": 1.0874465944385308, "grad_norm": 5.0625, "learning_rate": 4.535383057488702e-06, "loss": 0.77276373, "memory(GiB)": 146.85, "step": 46610, "train_speed(iter/s)": 0.202003 }, { "acc": 0.7564003, "epoch": 1.0876799020108197, "grad_norm": 5.15625, "learning_rate": 4.533502152749523e-06, "loss": 0.87230768, "memory(GiB)": 146.85, "step": 46620, "train_speed(iter/s)": 0.202025 }, { "acc": 0.78227816, "epoch": 1.0879132095831086, "grad_norm": 6.03125, "learning_rate": 4.531621314602777e-06, "loss": 0.7960608, "memory(GiB)": 146.85, "step": 46630, "train_speed(iter/s)": 0.202048 }, { "acc": 0.79964437, "epoch": 1.0881465171553975, "grad_norm": 8.5, "learning_rate": 4.529740543316952e-06, "loss": 0.72686071, "memory(GiB)": 146.85, "step": 46640, "train_speed(iter/s)": 0.202069 }, { "acc": 0.76338515, "epoch": 1.0883798247276864, "grad_norm": 4.59375, "learning_rate": 4.52785983916053e-06, "loss": 0.87091656, "memory(GiB)": 146.85, "step": 46650, "train_speed(iter/s)": 0.202092 }, { "acc": 0.76345739, "epoch": 1.0886131322999753, "grad_norm": 5.9375, "learning_rate": 4.525979202401976e-06, "loss": 0.84901667, "memory(GiB)": 146.85, "step": 46660, "train_speed(iter/s)": 0.202116 }, { "acc": 0.77298908, "epoch": 1.0888464398722641, "grad_norm": 6.46875, "learning_rate": 4.524098633309753e-06, "loss": 0.82216167, "memory(GiB)": 146.85, "step": 46670, "train_speed(iter/s)": 0.202138 }, { "acc": 0.78517995, "epoch": 1.089079747444553, "grad_norm": 14.375, "learning_rate": 4.522218132152313e-06, "loss": 0.7715199, "memory(GiB)": 146.85, "step": 46680, "train_speed(iter/s)": 0.20216 }, { "acc": 0.77037668, "epoch": 1.089313055016842, "grad_norm": 5.65625, "learning_rate": 4.520337699198095e-06, "loss": 0.84481668, "memory(GiB)": 146.85, "step": 46690, "train_speed(iter/s)": 0.202181 }, { "acc": 0.77442465, "epoch": 1.0895463625891308, "grad_norm": 5.75, "learning_rate": 4.5184573347155316e-06, "loss": 0.81327868, "memory(GiB)": 146.85, "step": 46700, "train_speed(iter/s)": 0.202204 }, { "acc": 0.7671432, "epoch": 1.0897796701614197, "grad_norm": 8.3125, "learning_rate": 4.516577038973044e-06, "loss": 0.84261074, "memory(GiB)": 146.85, "step": 46710, "train_speed(iter/s)": 0.202225 }, { "acc": 0.78217678, "epoch": 1.0900129777337086, "grad_norm": 6.03125, "learning_rate": 4.514696812239043e-06, "loss": 0.80802155, "memory(GiB)": 146.85, "step": 46720, "train_speed(iter/s)": 0.202248 }, { "acc": 0.77701054, "epoch": 1.0902462853059975, "grad_norm": 10.5625, "learning_rate": 4.512816654781931e-06, "loss": 0.7927659, "memory(GiB)": 146.85, "step": 46730, "train_speed(iter/s)": 0.202269 }, { "acc": 0.76492386, "epoch": 1.0904795928782864, "grad_norm": 7.71875, "learning_rate": 4.5109365668701e-06, "loss": 0.84012985, "memory(GiB)": 146.85, "step": 46740, "train_speed(iter/s)": 0.202291 }, { "acc": 0.7652966, "epoch": 1.0907129004505753, "grad_norm": 7.53125, "learning_rate": 4.5090565487719326e-06, "loss": 0.83487301, "memory(GiB)": 146.85, "step": 46750, "train_speed(iter/s)": 0.202314 }, { "acc": 0.77362804, "epoch": 1.0909462080228642, "grad_norm": 4.75, "learning_rate": 4.5071766007558e-06, "loss": 0.82314157, "memory(GiB)": 146.85, "step": 46760, "train_speed(iter/s)": 0.202338 }, { "acc": 0.75700455, "epoch": 1.0911795155951531, "grad_norm": 5.8125, "learning_rate": 4.505296723090066e-06, "loss": 0.87763691, "memory(GiB)": 146.85, "step": 46770, "train_speed(iter/s)": 0.20236 }, { "acc": 0.76121655, "epoch": 1.0914128231674418, "grad_norm": 4.84375, "learning_rate": 4.503416916043079e-06, "loss": 0.85789547, "memory(GiB)": 146.85, "step": 46780, "train_speed(iter/s)": 0.20238 }, { "acc": 0.78849115, "epoch": 1.091646130739731, "grad_norm": 5.21875, "learning_rate": 4.501537179883184e-06, "loss": 0.75206962, "memory(GiB)": 146.85, "step": 46790, "train_speed(iter/s)": 0.202402 }, { "acc": 0.75822668, "epoch": 1.0918794383120196, "grad_norm": 5.9375, "learning_rate": 4.499657514878711e-06, "loss": 0.85333633, "memory(GiB)": 146.85, "step": 46800, "train_speed(iter/s)": 0.202425 }, { "acc": 0.76962557, "epoch": 1.0921127458843085, "grad_norm": 7.75, "learning_rate": 4.497777921297983e-06, "loss": 0.84499502, "memory(GiB)": 146.85, "step": 46810, "train_speed(iter/s)": 0.202447 }, { "acc": 0.74706831, "epoch": 1.0923460534565974, "grad_norm": 8.25, "learning_rate": 4.49589839940931e-06, "loss": 0.92847042, "memory(GiB)": 146.85, "step": 46820, "train_speed(iter/s)": 0.202471 }, { "acc": 0.77295866, "epoch": 1.0925793610288863, "grad_norm": 5.59375, "learning_rate": 4.494018949480994e-06, "loss": 0.82387667, "memory(GiB)": 146.85, "step": 46830, "train_speed(iter/s)": 0.202493 }, { "acc": 0.76219211, "epoch": 1.0928126686011752, "grad_norm": 7.21875, "learning_rate": 4.492139571781328e-06, "loss": 0.8358017, "memory(GiB)": 146.85, "step": 46840, "train_speed(iter/s)": 0.202515 }, { "acc": 0.76220121, "epoch": 1.093045976173464, "grad_norm": 7.59375, "learning_rate": 4.490260266578589e-06, "loss": 0.85125036, "memory(GiB)": 146.85, "step": 46850, "train_speed(iter/s)": 0.202537 }, { "acc": 0.79182653, "epoch": 1.093279283745753, "grad_norm": 4.96875, "learning_rate": 4.4883810341410485e-06, "loss": 0.74354172, "memory(GiB)": 146.85, "step": 46860, "train_speed(iter/s)": 0.202561 }, { "acc": 0.76771431, "epoch": 1.093512591318042, "grad_norm": 5.40625, "learning_rate": 4.486501874736967e-06, "loss": 0.85793381, "memory(GiB)": 146.85, "step": 46870, "train_speed(iter/s)": 0.202584 }, { "acc": 0.78163137, "epoch": 1.0937458988903308, "grad_norm": 6.28125, "learning_rate": 4.484622788634596e-06, "loss": 0.77719498, "memory(GiB)": 146.85, "step": 46880, "train_speed(iter/s)": 0.202606 }, { "acc": 0.77222886, "epoch": 1.0939792064626197, "grad_norm": 5.78125, "learning_rate": 4.48274377610217e-06, "loss": 0.8366147, "memory(GiB)": 146.85, "step": 46890, "train_speed(iter/s)": 0.202627 }, { "acc": 0.77517977, "epoch": 1.0942125140349086, "grad_norm": 6.59375, "learning_rate": 4.480864837407919e-06, "loss": 0.80128021, "memory(GiB)": 146.85, "step": 46900, "train_speed(iter/s)": 0.202649 }, { "acc": 0.78349581, "epoch": 1.0944458216071975, "grad_norm": 5.46875, "learning_rate": 4.478985972820063e-06, "loss": 0.78614793, "memory(GiB)": 146.85, "step": 46910, "train_speed(iter/s)": 0.202669 }, { "acc": 0.77873745, "epoch": 1.0946791291794864, "grad_norm": 5.1875, "learning_rate": 4.477107182606807e-06, "loss": 0.80825834, "memory(GiB)": 146.85, "step": 46920, "train_speed(iter/s)": 0.20269 }, { "acc": 0.77216024, "epoch": 1.0949124367517753, "grad_norm": 6.78125, "learning_rate": 4.4752284670363495e-06, "loss": 0.8216238, "memory(GiB)": 146.85, "step": 46930, "train_speed(iter/s)": 0.202713 }, { "acc": 0.76305208, "epoch": 1.0951457443240642, "grad_norm": 6.03125, "learning_rate": 4.473349826376876e-06, "loss": 0.88864403, "memory(GiB)": 146.85, "step": 46940, "train_speed(iter/s)": 0.202735 }, { "acc": 0.75824938, "epoch": 1.095379051896353, "grad_norm": 8.625, "learning_rate": 4.471471260896561e-06, "loss": 0.90116119, "memory(GiB)": 146.85, "step": 46950, "train_speed(iter/s)": 0.202757 }, { "acc": 0.77512527, "epoch": 1.095612359468642, "grad_norm": 9.8125, "learning_rate": 4.46959277086357e-06, "loss": 0.8171463, "memory(GiB)": 146.85, "step": 46960, "train_speed(iter/s)": 0.20278 }, { "acc": 0.78948221, "epoch": 1.095845667040931, "grad_norm": 6.0625, "learning_rate": 4.467714356546057e-06, "loss": 0.74956064, "memory(GiB)": 146.85, "step": 46970, "train_speed(iter/s)": 0.202801 }, { "acc": 0.78156805, "epoch": 1.0960789746132198, "grad_norm": 5.03125, "learning_rate": 4.465836018212166e-06, "loss": 0.79024763, "memory(GiB)": 146.85, "step": 46980, "train_speed(iter/s)": 0.202823 }, { "acc": 0.75970736, "epoch": 1.0963122821855087, "grad_norm": 6.09375, "learning_rate": 4.463957756130028e-06, "loss": 0.85795069, "memory(GiB)": 146.85, "step": 46990, "train_speed(iter/s)": 0.202845 }, { "acc": 0.76657414, "epoch": 1.0965455897577976, "grad_norm": 7.6875, "learning_rate": 4.462079570567765e-06, "loss": 0.82349195, "memory(GiB)": 146.85, "step": 47000, "train_speed(iter/s)": 0.202868 }, { "epoch": 1.0965455897577976, "eval_acc": 0.7351059538752949, "eval_loss": 0.8342330455780029, "eval_runtime": 1265.6113, "eval_samples_per_second": 28.438, "eval_steps_per_second": 14.219, "step": 47000 }, { "acc": 0.75827842, "epoch": 1.0967788973300865, "grad_norm": 4.34375, "learning_rate": 4.460201461793486e-06, "loss": 0.8587719, "memory(GiB)": 146.85, "step": 47010, "train_speed(iter/s)": 0.201764 }, { "acc": 0.76548204, "epoch": 1.0970122049023754, "grad_norm": 6.59375, "learning_rate": 4.458323430075292e-06, "loss": 0.8493187, "memory(GiB)": 146.85, "step": 47020, "train_speed(iter/s)": 0.201786 }, { "acc": 0.76277947, "epoch": 1.0972455124746643, "grad_norm": 4.875, "learning_rate": 4.45644547568127e-06, "loss": 0.83517761, "memory(GiB)": 146.85, "step": 47030, "train_speed(iter/s)": 0.201808 }, { "acc": 0.76745791, "epoch": 1.0974788200469532, "grad_norm": 5.53125, "learning_rate": 4.4545675988795e-06, "loss": 0.85765324, "memory(GiB)": 146.85, "step": 47040, "train_speed(iter/s)": 0.201826 }, { "acc": 0.76599245, "epoch": 1.097712127619242, "grad_norm": 7.0625, "learning_rate": 4.452689799938045e-06, "loss": 0.85991821, "memory(GiB)": 146.85, "step": 47050, "train_speed(iter/s)": 0.201848 }, { "acc": 0.75888996, "epoch": 1.097945435191531, "grad_norm": 8.4375, "learning_rate": 4.450812079124964e-06, "loss": 0.8666893, "memory(GiB)": 146.85, "step": 47060, "train_speed(iter/s)": 0.20187 }, { "acc": 0.76841049, "epoch": 1.0981787427638199, "grad_norm": 4.3125, "learning_rate": 4.448934436708297e-06, "loss": 0.84283113, "memory(GiB)": 146.85, "step": 47070, "train_speed(iter/s)": 0.201892 }, { "acc": 0.77136769, "epoch": 1.0984120503361088, "grad_norm": 7.53125, "learning_rate": 4.44705687295608e-06, "loss": 0.82875633, "memory(GiB)": 146.85, "step": 47080, "train_speed(iter/s)": 0.201915 }, { "acc": 0.75134401, "epoch": 1.0986453579083977, "grad_norm": 8.8125, "learning_rate": 4.445179388136335e-06, "loss": 0.91328526, "memory(GiB)": 146.85, "step": 47090, "train_speed(iter/s)": 0.201937 }, { "acc": 0.77101564, "epoch": 1.0988786654806866, "grad_norm": 4.5, "learning_rate": 4.44330198251707e-06, "loss": 0.81855469, "memory(GiB)": 146.85, "step": 47100, "train_speed(iter/s)": 0.201959 }, { "acc": 0.75588093, "epoch": 1.0991119730529755, "grad_norm": 7.8125, "learning_rate": 4.441424656366287e-06, "loss": 0.89804401, "memory(GiB)": 146.85, "step": 47110, "train_speed(iter/s)": 0.201981 }, { "acc": 0.77680607, "epoch": 1.0993452806252644, "grad_norm": 6.3125, "learning_rate": 4.43954740995197e-06, "loss": 0.79600592, "memory(GiB)": 146.85, "step": 47120, "train_speed(iter/s)": 0.202002 }, { "acc": 0.79293156, "epoch": 1.0995785881975533, "grad_norm": 4.0625, "learning_rate": 4.437670243542097e-06, "loss": 0.73510566, "memory(GiB)": 146.85, "step": 47130, "train_speed(iter/s)": 0.202023 }, { "acc": 0.77574778, "epoch": 1.0998118957698422, "grad_norm": 5.4375, "learning_rate": 4.435793157404636e-06, "loss": 0.82291145, "memory(GiB)": 146.85, "step": 47140, "train_speed(iter/s)": 0.202045 }, { "acc": 0.77092848, "epoch": 1.100045203342131, "grad_norm": 6.15625, "learning_rate": 4.433916151807535e-06, "loss": 0.82544308, "memory(GiB)": 146.85, "step": 47150, "train_speed(iter/s)": 0.202067 }, { "acc": 0.7870038, "epoch": 1.10027851091442, "grad_norm": 6.0, "learning_rate": 4.43203922701874e-06, "loss": 0.7604394, "memory(GiB)": 146.85, "step": 47160, "train_speed(iter/s)": 0.20209 }, { "acc": 0.77253189, "epoch": 1.1005118184867086, "grad_norm": 6.28125, "learning_rate": 4.43016238330618e-06, "loss": 0.80666275, "memory(GiB)": 146.85, "step": 47170, "train_speed(iter/s)": 0.202111 }, { "acc": 0.78762493, "epoch": 1.1007451260589978, "grad_norm": 5.59375, "learning_rate": 4.428285620937774e-06, "loss": 0.74700489, "memory(GiB)": 146.85, "step": 47180, "train_speed(iter/s)": 0.202131 }, { "acc": 0.75392103, "epoch": 1.1009784336312864, "grad_norm": 5.375, "learning_rate": 4.4264089401814306e-06, "loss": 0.88965225, "memory(GiB)": 146.85, "step": 47190, "train_speed(iter/s)": 0.202152 }, { "acc": 0.7838099, "epoch": 1.1012117412035753, "grad_norm": 5.40625, "learning_rate": 4.4245323413050446e-06, "loss": 0.78972301, "memory(GiB)": 146.85, "step": 47200, "train_speed(iter/s)": 0.202174 }, { "acc": 0.77003174, "epoch": 1.1014450487758642, "grad_norm": 6.53125, "learning_rate": 4.422655824576499e-06, "loss": 0.82872181, "memory(GiB)": 146.85, "step": 47210, "train_speed(iter/s)": 0.202196 }, { "acc": 0.77589517, "epoch": 1.1016783563481531, "grad_norm": 6.3125, "learning_rate": 4.420779390263669e-06, "loss": 0.79527655, "memory(GiB)": 146.85, "step": 47220, "train_speed(iter/s)": 0.202219 }, { "acc": 0.76545053, "epoch": 1.101911663920442, "grad_norm": 5.25, "learning_rate": 4.4189030386344094e-06, "loss": 0.83906803, "memory(GiB)": 146.85, "step": 47230, "train_speed(iter/s)": 0.20224 }, { "acc": 0.75631351, "epoch": 1.102144971492731, "grad_norm": 4.96875, "learning_rate": 4.417026769956573e-06, "loss": 0.90623493, "memory(GiB)": 146.85, "step": 47240, "train_speed(iter/s)": 0.202261 }, { "acc": 0.78408074, "epoch": 1.1023782790650198, "grad_norm": 5.25, "learning_rate": 4.415150584497996e-06, "loss": 0.76452241, "memory(GiB)": 146.85, "step": 47250, "train_speed(iter/s)": 0.202283 }, { "acc": 0.76170683, "epoch": 1.1026115866373087, "grad_norm": 6.78125, "learning_rate": 4.413274482526503e-06, "loss": 0.86684809, "memory(GiB)": 146.85, "step": 47260, "train_speed(iter/s)": 0.202304 }, { "acc": 0.75942526, "epoch": 1.1028448942095976, "grad_norm": 6.125, "learning_rate": 4.4113984643099075e-06, "loss": 0.8930069, "memory(GiB)": 146.85, "step": 47270, "train_speed(iter/s)": 0.202326 }, { "acc": 0.7786231, "epoch": 1.1030782017818865, "grad_norm": 9.375, "learning_rate": 4.409522530116011e-06, "loss": 0.80248442, "memory(GiB)": 146.85, "step": 47280, "train_speed(iter/s)": 0.202349 }, { "acc": 0.77417974, "epoch": 1.1033115093541754, "grad_norm": 6.375, "learning_rate": 4.407646680212601e-06, "loss": 0.82556591, "memory(GiB)": 146.85, "step": 47290, "train_speed(iter/s)": 0.202371 }, { "acc": 0.77373676, "epoch": 1.1035448169264643, "grad_norm": 6.1875, "learning_rate": 4.405770914867455e-06, "loss": 0.79739261, "memory(GiB)": 146.85, "step": 47300, "train_speed(iter/s)": 0.202393 }, { "acc": 0.78629589, "epoch": 1.1037781244987532, "grad_norm": 5.53125, "learning_rate": 4.403895234348338e-06, "loss": 0.75432949, "memory(GiB)": 146.85, "step": 47310, "train_speed(iter/s)": 0.202416 }, { "acc": 0.76888952, "epoch": 1.1040114320710421, "grad_norm": 5.53125, "learning_rate": 4.402019638923003e-06, "loss": 0.85802879, "memory(GiB)": 146.85, "step": 47320, "train_speed(iter/s)": 0.202438 }, { "acc": 0.76228151, "epoch": 1.104244739643331, "grad_norm": 4.71875, "learning_rate": 4.400144128859192e-06, "loss": 0.87550364, "memory(GiB)": 146.85, "step": 47330, "train_speed(iter/s)": 0.202461 }, { "acc": 0.78019581, "epoch": 1.10447804721562, "grad_norm": 5.1875, "learning_rate": 4.3982687044246336e-06, "loss": 0.79276915, "memory(GiB)": 146.85, "step": 47340, "train_speed(iter/s)": 0.202482 }, { "acc": 0.77792006, "epoch": 1.1047113547879088, "grad_norm": 6.59375, "learning_rate": 4.396393365887041e-06, "loss": 0.8068882, "memory(GiB)": 146.85, "step": 47350, "train_speed(iter/s)": 0.202502 }, { "acc": 0.77639923, "epoch": 1.1049446623601977, "grad_norm": 7.46875, "learning_rate": 4.394518113514121e-06, "loss": 0.79934292, "memory(GiB)": 146.85, "step": 47360, "train_speed(iter/s)": 0.202525 }, { "acc": 0.78914738, "epoch": 1.1051779699324866, "grad_norm": 6.09375, "learning_rate": 4.392642947573563e-06, "loss": 0.75166197, "memory(GiB)": 146.85, "step": 47370, "train_speed(iter/s)": 0.202548 }, { "acc": 0.78531098, "epoch": 1.1054112775047755, "grad_norm": 8.1875, "learning_rate": 4.3907678683330486e-06, "loss": 0.76560197, "memory(GiB)": 146.85, "step": 47380, "train_speed(iter/s)": 0.202572 }, { "acc": 0.76891422, "epoch": 1.1056445850770644, "grad_norm": 4.625, "learning_rate": 4.388892876060243e-06, "loss": 0.83208981, "memory(GiB)": 146.85, "step": 47390, "train_speed(iter/s)": 0.202595 }, { "acc": 0.78349581, "epoch": 1.1058778926493533, "grad_norm": 5.625, "learning_rate": 4.387017971022803e-06, "loss": 0.77736988, "memory(GiB)": 146.85, "step": 47400, "train_speed(iter/s)": 0.202617 }, { "acc": 0.79620285, "epoch": 1.1061112002216422, "grad_norm": 8.75, "learning_rate": 4.385143153488369e-06, "loss": 0.73822799, "memory(GiB)": 146.85, "step": 47410, "train_speed(iter/s)": 0.20264 }, { "acc": 0.7685421, "epoch": 1.1063445077939311, "grad_norm": 5.46875, "learning_rate": 4.383268423724572e-06, "loss": 0.80850019, "memory(GiB)": 146.85, "step": 47420, "train_speed(iter/s)": 0.202662 }, { "acc": 0.74466205, "epoch": 1.10657781536622, "grad_norm": 6.03125, "learning_rate": 4.381393781999027e-06, "loss": 0.92997513, "memory(GiB)": 146.85, "step": 47430, "train_speed(iter/s)": 0.202684 }, { "acc": 0.7871295, "epoch": 1.106811122938509, "grad_norm": 5.0, "learning_rate": 4.379519228579342e-06, "loss": 0.75856218, "memory(GiB)": 146.85, "step": 47440, "train_speed(iter/s)": 0.202707 }, { "acc": 0.75111895, "epoch": 1.1070444305107978, "grad_norm": 5.78125, "learning_rate": 4.377644763733106e-06, "loss": 0.91428509, "memory(GiB)": 146.85, "step": 47450, "train_speed(iter/s)": 0.202729 }, { "acc": 0.77670212, "epoch": 1.1072777380830867, "grad_norm": 6.78125, "learning_rate": 4.375770387727899e-06, "loss": 0.80318012, "memory(GiB)": 146.85, "step": 47460, "train_speed(iter/s)": 0.202753 }, { "acc": 0.75863771, "epoch": 1.1075110456553756, "grad_norm": 6.25, "learning_rate": 4.373896100831288e-06, "loss": 0.9025157, "memory(GiB)": 146.85, "step": 47470, "train_speed(iter/s)": 0.202776 }, { "acc": 0.77386956, "epoch": 1.1077443532276645, "grad_norm": 8.5625, "learning_rate": 4.372021903310826e-06, "loss": 0.8226162, "memory(GiB)": 146.85, "step": 47480, "train_speed(iter/s)": 0.2028 }, { "acc": 0.77549715, "epoch": 1.1079776607999534, "grad_norm": 5.5625, "learning_rate": 4.370147795434054e-06, "loss": 0.80647688, "memory(GiB)": 146.85, "step": 47490, "train_speed(iter/s)": 0.202822 }, { "acc": 0.78244882, "epoch": 1.1082109683722423, "grad_norm": 6.59375, "learning_rate": 4.3682737774685035e-06, "loss": 0.78387566, "memory(GiB)": 146.85, "step": 47500, "train_speed(iter/s)": 0.202845 }, { "epoch": 1.1082109683722423, "eval_acc": 0.7351880774058644, "eval_loss": 0.8342263698577881, "eval_runtime": 1263.5018, "eval_samples_per_second": 28.485, "eval_steps_per_second": 14.243, "step": 47500 }, { "acc": 0.75842447, "epoch": 1.1084442759445312, "grad_norm": 5.59375, "learning_rate": 4.366399849681686e-06, "loss": 0.86695709, "memory(GiB)": 146.85, "step": 47510, "train_speed(iter/s)": 0.201754 }, { "acc": 0.77977762, "epoch": 1.10867758351682, "grad_norm": 6.5, "learning_rate": 4.364526012341107e-06, "loss": 0.78027377, "memory(GiB)": 146.85, "step": 47520, "train_speed(iter/s)": 0.201774 }, { "acc": 0.78188429, "epoch": 1.108910891089109, "grad_norm": 5.15625, "learning_rate": 4.362652265714254e-06, "loss": 0.76219225, "memory(GiB)": 146.85, "step": 47530, "train_speed(iter/s)": 0.201796 }, { "acc": 0.74995575, "epoch": 1.109144198661398, "grad_norm": 6.84375, "learning_rate": 4.360778610068605e-06, "loss": 0.92144899, "memory(GiB)": 146.85, "step": 47540, "train_speed(iter/s)": 0.201818 }, { "acc": 0.77445202, "epoch": 1.1093775062336868, "grad_norm": 5.65625, "learning_rate": 4.3589050456716254e-06, "loss": 0.79929876, "memory(GiB)": 146.85, "step": 47550, "train_speed(iter/s)": 0.20184 }, { "acc": 0.77928467, "epoch": 1.1096108138059755, "grad_norm": 6.34375, "learning_rate": 4.357031572790763e-06, "loss": 0.80060415, "memory(GiB)": 146.85, "step": 47560, "train_speed(iter/s)": 0.201863 }, { "acc": 0.76393294, "epoch": 1.1098441213782644, "grad_norm": 5.5625, "learning_rate": 4.355158191693458e-06, "loss": 0.85729151, "memory(GiB)": 146.85, "step": 47570, "train_speed(iter/s)": 0.201885 }, { "acc": 0.78235054, "epoch": 1.1100774289505533, "grad_norm": 5.25, "learning_rate": 4.353284902647133e-06, "loss": 0.79345236, "memory(GiB)": 146.85, "step": 47580, "train_speed(iter/s)": 0.201908 }, { "acc": 0.77752371, "epoch": 1.1103107365228422, "grad_norm": 5.8125, "learning_rate": 4.351411705919201e-06, "loss": 0.80304451, "memory(GiB)": 146.85, "step": 47590, "train_speed(iter/s)": 0.201929 }, { "acc": 0.79562101, "epoch": 1.110544044095131, "grad_norm": 6.6875, "learning_rate": 4.349538601777058e-06, "loss": 0.77215447, "memory(GiB)": 146.85, "step": 47600, "train_speed(iter/s)": 0.201951 }, { "acc": 0.76177444, "epoch": 1.11077735166742, "grad_norm": 6.375, "learning_rate": 4.347665590488091e-06, "loss": 0.86883945, "memory(GiB)": 146.85, "step": 47610, "train_speed(iter/s)": 0.201973 }, { "acc": 0.79507818, "epoch": 1.1110106592397089, "grad_norm": 5.59375, "learning_rate": 4.3457926723196716e-06, "loss": 0.71945324, "memory(GiB)": 146.85, "step": 47620, "train_speed(iter/s)": 0.201995 }, { "acc": 0.78331976, "epoch": 1.1112439668119978, "grad_norm": 6.125, "learning_rate": 4.343919847539157e-06, "loss": 0.79045949, "memory(GiB)": 146.85, "step": 47630, "train_speed(iter/s)": 0.202017 }, { "acc": 0.75759306, "epoch": 1.1114772743842867, "grad_norm": 4.5, "learning_rate": 4.342047116413897e-06, "loss": 0.89205151, "memory(GiB)": 146.85, "step": 47640, "train_speed(iter/s)": 0.20204 }, { "acc": 0.77704024, "epoch": 1.1117105819565756, "grad_norm": 4.125, "learning_rate": 4.340174479211217e-06, "loss": 0.80316944, "memory(GiB)": 146.85, "step": 47650, "train_speed(iter/s)": 0.202062 }, { "acc": 0.77069979, "epoch": 1.1119438895288645, "grad_norm": 6.21875, "learning_rate": 4.338301936198439e-06, "loss": 0.82528133, "memory(GiB)": 146.85, "step": 47660, "train_speed(iter/s)": 0.202085 }, { "acc": 0.76555657, "epoch": 1.1121771971011534, "grad_norm": 5.59375, "learning_rate": 4.336429487642867e-06, "loss": 0.86652946, "memory(GiB)": 146.85, "step": 47670, "train_speed(iter/s)": 0.202107 }, { "acc": 0.78941641, "epoch": 1.1124105046734423, "grad_norm": 6.3125, "learning_rate": 4.334557133811796e-06, "loss": 0.75633993, "memory(GiB)": 146.85, "step": 47680, "train_speed(iter/s)": 0.20213 }, { "acc": 0.77698145, "epoch": 1.1126438122457312, "grad_norm": 5.40625, "learning_rate": 4.332684874972498e-06, "loss": 0.80427866, "memory(GiB)": 146.85, "step": 47690, "train_speed(iter/s)": 0.202152 }, { "acc": 0.77090092, "epoch": 1.11287711981802, "grad_norm": 5.65625, "learning_rate": 4.330812711392241e-06, "loss": 0.84765263, "memory(GiB)": 146.85, "step": 47700, "train_speed(iter/s)": 0.202174 }, { "acc": 0.75020471, "epoch": 1.113110427390309, "grad_norm": 4.46875, "learning_rate": 4.328940643338274e-06, "loss": 0.88693848, "memory(GiB)": 146.85, "step": 47710, "train_speed(iter/s)": 0.202196 }, { "acc": 0.78521967, "epoch": 1.1133437349625979, "grad_norm": 5.09375, "learning_rate": 4.327068671077836e-06, "loss": 0.77285204, "memory(GiB)": 146.85, "step": 47720, "train_speed(iter/s)": 0.202215 }, { "acc": 0.78573198, "epoch": 1.1135770425348868, "grad_norm": 6.625, "learning_rate": 4.32519679487815e-06, "loss": 0.76746035, "memory(GiB)": 146.85, "step": 47730, "train_speed(iter/s)": 0.202237 }, { "acc": 0.78353057, "epoch": 1.1138103501071757, "grad_norm": 5.78125, "learning_rate": 4.323325015006425e-06, "loss": 0.77550778, "memory(GiB)": 146.85, "step": 47740, "train_speed(iter/s)": 0.202259 }, { "acc": 0.79130087, "epoch": 1.1140436576794646, "grad_norm": 5.53125, "learning_rate": 4.321453331729857e-06, "loss": 0.75973244, "memory(GiB)": 146.85, "step": 47750, "train_speed(iter/s)": 0.202281 }, { "acc": 0.76926622, "epoch": 1.1142769652517535, "grad_norm": 48.25, "learning_rate": 4.319581745315629e-06, "loss": 0.86730461, "memory(GiB)": 146.85, "step": 47760, "train_speed(iter/s)": 0.202302 }, { "acc": 0.77262135, "epoch": 1.1145102728240424, "grad_norm": 6.09375, "learning_rate": 4.317710256030911e-06, "loss": 0.82603149, "memory(GiB)": 146.85, "step": 47770, "train_speed(iter/s)": 0.202325 }, { "acc": 0.76706882, "epoch": 1.1147435803963313, "grad_norm": 7.03125, "learning_rate": 4.3158388641428536e-06, "loss": 0.8401228, "memory(GiB)": 146.85, "step": 47780, "train_speed(iter/s)": 0.202347 }, { "acc": 0.76496086, "epoch": 1.1149768879686202, "grad_norm": 6.65625, "learning_rate": 4.3139675699186e-06, "loss": 0.8438015, "memory(GiB)": 146.85, "step": 47790, "train_speed(iter/s)": 0.202369 }, { "acc": 0.75830221, "epoch": 1.115210195540909, "grad_norm": 5.59375, "learning_rate": 4.312096373625279e-06, "loss": 0.89270496, "memory(GiB)": 146.85, "step": 47800, "train_speed(iter/s)": 0.202389 }, { "acc": 0.75054326, "epoch": 1.115443503113198, "grad_norm": 8.9375, "learning_rate": 4.310225275529998e-06, "loss": 0.89807997, "memory(GiB)": 146.85, "step": 47810, "train_speed(iter/s)": 0.202411 }, { "acc": 0.75068226, "epoch": 1.1156768106854869, "grad_norm": 5.125, "learning_rate": 4.308354275899859e-06, "loss": 0.91133127, "memory(GiB)": 146.85, "step": 47820, "train_speed(iter/s)": 0.202433 }, { "acc": 0.77006979, "epoch": 1.1159101182577758, "grad_norm": 14.6875, "learning_rate": 4.306483375001946e-06, "loss": 0.84228821, "memory(GiB)": 146.85, "step": 47830, "train_speed(iter/s)": 0.202454 }, { "acc": 0.77723064, "epoch": 1.1161434258300647, "grad_norm": 4.65625, "learning_rate": 4.30461257310333e-06, "loss": 0.80090551, "memory(GiB)": 146.85, "step": 47840, "train_speed(iter/s)": 0.202477 }, { "acc": 0.7778759, "epoch": 1.1163767334023535, "grad_norm": 5.53125, "learning_rate": 4.302741870471069e-06, "loss": 0.77942281, "memory(GiB)": 146.85, "step": 47850, "train_speed(iter/s)": 0.202499 }, { "acc": 0.76664486, "epoch": 1.1166100409746424, "grad_norm": 8.375, "learning_rate": 4.3008712673722005e-06, "loss": 0.86221256, "memory(GiB)": 146.85, "step": 47860, "train_speed(iter/s)": 0.202522 }, { "acc": 0.77623024, "epoch": 1.1168433485469313, "grad_norm": 5.8125, "learning_rate": 4.299000764073757e-06, "loss": 0.79607015, "memory(GiB)": 146.85, "step": 47870, "train_speed(iter/s)": 0.202544 }, { "acc": 0.76134338, "epoch": 1.1170766561192202, "grad_norm": 5.21875, "learning_rate": 4.29713036084275e-06, "loss": 0.88599024, "memory(GiB)": 146.85, "step": 47880, "train_speed(iter/s)": 0.202567 }, { "acc": 0.77105122, "epoch": 1.1173099636915091, "grad_norm": 4.21875, "learning_rate": 4.29526005794618e-06, "loss": 0.82362709, "memory(GiB)": 146.85, "step": 47890, "train_speed(iter/s)": 0.202588 }, { "acc": 0.78915219, "epoch": 1.117543271263798, "grad_norm": 4.96875, "learning_rate": 4.2933898556510325e-06, "loss": 0.77366314, "memory(GiB)": 146.85, "step": 47900, "train_speed(iter/s)": 0.202609 }, { "acc": 0.76358423, "epoch": 1.117776578836087, "grad_norm": 6.28125, "learning_rate": 4.29151975422428e-06, "loss": 0.86374435, "memory(GiB)": 146.85, "step": 47910, "train_speed(iter/s)": 0.202631 }, { "acc": 0.78533058, "epoch": 1.1180098864083758, "grad_norm": 5.5625, "learning_rate": 4.289649753932874e-06, "loss": 0.77041426, "memory(GiB)": 146.85, "step": 47920, "train_speed(iter/s)": 0.202653 }, { "acc": 0.76965847, "epoch": 1.1182431939806645, "grad_norm": 5.0, "learning_rate": 4.28777985504376e-06, "loss": 0.82941408, "memory(GiB)": 146.85, "step": 47930, "train_speed(iter/s)": 0.202673 }, { "acc": 0.78132143, "epoch": 1.1184765015529536, "grad_norm": 5.28125, "learning_rate": 4.285910057823864e-06, "loss": 0.79782658, "memory(GiB)": 146.85, "step": 47940, "train_speed(iter/s)": 0.202694 }, { "acc": 0.7861392, "epoch": 1.1187098091252423, "grad_norm": 5.78125, "learning_rate": 4.284040362540101e-06, "loss": 0.7702601, "memory(GiB)": 146.85, "step": 47950, "train_speed(iter/s)": 0.202715 }, { "acc": 0.76491108, "epoch": 1.1189431166975312, "grad_norm": 5.28125, "learning_rate": 4.282170769459367e-06, "loss": 0.85888643, "memory(GiB)": 146.85, "step": 47960, "train_speed(iter/s)": 0.202737 }, { "acc": 0.76208735, "epoch": 1.1191764242698201, "grad_norm": 5.3125, "learning_rate": 4.2803012788485475e-06, "loss": 0.8695507, "memory(GiB)": 146.85, "step": 47970, "train_speed(iter/s)": 0.20276 }, { "acc": 0.77258663, "epoch": 1.119409731842109, "grad_norm": 5.8125, "learning_rate": 4.278431890974511e-06, "loss": 0.82737179, "memory(GiB)": 146.85, "step": 47980, "train_speed(iter/s)": 0.202782 }, { "acc": 0.76833682, "epoch": 1.119643039414398, "grad_norm": 5.9375, "learning_rate": 4.276562606104114e-06, "loss": 0.82221231, "memory(GiB)": 146.85, "step": 47990, "train_speed(iter/s)": 0.202803 }, { "acc": 0.76684685, "epoch": 1.1198763469866868, "grad_norm": 8.375, "learning_rate": 4.274693424504194e-06, "loss": 0.86059036, "memory(GiB)": 146.85, "step": 48000, "train_speed(iter/s)": 0.202825 }, { "epoch": 1.1198763469866868, "eval_acc": 0.7351675868589443, "eval_loss": 0.8341879844665527, "eval_runtime": 1263.2243, "eval_samples_per_second": 28.491, "eval_steps_per_second": 14.246, "step": 48000 }, { "acc": 0.77327037, "epoch": 1.1201096545589757, "grad_norm": 5.09375, "learning_rate": 4.272824346441576e-06, "loss": 0.82209759, "memory(GiB)": 146.85, "step": 48010, "train_speed(iter/s)": 0.201747 }, { "acc": 0.76315031, "epoch": 1.1203429621312646, "grad_norm": 5.8125, "learning_rate": 4.270955372183074e-06, "loss": 0.84115791, "memory(GiB)": 146.85, "step": 48020, "train_speed(iter/s)": 0.20177 }, { "acc": 0.7878212, "epoch": 1.1205762697035535, "grad_norm": 4.6875, "learning_rate": 4.269086501995478e-06, "loss": 0.75878277, "memory(GiB)": 146.85, "step": 48030, "train_speed(iter/s)": 0.201792 }, { "acc": 0.7787775, "epoch": 1.1208095772758424, "grad_norm": 6.90625, "learning_rate": 4.267217736145573e-06, "loss": 0.78848639, "memory(GiB)": 146.85, "step": 48040, "train_speed(iter/s)": 0.201813 }, { "acc": 0.76182876, "epoch": 1.1210428848481313, "grad_norm": 5.5625, "learning_rate": 4.265349074900123e-06, "loss": 0.86063824, "memory(GiB)": 146.85, "step": 48050, "train_speed(iter/s)": 0.201835 }, { "acc": 0.78460379, "epoch": 1.1212761924204202, "grad_norm": 5.53125, "learning_rate": 4.263480518525878e-06, "loss": 0.77461448, "memory(GiB)": 146.85, "step": 48060, "train_speed(iter/s)": 0.201858 }, { "acc": 0.77335744, "epoch": 1.121509499992709, "grad_norm": 5.28125, "learning_rate": 4.261612067289577e-06, "loss": 0.81010914, "memory(GiB)": 146.85, "step": 48070, "train_speed(iter/s)": 0.20188 }, { "acc": 0.77375546, "epoch": 1.121742807564998, "grad_norm": 5.3125, "learning_rate": 4.259743721457937e-06, "loss": 0.79694977, "memory(GiB)": 146.85, "step": 48080, "train_speed(iter/s)": 0.201902 }, { "acc": 0.76343832, "epoch": 1.121976115137287, "grad_norm": 5.25, "learning_rate": 4.257875481297667e-06, "loss": 0.86238384, "memory(GiB)": 146.85, "step": 48090, "train_speed(iter/s)": 0.201925 }, { "acc": 0.76843147, "epoch": 1.1222094227095758, "grad_norm": 5.90625, "learning_rate": 4.256007347075455e-06, "loss": 0.82185183, "memory(GiB)": 146.85, "step": 48100, "train_speed(iter/s)": 0.201948 }, { "acc": 0.74763145, "epoch": 1.1224427302818647, "grad_norm": 6.3125, "learning_rate": 4.254139319057979e-06, "loss": 0.92999201, "memory(GiB)": 146.85, "step": 48110, "train_speed(iter/s)": 0.201969 }, { "acc": 0.76553555, "epoch": 1.1226760378541536, "grad_norm": 5.09375, "learning_rate": 4.252271397511898e-06, "loss": 0.87832098, "memory(GiB)": 146.85, "step": 48120, "train_speed(iter/s)": 0.201991 }, { "acc": 0.76242743, "epoch": 1.1229093454264425, "grad_norm": 6.46875, "learning_rate": 4.2504035827038595e-06, "loss": 0.87127686, "memory(GiB)": 146.85, "step": 48130, "train_speed(iter/s)": 0.202013 }, { "acc": 0.76723614, "epoch": 1.1231426529987314, "grad_norm": 5.59375, "learning_rate": 4.248535874900491e-06, "loss": 0.84471321, "memory(GiB)": 146.85, "step": 48140, "train_speed(iter/s)": 0.202035 }, { "acc": 0.79745517, "epoch": 1.1233759605710203, "grad_norm": 5.0625, "learning_rate": 4.246668274368409e-06, "loss": 0.72981949, "memory(GiB)": 146.85, "step": 48150, "train_speed(iter/s)": 0.202057 }, { "acc": 0.76288776, "epoch": 1.1236092681433092, "grad_norm": 9.75, "learning_rate": 4.24480078137421e-06, "loss": 0.87140398, "memory(GiB)": 146.85, "step": 48160, "train_speed(iter/s)": 0.202078 }, { "acc": 0.76260953, "epoch": 1.123842575715598, "grad_norm": 6.5625, "learning_rate": 4.2429333961844805e-06, "loss": 0.85976963, "memory(GiB)": 146.85, "step": 48170, "train_speed(iter/s)": 0.2021 }, { "acc": 0.77397685, "epoch": 1.124075883287887, "grad_norm": 4.0, "learning_rate": 4.241066119065789e-06, "loss": 0.84804592, "memory(GiB)": 146.85, "step": 48180, "train_speed(iter/s)": 0.202119 }, { "acc": 0.76989431, "epoch": 1.1243091908601759, "grad_norm": 5.21875, "learning_rate": 4.239198950284688e-06, "loss": 0.83785324, "memory(GiB)": 146.85, "step": 48190, "train_speed(iter/s)": 0.20214 }, { "acc": 0.7631053, "epoch": 1.1245424984324648, "grad_norm": 7.4375, "learning_rate": 4.237331890107717e-06, "loss": 0.86919861, "memory(GiB)": 146.85, "step": 48200, "train_speed(iter/s)": 0.202159 }, { "acc": 0.77214413, "epoch": 1.1247758060047537, "grad_norm": 5.15625, "learning_rate": 4.2354649388013965e-06, "loss": 0.81717701, "memory(GiB)": 146.85, "step": 48210, "train_speed(iter/s)": 0.202181 }, { "acc": 0.76291223, "epoch": 1.1250091135770426, "grad_norm": 5.1875, "learning_rate": 4.233598096632234e-06, "loss": 0.84736996, "memory(GiB)": 146.85, "step": 48220, "train_speed(iter/s)": 0.202204 }, { "acc": 0.78089867, "epoch": 1.1252424211493315, "grad_norm": 7.5625, "learning_rate": 4.23173136386672e-06, "loss": 0.76700048, "memory(GiB)": 146.85, "step": 48230, "train_speed(iter/s)": 0.202225 }, { "acc": 0.7823853, "epoch": 1.1254757287216204, "grad_norm": 4.96875, "learning_rate": 4.2298647407713314e-06, "loss": 0.76192608, "memory(GiB)": 146.85, "step": 48240, "train_speed(iter/s)": 0.202249 }, { "acc": 0.76121893, "epoch": 1.1257090362939093, "grad_norm": 5.75, "learning_rate": 4.227998227612529e-06, "loss": 0.88008633, "memory(GiB)": 146.85, "step": 48250, "train_speed(iter/s)": 0.202271 }, { "acc": 0.77902217, "epoch": 1.1259423438661982, "grad_norm": 7.0, "learning_rate": 4.226131824656752e-06, "loss": 0.80265713, "memory(GiB)": 146.85, "step": 48260, "train_speed(iter/s)": 0.202293 }, { "acc": 0.77084169, "epoch": 1.126175651438487, "grad_norm": 5.96875, "learning_rate": 4.224265532170434e-06, "loss": 0.8095993, "memory(GiB)": 146.85, "step": 48270, "train_speed(iter/s)": 0.202314 }, { "acc": 0.76959877, "epoch": 1.126408959010776, "grad_norm": 7.53125, "learning_rate": 4.222399350419985e-06, "loss": 0.829212, "memory(GiB)": 146.85, "step": 48280, "train_speed(iter/s)": 0.202336 }, { "acc": 0.78710809, "epoch": 1.1266422665830649, "grad_norm": 5.03125, "learning_rate": 4.220533279671804e-06, "loss": 0.76747565, "memory(GiB)": 146.85, "step": 48290, "train_speed(iter/s)": 0.202358 }, { "acc": 0.78911958, "epoch": 1.1268755741553538, "grad_norm": 6.46875, "learning_rate": 4.21866732019227e-06, "loss": 0.74851618, "memory(GiB)": 146.85, "step": 48300, "train_speed(iter/s)": 0.20238 }, { "acc": 0.77596874, "epoch": 1.1271088817276427, "grad_norm": 6.40625, "learning_rate": 4.216801472247749e-06, "loss": 0.79742446, "memory(GiB)": 146.85, "step": 48310, "train_speed(iter/s)": 0.202401 }, { "acc": 0.76997323, "epoch": 1.1273421892999314, "grad_norm": 6.375, "learning_rate": 4.214935736104591e-06, "loss": 0.83124428, "memory(GiB)": 146.85, "step": 48320, "train_speed(iter/s)": 0.202424 }, { "acc": 0.76746964, "epoch": 1.1275754968722205, "grad_norm": 4.65625, "learning_rate": 4.213070112029127e-06, "loss": 0.83171978, "memory(GiB)": 146.85, "step": 48330, "train_speed(iter/s)": 0.202444 }, { "acc": 0.77163076, "epoch": 1.1278088044445091, "grad_norm": 5.3125, "learning_rate": 4.211204600287677e-06, "loss": 0.84597187, "memory(GiB)": 146.85, "step": 48340, "train_speed(iter/s)": 0.202466 }, { "acc": 0.78793755, "epoch": 1.128042112016798, "grad_norm": 5.53125, "learning_rate": 4.2093392011465425e-06, "loss": 0.74456205, "memory(GiB)": 146.85, "step": 48350, "train_speed(iter/s)": 0.202487 }, { "acc": 0.76029058, "epoch": 1.128275419589087, "grad_norm": 6.15625, "learning_rate": 4.207473914872006e-06, "loss": 0.86165133, "memory(GiB)": 146.85, "step": 48360, "train_speed(iter/s)": 0.202508 }, { "acc": 0.77400007, "epoch": 1.1285087271613758, "grad_norm": 6.5, "learning_rate": 4.20560874173034e-06, "loss": 0.80730686, "memory(GiB)": 146.85, "step": 48370, "train_speed(iter/s)": 0.20253 }, { "acc": 0.76626792, "epoch": 1.1287420347336647, "grad_norm": 5.0625, "learning_rate": 4.203743681987793e-06, "loss": 0.82915783, "memory(GiB)": 146.85, "step": 48380, "train_speed(iter/s)": 0.202551 }, { "acc": 0.7830493, "epoch": 1.1289753423059536, "grad_norm": 4.4375, "learning_rate": 4.2018787359106045e-06, "loss": 0.7884582, "memory(GiB)": 146.85, "step": 48390, "train_speed(iter/s)": 0.202573 }, { "acc": 0.78300257, "epoch": 1.1292086498782425, "grad_norm": 4.40625, "learning_rate": 4.200013903764994e-06, "loss": 0.78308973, "memory(GiB)": 146.85, "step": 48400, "train_speed(iter/s)": 0.202594 }, { "acc": 0.76421447, "epoch": 1.1294419574505314, "grad_norm": 6.25, "learning_rate": 4.198149185817167e-06, "loss": 0.84571056, "memory(GiB)": 146.85, "step": 48410, "train_speed(iter/s)": 0.202614 }, { "acc": 0.75189023, "epoch": 1.1296752650228203, "grad_norm": 7.0, "learning_rate": 4.19628458233331e-06, "loss": 0.90801401, "memory(GiB)": 146.85, "step": 48420, "train_speed(iter/s)": 0.202635 }, { "acc": 0.76325169, "epoch": 1.1299085725951092, "grad_norm": 5.90625, "learning_rate": 4.194420093579597e-06, "loss": 0.8489624, "memory(GiB)": 146.85, "step": 48430, "train_speed(iter/s)": 0.202657 }, { "acc": 0.78059645, "epoch": 1.1301418801673981, "grad_norm": 6.375, "learning_rate": 4.1925557198221805e-06, "loss": 0.78301945, "memory(GiB)": 146.85, "step": 48440, "train_speed(iter/s)": 0.202677 }, { "acc": 0.76214142, "epoch": 1.130375187739687, "grad_norm": 6.125, "learning_rate": 4.1906914613272e-06, "loss": 0.86999235, "memory(GiB)": 146.85, "step": 48450, "train_speed(iter/s)": 0.202699 }, { "acc": 0.76257839, "epoch": 1.130608495311976, "grad_norm": 5.53125, "learning_rate": 4.188827318360779e-06, "loss": 0.84773655, "memory(GiB)": 146.85, "step": 48460, "train_speed(iter/s)": 0.202721 }, { "acc": 0.7567915, "epoch": 1.1308418028842648, "grad_norm": 6.21875, "learning_rate": 4.186963291189022e-06, "loss": 0.8951088, "memory(GiB)": 146.85, "step": 48470, "train_speed(iter/s)": 0.202741 }, { "acc": 0.78167205, "epoch": 1.1310751104565537, "grad_norm": 5.03125, "learning_rate": 4.185099380078022e-06, "loss": 0.78908095, "memory(GiB)": 146.85, "step": 48480, "train_speed(iter/s)": 0.202762 }, { "acc": 0.77275462, "epoch": 1.1313084180288426, "grad_norm": 8.625, "learning_rate": 4.183235585293846e-06, "loss": 0.85458717, "memory(GiB)": 146.85, "step": 48490, "train_speed(iter/s)": 0.202786 }, { "acc": 0.78155551, "epoch": 1.1315417256011315, "grad_norm": 9.5, "learning_rate": 4.181371907102553e-06, "loss": 0.76678705, "memory(GiB)": 146.85, "step": 48500, "train_speed(iter/s)": 0.202809 }, { "epoch": 1.1315417256011315, "eval_acc": 0.7351603264289333, "eval_loss": 0.834158182144165, "eval_runtime": 1263.5592, "eval_samples_per_second": 28.484, "eval_steps_per_second": 14.242, "step": 48500 }, { "acc": 0.7746582, "epoch": 1.1317750331734204, "grad_norm": 5.75, "learning_rate": 4.179508345770184e-06, "loss": 0.81047058, "memory(GiB)": 146.85, "step": 48510, "train_speed(iter/s)": 0.201743 }, { "acc": 0.76802449, "epoch": 1.1320083407457093, "grad_norm": 4.90625, "learning_rate": 4.177644901562758e-06, "loss": 0.84689636, "memory(GiB)": 146.85, "step": 48520, "train_speed(iter/s)": 0.201765 }, { "acc": 0.79919052, "epoch": 1.1322416483179982, "grad_norm": 5.125, "learning_rate": 4.1757815747462845e-06, "loss": 0.71522851, "memory(GiB)": 146.85, "step": 48530, "train_speed(iter/s)": 0.201787 }, { "acc": 0.76307726, "epoch": 1.1324749558902871, "grad_norm": 6.25, "learning_rate": 4.173918365586751e-06, "loss": 0.84144897, "memory(GiB)": 146.85, "step": 48540, "train_speed(iter/s)": 0.201809 }, { "acc": 0.77744608, "epoch": 1.132708263462576, "grad_norm": 5.5, "learning_rate": 4.172055274350132e-06, "loss": 0.8138855, "memory(GiB)": 146.85, "step": 48550, "train_speed(iter/s)": 0.20183 }, { "acc": 0.77187958, "epoch": 1.132941571034865, "grad_norm": 4.71875, "learning_rate": 4.170192301302382e-06, "loss": 0.8304863, "memory(GiB)": 146.85, "step": 48560, "train_speed(iter/s)": 0.201851 }, { "acc": 0.77273312, "epoch": 1.1331748786071538, "grad_norm": 5.21875, "learning_rate": 4.168329446709439e-06, "loss": 0.81724758, "memory(GiB)": 146.85, "step": 48570, "train_speed(iter/s)": 0.201872 }, { "acc": 0.77385502, "epoch": 1.1334081861794427, "grad_norm": 6.09375, "learning_rate": 4.166466710837226e-06, "loss": 0.83091736, "memory(GiB)": 146.85, "step": 48580, "train_speed(iter/s)": 0.201893 }, { "acc": 0.77491684, "epoch": 1.1336414937517316, "grad_norm": 7.625, "learning_rate": 4.1646040939516485e-06, "loss": 0.80402641, "memory(GiB)": 146.85, "step": 48590, "train_speed(iter/s)": 0.201913 }, { "acc": 0.77262163, "epoch": 1.1338748013240205, "grad_norm": 7.5625, "learning_rate": 4.162741596318596e-06, "loss": 0.83972836, "memory(GiB)": 146.85, "step": 48600, "train_speed(iter/s)": 0.201934 }, { "acc": 0.77076969, "epoch": 1.1341081088963094, "grad_norm": 6.0625, "learning_rate": 4.160879218203935e-06, "loss": 0.82323179, "memory(GiB)": 146.85, "step": 48610, "train_speed(iter/s)": 0.201955 }, { "acc": 0.76118374, "epoch": 1.1343414164685983, "grad_norm": 4.28125, "learning_rate": 4.159016959873521e-06, "loss": 0.8524085, "memory(GiB)": 146.85, "step": 48620, "train_speed(iter/s)": 0.201976 }, { "acc": 0.76070623, "epoch": 1.1345747240408872, "grad_norm": 6.5625, "learning_rate": 4.1571548215931925e-06, "loss": 0.85616226, "memory(GiB)": 146.85, "step": 48630, "train_speed(iter/s)": 0.201998 }, { "acc": 0.76596837, "epoch": 1.1348080316131761, "grad_norm": 5.6875, "learning_rate": 4.155292803628768e-06, "loss": 0.86256752, "memory(GiB)": 146.85, "step": 48640, "train_speed(iter/s)": 0.20202 }, { "acc": 0.77304592, "epoch": 1.135041339185465, "grad_norm": 4.84375, "learning_rate": 4.153430906246052e-06, "loss": 0.80558243, "memory(GiB)": 146.85, "step": 48650, "train_speed(iter/s)": 0.202041 }, { "acc": 0.79830546, "epoch": 1.135274646757754, "grad_norm": 5.0625, "learning_rate": 4.151569129710827e-06, "loss": 0.7284708, "memory(GiB)": 146.85, "step": 48660, "train_speed(iter/s)": 0.202063 }, { "acc": 0.76869411, "epoch": 1.1355079543300428, "grad_norm": 5.4375, "learning_rate": 4.149707474288862e-06, "loss": 0.84821692, "memory(GiB)": 146.85, "step": 48670, "train_speed(iter/s)": 0.202085 }, { "acc": 0.7782918, "epoch": 1.1357412619023317, "grad_norm": 5.78125, "learning_rate": 4.147845940245908e-06, "loss": 0.80458899, "memory(GiB)": 146.85, "step": 48680, "train_speed(iter/s)": 0.202106 }, { "acc": 0.7489419, "epoch": 1.1359745694746204, "grad_norm": 6.5, "learning_rate": 4.145984527847699e-06, "loss": 0.90544224, "memory(GiB)": 146.85, "step": 48690, "train_speed(iter/s)": 0.202129 }, { "acc": 0.7704679, "epoch": 1.1362078770469095, "grad_norm": 6.78125, "learning_rate": 4.14412323735995e-06, "loss": 0.82698364, "memory(GiB)": 146.85, "step": 48700, "train_speed(iter/s)": 0.202151 }, { "acc": 0.77008309, "epoch": 1.1364411846191982, "grad_norm": 5.71875, "learning_rate": 4.142262069048362e-06, "loss": 0.83876734, "memory(GiB)": 146.85, "step": 48710, "train_speed(iter/s)": 0.202172 }, { "acc": 0.74910631, "epoch": 1.1366744921914873, "grad_norm": 5.28125, "learning_rate": 4.140401023178613e-06, "loss": 0.91267986, "memory(GiB)": 146.85, "step": 48720, "train_speed(iter/s)": 0.202194 }, { "acc": 0.77082014, "epoch": 1.136907799763776, "grad_norm": 6.59375, "learning_rate": 4.138540100016369e-06, "loss": 0.8168251, "memory(GiB)": 146.85, "step": 48730, "train_speed(iter/s)": 0.202216 }, { "acc": 0.77866106, "epoch": 1.1371411073360649, "grad_norm": 9.5, "learning_rate": 4.136679299827275e-06, "loss": 0.81779041, "memory(GiB)": 146.85, "step": 48740, "train_speed(iter/s)": 0.202238 }, { "acc": 0.7799396, "epoch": 1.1373744149083538, "grad_norm": 5.59375, "learning_rate": 4.134818622876959e-06, "loss": 0.80648899, "memory(GiB)": 146.85, "step": 48750, "train_speed(iter/s)": 0.202261 }, { "acc": 0.78956308, "epoch": 1.1376077224806427, "grad_norm": 4.25, "learning_rate": 4.132958069431034e-06, "loss": 0.75775189, "memory(GiB)": 146.85, "step": 48760, "train_speed(iter/s)": 0.202284 }, { "acc": 0.77059622, "epoch": 1.1378410300529316, "grad_norm": 4.78125, "learning_rate": 4.131097639755093e-06, "loss": 0.80958862, "memory(GiB)": 146.85, "step": 48770, "train_speed(iter/s)": 0.202303 }, { "acc": 0.76704731, "epoch": 1.1380743376252205, "grad_norm": 7.46875, "learning_rate": 4.129237334114712e-06, "loss": 0.85643339, "memory(GiB)": 146.85, "step": 48780, "train_speed(iter/s)": 0.202325 }, { "acc": 0.77849388, "epoch": 1.1383076451975094, "grad_norm": 6.21875, "learning_rate": 4.127377152775448e-06, "loss": 0.8103693, "memory(GiB)": 146.85, "step": 48790, "train_speed(iter/s)": 0.202346 }, { "acc": 0.78006315, "epoch": 1.1385409527697983, "grad_norm": 6.625, "learning_rate": 4.125517096002842e-06, "loss": 0.78734617, "memory(GiB)": 146.85, "step": 48800, "train_speed(iter/s)": 0.202368 }, { "acc": 0.7665864, "epoch": 1.1387742603420872, "grad_norm": 4.375, "learning_rate": 4.123657164062415e-06, "loss": 0.84209118, "memory(GiB)": 146.85, "step": 48810, "train_speed(iter/s)": 0.202389 }, { "acc": 0.75435648, "epoch": 1.139007567914376, "grad_norm": 5.25, "learning_rate": 4.121797357219678e-06, "loss": 0.90329313, "memory(GiB)": 146.85, "step": 48820, "train_speed(iter/s)": 0.202411 }, { "acc": 0.78365211, "epoch": 1.139240875486665, "grad_norm": 6.09375, "learning_rate": 4.119937675740109e-06, "loss": 0.76753654, "memory(GiB)": 146.85, "step": 48830, "train_speed(iter/s)": 0.202434 }, { "acc": 0.78032188, "epoch": 1.1394741830589539, "grad_norm": 6.375, "learning_rate": 4.118078119889182e-06, "loss": 0.77760792, "memory(GiB)": 146.85, "step": 48840, "train_speed(iter/s)": 0.202456 }, { "acc": 0.77732687, "epoch": 1.1397074906312428, "grad_norm": 5.46875, "learning_rate": 4.116218689932346e-06, "loss": 0.83253717, "memory(GiB)": 146.85, "step": 48850, "train_speed(iter/s)": 0.202477 }, { "acc": 0.76485758, "epoch": 1.1399407982035317, "grad_norm": 6.53125, "learning_rate": 4.114359386135038e-06, "loss": 0.85125389, "memory(GiB)": 146.85, "step": 48860, "train_speed(iter/s)": 0.2025 }, { "acc": 0.76786356, "epoch": 1.1401741057758206, "grad_norm": 5.96875, "learning_rate": 4.112500208762668e-06, "loss": 0.81641216, "memory(GiB)": 146.85, "step": 48870, "train_speed(iter/s)": 0.20252 }, { "acc": 0.78673258, "epoch": 1.1404074133481095, "grad_norm": 5.21875, "learning_rate": 4.110641158080636e-06, "loss": 0.79185905, "memory(GiB)": 146.85, "step": 48880, "train_speed(iter/s)": 0.202541 }, { "acc": 0.79044671, "epoch": 1.1406407209203984, "grad_norm": 7.25, "learning_rate": 4.108782234354321e-06, "loss": 0.75460768, "memory(GiB)": 146.85, "step": 48890, "train_speed(iter/s)": 0.202559 }, { "acc": 0.77340879, "epoch": 1.1408740284926873, "grad_norm": 5.71875, "learning_rate": 4.106923437849082e-06, "loss": 0.82447548, "memory(GiB)": 146.85, "step": 48900, "train_speed(iter/s)": 0.202582 }, { "acc": 0.7803709, "epoch": 1.1411073360649762, "grad_norm": 6.625, "learning_rate": 4.105064768830263e-06, "loss": 0.78099637, "memory(GiB)": 146.85, "step": 48910, "train_speed(iter/s)": 0.202603 }, { "acc": 0.75800047, "epoch": 1.141340643637265, "grad_norm": 5.71875, "learning_rate": 4.1032062275631894e-06, "loss": 0.8750782, "memory(GiB)": 146.85, "step": 48920, "train_speed(iter/s)": 0.202624 }, { "acc": 0.77079115, "epoch": 1.141573951209554, "grad_norm": 6.28125, "learning_rate": 4.101347814313166e-06, "loss": 0.82674999, "memory(GiB)": 146.85, "step": 48930, "train_speed(iter/s)": 0.202645 }, { "acc": 0.78686028, "epoch": 1.1418072587818429, "grad_norm": 5.25, "learning_rate": 4.099489529345483e-06, "loss": 0.76858187, "memory(GiB)": 146.85, "step": 48940, "train_speed(iter/s)": 0.202667 }, { "acc": 0.75791101, "epoch": 1.1420405663541318, "grad_norm": 6.25, "learning_rate": 4.097631372925405e-06, "loss": 0.88844652, "memory(GiB)": 146.85, "step": 48950, "train_speed(iter/s)": 0.202688 }, { "acc": 0.77191148, "epoch": 1.1422738739264207, "grad_norm": 5.5625, "learning_rate": 4.095773345318186e-06, "loss": 0.81973639, "memory(GiB)": 146.85, "step": 48960, "train_speed(iter/s)": 0.20271 }, { "acc": 0.77041669, "epoch": 1.1425071814987096, "grad_norm": 9.8125, "learning_rate": 4.0939154467890605e-06, "loss": 0.83082266, "memory(GiB)": 146.85, "step": 48970, "train_speed(iter/s)": 0.20273 }, { "acc": 0.79019752, "epoch": 1.1427404890709985, "grad_norm": 6.1875, "learning_rate": 4.0920576776032415e-06, "loss": 0.74156313, "memory(GiB)": 146.85, "step": 48980, "train_speed(iter/s)": 0.202752 }, { "acc": 0.75792403, "epoch": 1.1429737966432874, "grad_norm": 4.4375, "learning_rate": 4.090200038025926e-06, "loss": 0.88012285, "memory(GiB)": 146.85, "step": 48990, "train_speed(iter/s)": 0.202772 }, { "acc": 0.78965917, "epoch": 1.1432071042155763, "grad_norm": 4.875, "learning_rate": 4.08834252832229e-06, "loss": 0.74002047, "memory(GiB)": 146.85, "step": 49000, "train_speed(iter/s)": 0.202793 }, { "epoch": 1.1432071042155763, "eval_acc": 0.7351675868589443, "eval_loss": 0.8341988325119019, "eval_runtime": 1265.212, "eval_samples_per_second": 28.447, "eval_steps_per_second": 14.224, "step": 49000 }, { "acc": 0.78238726, "epoch": 1.1434404117878652, "grad_norm": 5.90625, "learning_rate": 4.086485148757493e-06, "loss": 0.78582888, "memory(GiB)": 146.85, "step": 49010, "train_speed(iter/s)": 0.201736 }, { "acc": 0.77280016, "epoch": 1.143673719360154, "grad_norm": 9.3125, "learning_rate": 4.084627899596676e-06, "loss": 0.81924925, "memory(GiB)": 146.85, "step": 49020, "train_speed(iter/s)": 0.201758 }, { "acc": 0.74172869, "epoch": 1.143907026932443, "grad_norm": 5.625, "learning_rate": 4.082770781104961e-06, "loss": 0.93987865, "memory(GiB)": 146.85, "step": 49030, "train_speed(iter/s)": 0.201777 }, { "acc": 0.77281165, "epoch": 1.1441403345047318, "grad_norm": 7.5, "learning_rate": 4.080913793547449e-06, "loss": 0.82264643, "memory(GiB)": 146.85, "step": 49040, "train_speed(iter/s)": 0.201797 }, { "acc": 0.78165007, "epoch": 1.1443736420770207, "grad_norm": 5.90625, "learning_rate": 4.079056937189229e-06, "loss": 0.78157759, "memory(GiB)": 146.85, "step": 49050, "train_speed(iter/s)": 0.201818 }, { "acc": 0.77326574, "epoch": 1.1446069496493096, "grad_norm": 8.5, "learning_rate": 4.077200212295361e-06, "loss": 0.80821838, "memory(GiB)": 146.85, "step": 49060, "train_speed(iter/s)": 0.201839 }, { "acc": 0.77507048, "epoch": 1.1448402572215985, "grad_norm": 5.5625, "learning_rate": 4.075343619130895e-06, "loss": 0.81711626, "memory(GiB)": 146.85, "step": 49070, "train_speed(iter/s)": 0.20186 }, { "acc": 0.77670898, "epoch": 1.1450735647938872, "grad_norm": 5.3125, "learning_rate": 4.0734871579608606e-06, "loss": 0.80692101, "memory(GiB)": 146.85, "step": 49080, "train_speed(iter/s)": 0.201877 }, { "acc": 0.76645532, "epoch": 1.1453068723661763, "grad_norm": 5.6875, "learning_rate": 4.071630829050263e-06, "loss": 0.84816866, "memory(GiB)": 146.85, "step": 49090, "train_speed(iter/s)": 0.201898 }, { "acc": 0.78240604, "epoch": 1.145540179938465, "grad_norm": 4.9375, "learning_rate": 4.069774632664095e-06, "loss": 0.77873316, "memory(GiB)": 146.85, "step": 49100, "train_speed(iter/s)": 0.201917 }, { "acc": 0.77294793, "epoch": 1.1457734875107541, "grad_norm": 7.03125, "learning_rate": 4.0679185690673285e-06, "loss": 0.80922785, "memory(GiB)": 146.85, "step": 49110, "train_speed(iter/s)": 0.201938 }, { "acc": 0.77921629, "epoch": 1.1460067950830428, "grad_norm": 8.5625, "learning_rate": 4.066062638524915e-06, "loss": 0.82725878, "memory(GiB)": 146.85, "step": 49120, "train_speed(iter/s)": 0.201959 }, { "acc": 0.77071447, "epoch": 1.1462401026553317, "grad_norm": 6.90625, "learning_rate": 4.064206841301789e-06, "loss": 0.83020353, "memory(GiB)": 146.85, "step": 49130, "train_speed(iter/s)": 0.201981 }, { "acc": 0.76776428, "epoch": 1.1464734102276206, "grad_norm": 4.8125, "learning_rate": 4.062351177662866e-06, "loss": 0.83814774, "memory(GiB)": 146.85, "step": 49140, "train_speed(iter/s)": 0.202003 }, { "acc": 0.79014835, "epoch": 1.1467067177999095, "grad_norm": 5.4375, "learning_rate": 4.060495647873038e-06, "loss": 0.75755568, "memory(GiB)": 146.85, "step": 49150, "train_speed(iter/s)": 0.202023 }, { "acc": 0.76913023, "epoch": 1.1469400253721984, "grad_norm": 6.25, "learning_rate": 4.058640252197184e-06, "loss": 0.8361763, "memory(GiB)": 146.85, "step": 49160, "train_speed(iter/s)": 0.202044 }, { "acc": 0.78012853, "epoch": 1.1471733329444873, "grad_norm": 5.0625, "learning_rate": 4.056784990900162e-06, "loss": 0.77208524, "memory(GiB)": 146.85, "step": 49170, "train_speed(iter/s)": 0.202065 }, { "acc": 0.77641382, "epoch": 1.1474066405167762, "grad_norm": 5.0625, "learning_rate": 4.054929864246807e-06, "loss": 0.80695667, "memory(GiB)": 146.85, "step": 49180, "train_speed(iter/s)": 0.202086 }, { "acc": 0.79084749, "epoch": 1.147639948089065, "grad_norm": 5.59375, "learning_rate": 4.053074872501939e-06, "loss": 0.75714693, "memory(GiB)": 146.85, "step": 49190, "train_speed(iter/s)": 0.202108 }, { "acc": 0.77255144, "epoch": 1.147873255661354, "grad_norm": 4.21875, "learning_rate": 4.051220015930358e-06, "loss": 0.81327734, "memory(GiB)": 146.85, "step": 49200, "train_speed(iter/s)": 0.20213 }, { "acc": 0.77912197, "epoch": 1.148106563233643, "grad_norm": 5.125, "learning_rate": 4.049365294796844e-06, "loss": 0.80468102, "memory(GiB)": 146.85, "step": 49210, "train_speed(iter/s)": 0.20215 }, { "acc": 0.75521336, "epoch": 1.1483398708059318, "grad_norm": 3.734375, "learning_rate": 4.047510709366159e-06, "loss": 0.88245182, "memory(GiB)": 146.85, "step": 49220, "train_speed(iter/s)": 0.202172 }, { "acc": 0.77343683, "epoch": 1.1485731783782207, "grad_norm": 5.3125, "learning_rate": 4.045656259903042e-06, "loss": 0.8032136, "memory(GiB)": 146.85, "step": 49230, "train_speed(iter/s)": 0.202194 }, { "acc": 0.76000624, "epoch": 1.1488064859505096, "grad_norm": 6.84375, "learning_rate": 4.043801946672217e-06, "loss": 0.89378548, "memory(GiB)": 146.85, "step": 49240, "train_speed(iter/s)": 0.202216 }, { "acc": 0.7769515, "epoch": 1.1490397935227985, "grad_norm": 6.03125, "learning_rate": 4.041947769938387e-06, "loss": 0.81838188, "memory(GiB)": 146.85, "step": 49250, "train_speed(iter/s)": 0.202237 }, { "acc": 0.75550566, "epoch": 1.1492731010950874, "grad_norm": 4.6875, "learning_rate": 4.040093729966234e-06, "loss": 0.88800974, "memory(GiB)": 146.85, "step": 49260, "train_speed(iter/s)": 0.202257 }, { "acc": 0.77085934, "epoch": 1.1495064086673763, "grad_norm": 7.84375, "learning_rate": 4.038239827020424e-06, "loss": 0.82978039, "memory(GiB)": 146.85, "step": 49270, "train_speed(iter/s)": 0.202279 }, { "acc": 0.77341442, "epoch": 1.1497397162396652, "grad_norm": 6.09375, "learning_rate": 4.036386061365598e-06, "loss": 0.83586159, "memory(GiB)": 146.85, "step": 49280, "train_speed(iter/s)": 0.2023 }, { "acc": 0.77856121, "epoch": 1.149973023811954, "grad_norm": 7.40625, "learning_rate": 4.034532433266382e-06, "loss": 0.80039883, "memory(GiB)": 146.85, "step": 49290, "train_speed(iter/s)": 0.202321 }, { "acc": 0.76958466, "epoch": 1.150206331384243, "grad_norm": 12.8125, "learning_rate": 4.032678942987382e-06, "loss": 0.84434996, "memory(GiB)": 146.85, "step": 49300, "train_speed(iter/s)": 0.202341 }, { "acc": 0.76118784, "epoch": 1.150439638956532, "grad_norm": 5.46875, "learning_rate": 4.030825590793179e-06, "loss": 0.85149298, "memory(GiB)": 146.85, "step": 49310, "train_speed(iter/s)": 0.202362 }, { "acc": 0.7728085, "epoch": 1.1506729465288208, "grad_norm": 6.9375, "learning_rate": 4.028972376948343e-06, "loss": 0.83273439, "memory(GiB)": 146.85, "step": 49320, "train_speed(iter/s)": 0.202381 }, { "acc": 0.77747178, "epoch": 1.1509062541011097, "grad_norm": 6.8125, "learning_rate": 4.027119301717417e-06, "loss": 0.77441254, "memory(GiB)": 146.85, "step": 49330, "train_speed(iter/s)": 0.202402 }, { "acc": 0.78036995, "epoch": 1.1511395616733986, "grad_norm": 5.71875, "learning_rate": 4.025266365364928e-06, "loss": 0.76959438, "memory(GiB)": 146.85, "step": 49340, "train_speed(iter/s)": 0.202421 }, { "acc": 0.7826869, "epoch": 1.1513728692456875, "grad_norm": 6.96875, "learning_rate": 4.0234135681553835e-06, "loss": 0.79887667, "memory(GiB)": 146.85, "step": 49350, "train_speed(iter/s)": 0.202443 }, { "acc": 0.77399216, "epoch": 1.1516061768179764, "grad_norm": 7.375, "learning_rate": 4.021560910353268e-06, "loss": 0.81838903, "memory(GiB)": 146.85, "step": 49360, "train_speed(iter/s)": 0.202462 }, { "acc": 0.78215246, "epoch": 1.1518394843902653, "grad_norm": 5.59375, "learning_rate": 4.019708392223048e-06, "loss": 0.78709269, "memory(GiB)": 146.85, "step": 49370, "train_speed(iter/s)": 0.202483 }, { "acc": 0.78577347, "epoch": 1.1520727919625542, "grad_norm": 5.90625, "learning_rate": 4.017856014029171e-06, "loss": 0.78504524, "memory(GiB)": 146.85, "step": 49380, "train_speed(iter/s)": 0.202505 }, { "acc": 0.76135492, "epoch": 1.152306099534843, "grad_norm": 5.46875, "learning_rate": 4.016003776036064e-06, "loss": 0.84697437, "memory(GiB)": 146.85, "step": 49390, "train_speed(iter/s)": 0.202525 }, { "acc": 0.79407911, "epoch": 1.152539407107132, "grad_norm": 5.78125, "learning_rate": 4.01415167850813e-06, "loss": 0.7355444, "memory(GiB)": 146.85, "step": 49400, "train_speed(iter/s)": 0.202546 }, { "acc": 0.77767096, "epoch": 1.1527727146794209, "grad_norm": 5.0625, "learning_rate": 4.012299721709757e-06, "loss": 0.79713035, "memory(GiB)": 146.85, "step": 49410, "train_speed(iter/s)": 0.202566 }, { "acc": 0.77698679, "epoch": 1.1530060222517098, "grad_norm": 8.1875, "learning_rate": 4.010447905905312e-06, "loss": 0.81027679, "memory(GiB)": 146.85, "step": 49420, "train_speed(iter/s)": 0.202584 }, { "acc": 0.75254154, "epoch": 1.1532393298239987, "grad_norm": 6.625, "learning_rate": 4.0085962313591416e-06, "loss": 0.88670578, "memory(GiB)": 146.85, "step": 49430, "train_speed(iter/s)": 0.202604 }, { "acc": 0.75156584, "epoch": 1.1534726373962876, "grad_norm": 4.875, "learning_rate": 4.006744698335572e-06, "loss": 0.89362335, "memory(GiB)": 146.85, "step": 49440, "train_speed(iter/s)": 0.202626 }, { "acc": 0.77384214, "epoch": 1.1537059449685765, "grad_norm": 5.375, "learning_rate": 4.004893307098907e-06, "loss": 0.82525406, "memory(GiB)": 146.85, "step": 49450, "train_speed(iter/s)": 0.202647 }, { "acc": 0.7937149, "epoch": 1.1539392525408654, "grad_norm": 6.0, "learning_rate": 4.003042057913434e-06, "loss": 0.74415808, "memory(GiB)": 146.85, "step": 49460, "train_speed(iter/s)": 0.202668 }, { "acc": 0.76567755, "epoch": 1.154172560113154, "grad_norm": 5.3125, "learning_rate": 4.001190951043416e-06, "loss": 0.857232, "memory(GiB)": 146.85, "step": 49470, "train_speed(iter/s)": 0.202689 }, { "acc": 0.78185697, "epoch": 1.1544058676854432, "grad_norm": 5.5, "learning_rate": 3.9993399867531e-06, "loss": 0.77868156, "memory(GiB)": 146.85, "step": 49480, "train_speed(iter/s)": 0.20271 }, { "acc": 0.79113498, "epoch": 1.1546391752577319, "grad_norm": 7.0, "learning_rate": 3.997489165306713e-06, "loss": 0.73854074, "memory(GiB)": 146.85, "step": 49490, "train_speed(iter/s)": 0.20273 }, { "acc": 0.76517477, "epoch": 1.154872482830021, "grad_norm": 6.28125, "learning_rate": 3.995638486968453e-06, "loss": 0.83324661, "memory(GiB)": 146.85, "step": 49500, "train_speed(iter/s)": 0.202753 }, { "epoch": 1.154872482830021, "eval_acc": 0.7351185386206474, "eval_loss": 0.8342490792274475, "eval_runtime": 1265.0952, "eval_samples_per_second": 28.449, "eval_steps_per_second": 14.225, "step": 49500 }, { "acc": 0.76396894, "epoch": 1.1551057904023097, "grad_norm": 5.375, "learning_rate": 3.99378795200251e-06, "loss": 0.88474712, "memory(GiB)": 146.85, "step": 49510, "train_speed(iter/s)": 0.201708 }, { "acc": 0.76810269, "epoch": 1.1553390979745986, "grad_norm": 5.96875, "learning_rate": 3.991937560673044e-06, "loss": 0.8222003, "memory(GiB)": 146.85, "step": 49520, "train_speed(iter/s)": 0.20173 }, { "acc": 0.77053566, "epoch": 1.1555724055468874, "grad_norm": 5.15625, "learning_rate": 3.990087313244197e-06, "loss": 0.82292881, "memory(GiB)": 146.85, "step": 49530, "train_speed(iter/s)": 0.201752 }, { "acc": 0.76206627, "epoch": 1.1558057131191763, "grad_norm": 7.78125, "learning_rate": 3.988237209980093e-06, "loss": 0.83780956, "memory(GiB)": 146.85, "step": 49540, "train_speed(iter/s)": 0.201774 }, { "acc": 0.77367601, "epoch": 1.1560390206914652, "grad_norm": 5.375, "learning_rate": 3.986387251144833e-06, "loss": 0.83019562, "memory(GiB)": 146.85, "step": 49550, "train_speed(iter/s)": 0.201794 }, { "acc": 0.76186857, "epoch": 1.1562723282637541, "grad_norm": 6.65625, "learning_rate": 3.9845374370024995e-06, "loss": 0.83242521, "memory(GiB)": 146.85, "step": 49560, "train_speed(iter/s)": 0.201816 }, { "acc": 0.77476535, "epoch": 1.156505635836043, "grad_norm": 5.90625, "learning_rate": 3.9826877678171515e-06, "loss": 0.82364321, "memory(GiB)": 146.85, "step": 49570, "train_speed(iter/s)": 0.201838 }, { "acc": 0.77366982, "epoch": 1.156738943408332, "grad_norm": 5.0, "learning_rate": 3.980838243852829e-06, "loss": 0.78847017, "memory(GiB)": 146.85, "step": 49580, "train_speed(iter/s)": 0.20186 }, { "acc": 0.77038593, "epoch": 1.1569722509806208, "grad_norm": 6.25, "learning_rate": 3.978988865373551e-06, "loss": 0.8412859, "memory(GiB)": 146.85, "step": 49590, "train_speed(iter/s)": 0.20188 }, { "acc": 0.78771009, "epoch": 1.1572055585529097, "grad_norm": 6.0625, "learning_rate": 3.977139632643316e-06, "loss": 0.78352909, "memory(GiB)": 146.85, "step": 49600, "train_speed(iter/s)": 0.201901 }, { "acc": 0.77288561, "epoch": 1.1574388661251986, "grad_norm": 7.8125, "learning_rate": 3.975290545926101e-06, "loss": 0.83032999, "memory(GiB)": 146.85, "step": 49610, "train_speed(iter/s)": 0.201921 }, { "acc": 0.75575519, "epoch": 1.1576721736974875, "grad_norm": 7.53125, "learning_rate": 3.973441605485864e-06, "loss": 0.89857864, "memory(GiB)": 146.85, "step": 49620, "train_speed(iter/s)": 0.201942 }, { "acc": 0.77702417, "epoch": 1.1579054812697764, "grad_norm": 6.3125, "learning_rate": 3.971592811586539e-06, "loss": 0.80176811, "memory(GiB)": 146.85, "step": 49630, "train_speed(iter/s)": 0.201962 }, { "acc": 0.77392502, "epoch": 1.1581387888420653, "grad_norm": 6.6875, "learning_rate": 3.969744164492041e-06, "loss": 0.82165594, "memory(GiB)": 146.85, "step": 49640, "train_speed(iter/s)": 0.201983 }, { "acc": 0.7631423, "epoch": 1.1583720964143542, "grad_norm": 5.65625, "learning_rate": 3.967895664466265e-06, "loss": 0.84254913, "memory(GiB)": 146.85, "step": 49650, "train_speed(iter/s)": 0.202005 }, { "acc": 0.75672321, "epoch": 1.1586054039866431, "grad_norm": 5.53125, "learning_rate": 3.966047311773083e-06, "loss": 0.89304581, "memory(GiB)": 146.85, "step": 49660, "train_speed(iter/s)": 0.202025 }, { "acc": 0.76614885, "epoch": 1.158838711558932, "grad_norm": 6.9375, "learning_rate": 3.964199106676345e-06, "loss": 0.8654192, "memory(GiB)": 146.85, "step": 49670, "train_speed(iter/s)": 0.202048 }, { "acc": 0.78937025, "epoch": 1.159072019131221, "grad_norm": 5.3125, "learning_rate": 3.962351049439885e-06, "loss": 0.76890659, "memory(GiB)": 146.85, "step": 49680, "train_speed(iter/s)": 0.202068 }, { "acc": 0.7644001, "epoch": 1.1593053267035098, "grad_norm": 6.15625, "learning_rate": 3.960503140327511e-06, "loss": 0.84654217, "memory(GiB)": 146.85, "step": 49690, "train_speed(iter/s)": 0.20209 }, { "acc": 0.74622493, "epoch": 1.1595386342757987, "grad_norm": 6.1875, "learning_rate": 3.958655379603011e-06, "loss": 0.93447943, "memory(GiB)": 146.85, "step": 49700, "train_speed(iter/s)": 0.202112 }, { "acc": 0.78476696, "epoch": 1.1597719418480876, "grad_norm": 6.28125, "learning_rate": 3.956807767530155e-06, "loss": 0.77230873, "memory(GiB)": 146.85, "step": 49710, "train_speed(iter/s)": 0.202133 }, { "acc": 0.75386648, "epoch": 1.1600052494203765, "grad_norm": 5.5625, "learning_rate": 3.954960304372686e-06, "loss": 0.90400629, "memory(GiB)": 146.85, "step": 49720, "train_speed(iter/s)": 0.202154 }, { "acc": 0.77578707, "epoch": 1.1602385569926654, "grad_norm": 4.875, "learning_rate": 3.95311299039433e-06, "loss": 0.81317015, "memory(GiB)": 146.85, "step": 49730, "train_speed(iter/s)": 0.202176 }, { "acc": 0.78016491, "epoch": 1.1604718645649543, "grad_norm": 4.15625, "learning_rate": 3.951265825858792e-06, "loss": 0.81017666, "memory(GiB)": 146.85, "step": 49740, "train_speed(iter/s)": 0.202197 }, { "acc": 0.77135706, "epoch": 1.1607051721372432, "grad_norm": 6.625, "learning_rate": 3.949418811029752e-06, "loss": 0.82975016, "memory(GiB)": 146.85, "step": 49750, "train_speed(iter/s)": 0.202216 }, { "acc": 0.7748395, "epoch": 1.1609384797095321, "grad_norm": 6.65625, "learning_rate": 3.94757194617087e-06, "loss": 0.80786743, "memory(GiB)": 146.85, "step": 49760, "train_speed(iter/s)": 0.202237 }, { "acc": 0.7779058, "epoch": 1.161171787281821, "grad_norm": 6.84375, "learning_rate": 3.945725231545787e-06, "loss": 0.80388346, "memory(GiB)": 146.85, "step": 49770, "train_speed(iter/s)": 0.202257 }, { "acc": 0.7703229, "epoch": 1.16140509485411, "grad_norm": 5.28125, "learning_rate": 3.943878667418122e-06, "loss": 0.83580961, "memory(GiB)": 146.85, "step": 49780, "train_speed(iter/s)": 0.202277 }, { "acc": 0.76243019, "epoch": 1.1616384024263988, "grad_norm": 4.1875, "learning_rate": 3.942032254051471e-06, "loss": 0.87772503, "memory(GiB)": 146.85, "step": 49790, "train_speed(iter/s)": 0.202299 }, { "acc": 0.777807, "epoch": 1.1618717099986877, "grad_norm": 5.15625, "learning_rate": 3.940185991709407e-06, "loss": 0.81917801, "memory(GiB)": 146.85, "step": 49800, "train_speed(iter/s)": 0.202319 }, { "acc": 0.79209533, "epoch": 1.1621050175709766, "grad_norm": 4.59375, "learning_rate": 3.938339880655485e-06, "loss": 0.76629076, "memory(GiB)": 146.85, "step": 49810, "train_speed(iter/s)": 0.202339 }, { "acc": 0.78689947, "epoch": 1.1623383251432655, "grad_norm": 6.28125, "learning_rate": 3.9364939211532365e-06, "loss": 0.76444526, "memory(GiB)": 146.85, "step": 49820, "train_speed(iter/s)": 0.202359 }, { "acc": 0.76469564, "epoch": 1.1625716327155544, "grad_norm": 5.375, "learning_rate": 3.934648113466172e-06, "loss": 0.85859299, "memory(GiB)": 146.85, "step": 49830, "train_speed(iter/s)": 0.202381 }, { "acc": 0.77371402, "epoch": 1.1628049402878433, "grad_norm": 6.3125, "learning_rate": 3.93280245785778e-06, "loss": 0.82290726, "memory(GiB)": 146.85, "step": 49840, "train_speed(iter/s)": 0.202402 }, { "acc": 0.77539568, "epoch": 1.1630382478601322, "grad_norm": 5.625, "learning_rate": 3.9309569545915285e-06, "loss": 0.79614267, "memory(GiB)": 146.85, "step": 49850, "train_speed(iter/s)": 0.202423 }, { "acc": 0.7719686, "epoch": 1.163271555432421, "grad_norm": 8.0, "learning_rate": 3.9291116039308605e-06, "loss": 0.83327131, "memory(GiB)": 146.85, "step": 49860, "train_speed(iter/s)": 0.202444 }, { "acc": 0.77543592, "epoch": 1.16350486300471, "grad_norm": 6.0625, "learning_rate": 3.9272664061392e-06, "loss": 0.80924673, "memory(GiB)": 146.85, "step": 49870, "train_speed(iter/s)": 0.202462 }, { "acc": 0.7643095, "epoch": 1.1637381705769987, "grad_norm": 6.09375, "learning_rate": 3.925421361479947e-06, "loss": 0.86487789, "memory(GiB)": 146.85, "step": 49880, "train_speed(iter/s)": 0.202481 }, { "acc": 0.75789528, "epoch": 1.1639714781492876, "grad_norm": 4.625, "learning_rate": 3.923576470216483e-06, "loss": 0.89659882, "memory(GiB)": 146.85, "step": 49890, "train_speed(iter/s)": 0.202502 }, { "acc": 0.7712616, "epoch": 1.1642047857215765, "grad_norm": 4.75, "learning_rate": 3.9217317326121655e-06, "loss": 0.83290997, "memory(GiB)": 146.85, "step": 49900, "train_speed(iter/s)": 0.202524 }, { "acc": 0.77256212, "epoch": 1.1644380932938654, "grad_norm": 7.03125, "learning_rate": 3.919887148930329e-06, "loss": 0.81212931, "memory(GiB)": 146.85, "step": 49910, "train_speed(iter/s)": 0.202545 }, { "acc": 0.77020369, "epoch": 1.1646714008661543, "grad_norm": 8.3125, "learning_rate": 3.918042719434288e-06, "loss": 0.8287384, "memory(GiB)": 146.85, "step": 49920, "train_speed(iter/s)": 0.202566 }, { "acc": 0.78074751, "epoch": 1.1649047084384432, "grad_norm": 6.34375, "learning_rate": 3.916198444387337e-06, "loss": 0.79582415, "memory(GiB)": 146.85, "step": 49930, "train_speed(iter/s)": 0.202586 }, { "acc": 0.7718729, "epoch": 1.165138016010732, "grad_norm": 5.65625, "learning_rate": 3.914354324052741e-06, "loss": 0.81184311, "memory(GiB)": 146.85, "step": 49940, "train_speed(iter/s)": 0.202606 }, { "acc": 0.77758541, "epoch": 1.165371323583021, "grad_norm": 4.65625, "learning_rate": 3.91251035869375e-06, "loss": 0.79779301, "memory(GiB)": 146.85, "step": 49950, "train_speed(iter/s)": 0.202627 }, { "acc": 0.75753222, "epoch": 1.1656046311553099, "grad_norm": 5.46875, "learning_rate": 3.91066654857359e-06, "loss": 0.89764471, "memory(GiB)": 146.85, "step": 49960, "train_speed(iter/s)": 0.202647 }, { "acc": 0.76481972, "epoch": 1.1658379387275988, "grad_norm": 6.40625, "learning_rate": 3.908822893955466e-06, "loss": 0.84974918, "memory(GiB)": 146.85, "step": 49970, "train_speed(iter/s)": 0.202669 }, { "acc": 0.78254623, "epoch": 1.1660712462998877, "grad_norm": 7.21875, "learning_rate": 3.9069793951025544e-06, "loss": 0.76704111, "memory(GiB)": 146.85, "step": 49980, "train_speed(iter/s)": 0.202691 }, { "acc": 0.78539872, "epoch": 1.1663045538721766, "grad_norm": 6.3125, "learning_rate": 3.9051360522780166e-06, "loss": 0.74286385, "memory(GiB)": 146.85, "step": 49990, "train_speed(iter/s)": 0.202712 }, { "acc": 0.76975632, "epoch": 1.1665378614444655, "grad_norm": 6.25, "learning_rate": 3.903292865744989e-06, "loss": 0.8291357, "memory(GiB)": 146.85, "step": 50000, "train_speed(iter/s)": 0.202733 }, { "epoch": 1.1665378614444655, "eval_acc": 0.7352395457874985, "eval_loss": 0.8342112302780151, "eval_runtime": 1263.2663, "eval_samples_per_second": 28.49, "eval_steps_per_second": 14.246, "step": 50000 }, { "acc": 0.7732831, "epoch": 1.1667711690167544, "grad_norm": 6.15625, "learning_rate": 3.901449835766588e-06, "loss": 0.8318779, "memory(GiB)": 146.85, "step": 50010, "train_speed(iter/s)": 0.201699 }, { "acc": 0.78292313, "epoch": 1.1670044765890433, "grad_norm": 8.625, "learning_rate": 3.899606962605902e-06, "loss": 0.76762428, "memory(GiB)": 146.85, "step": 50020, "train_speed(iter/s)": 0.20172 }, { "acc": 0.77403116, "epoch": 1.1672377841613322, "grad_norm": 6.28125, "learning_rate": 3.897764246526003e-06, "loss": 0.8059598, "memory(GiB)": 146.85, "step": 50030, "train_speed(iter/s)": 0.201743 }, { "acc": 0.76933322, "epoch": 1.167471091733621, "grad_norm": 6.1875, "learning_rate": 3.895921687789936e-06, "loss": 0.84013643, "memory(GiB)": 146.85, "step": 50040, "train_speed(iter/s)": 0.201764 }, { "acc": 0.7707778, "epoch": 1.16770439930591, "grad_norm": 5.65625, "learning_rate": 3.894079286660729e-06, "loss": 0.82886839, "memory(GiB)": 146.85, "step": 50050, "train_speed(iter/s)": 0.201786 }, { "acc": 0.77219143, "epoch": 1.1679377068781989, "grad_norm": 10.25, "learning_rate": 3.892237043401382e-06, "loss": 0.8054698, "memory(GiB)": 146.85, "step": 50060, "train_speed(iter/s)": 0.201808 }, { "acc": 0.78135958, "epoch": 1.1681710144504878, "grad_norm": 5.1875, "learning_rate": 3.890394958274877e-06, "loss": 0.77993641, "memory(GiB)": 146.85, "step": 50070, "train_speed(iter/s)": 0.201829 }, { "acc": 0.77404304, "epoch": 1.1684043220227767, "grad_norm": 7.65625, "learning_rate": 3.888553031544169e-06, "loss": 0.81121912, "memory(GiB)": 146.85, "step": 50080, "train_speed(iter/s)": 0.201851 }, { "acc": 0.77225199, "epoch": 1.1686376295950656, "grad_norm": 5.75, "learning_rate": 3.886711263472192e-06, "loss": 0.81709023, "memory(GiB)": 146.85, "step": 50090, "train_speed(iter/s)": 0.201873 }, { "acc": 0.76932859, "epoch": 1.1688709371673545, "grad_norm": 4.71875, "learning_rate": 3.884869654321859e-06, "loss": 0.8373106, "memory(GiB)": 146.85, "step": 50100, "train_speed(iter/s)": 0.201892 }, { "acc": 0.78177924, "epoch": 1.1691042447396434, "grad_norm": 5.53125, "learning_rate": 3.883028204356058e-06, "loss": 0.80050259, "memory(GiB)": 146.85, "step": 50110, "train_speed(iter/s)": 0.201913 }, { "acc": 0.77619085, "epoch": 1.1693375523119323, "grad_norm": 5.78125, "learning_rate": 3.881186913837657e-06, "loss": 0.79847207, "memory(GiB)": 146.85, "step": 50120, "train_speed(iter/s)": 0.201933 }, { "acc": 0.80246382, "epoch": 1.1695708598842212, "grad_norm": 5.3125, "learning_rate": 3.879345783029498e-06, "loss": 0.69431791, "memory(GiB)": 146.85, "step": 50130, "train_speed(iter/s)": 0.201953 }, { "acc": 0.7715868, "epoch": 1.16980416745651, "grad_norm": 7.0625, "learning_rate": 3.877504812194404e-06, "loss": 0.82872467, "memory(GiB)": 146.85, "step": 50140, "train_speed(iter/s)": 0.201974 }, { "acc": 0.77955976, "epoch": 1.170037475028799, "grad_norm": 6.25, "learning_rate": 3.875664001595172e-06, "loss": 0.78663673, "memory(GiB)": 146.85, "step": 50150, "train_speed(iter/s)": 0.201994 }, { "acc": 0.77398291, "epoch": 1.1702707826010879, "grad_norm": 6.0, "learning_rate": 3.873823351494576e-06, "loss": 0.80191708, "memory(GiB)": 146.85, "step": 50160, "train_speed(iter/s)": 0.202014 }, { "acc": 0.7482255, "epoch": 1.1705040901733768, "grad_norm": 6.28125, "learning_rate": 3.8719828621553715e-06, "loss": 0.90200071, "memory(GiB)": 146.85, "step": 50170, "train_speed(iter/s)": 0.202035 }, { "acc": 0.75793552, "epoch": 1.1707373977456657, "grad_norm": 6.21875, "learning_rate": 3.870142533840283e-06, "loss": 0.89205704, "memory(GiB)": 146.85, "step": 50180, "train_speed(iter/s)": 0.202056 }, { "acc": 0.75329657, "epoch": 1.1709707053179546, "grad_norm": 6.5, "learning_rate": 3.868302366812024e-06, "loss": 0.907693, "memory(GiB)": 146.85, "step": 50190, "train_speed(iter/s)": 0.202075 }, { "acc": 0.75211058, "epoch": 1.1712040128902435, "grad_norm": 6.09375, "learning_rate": 3.8664623613332705e-06, "loss": 0.89070387, "memory(GiB)": 146.85, "step": 50200, "train_speed(iter/s)": 0.202097 }, { "acc": 0.77827005, "epoch": 1.1714373204625323, "grad_norm": 4.9375, "learning_rate": 3.864622517666685e-06, "loss": 0.80716343, "memory(GiB)": 146.85, "step": 50210, "train_speed(iter/s)": 0.202117 }, { "acc": 0.78549442, "epoch": 1.1716706280348212, "grad_norm": 6.8125, "learning_rate": 3.862782836074906e-06, "loss": 0.75385532, "memory(GiB)": 146.85, "step": 50220, "train_speed(iter/s)": 0.202137 }, { "acc": 0.77948866, "epoch": 1.17190393560711, "grad_norm": 4.46875, "learning_rate": 3.860943316820548e-06, "loss": 0.78423023, "memory(GiB)": 146.85, "step": 50230, "train_speed(iter/s)": 0.202158 }, { "acc": 0.78778658, "epoch": 1.172137243179399, "grad_norm": 8.5, "learning_rate": 3.859103960166198e-06, "loss": 0.77195396, "memory(GiB)": 146.85, "step": 50240, "train_speed(iter/s)": 0.202178 }, { "acc": 0.77718711, "epoch": 1.1723705507516877, "grad_norm": 7.625, "learning_rate": 3.857264766374428e-06, "loss": 0.80802612, "memory(GiB)": 146.85, "step": 50250, "train_speed(iter/s)": 0.2022 }, { "acc": 0.77543554, "epoch": 1.1726038583239768, "grad_norm": 5.59375, "learning_rate": 3.855425735707779e-06, "loss": 0.79955645, "memory(GiB)": 146.85, "step": 50260, "train_speed(iter/s)": 0.202219 }, { "acc": 0.77110138, "epoch": 1.1728371658962655, "grad_norm": 5.34375, "learning_rate": 3.853586868428775e-06, "loss": 0.83160448, "memory(GiB)": 146.85, "step": 50270, "train_speed(iter/s)": 0.20224 }, { "acc": 0.77692766, "epoch": 1.1730704734685544, "grad_norm": 5.0625, "learning_rate": 3.851748164799914e-06, "loss": 0.81530485, "memory(GiB)": 146.85, "step": 50280, "train_speed(iter/s)": 0.20226 }, { "acc": 0.78514333, "epoch": 1.1733037810408433, "grad_norm": 5.90625, "learning_rate": 3.849909625083666e-06, "loss": 0.76534672, "memory(GiB)": 146.85, "step": 50290, "train_speed(iter/s)": 0.20228 }, { "acc": 0.78304901, "epoch": 1.1735370886131322, "grad_norm": 5.59375, "learning_rate": 3.848071249542486e-06, "loss": 0.76788511, "memory(GiB)": 146.85, "step": 50300, "train_speed(iter/s)": 0.202301 }, { "acc": 0.74151278, "epoch": 1.1737703961854211, "grad_norm": 4.90625, "learning_rate": 3.846233038438803e-06, "loss": 0.95167732, "memory(GiB)": 146.85, "step": 50310, "train_speed(iter/s)": 0.202323 }, { "acc": 0.76246233, "epoch": 1.17400370375771, "grad_norm": 5.28125, "learning_rate": 3.844394992035017e-06, "loss": 0.85736847, "memory(GiB)": 146.85, "step": 50320, "train_speed(iter/s)": 0.202345 }, { "acc": 0.77607136, "epoch": 1.174237011329999, "grad_norm": 6.4375, "learning_rate": 3.842557110593509e-06, "loss": 0.82033539, "memory(GiB)": 146.85, "step": 50330, "train_speed(iter/s)": 0.202365 }, { "acc": 0.77252574, "epoch": 1.1744703189022878, "grad_norm": 7.4375, "learning_rate": 3.840719394376638e-06, "loss": 0.82712326, "memory(GiB)": 146.85, "step": 50340, "train_speed(iter/s)": 0.202385 }, { "acc": 0.77747145, "epoch": 1.1747036264745767, "grad_norm": 5.75, "learning_rate": 3.838881843646736e-06, "loss": 0.80375872, "memory(GiB)": 146.85, "step": 50350, "train_speed(iter/s)": 0.202405 }, { "acc": 0.75421748, "epoch": 1.1749369340468656, "grad_norm": 4.5625, "learning_rate": 3.8370444586661135e-06, "loss": 0.89368477, "memory(GiB)": 146.85, "step": 50360, "train_speed(iter/s)": 0.202427 }, { "acc": 0.77247858, "epoch": 1.1751702416191545, "grad_norm": 6.375, "learning_rate": 3.835207239697057e-06, "loss": 0.7984437, "memory(GiB)": 146.85, "step": 50370, "train_speed(iter/s)": 0.202449 }, { "acc": 0.77412748, "epoch": 1.1754035491914434, "grad_norm": 5.03125, "learning_rate": 3.8333701870018296e-06, "loss": 0.80720901, "memory(GiB)": 146.85, "step": 50380, "train_speed(iter/s)": 0.202469 }, { "acc": 0.76704149, "epoch": 1.1756368567637323, "grad_norm": 5.875, "learning_rate": 3.831533300842667e-06, "loss": 0.85643282, "memory(GiB)": 146.85, "step": 50390, "train_speed(iter/s)": 0.202491 }, { "acc": 0.78507247, "epoch": 1.1758701643360212, "grad_norm": 6.40625, "learning_rate": 3.829696581481787e-06, "loss": 0.77448387, "memory(GiB)": 146.85, "step": 50400, "train_speed(iter/s)": 0.20251 }, { "acc": 0.76974792, "epoch": 1.17610347190831, "grad_norm": 5.75, "learning_rate": 3.827860029181382e-06, "loss": 0.85429649, "memory(GiB)": 146.85, "step": 50410, "train_speed(iter/s)": 0.202532 }, { "acc": 0.77608948, "epoch": 1.176336779480599, "grad_norm": 6.53125, "learning_rate": 3.826023644203617e-06, "loss": 0.80963612, "memory(GiB)": 146.85, "step": 50420, "train_speed(iter/s)": 0.202552 }, { "acc": 0.75800228, "epoch": 1.176570087052888, "grad_norm": 6.9375, "learning_rate": 3.824187426810635e-06, "loss": 0.88443871, "memory(GiB)": 146.85, "step": 50430, "train_speed(iter/s)": 0.202574 }, { "acc": 0.77980042, "epoch": 1.1768033946251768, "grad_norm": 5.84375, "learning_rate": 3.822351377264555e-06, "loss": 0.78946886, "memory(GiB)": 146.85, "step": 50440, "train_speed(iter/s)": 0.202595 }, { "acc": 0.77967839, "epoch": 1.1770367021974657, "grad_norm": 5.78125, "learning_rate": 3.820515495827476e-06, "loss": 0.79184961, "memory(GiB)": 146.85, "step": 50450, "train_speed(iter/s)": 0.202616 }, { "acc": 0.76444035, "epoch": 1.1772700097697546, "grad_norm": 5.15625, "learning_rate": 3.818679782761465e-06, "loss": 0.84260902, "memory(GiB)": 146.85, "step": 50460, "train_speed(iter/s)": 0.202636 }, { "acc": 0.75861082, "epoch": 1.1775033173420435, "grad_norm": 4.6875, "learning_rate": 3.816844238328573e-06, "loss": 0.86886463, "memory(GiB)": 146.85, "step": 50470, "train_speed(iter/s)": 0.202657 }, { "acc": 0.75553637, "epoch": 1.1777366249143324, "grad_norm": 5.71875, "learning_rate": 3.815008862790822e-06, "loss": 0.87062569, "memory(GiB)": 146.85, "step": 50480, "train_speed(iter/s)": 0.202678 }, { "acc": 0.7794383, "epoch": 1.1779699324866213, "grad_norm": 7.71875, "learning_rate": 3.813173656410211e-06, "loss": 0.82249985, "memory(GiB)": 146.85, "step": 50490, "train_speed(iter/s)": 0.202698 }, { "acc": 0.76381207, "epoch": 1.1782032400589102, "grad_norm": 5.40625, "learning_rate": 3.8113386194487177e-06, "loss": 0.84003344, "memory(GiB)": 146.85, "step": 50500, "train_speed(iter/s)": 0.202718 }, { "epoch": 1.1782032400589102, "eval_acc": 0.7350956279303902, "eval_loss": 0.8341567516326904, "eval_runtime": 1263.8903, "eval_samples_per_second": 28.476, "eval_steps_per_second": 14.239, "step": 50500 }, { "acc": 0.78242655, "epoch": 1.178436547631199, "grad_norm": 5.46875, "learning_rate": 3.80950375216829e-06, "loss": 0.77015142, "memory(GiB)": 146.85, "step": 50510, "train_speed(iter/s)": 0.201696 }, { "acc": 0.78174434, "epoch": 1.178669855203488, "grad_norm": 9.1875, "learning_rate": 3.807669054830855e-06, "loss": 0.7876626, "memory(GiB)": 146.85, "step": 50520, "train_speed(iter/s)": 0.201718 }, { "acc": 0.78991451, "epoch": 1.178903162775777, "grad_norm": 4.875, "learning_rate": 3.8058345276983165e-06, "loss": 0.77865629, "memory(GiB)": 146.85, "step": 50530, "train_speed(iter/s)": 0.201738 }, { "acc": 0.76763668, "epoch": 1.1791364703480658, "grad_norm": 5.375, "learning_rate": 3.8040001710325547e-06, "loss": 0.8571063, "memory(GiB)": 146.85, "step": 50540, "train_speed(iter/s)": 0.201758 }, { "acc": 0.7768075, "epoch": 1.1793697779203547, "grad_norm": 6.21875, "learning_rate": 3.8021659850954186e-06, "loss": 0.81487408, "memory(GiB)": 146.85, "step": 50550, "train_speed(iter/s)": 0.201779 }, { "acc": 0.77924547, "epoch": 1.1796030854926436, "grad_norm": 5.53125, "learning_rate": 3.8003319701487407e-06, "loss": 0.82268496, "memory(GiB)": 146.85, "step": 50560, "train_speed(iter/s)": 0.201799 }, { "acc": 0.77056999, "epoch": 1.1798363930649325, "grad_norm": 5.25, "learning_rate": 3.7984981264543247e-06, "loss": 0.8586235, "memory(GiB)": 146.85, "step": 50570, "train_speed(iter/s)": 0.201819 }, { "acc": 0.77775545, "epoch": 1.1800697006372214, "grad_norm": 4.9375, "learning_rate": 3.7966644542739538e-06, "loss": 0.79524536, "memory(GiB)": 146.85, "step": 50580, "train_speed(iter/s)": 0.201841 }, { "acc": 0.76000032, "epoch": 1.1803030082095103, "grad_norm": 5.25, "learning_rate": 3.794830953869381e-06, "loss": 0.87130594, "memory(GiB)": 146.85, "step": 50590, "train_speed(iter/s)": 0.201862 }, { "acc": 0.78412123, "epoch": 1.1805363157817992, "grad_norm": 4.78125, "learning_rate": 3.7929976255023398e-06, "loss": 0.76212378, "memory(GiB)": 146.85, "step": 50600, "train_speed(iter/s)": 0.201882 }, { "acc": 0.76360497, "epoch": 1.180769623354088, "grad_norm": 6.65625, "learning_rate": 3.7911644694345368e-06, "loss": 0.85141068, "memory(GiB)": 146.85, "step": 50610, "train_speed(iter/s)": 0.201904 }, { "acc": 0.77118006, "epoch": 1.1810029309263768, "grad_norm": 5.4375, "learning_rate": 3.789331485927654e-06, "loss": 0.82629719, "memory(GiB)": 146.85, "step": 50620, "train_speed(iter/s)": 0.201924 }, { "acc": 0.76132088, "epoch": 1.1812362384986659, "grad_norm": 5.3125, "learning_rate": 3.7874986752433506e-06, "loss": 0.8738121, "memory(GiB)": 146.85, "step": 50630, "train_speed(iter/s)": 0.201946 }, { "acc": 0.78145456, "epoch": 1.1814695460709546, "grad_norm": 6.125, "learning_rate": 3.78566603764326e-06, "loss": 0.79278564, "memory(GiB)": 146.85, "step": 50640, "train_speed(iter/s)": 0.201968 }, { "acc": 0.7502367, "epoch": 1.1817028536432437, "grad_norm": 5.0625, "learning_rate": 3.7838335733889895e-06, "loss": 0.89678259, "memory(GiB)": 146.85, "step": 50650, "train_speed(iter/s)": 0.201989 }, { "acc": 0.75692101, "epoch": 1.1819361612155324, "grad_norm": 6.84375, "learning_rate": 3.782001282742124e-06, "loss": 0.8773283, "memory(GiB)": 146.85, "step": 50660, "train_speed(iter/s)": 0.20201 }, { "acc": 0.76582665, "epoch": 1.1821694687878213, "grad_norm": 7.125, "learning_rate": 3.7801691659642196e-06, "loss": 0.86062994, "memory(GiB)": 146.85, "step": 50670, "train_speed(iter/s)": 0.202032 }, { "acc": 0.77644687, "epoch": 1.1824027763601102, "grad_norm": 8.8125, "learning_rate": 3.7783372233168127e-06, "loss": 0.78286333, "memory(GiB)": 146.85, "step": 50680, "train_speed(iter/s)": 0.202053 }, { "acc": 0.76964884, "epoch": 1.182636083932399, "grad_norm": 6.0625, "learning_rate": 3.776505455061412e-06, "loss": 0.83424244, "memory(GiB)": 146.85, "step": 50690, "train_speed(iter/s)": 0.202074 }, { "acc": 0.77608337, "epoch": 1.182869391504688, "grad_norm": 6.5, "learning_rate": 3.7746738614595022e-06, "loss": 0.80180607, "memory(GiB)": 146.85, "step": 50700, "train_speed(iter/s)": 0.202094 }, { "acc": 0.78194251, "epoch": 1.1831026990769768, "grad_norm": 6.625, "learning_rate": 3.772842442772543e-06, "loss": 0.75820017, "memory(GiB)": 146.85, "step": 50710, "train_speed(iter/s)": 0.202114 }, { "acc": 0.77464895, "epoch": 1.1833360066492657, "grad_norm": 8.625, "learning_rate": 3.7710111992619696e-06, "loss": 0.83825912, "memory(GiB)": 146.85, "step": 50720, "train_speed(iter/s)": 0.202134 }, { "acc": 0.77264075, "epoch": 1.1835693142215546, "grad_norm": 6.6875, "learning_rate": 3.7691801311891898e-06, "loss": 0.81140823, "memory(GiB)": 146.85, "step": 50730, "train_speed(iter/s)": 0.202153 }, { "acc": 0.76703749, "epoch": 1.1838026217938435, "grad_norm": 6.40625, "learning_rate": 3.767349238815588e-06, "loss": 0.83594761, "memory(GiB)": 146.85, "step": 50740, "train_speed(iter/s)": 0.202175 }, { "acc": 0.76962748, "epoch": 1.1840359293661324, "grad_norm": 6.46875, "learning_rate": 3.7655185224025247e-06, "loss": 0.82463112, "memory(GiB)": 146.85, "step": 50750, "train_speed(iter/s)": 0.202196 }, { "acc": 0.76552725, "epoch": 1.1842692369384213, "grad_norm": 6.40625, "learning_rate": 3.7636879822113338e-06, "loss": 0.84462366, "memory(GiB)": 146.85, "step": 50760, "train_speed(iter/s)": 0.202218 }, { "acc": 0.78713121, "epoch": 1.1845025445107102, "grad_norm": 8.75, "learning_rate": 3.761857618503326e-06, "loss": 0.7751214, "memory(GiB)": 146.85, "step": 50770, "train_speed(iter/s)": 0.202236 }, { "acc": 0.75648088, "epoch": 1.1847358520829991, "grad_norm": 6.84375, "learning_rate": 3.7600274315397816e-06, "loss": 0.88527851, "memory(GiB)": 146.85, "step": 50780, "train_speed(iter/s)": 0.202256 }, { "acc": 0.74855881, "epoch": 1.184969159655288, "grad_norm": 4.65625, "learning_rate": 3.758197421581961e-06, "loss": 0.94146843, "memory(GiB)": 146.85, "step": 50790, "train_speed(iter/s)": 0.202277 }, { "acc": 0.77101068, "epoch": 1.185202467227577, "grad_norm": 4.28125, "learning_rate": 3.756367588891099e-06, "loss": 0.82518711, "memory(GiB)": 146.85, "step": 50800, "train_speed(iter/s)": 0.202297 }, { "acc": 0.78042078, "epoch": 1.1854357747998658, "grad_norm": 6.6875, "learning_rate": 3.754537933728401e-06, "loss": 0.79728622, "memory(GiB)": 146.85, "step": 50810, "train_speed(iter/s)": 0.202317 }, { "acc": 0.78247008, "epoch": 1.1856690823721547, "grad_norm": 5.9375, "learning_rate": 3.7527084563550515e-06, "loss": 0.79244065, "memory(GiB)": 146.85, "step": 50820, "train_speed(iter/s)": 0.202339 }, { "acc": 0.77394972, "epoch": 1.1859023899444436, "grad_norm": 5.71875, "learning_rate": 3.750879157032207e-06, "loss": 0.81235924, "memory(GiB)": 146.85, "step": 50830, "train_speed(iter/s)": 0.20236 }, { "acc": 0.76377258, "epoch": 1.1861356975167325, "grad_norm": 10.8125, "learning_rate": 3.7490500360210003e-06, "loss": 0.85129347, "memory(GiB)": 146.85, "step": 50840, "train_speed(iter/s)": 0.202379 }, { "acc": 0.77144237, "epoch": 1.1863690050890214, "grad_norm": 7.53125, "learning_rate": 3.747221093582538e-06, "loss": 0.82928553, "memory(GiB)": 146.85, "step": 50850, "train_speed(iter/s)": 0.202398 }, { "acc": 0.75463099, "epoch": 1.1866023126613103, "grad_norm": 5.21875, "learning_rate": 3.7453923299779014e-06, "loss": 0.90866289, "memory(GiB)": 146.85, "step": 50860, "train_speed(iter/s)": 0.202419 }, { "acc": 0.78162708, "epoch": 1.1868356202335992, "grad_norm": 5.40625, "learning_rate": 3.743563745468144e-06, "loss": 0.77456303, "memory(GiB)": 146.85, "step": 50870, "train_speed(iter/s)": 0.202439 }, { "acc": 0.76468258, "epoch": 1.1870689278058881, "grad_norm": 4.875, "learning_rate": 3.7417353403142988e-06, "loss": 0.85528812, "memory(GiB)": 146.85, "step": 50880, "train_speed(iter/s)": 0.202461 }, { "acc": 0.78059254, "epoch": 1.187302235378177, "grad_norm": 6.125, "learning_rate": 3.7399071147773668e-06, "loss": 0.79479117, "memory(GiB)": 146.85, "step": 50890, "train_speed(iter/s)": 0.202481 }, { "acc": 0.75926723, "epoch": 1.187535542950466, "grad_norm": 4.375, "learning_rate": 3.7380790691183276e-06, "loss": 0.85019035, "memory(GiB)": 146.85, "step": 50900, "train_speed(iter/s)": 0.202502 }, { "acc": 0.75915532, "epoch": 1.1877688505227548, "grad_norm": 5.3125, "learning_rate": 3.7362512035981347e-06, "loss": 0.88091354, "memory(GiB)": 146.85, "step": 50910, "train_speed(iter/s)": 0.202524 }, { "acc": 0.77073002, "epoch": 1.1880021580950437, "grad_norm": 5.9375, "learning_rate": 3.7344235184777157e-06, "loss": 0.79829254, "memory(GiB)": 146.85, "step": 50920, "train_speed(iter/s)": 0.202544 }, { "acc": 0.7808485, "epoch": 1.1882354656673326, "grad_norm": 5.625, "learning_rate": 3.7325960140179717e-06, "loss": 0.7982851, "memory(GiB)": 146.85, "step": 50930, "train_speed(iter/s)": 0.202565 }, { "acc": 0.76777458, "epoch": 1.1884687732396215, "grad_norm": 4.6875, "learning_rate": 3.730768690479779e-06, "loss": 0.8450387, "memory(GiB)": 146.85, "step": 50940, "train_speed(iter/s)": 0.202585 }, { "acc": 0.76931038, "epoch": 1.1887020808119104, "grad_norm": 6.34375, "learning_rate": 3.7289415481239865e-06, "loss": 0.83306465, "memory(GiB)": 146.85, "step": 50950, "train_speed(iter/s)": 0.202607 }, { "acc": 0.77309389, "epoch": 1.1889353883841993, "grad_norm": 5.3125, "learning_rate": 3.727114587211419e-06, "loss": 0.80866203, "memory(GiB)": 146.85, "step": 50960, "train_speed(iter/s)": 0.202629 }, { "acc": 0.78052139, "epoch": 1.1891686959564882, "grad_norm": 5.8125, "learning_rate": 3.7252878080028744e-06, "loss": 0.78787518, "memory(GiB)": 146.85, "step": 50970, "train_speed(iter/s)": 0.20265 }, { "acc": 0.76643009, "epoch": 1.1894020035287771, "grad_norm": 4.84375, "learning_rate": 3.7234612107591246e-06, "loss": 0.85120411, "memory(GiB)": 146.85, "step": 50980, "train_speed(iter/s)": 0.20267 }, { "acc": 0.77199383, "epoch": 1.189635311101066, "grad_norm": 4.96875, "learning_rate": 3.721634795740918e-06, "loss": 0.8246295, "memory(GiB)": 146.85, "step": 50990, "train_speed(iter/s)": 0.20269 }, { "acc": 0.76533809, "epoch": 1.189868618673355, "grad_norm": 4.125, "learning_rate": 3.719808563208971e-06, "loss": 0.8361064, "memory(GiB)": 146.85, "step": 51000, "train_speed(iter/s)": 0.202711 }, { "epoch": 1.189868618673355, "eval_acc": 0.7351222495070975, "eval_loss": 0.8341910243034363, "eval_runtime": 1264.0864, "eval_samples_per_second": 28.472, "eval_steps_per_second": 14.236, "step": 51000 }, { "acc": 0.77396011, "epoch": 1.1901019262456436, "grad_norm": 6.8125, "learning_rate": 3.71798251342398e-06, "loss": 0.8313199, "memory(GiB)": 146.85, "step": 51010, "train_speed(iter/s)": 0.201698 }, { "acc": 0.78748569, "epoch": 1.1903352338179327, "grad_norm": 7.125, "learning_rate": 3.7161566466466137e-06, "loss": 0.75305142, "memory(GiB)": 146.85, "step": 51020, "train_speed(iter/s)": 0.201718 }, { "acc": 0.74942799, "epoch": 1.1905685413902214, "grad_norm": 5.375, "learning_rate": 3.714330963137512e-06, "loss": 0.93335896, "memory(GiB)": 146.85, "step": 51030, "train_speed(iter/s)": 0.201739 }, { "acc": 0.75499229, "epoch": 1.1908018489625105, "grad_norm": 6.8125, "learning_rate": 3.7125054631572915e-06, "loss": 0.90391521, "memory(GiB)": 146.85, "step": 51040, "train_speed(iter/s)": 0.201762 }, { "acc": 0.76399021, "epoch": 1.1910351565347992, "grad_norm": 6.5625, "learning_rate": 3.710680146966542e-06, "loss": 0.85220547, "memory(GiB)": 146.85, "step": 51050, "train_speed(iter/s)": 0.201781 }, { "acc": 0.75121598, "epoch": 1.191268464107088, "grad_norm": 8.6875, "learning_rate": 3.7088550148258277e-06, "loss": 0.90126781, "memory(GiB)": 146.85, "step": 51060, "train_speed(iter/s)": 0.201801 }, { "acc": 0.75748696, "epoch": 1.191501771679377, "grad_norm": 6.96875, "learning_rate": 3.707030066995685e-06, "loss": 0.88251629, "memory(GiB)": 146.85, "step": 51070, "train_speed(iter/s)": 0.201819 }, { "acc": 0.77859173, "epoch": 1.1917350792516659, "grad_norm": 5.375, "learning_rate": 3.705205303736625e-06, "loss": 0.802353, "memory(GiB)": 146.85, "step": 51080, "train_speed(iter/s)": 0.201838 }, { "acc": 0.75896568, "epoch": 1.1919683868239548, "grad_norm": 5.3125, "learning_rate": 3.7033807253091313e-06, "loss": 0.86267891, "memory(GiB)": 146.85, "step": 51090, "train_speed(iter/s)": 0.201859 }, { "acc": 0.77134581, "epoch": 1.1922016943962437, "grad_norm": 5.96875, "learning_rate": 3.7015563319736618e-06, "loss": 0.84330292, "memory(GiB)": 146.85, "step": 51100, "train_speed(iter/s)": 0.201881 }, { "acc": 0.78267832, "epoch": 1.1924350019685326, "grad_norm": 4.84375, "learning_rate": 3.6997321239906513e-06, "loss": 0.78401055, "memory(GiB)": 146.85, "step": 51110, "train_speed(iter/s)": 0.201902 }, { "acc": 0.77795677, "epoch": 1.1926683095408215, "grad_norm": 5.53125, "learning_rate": 3.6979081016204998e-06, "loss": 0.8053463, "memory(GiB)": 146.85, "step": 51120, "train_speed(iter/s)": 0.201923 }, { "acc": 0.78278055, "epoch": 1.1929016171131104, "grad_norm": 5.5625, "learning_rate": 3.6960842651235894e-06, "loss": 0.77537403, "memory(GiB)": 146.85, "step": 51130, "train_speed(iter/s)": 0.201944 }, { "acc": 0.76067228, "epoch": 1.1931349246853993, "grad_norm": 5.1875, "learning_rate": 3.6942606147602705e-06, "loss": 0.86650343, "memory(GiB)": 146.85, "step": 51140, "train_speed(iter/s)": 0.201965 }, { "acc": 0.79946365, "epoch": 1.1933682322576882, "grad_norm": 5.71875, "learning_rate": 3.6924371507908695e-06, "loss": 0.71805277, "memory(GiB)": 146.85, "step": 51150, "train_speed(iter/s)": 0.201985 }, { "acc": 0.7669981, "epoch": 1.193601539829977, "grad_norm": 7.0, "learning_rate": 3.690613873475687e-06, "loss": 0.83691216, "memory(GiB)": 146.85, "step": 51160, "train_speed(iter/s)": 0.202006 }, { "acc": 0.7953886, "epoch": 1.193834847402266, "grad_norm": 6.65625, "learning_rate": 3.6887907830749923e-06, "loss": 0.72996855, "memory(GiB)": 146.85, "step": 51170, "train_speed(iter/s)": 0.202026 }, { "acc": 0.78961711, "epoch": 1.1940681549745549, "grad_norm": 5.0, "learning_rate": 3.686967879849033e-06, "loss": 0.74603996, "memory(GiB)": 146.85, "step": 51180, "train_speed(iter/s)": 0.202046 }, { "acc": 0.77072268, "epoch": 1.1943014625468438, "grad_norm": 6.0, "learning_rate": 3.6851451640580264e-06, "loss": 0.84077129, "memory(GiB)": 146.85, "step": 51190, "train_speed(iter/s)": 0.202067 }, { "acc": 0.75437627, "epoch": 1.1945347701191327, "grad_norm": 10.0, "learning_rate": 3.6833226359621668e-06, "loss": 0.88859415, "memory(GiB)": 146.85, "step": 51200, "train_speed(iter/s)": 0.202088 }, { "acc": 0.7592474, "epoch": 1.1947680776914216, "grad_norm": 6.4375, "learning_rate": 3.6815002958216183e-06, "loss": 0.88297691, "memory(GiB)": 146.85, "step": 51210, "train_speed(iter/s)": 0.202108 }, { "acc": 0.78372421, "epoch": 1.1950013852637105, "grad_norm": 4.90625, "learning_rate": 3.67967814389652e-06, "loss": 0.76991892, "memory(GiB)": 146.85, "step": 51220, "train_speed(iter/s)": 0.202128 }, { "acc": 0.78185434, "epoch": 1.1952346928359994, "grad_norm": 4.8125, "learning_rate": 3.6778561804469825e-06, "loss": 0.78849592, "memory(GiB)": 146.85, "step": 51230, "train_speed(iter/s)": 0.202148 }, { "acc": 0.76781206, "epoch": 1.1954680004082883, "grad_norm": 6.15625, "learning_rate": 3.676034405733092e-06, "loss": 0.82253771, "memory(GiB)": 146.85, "step": 51240, "train_speed(iter/s)": 0.202169 }, { "acc": 0.7654108, "epoch": 1.1957013079805772, "grad_norm": 5.28125, "learning_rate": 3.6742128200149042e-06, "loss": 0.86093521, "memory(GiB)": 146.85, "step": 51250, "train_speed(iter/s)": 0.202189 }, { "acc": 0.79221544, "epoch": 1.195934615552866, "grad_norm": 5.21875, "learning_rate": 3.672391423552451e-06, "loss": 0.76711001, "memory(GiB)": 146.85, "step": 51260, "train_speed(iter/s)": 0.202209 }, { "acc": 0.75826073, "epoch": 1.196167923125155, "grad_norm": 5.78125, "learning_rate": 3.6705702166057366e-06, "loss": 0.89045687, "memory(GiB)": 146.85, "step": 51270, "train_speed(iter/s)": 0.20223 }, { "acc": 0.77297244, "epoch": 1.1964012306974439, "grad_norm": 5.4375, "learning_rate": 3.668749199434738e-06, "loss": 0.82173281, "memory(GiB)": 146.85, "step": 51280, "train_speed(iter/s)": 0.202249 }, { "acc": 0.77335086, "epoch": 1.1966345382697328, "grad_norm": 6.375, "learning_rate": 3.6669283722994054e-06, "loss": 0.82509499, "memory(GiB)": 146.85, "step": 51290, "train_speed(iter/s)": 0.202269 }, { "acc": 0.7820282, "epoch": 1.1968678458420217, "grad_norm": 5.28125, "learning_rate": 3.6651077354596586e-06, "loss": 0.79955559, "memory(GiB)": 146.85, "step": 51300, "train_speed(iter/s)": 0.20229 }, { "acc": 0.76851163, "epoch": 1.1971011534143106, "grad_norm": 6.0625, "learning_rate": 3.6632872891753956e-06, "loss": 0.82214136, "memory(GiB)": 146.85, "step": 51310, "train_speed(iter/s)": 0.20231 }, { "acc": 0.77890878, "epoch": 1.1973344609865995, "grad_norm": 5.8125, "learning_rate": 3.661467033706483e-06, "loss": 0.78753185, "memory(GiB)": 146.85, "step": 51320, "train_speed(iter/s)": 0.202332 }, { "acc": 0.75586872, "epoch": 1.1975677685588884, "grad_norm": 7.53125, "learning_rate": 3.6596469693127636e-06, "loss": 0.91812963, "memory(GiB)": 146.85, "step": 51330, "train_speed(iter/s)": 0.202353 }, { "acc": 0.79322696, "epoch": 1.1978010761311773, "grad_norm": 6.875, "learning_rate": 3.6578270962540506e-06, "loss": 0.74097395, "memory(GiB)": 146.85, "step": 51340, "train_speed(iter/s)": 0.202373 }, { "acc": 0.76391869, "epoch": 1.1980343837034662, "grad_norm": 8.3125, "learning_rate": 3.6560074147901287e-06, "loss": 0.86135025, "memory(GiB)": 146.85, "step": 51350, "train_speed(iter/s)": 0.202394 }, { "acc": 0.78991079, "epoch": 1.198267691275755, "grad_norm": 8.5625, "learning_rate": 3.654187925180758e-06, "loss": 0.75335159, "memory(GiB)": 146.85, "step": 51360, "train_speed(iter/s)": 0.202415 }, { "acc": 0.78008356, "epoch": 1.198500998848044, "grad_norm": 3.796875, "learning_rate": 3.65236862768567e-06, "loss": 0.79729328, "memory(GiB)": 146.85, "step": 51370, "train_speed(iter/s)": 0.202434 }, { "acc": 0.79708185, "epoch": 1.1987343064203329, "grad_norm": 5.90625, "learning_rate": 3.650549522564569e-06, "loss": 0.71570287, "memory(GiB)": 146.85, "step": 51380, "train_speed(iter/s)": 0.202455 }, { "acc": 0.77166228, "epoch": 1.1989676139926218, "grad_norm": 7.8125, "learning_rate": 3.648730610077131e-06, "loss": 0.82307472, "memory(GiB)": 146.85, "step": 51390, "train_speed(iter/s)": 0.202474 }, { "acc": 0.77303467, "epoch": 1.1992009215649104, "grad_norm": 5.375, "learning_rate": 3.646911890483006e-06, "loss": 0.84150524, "memory(GiB)": 146.85, "step": 51400, "train_speed(iter/s)": 0.202495 }, { "acc": 0.76757812, "epoch": 1.1994342291371995, "grad_norm": 5.1875, "learning_rate": 3.645093364041815e-06, "loss": 0.84812746, "memory(GiB)": 146.85, "step": 51410, "train_speed(iter/s)": 0.202516 }, { "acc": 0.76242485, "epoch": 1.1996675367094882, "grad_norm": 4.84375, "learning_rate": 3.6432750310131537e-06, "loss": 0.84945717, "memory(GiB)": 146.85, "step": 51420, "train_speed(iter/s)": 0.202536 }, { "acc": 0.76738882, "epoch": 1.1999008442817771, "grad_norm": 5.90625, "learning_rate": 3.6414568916565884e-06, "loss": 0.82944546, "memory(GiB)": 146.85, "step": 51430, "train_speed(iter/s)": 0.202556 }, { "acc": 0.76989183, "epoch": 1.200134151854066, "grad_norm": 5.4375, "learning_rate": 3.6396389462316558e-06, "loss": 0.84020195, "memory(GiB)": 146.85, "step": 51440, "train_speed(iter/s)": 0.202577 }, { "acc": 0.77402267, "epoch": 1.200367459426355, "grad_norm": 5.25, "learning_rate": 3.6378211949978693e-06, "loss": 0.8048357, "memory(GiB)": 146.85, "step": 51450, "train_speed(iter/s)": 0.202598 }, { "acc": 0.77169371, "epoch": 1.2006007669986438, "grad_norm": 5.5, "learning_rate": 3.6360036382147117e-06, "loss": 0.83554754, "memory(GiB)": 146.85, "step": 51460, "train_speed(iter/s)": 0.202619 }, { "acc": 0.78332891, "epoch": 1.2008340745709327, "grad_norm": 6.90625, "learning_rate": 3.634186276141638e-06, "loss": 0.77204418, "memory(GiB)": 146.85, "step": 51470, "train_speed(iter/s)": 0.202641 }, { "acc": 0.75625095, "epoch": 1.2010673821432216, "grad_norm": 7.3125, "learning_rate": 3.6323691090380756e-06, "loss": 0.87282553, "memory(GiB)": 146.85, "step": 51480, "train_speed(iter/s)": 0.202662 }, { "acc": 0.75988035, "epoch": 1.2013006897155105, "grad_norm": 5.8125, "learning_rate": 3.630552137163427e-06, "loss": 0.88680477, "memory(GiB)": 146.85, "step": 51490, "train_speed(iter/s)": 0.202681 }, { "acc": 0.783358, "epoch": 1.2015339972877994, "grad_norm": 5.71875, "learning_rate": 3.6287353607770613e-06, "loss": 0.77926369, "memory(GiB)": 146.85, "step": 51500, "train_speed(iter/s)": 0.202702 }, { "epoch": 1.2015339972877994, "eval_acc": 0.7351458055689112, "eval_loss": 0.834151029586792, "eval_runtime": 1262.66, "eval_samples_per_second": 28.504, "eval_steps_per_second": 14.252, "step": 51500 }, { "acc": 0.7731946, "epoch": 1.2017673048600883, "grad_norm": 6.09375, "learning_rate": 3.6269187801383267e-06, "loss": 0.81724195, "memory(GiB)": 146.85, "step": 51510, "train_speed(iter/s)": 0.2017 }, { "acc": 0.76135902, "epoch": 1.2020006124323772, "grad_norm": 7.53125, "learning_rate": 3.6251023955065356e-06, "loss": 0.86959858, "memory(GiB)": 146.85, "step": 51520, "train_speed(iter/s)": 0.201721 }, { "acc": 0.77618675, "epoch": 1.2022339200046661, "grad_norm": 4.90625, "learning_rate": 3.623286207140979e-06, "loss": 0.82121258, "memory(GiB)": 146.85, "step": 51530, "train_speed(iter/s)": 0.201741 }, { "acc": 0.7693697, "epoch": 1.202467227576955, "grad_norm": 6.25, "learning_rate": 3.6214702153009157e-06, "loss": 0.84339085, "memory(GiB)": 146.85, "step": 51540, "train_speed(iter/s)": 0.201761 }, { "acc": 0.7724658, "epoch": 1.202700535149244, "grad_norm": 9.0, "learning_rate": 3.6196544202455787e-06, "loss": 0.82317371, "memory(GiB)": 146.85, "step": 51550, "train_speed(iter/s)": 0.201781 }, { "acc": 0.77284636, "epoch": 1.2029338427215328, "grad_norm": 4.71875, "learning_rate": 3.617838822234175e-06, "loss": 0.83494778, "memory(GiB)": 146.85, "step": 51560, "train_speed(iter/s)": 0.201801 }, { "acc": 0.77272568, "epoch": 1.2031671502938217, "grad_norm": 4.9375, "learning_rate": 3.616023421525875e-06, "loss": 0.8270937, "memory(GiB)": 146.85, "step": 51570, "train_speed(iter/s)": 0.201822 }, { "acc": 0.76886587, "epoch": 1.2034004578661106, "grad_norm": 5.1875, "learning_rate": 3.61420821837983e-06, "loss": 0.83855762, "memory(GiB)": 146.85, "step": 51580, "train_speed(iter/s)": 0.201842 }, { "acc": 0.78193326, "epoch": 1.2036337654383995, "grad_norm": 3.875, "learning_rate": 3.61239321305516e-06, "loss": 0.77478194, "memory(GiB)": 146.85, "step": 51590, "train_speed(iter/s)": 0.201862 }, { "acc": 0.75355611, "epoch": 1.2038670730106884, "grad_norm": 8.8125, "learning_rate": 3.610578405810955e-06, "loss": 0.89257164, "memory(GiB)": 146.85, "step": 51600, "train_speed(iter/s)": 0.201882 }, { "acc": 0.77738862, "epoch": 1.2041003805829773, "grad_norm": 4.5, "learning_rate": 3.6087637969062783e-06, "loss": 0.81275005, "memory(GiB)": 146.85, "step": 51610, "train_speed(iter/s)": 0.201903 }, { "acc": 0.78326244, "epoch": 1.2043336881552662, "grad_norm": 6.375, "learning_rate": 3.606949386600166e-06, "loss": 0.78826675, "memory(GiB)": 146.85, "step": 51620, "train_speed(iter/s)": 0.201924 }, { "acc": 0.75870981, "epoch": 1.204566995727555, "grad_norm": 7.1875, "learning_rate": 3.605135175151624e-06, "loss": 0.88881989, "memory(GiB)": 146.85, "step": 51630, "train_speed(iter/s)": 0.201945 }, { "acc": 0.78206415, "epoch": 1.204800303299844, "grad_norm": 6.78125, "learning_rate": 3.6033211628196308e-06, "loss": 0.7626842, "memory(GiB)": 146.85, "step": 51640, "train_speed(iter/s)": 0.201965 }, { "acc": 0.75750027, "epoch": 1.205033610872133, "grad_norm": 5.84375, "learning_rate": 3.601507349863137e-06, "loss": 0.87601061, "memory(GiB)": 146.85, "step": 51650, "train_speed(iter/s)": 0.201985 }, { "acc": 0.76714573, "epoch": 1.2052669184444218, "grad_norm": 7.375, "learning_rate": 3.599693736541061e-06, "loss": 0.837115, "memory(GiB)": 146.85, "step": 51660, "train_speed(iter/s)": 0.202004 }, { "acc": 0.75826426, "epoch": 1.2055002260167107, "grad_norm": 6.65625, "learning_rate": 3.5978803231122977e-06, "loss": 0.90191545, "memory(GiB)": 146.85, "step": 51670, "train_speed(iter/s)": 0.202024 }, { "acc": 0.76391621, "epoch": 1.2057335335889996, "grad_norm": 4.875, "learning_rate": 3.596067109835713e-06, "loss": 0.84386101, "memory(GiB)": 146.85, "step": 51680, "train_speed(iter/s)": 0.202042 }, { "acc": 0.77304773, "epoch": 1.2059668411612885, "grad_norm": 4.65625, "learning_rate": 3.5942540969701386e-06, "loss": 0.81269083, "memory(GiB)": 146.85, "step": 51690, "train_speed(iter/s)": 0.202061 }, { "acc": 0.76480541, "epoch": 1.2062001487335774, "grad_norm": 6.5, "learning_rate": 3.592441284774383e-06, "loss": 0.83629131, "memory(GiB)": 146.85, "step": 51700, "train_speed(iter/s)": 0.202081 }, { "acc": 0.76559682, "epoch": 1.2064334563058663, "grad_norm": 5.25, "learning_rate": 3.5906286735072255e-06, "loss": 0.83764734, "memory(GiB)": 146.85, "step": 51710, "train_speed(iter/s)": 0.202099 }, { "acc": 0.76090918, "epoch": 1.2066667638781552, "grad_norm": 9.375, "learning_rate": 3.5888162634274154e-06, "loss": 0.84825974, "memory(GiB)": 146.85, "step": 51720, "train_speed(iter/s)": 0.202121 }, { "acc": 0.77577796, "epoch": 1.206900071450444, "grad_norm": 5.8125, "learning_rate": 3.5870040547936748e-06, "loss": 0.80779419, "memory(GiB)": 146.85, "step": 51730, "train_speed(iter/s)": 0.202142 }, { "acc": 0.75735855, "epoch": 1.207133379022733, "grad_norm": 5.96875, "learning_rate": 3.585192047864694e-06, "loss": 0.89662457, "memory(GiB)": 146.85, "step": 51740, "train_speed(iter/s)": 0.202163 }, { "acc": 0.76667719, "epoch": 1.2073666865950219, "grad_norm": 7.625, "learning_rate": 3.5833802428991373e-06, "loss": 0.84809704, "memory(GiB)": 146.85, "step": 51750, "train_speed(iter/s)": 0.202184 }, { "acc": 0.76876311, "epoch": 1.2075999941673108, "grad_norm": 6.0625, "learning_rate": 3.581568640155639e-06, "loss": 0.81898661, "memory(GiB)": 146.85, "step": 51760, "train_speed(iter/s)": 0.202205 }, { "acc": 0.76747084, "epoch": 1.2078333017395995, "grad_norm": 6.875, "learning_rate": 3.5797572398928053e-06, "loss": 0.84730473, "memory(GiB)": 146.85, "step": 51770, "train_speed(iter/s)": 0.202226 }, { "acc": 0.76435194, "epoch": 1.2080666093118886, "grad_norm": 6.125, "learning_rate": 3.5779460423692136e-06, "loss": 0.84718561, "memory(GiB)": 146.85, "step": 51780, "train_speed(iter/s)": 0.202247 }, { "acc": 0.7579318, "epoch": 1.2082999168841773, "grad_norm": 5.0, "learning_rate": 3.5761350478434133e-06, "loss": 0.88392, "memory(GiB)": 146.85, "step": 51790, "train_speed(iter/s)": 0.202267 }, { "acc": 0.77070875, "epoch": 1.2085332244564664, "grad_norm": 4.84375, "learning_rate": 3.5743242565739183e-06, "loss": 0.82411423, "memory(GiB)": 146.85, "step": 51800, "train_speed(iter/s)": 0.202287 }, { "acc": 0.76780386, "epoch": 1.208766532028755, "grad_norm": 7.59375, "learning_rate": 3.572513668819223e-06, "loss": 0.82400846, "memory(GiB)": 146.85, "step": 51810, "train_speed(iter/s)": 0.202306 }, { "acc": 0.7627656, "epoch": 1.208999839601044, "grad_norm": 7.65625, "learning_rate": 3.570703284837786e-06, "loss": 0.86987686, "memory(GiB)": 146.85, "step": 51820, "train_speed(iter/s)": 0.202325 }, { "acc": 0.76755009, "epoch": 1.2092331471733329, "grad_norm": 8.625, "learning_rate": 3.5688931048880397e-06, "loss": 0.83952122, "memory(GiB)": 146.85, "step": 51830, "train_speed(iter/s)": 0.202344 }, { "acc": 0.75672951, "epoch": 1.2094664547456218, "grad_norm": 6.5625, "learning_rate": 3.567083129228387e-06, "loss": 0.86297092, "memory(GiB)": 146.85, "step": 51840, "train_speed(iter/s)": 0.202364 }, { "acc": 0.76761851, "epoch": 1.2096997623179107, "grad_norm": 5.5625, "learning_rate": 3.5652733581172015e-06, "loss": 0.8455883, "memory(GiB)": 146.85, "step": 51850, "train_speed(iter/s)": 0.202384 }, { "acc": 0.76675472, "epoch": 1.2099330698901996, "grad_norm": 5.625, "learning_rate": 3.5634637918128267e-06, "loss": 0.83858023, "memory(GiB)": 146.85, "step": 51860, "train_speed(iter/s)": 0.202405 }, { "acc": 0.76654668, "epoch": 1.2101663774624885, "grad_norm": 4.84375, "learning_rate": 3.56165443057358e-06, "loss": 0.82820034, "memory(GiB)": 146.85, "step": 51870, "train_speed(iter/s)": 0.202424 }, { "acc": 0.78509636, "epoch": 1.2103996850347774, "grad_norm": 5.03125, "learning_rate": 3.5598452746577443e-06, "loss": 0.76307173, "memory(GiB)": 146.85, "step": 51880, "train_speed(iter/s)": 0.202445 }, { "acc": 0.77281685, "epoch": 1.2106329926070662, "grad_norm": 5.75, "learning_rate": 3.5580363243235773e-06, "loss": 0.81090488, "memory(GiB)": 146.85, "step": 51890, "train_speed(iter/s)": 0.202465 }, { "acc": 0.79191999, "epoch": 1.2108663001793551, "grad_norm": 4.78125, "learning_rate": 3.556227579829306e-06, "loss": 0.73416433, "memory(GiB)": 146.85, "step": 51900, "train_speed(iter/s)": 0.202485 }, { "acc": 0.76582108, "epoch": 1.211099607751644, "grad_norm": 5.625, "learning_rate": 3.5544190414331305e-06, "loss": 0.84261112, "memory(GiB)": 146.85, "step": 51910, "train_speed(iter/s)": 0.202505 }, { "acc": 0.77354321, "epoch": 1.211332915323933, "grad_norm": 5.40625, "learning_rate": 3.552610709393215e-06, "loss": 0.81788731, "memory(GiB)": 146.85, "step": 51920, "train_speed(iter/s)": 0.202527 }, { "acc": 0.75894985, "epoch": 1.2115662228962218, "grad_norm": 7.03125, "learning_rate": 3.5508025839676997e-06, "loss": 0.88219547, "memory(GiB)": 146.85, "step": 51930, "train_speed(iter/s)": 0.202548 }, { "acc": 0.7874835, "epoch": 1.2117995304685107, "grad_norm": 7.84375, "learning_rate": 3.5489946654146945e-06, "loss": 0.77628264, "memory(GiB)": 146.85, "step": 51940, "train_speed(iter/s)": 0.202569 }, { "acc": 0.77332716, "epoch": 1.2120328380407996, "grad_norm": 6.4375, "learning_rate": 3.547186953992281e-06, "loss": 0.81260071, "memory(GiB)": 146.85, "step": 51950, "train_speed(iter/s)": 0.202589 }, { "acc": 0.77266178, "epoch": 1.2122661456130885, "grad_norm": 8.6875, "learning_rate": 3.5453794499585057e-06, "loss": 0.80839033, "memory(GiB)": 146.85, "step": 51960, "train_speed(iter/s)": 0.20261 }, { "acc": 0.76482878, "epoch": 1.2124994531853774, "grad_norm": 4.9375, "learning_rate": 3.543572153571393e-06, "loss": 0.83725452, "memory(GiB)": 146.85, "step": 51970, "train_speed(iter/s)": 0.202629 }, { "acc": 0.77374458, "epoch": 1.2127327607576663, "grad_norm": 4.90625, "learning_rate": 3.541765065088931e-06, "loss": 0.81205845, "memory(GiB)": 146.85, "step": 51980, "train_speed(iter/s)": 0.202649 }, { "acc": 0.76786509, "epoch": 1.2129660683299552, "grad_norm": 7.71875, "learning_rate": 3.539958184769082e-06, "loss": 0.84158936, "memory(GiB)": 146.85, "step": 51990, "train_speed(iter/s)": 0.20267 }, { "acc": 0.78923521, "epoch": 1.2131993759022441, "grad_norm": 6.875, "learning_rate": 3.53815151286978e-06, "loss": 0.77972083, "memory(GiB)": 146.85, "step": 52000, "train_speed(iter/s)": 0.202692 }, { "epoch": 1.2131993759022441, "eval_acc": 0.7350928851012749, "eval_loss": 0.8341500163078308, "eval_runtime": 1263.9469, "eval_samples_per_second": 28.475, "eval_steps_per_second": 14.238, "step": 52000 }, { "acc": 0.76294403, "epoch": 1.213432683474533, "grad_norm": 9.625, "learning_rate": 3.536345049648924e-06, "loss": 0.85498629, "memory(GiB)": 146.85, "step": 52010, "train_speed(iter/s)": 0.201699 }, { "acc": 0.77892404, "epoch": 1.213665991046822, "grad_norm": 7.0, "learning_rate": 3.5345387953643872e-06, "loss": 0.80390615, "memory(GiB)": 146.85, "step": 52020, "train_speed(iter/s)": 0.201719 }, { "acc": 0.78202062, "epoch": 1.2138992986191108, "grad_norm": 6.09375, "learning_rate": 3.5327327502740114e-06, "loss": 0.79501572, "memory(GiB)": 146.85, "step": 52030, "train_speed(iter/s)": 0.201739 }, { "acc": 0.78324752, "epoch": 1.2141326061913997, "grad_norm": 6.40625, "learning_rate": 3.5309269146356097e-06, "loss": 0.77866039, "memory(GiB)": 146.85, "step": 52040, "train_speed(iter/s)": 0.201759 }, { "acc": 0.75336332, "epoch": 1.2143659137636886, "grad_norm": 5.9375, "learning_rate": 3.5291212887069624e-06, "loss": 0.90225544, "memory(GiB)": 146.85, "step": 52050, "train_speed(iter/s)": 0.201777 }, { "acc": 0.76436143, "epoch": 1.2145992213359775, "grad_norm": 8.9375, "learning_rate": 3.5273158727458253e-06, "loss": 0.8432765, "memory(GiB)": 146.85, "step": 52060, "train_speed(iter/s)": 0.201797 }, { "acc": 0.75676622, "epoch": 1.2148325289082664, "grad_norm": 5.625, "learning_rate": 3.5255106670099186e-06, "loss": 0.88601742, "memory(GiB)": 146.85, "step": 52070, "train_speed(iter/s)": 0.201817 }, { "acc": 0.77279124, "epoch": 1.2150658364805553, "grad_norm": 4.9375, "learning_rate": 3.5237056717569363e-06, "loss": 0.83730736, "memory(GiB)": 146.85, "step": 52080, "train_speed(iter/s)": 0.201834 }, { "acc": 0.77533393, "epoch": 1.2152991440528442, "grad_norm": 13.0625, "learning_rate": 3.5219008872445414e-06, "loss": 0.8002634, "memory(GiB)": 146.85, "step": 52090, "train_speed(iter/s)": 0.201854 }, { "acc": 0.7641552, "epoch": 1.2155324516251331, "grad_norm": 5.34375, "learning_rate": 3.5200963137303644e-06, "loss": 0.84990377, "memory(GiB)": 146.85, "step": 52100, "train_speed(iter/s)": 0.201873 }, { "acc": 0.76347895, "epoch": 1.215765759197422, "grad_norm": 4.15625, "learning_rate": 3.5182919514720087e-06, "loss": 0.84473619, "memory(GiB)": 146.85, "step": 52110, "train_speed(iter/s)": 0.201892 }, { "acc": 0.77328453, "epoch": 1.215999066769711, "grad_norm": 5.875, "learning_rate": 3.5164878007270464e-06, "loss": 0.82426205, "memory(GiB)": 146.85, "step": 52120, "train_speed(iter/s)": 0.201912 }, { "acc": 0.77108364, "epoch": 1.2162323743419998, "grad_norm": 7.4375, "learning_rate": 3.5146838617530197e-06, "loss": 0.82396164, "memory(GiB)": 146.85, "step": 52130, "train_speed(iter/s)": 0.201932 }, { "acc": 0.76817279, "epoch": 1.2164656819142887, "grad_norm": 6.6875, "learning_rate": 3.5128801348074426e-06, "loss": 0.83505831, "memory(GiB)": 146.85, "step": 52140, "train_speed(iter/s)": 0.201952 }, { "acc": 0.77047491, "epoch": 1.2166989894865776, "grad_norm": 7.75, "learning_rate": 3.511076620147792e-06, "loss": 0.83046913, "memory(GiB)": 146.85, "step": 52150, "train_speed(iter/s)": 0.201973 }, { "acc": 0.78926706, "epoch": 1.2169322970588663, "grad_norm": 4.1875, "learning_rate": 3.5092733180315206e-06, "loss": 0.77577963, "memory(GiB)": 146.85, "step": 52160, "train_speed(iter/s)": 0.201992 }, { "acc": 0.78175206, "epoch": 1.2171656046311554, "grad_norm": 4.3125, "learning_rate": 3.5074702287160523e-06, "loss": 0.77069716, "memory(GiB)": 146.85, "step": 52170, "train_speed(iter/s)": 0.202012 }, { "acc": 0.77346592, "epoch": 1.217398912203444, "grad_norm": 5.375, "learning_rate": 3.5056673524587733e-06, "loss": 0.84413118, "memory(GiB)": 146.85, "step": 52180, "train_speed(iter/s)": 0.202031 }, { "acc": 0.78412561, "epoch": 1.2176322197757332, "grad_norm": 4.5, "learning_rate": 3.503864689517046e-06, "loss": 0.76995568, "memory(GiB)": 146.85, "step": 52190, "train_speed(iter/s)": 0.202051 }, { "acc": 0.77056427, "epoch": 1.217865527348022, "grad_norm": 7.15625, "learning_rate": 3.5020622401481996e-06, "loss": 0.82346363, "memory(GiB)": 146.85, "step": 52200, "train_speed(iter/s)": 0.202071 }, { "acc": 0.7589201, "epoch": 1.2180988349203108, "grad_norm": 6.0625, "learning_rate": 3.500260004609533e-06, "loss": 0.86447334, "memory(GiB)": 146.85, "step": 52210, "train_speed(iter/s)": 0.202092 }, { "acc": 0.75976181, "epoch": 1.2183321424925997, "grad_norm": 5.0, "learning_rate": 3.4984579831583166e-06, "loss": 0.86661816, "memory(GiB)": 146.85, "step": 52220, "train_speed(iter/s)": 0.202113 }, { "acc": 0.77649055, "epoch": 1.2185654500648886, "grad_norm": 6.09375, "learning_rate": 3.4966561760517852e-06, "loss": 0.8323988, "memory(GiB)": 146.85, "step": 52230, "train_speed(iter/s)": 0.202134 }, { "acc": 0.78055267, "epoch": 1.2187987576371775, "grad_norm": 5.0, "learning_rate": 3.494854583547148e-06, "loss": 0.78928046, "memory(GiB)": 146.85, "step": 52240, "train_speed(iter/s)": 0.202153 }, { "acc": 0.79675789, "epoch": 1.2190320652094664, "grad_norm": 5.46875, "learning_rate": 3.4930532059015845e-06, "loss": 0.74164867, "memory(GiB)": 146.85, "step": 52250, "train_speed(iter/s)": 0.202173 }, { "acc": 0.7620235, "epoch": 1.2192653727817553, "grad_norm": 8.0, "learning_rate": 3.491252043372236e-06, "loss": 0.85065699, "memory(GiB)": 146.85, "step": 52260, "train_speed(iter/s)": 0.202193 }, { "acc": 0.75897632, "epoch": 1.2194986803540442, "grad_norm": 8.375, "learning_rate": 3.4894510962162194e-06, "loss": 0.87336617, "memory(GiB)": 146.85, "step": 52270, "train_speed(iter/s)": 0.202212 }, { "acc": 0.764818, "epoch": 1.219731987926333, "grad_norm": 7.96875, "learning_rate": 3.4876503646906203e-06, "loss": 0.84698057, "memory(GiB)": 146.85, "step": 52280, "train_speed(iter/s)": 0.20223 }, { "acc": 0.77502728, "epoch": 1.219965295498622, "grad_norm": 5.3125, "learning_rate": 3.4858498490524924e-06, "loss": 0.81349239, "memory(GiB)": 146.85, "step": 52290, "train_speed(iter/s)": 0.20225 }, { "acc": 0.75475116, "epoch": 1.2201986030709109, "grad_norm": 7.0, "learning_rate": 3.4840495495588593e-06, "loss": 0.90797195, "memory(GiB)": 146.85, "step": 52300, "train_speed(iter/s)": 0.202271 }, { "acc": 0.78326855, "epoch": 1.2204319106431998, "grad_norm": 4.3125, "learning_rate": 3.4822494664667117e-06, "loss": 0.78288345, "memory(GiB)": 146.85, "step": 52310, "train_speed(iter/s)": 0.202291 }, { "acc": 0.78666439, "epoch": 1.2206652182154887, "grad_norm": 5.84375, "learning_rate": 3.4804496000330124e-06, "loss": 0.75715799, "memory(GiB)": 146.85, "step": 52320, "train_speed(iter/s)": 0.20231 }, { "acc": 0.77658567, "epoch": 1.2208985257877776, "grad_norm": 9.0, "learning_rate": 3.478649950514691e-06, "loss": 0.82606525, "memory(GiB)": 146.85, "step": 52330, "train_speed(iter/s)": 0.20233 }, { "acc": 0.796838, "epoch": 1.2211318333600665, "grad_norm": 6.96875, "learning_rate": 3.4768505181686468e-06, "loss": 0.71504908, "memory(GiB)": 146.85, "step": 52340, "train_speed(iter/s)": 0.202347 }, { "acc": 0.77142668, "epoch": 1.2213651409323554, "grad_norm": 5.40625, "learning_rate": 3.4750513032517493e-06, "loss": 0.82355976, "memory(GiB)": 146.85, "step": 52350, "train_speed(iter/s)": 0.202369 }, { "acc": 0.77235928, "epoch": 1.2215984485046443, "grad_norm": 5.46875, "learning_rate": 3.473252306020837e-06, "loss": 0.825739, "memory(GiB)": 146.85, "step": 52360, "train_speed(iter/s)": 0.202389 }, { "acc": 0.76214023, "epoch": 1.2218317560769332, "grad_norm": 7.78125, "learning_rate": 3.471453526732712e-06, "loss": 0.85070057, "memory(GiB)": 146.85, "step": 52370, "train_speed(iter/s)": 0.20241 }, { "acc": 0.77140326, "epoch": 1.222065063649222, "grad_norm": 5.4375, "learning_rate": 3.4696549656441537e-06, "loss": 0.83244505, "memory(GiB)": 146.85, "step": 52380, "train_speed(iter/s)": 0.20243 }, { "acc": 0.77287989, "epoch": 1.222298371221511, "grad_norm": 8.375, "learning_rate": 3.467856623011903e-06, "loss": 0.81844816, "memory(GiB)": 146.85, "step": 52390, "train_speed(iter/s)": 0.20245 }, { "acc": 0.78394594, "epoch": 1.2225316787937999, "grad_norm": 5.1875, "learning_rate": 3.4660584990926748e-06, "loss": 0.76994739, "memory(GiB)": 146.85, "step": 52400, "train_speed(iter/s)": 0.202469 }, { "acc": 0.78399963, "epoch": 1.2227649863660888, "grad_norm": 5.6875, "learning_rate": 3.4642605941431494e-06, "loss": 0.78652477, "memory(GiB)": 146.85, "step": 52410, "train_speed(iter/s)": 0.202488 }, { "acc": 0.75711508, "epoch": 1.2229982939383777, "grad_norm": 5.90625, "learning_rate": 3.462462908419979e-06, "loss": 0.88251019, "memory(GiB)": 146.85, "step": 52420, "train_speed(iter/s)": 0.202507 }, { "acc": 0.76979942, "epoch": 1.2232316015106666, "grad_norm": 6.59375, "learning_rate": 3.4606654421797814e-06, "loss": 0.82017441, "memory(GiB)": 146.85, "step": 52430, "train_speed(iter/s)": 0.202525 }, { "acc": 0.7729579, "epoch": 1.2234649090829555, "grad_norm": 4.6875, "learning_rate": 3.458868195679146e-06, "loss": 0.82674732, "memory(GiB)": 146.85, "step": 52440, "train_speed(iter/s)": 0.202545 }, { "acc": 0.77847676, "epoch": 1.2236982166552444, "grad_norm": 6.53125, "learning_rate": 3.4570711691746262e-06, "loss": 0.8204565, "memory(GiB)": 146.85, "step": 52450, "train_speed(iter/s)": 0.202566 }, { "acc": 0.77991037, "epoch": 1.2239315242275333, "grad_norm": 5.53125, "learning_rate": 3.4552743629227494e-06, "loss": 0.80491562, "memory(GiB)": 146.85, "step": 52460, "train_speed(iter/s)": 0.202586 }, { "acc": 0.76759186, "epoch": 1.2241648317998222, "grad_norm": 7.9375, "learning_rate": 3.4534777771800083e-06, "loss": 0.82272797, "memory(GiB)": 146.85, "step": 52470, "train_speed(iter/s)": 0.202606 }, { "acc": 0.77200913, "epoch": 1.224398139372111, "grad_norm": 6.09375, "learning_rate": 3.4516814122028676e-06, "loss": 0.83715696, "memory(GiB)": 146.85, "step": 52480, "train_speed(iter/s)": 0.202627 }, { "acc": 0.78658581, "epoch": 1.2246314469444, "grad_norm": 8.8125, "learning_rate": 3.449885268247753e-06, "loss": 0.75408335, "memory(GiB)": 146.85, "step": 52490, "train_speed(iter/s)": 0.202647 }, { "acc": 0.76842971, "epoch": 1.2248647545166889, "grad_norm": 5.84375, "learning_rate": 3.448089345571066e-06, "loss": 0.83543873, "memory(GiB)": 146.85, "step": 52500, "train_speed(iter/s)": 0.202667 }, { "epoch": 1.2248647545166889, "eval_acc": 0.735149193769583, "eval_loss": 0.834112286567688, "eval_runtime": 1264.1905, "eval_samples_per_second": 28.47, "eval_steps_per_second": 14.235, "step": 52500 }, { "acc": 0.77021832, "epoch": 1.2250980620889778, "grad_norm": 4.34375, "learning_rate": 3.4462936444291744e-06, "loss": 0.8162653, "memory(GiB)": 146.85, "step": 52510, "train_speed(iter/s)": 0.201683 }, { "acc": 0.7881959, "epoch": 1.2253313696612667, "grad_norm": 5.28125, "learning_rate": 3.4444981650784147e-06, "loss": 0.74899731, "memory(GiB)": 146.85, "step": 52520, "train_speed(iter/s)": 0.201703 }, { "acc": 0.75969715, "epoch": 1.2255646772335556, "grad_norm": 5.84375, "learning_rate": 3.4427029077750895e-06, "loss": 0.86235905, "memory(GiB)": 146.85, "step": 52530, "train_speed(iter/s)": 0.201723 }, { "acc": 0.7794055, "epoch": 1.2257979848058445, "grad_norm": 4.46875, "learning_rate": 3.4409078727754707e-06, "loss": 0.78630972, "memory(GiB)": 146.85, "step": 52540, "train_speed(iter/s)": 0.201742 }, { "acc": 0.78808484, "epoch": 1.2260312923781331, "grad_norm": 6.3125, "learning_rate": 3.4391130603358013e-06, "loss": 0.74802799, "memory(GiB)": 146.85, "step": 52550, "train_speed(iter/s)": 0.201763 }, { "acc": 0.78713694, "epoch": 1.2262645999504223, "grad_norm": 9.1875, "learning_rate": 3.4373184707122886e-06, "loss": 0.77774706, "memory(GiB)": 146.85, "step": 52560, "train_speed(iter/s)": 0.201783 }, { "acc": 0.7831069, "epoch": 1.226497907522711, "grad_norm": 4.75, "learning_rate": 3.4355241041611096e-06, "loss": 0.78779888, "memory(GiB)": 146.85, "step": 52570, "train_speed(iter/s)": 0.201803 }, { "acc": 0.7851615, "epoch": 1.226731215095, "grad_norm": 5.1875, "learning_rate": 3.4337299609384122e-06, "loss": 0.78409839, "memory(GiB)": 146.85, "step": 52580, "train_speed(iter/s)": 0.201822 }, { "acc": 0.76864052, "epoch": 1.2269645226672887, "grad_norm": 5.78125, "learning_rate": 3.431936041300308e-06, "loss": 0.83551159, "memory(GiB)": 146.85, "step": 52590, "train_speed(iter/s)": 0.201842 }, { "acc": 0.78193073, "epoch": 1.2271978302395776, "grad_norm": 5.4375, "learning_rate": 3.4301423455028777e-06, "loss": 0.77788014, "memory(GiB)": 146.85, "step": 52600, "train_speed(iter/s)": 0.201863 }, { "acc": 0.76638165, "epoch": 1.2274311378118665, "grad_norm": 3.953125, "learning_rate": 3.4283488738021707e-06, "loss": 0.83701611, "memory(GiB)": 146.85, "step": 52610, "train_speed(iter/s)": 0.201882 }, { "acc": 0.79850626, "epoch": 1.2276644453841554, "grad_norm": 6.40625, "learning_rate": 3.4265556264542054e-06, "loss": 0.72191477, "memory(GiB)": 146.85, "step": 52620, "train_speed(iter/s)": 0.201902 }, { "acc": 0.76453872, "epoch": 1.2278977529564443, "grad_norm": 5.65625, "learning_rate": 3.424762603714967e-06, "loss": 0.86852894, "memory(GiB)": 146.85, "step": 52630, "train_speed(iter/s)": 0.201922 }, { "acc": 0.76114368, "epoch": 1.2281310605287332, "grad_norm": 4.28125, "learning_rate": 3.4229698058404106e-06, "loss": 0.87119951, "memory(GiB)": 146.85, "step": 52640, "train_speed(iter/s)": 0.201941 }, { "acc": 0.76344671, "epoch": 1.2283643681010221, "grad_norm": 5.875, "learning_rate": 3.4211772330864552e-06, "loss": 0.85837097, "memory(GiB)": 146.85, "step": 52650, "train_speed(iter/s)": 0.201961 }, { "acc": 0.76751413, "epoch": 1.228597675673311, "grad_norm": 7.25, "learning_rate": 3.4193848857089924e-06, "loss": 0.85126009, "memory(GiB)": 146.85, "step": 52660, "train_speed(iter/s)": 0.201982 }, { "acc": 0.77551003, "epoch": 1.2288309832456, "grad_norm": 5.125, "learning_rate": 3.4175927639638767e-06, "loss": 0.8102397, "memory(GiB)": 146.85, "step": 52670, "train_speed(iter/s)": 0.202002 }, { "acc": 0.77409954, "epoch": 1.2290642908178888, "grad_norm": 6.40625, "learning_rate": 3.4158008681069343e-06, "loss": 0.8317194, "memory(GiB)": 146.85, "step": 52680, "train_speed(iter/s)": 0.202021 }, { "acc": 0.79311428, "epoch": 1.2292975983901777, "grad_norm": 6.09375, "learning_rate": 3.4140091983939584e-06, "loss": 0.73895669, "memory(GiB)": 146.85, "step": 52690, "train_speed(iter/s)": 0.202039 }, { "acc": 0.76913948, "epoch": 1.2295309059624666, "grad_norm": 6.09375, "learning_rate": 3.4122177550807077e-06, "loss": 0.84187679, "memory(GiB)": 146.85, "step": 52700, "train_speed(iter/s)": 0.20206 }, { "acc": 0.77148428, "epoch": 1.2297642135347555, "grad_norm": 5.3125, "learning_rate": 3.410426538422914e-06, "loss": 0.82883615, "memory(GiB)": 146.85, "step": 52710, "train_speed(iter/s)": 0.20208 }, { "acc": 0.77532091, "epoch": 1.2299975211070444, "grad_norm": 5.40625, "learning_rate": 3.4086355486762678e-06, "loss": 0.79093642, "memory(GiB)": 146.85, "step": 52720, "train_speed(iter/s)": 0.202099 }, { "acc": 0.76098032, "epoch": 1.2302308286793333, "grad_norm": 8.125, "learning_rate": 3.406844786096435e-06, "loss": 0.88272142, "memory(GiB)": 146.85, "step": 52730, "train_speed(iter/s)": 0.202119 }, { "acc": 0.7846817, "epoch": 1.2304641362516222, "grad_norm": 5.125, "learning_rate": 3.405054250939047e-06, "loss": 0.77597084, "memory(GiB)": 146.85, "step": 52740, "train_speed(iter/s)": 0.202138 }, { "acc": 0.77835865, "epoch": 1.230697443823911, "grad_norm": 6.125, "learning_rate": 3.4032639434597003e-06, "loss": 0.80097065, "memory(GiB)": 146.85, "step": 52750, "train_speed(iter/s)": 0.202158 }, { "acc": 0.76799669, "epoch": 1.2309307513962, "grad_norm": 6.03125, "learning_rate": 3.4014738639139622e-06, "loss": 0.84409904, "memory(GiB)": 146.85, "step": 52760, "train_speed(iter/s)": 0.202176 }, { "acc": 0.78781376, "epoch": 1.231164058968489, "grad_norm": 10.125, "learning_rate": 3.399684012557365e-06, "loss": 0.74820051, "memory(GiB)": 146.85, "step": 52770, "train_speed(iter/s)": 0.202195 }, { "acc": 0.78284831, "epoch": 1.2313973665407778, "grad_norm": 5.53125, "learning_rate": 3.3978943896454107e-06, "loss": 0.77194114, "memory(GiB)": 146.85, "step": 52780, "train_speed(iter/s)": 0.202215 }, { "acc": 0.77800045, "epoch": 1.2316306741130667, "grad_norm": 5.5, "learning_rate": 3.396104995433567e-06, "loss": 0.8062274, "memory(GiB)": 146.85, "step": 52790, "train_speed(iter/s)": 0.202234 }, { "acc": 0.7899353, "epoch": 1.2318639816853556, "grad_norm": 5.90625, "learning_rate": 3.3943158301772695e-06, "loss": 0.76196146, "memory(GiB)": 146.85, "step": 52800, "train_speed(iter/s)": 0.202255 }, { "acc": 0.78126822, "epoch": 1.2320972892576445, "grad_norm": 4.09375, "learning_rate": 3.39252689413192e-06, "loss": 0.8051116, "memory(GiB)": 146.85, "step": 52810, "train_speed(iter/s)": 0.202272 }, { "acc": 0.76382732, "epoch": 1.2323305968299334, "grad_norm": 5.0, "learning_rate": 3.3907381875528916e-06, "loss": 0.85982838, "memory(GiB)": 146.85, "step": 52820, "train_speed(iter/s)": 0.202291 }, { "acc": 0.76054592, "epoch": 1.2325639044022223, "grad_norm": 6.625, "learning_rate": 3.388949710695517e-06, "loss": 0.86466722, "memory(GiB)": 146.85, "step": 52830, "train_speed(iter/s)": 0.20231 }, { "acc": 0.77413235, "epoch": 1.2327972119745112, "grad_norm": 8.875, "learning_rate": 3.387161463815104e-06, "loss": 0.81193943, "memory(GiB)": 146.85, "step": 52840, "train_speed(iter/s)": 0.202331 }, { "acc": 0.76620088, "epoch": 1.2330305195468, "grad_norm": 5.84375, "learning_rate": 3.3853734471669232e-06, "loss": 0.86475191, "memory(GiB)": 146.85, "step": 52850, "train_speed(iter/s)": 0.202351 }, { "acc": 0.77478571, "epoch": 1.233263827119089, "grad_norm": 6.0, "learning_rate": 3.3835856610062135e-06, "loss": 0.80274258, "memory(GiB)": 146.85, "step": 52860, "train_speed(iter/s)": 0.202371 }, { "acc": 0.76600904, "epoch": 1.233497134691378, "grad_norm": 10.4375, "learning_rate": 3.381798105588181e-06, "loss": 0.84471436, "memory(GiB)": 146.85, "step": 52870, "train_speed(iter/s)": 0.202391 }, { "acc": 0.76447897, "epoch": 1.2337304422636668, "grad_norm": 4.5625, "learning_rate": 3.3800107811680004e-06, "loss": 0.85363617, "memory(GiB)": 146.85, "step": 52880, "train_speed(iter/s)": 0.202411 }, { "acc": 0.7679472, "epoch": 1.2339637498359557, "grad_norm": 6.25, "learning_rate": 3.378223688000809e-06, "loss": 0.84352207, "memory(GiB)": 146.85, "step": 52890, "train_speed(iter/s)": 0.20243 }, { "acc": 0.78237433, "epoch": 1.2341970574082446, "grad_norm": 5.75, "learning_rate": 3.3764368263417146e-06, "loss": 0.77372065, "memory(GiB)": 146.85, "step": 52900, "train_speed(iter/s)": 0.202451 }, { "acc": 0.77183762, "epoch": 1.2344303649805335, "grad_norm": 4.90625, "learning_rate": 3.3746501964457916e-06, "loss": 0.81571312, "memory(GiB)": 146.85, "step": 52910, "train_speed(iter/s)": 0.202471 }, { "acc": 0.77752481, "epoch": 1.2346636725528224, "grad_norm": 5.5625, "learning_rate": 3.3728637985680814e-06, "loss": 0.80983, "memory(GiB)": 146.85, "step": 52920, "train_speed(iter/s)": 0.202492 }, { "acc": 0.77725639, "epoch": 1.2348969801251113, "grad_norm": 7.3125, "learning_rate": 3.371077632963592e-06, "loss": 0.78868165, "memory(GiB)": 146.85, "step": 52930, "train_speed(iter/s)": 0.202511 }, { "acc": 0.76785927, "epoch": 1.2351302876974, "grad_norm": 5.0625, "learning_rate": 3.3692916998872972e-06, "loss": 0.82834129, "memory(GiB)": 146.85, "step": 52940, "train_speed(iter/s)": 0.202531 }, { "acc": 0.7776124, "epoch": 1.235363595269689, "grad_norm": 6.125, "learning_rate": 3.367505999594138e-06, "loss": 0.79924755, "memory(GiB)": 146.85, "step": 52950, "train_speed(iter/s)": 0.202551 }, { "acc": 0.78238173, "epoch": 1.2355969028419778, "grad_norm": 5.96875, "learning_rate": 3.3657205323390234e-06, "loss": 0.78575335, "memory(GiB)": 146.85, "step": 52960, "train_speed(iter/s)": 0.202571 }, { "acc": 0.758289, "epoch": 1.2358302104142667, "grad_norm": 4.75, "learning_rate": 3.3639352983768276e-06, "loss": 0.85949373, "memory(GiB)": 146.85, "step": 52970, "train_speed(iter/s)": 0.202593 }, { "acc": 0.78400393, "epoch": 1.2360635179865556, "grad_norm": 24.0, "learning_rate": 3.3621502979623923e-06, "loss": 0.76790409, "memory(GiB)": 146.85, "step": 52980, "train_speed(iter/s)": 0.202613 }, { "acc": 0.77554359, "epoch": 1.2362968255588445, "grad_norm": 8.625, "learning_rate": 3.360365531350527e-06, "loss": 0.80532789, "memory(GiB)": 146.85, "step": 52990, "train_speed(iter/s)": 0.202633 }, { "acc": 0.78944521, "epoch": 1.2365301331311334, "grad_norm": 6.90625, "learning_rate": 3.358580998796005e-06, "loss": 0.76329374, "memory(GiB)": 146.85, "step": 53000, "train_speed(iter/s)": 0.202653 }, { "epoch": 1.2365301331311334, "eval_acc": 0.7351414493109045, "eval_loss": 0.8340766429901123, "eval_runtime": 1264.7226, "eval_samples_per_second": 28.458, "eval_steps_per_second": 14.229, "step": 53000 }, { "acc": 0.78784676, "epoch": 1.2367634407034223, "grad_norm": 6.28125, "learning_rate": 3.3567967005535696e-06, "loss": 0.78339796, "memory(GiB)": 146.85, "step": 53010, "train_speed(iter/s)": 0.201678 }, { "acc": 0.76128788, "epoch": 1.2369967482757112, "grad_norm": 5.9375, "learning_rate": 3.355012636877927e-06, "loss": 0.86769619, "memory(GiB)": 146.85, "step": 53020, "train_speed(iter/s)": 0.2017 }, { "acc": 0.78394904, "epoch": 1.237230055848, "grad_norm": 5.125, "learning_rate": 3.353228808023752e-06, "loss": 0.7594265, "memory(GiB)": 146.85, "step": 53030, "train_speed(iter/s)": 0.201719 }, { "acc": 0.77916117, "epoch": 1.237463363420289, "grad_norm": 4.71875, "learning_rate": 3.351445214245687e-06, "loss": 0.80081501, "memory(GiB)": 146.85, "step": 53040, "train_speed(iter/s)": 0.201739 }, { "acc": 0.78294168, "epoch": 1.2376966709925779, "grad_norm": 5.46875, "learning_rate": 3.3496618557983405e-06, "loss": 0.7694715, "memory(GiB)": 146.85, "step": 53050, "train_speed(iter/s)": 0.201759 }, { "acc": 0.78922291, "epoch": 1.2379299785648668, "grad_norm": 5.46875, "learning_rate": 3.347878732936283e-06, "loss": 0.75546889, "memory(GiB)": 146.85, "step": 53060, "train_speed(iter/s)": 0.201778 }, { "acc": 0.75561113, "epoch": 1.2381632861371556, "grad_norm": 5.03125, "learning_rate": 3.346095845914056e-06, "loss": 0.89398479, "memory(GiB)": 146.85, "step": 53070, "train_speed(iter/s)": 0.201799 }, { "acc": 0.77292871, "epoch": 1.2383965937094445, "grad_norm": 5.78125, "learning_rate": 3.3443131949861667e-06, "loss": 0.81336288, "memory(GiB)": 146.85, "step": 53080, "train_speed(iter/s)": 0.201819 }, { "acc": 0.77215662, "epoch": 1.2386299012817334, "grad_norm": 5.25, "learning_rate": 3.3425307804070896e-06, "loss": 0.81215267, "memory(GiB)": 146.85, "step": 53090, "train_speed(iter/s)": 0.201839 }, { "acc": 0.77536941, "epoch": 1.2388632088540223, "grad_norm": 6.46875, "learning_rate": 3.3407486024312596e-06, "loss": 0.80265417, "memory(GiB)": 146.85, "step": 53100, "train_speed(iter/s)": 0.20186 }, { "acc": 0.7861454, "epoch": 1.2390965164263112, "grad_norm": 6.71875, "learning_rate": 3.3389666613130856e-06, "loss": 0.76984448, "memory(GiB)": 146.85, "step": 53110, "train_speed(iter/s)": 0.20188 }, { "acc": 0.79287729, "epoch": 1.2393298239986001, "grad_norm": 4.46875, "learning_rate": 3.337184957306938e-06, "loss": 0.75857878, "memory(GiB)": 146.85, "step": 53120, "train_speed(iter/s)": 0.2019 }, { "acc": 0.76653109, "epoch": 1.239563131570889, "grad_norm": 6.625, "learning_rate": 3.3354034906671545e-06, "loss": 0.85420284, "memory(GiB)": 146.85, "step": 53130, "train_speed(iter/s)": 0.20192 }, { "acc": 0.78523722, "epoch": 1.239796439143178, "grad_norm": 5.65625, "learning_rate": 3.333622261648039e-06, "loss": 0.76102419, "memory(GiB)": 146.85, "step": 53140, "train_speed(iter/s)": 0.201939 }, { "acc": 0.79054103, "epoch": 1.2400297467154668, "grad_norm": 4.34375, "learning_rate": 3.3318412705038626e-06, "loss": 0.75044031, "memory(GiB)": 146.85, "step": 53150, "train_speed(iter/s)": 0.201958 }, { "acc": 0.76410127, "epoch": 1.2402630542877557, "grad_norm": 5.84375, "learning_rate": 3.330060517488861e-06, "loss": 0.85848598, "memory(GiB)": 146.85, "step": 53160, "train_speed(iter/s)": 0.201979 }, { "acc": 0.76792212, "epoch": 1.2404963618600446, "grad_norm": 6.78125, "learning_rate": 3.328280002857234e-06, "loss": 0.85650959, "memory(GiB)": 146.85, "step": 53170, "train_speed(iter/s)": 0.201999 }, { "acc": 0.76822076, "epoch": 1.2407296694323335, "grad_norm": 6.59375, "learning_rate": 3.3264997268631515e-06, "loss": 0.84947853, "memory(GiB)": 146.85, "step": 53180, "train_speed(iter/s)": 0.202019 }, { "acc": 0.77728419, "epoch": 1.2409629770046224, "grad_norm": 5.71875, "learning_rate": 3.324719689760746e-06, "loss": 0.81324921, "memory(GiB)": 146.85, "step": 53190, "train_speed(iter/s)": 0.202039 }, { "acc": 0.74915724, "epoch": 1.2411962845769113, "grad_norm": 11.25, "learning_rate": 3.3229398918041184e-06, "loss": 0.91622992, "memory(GiB)": 146.85, "step": 53200, "train_speed(iter/s)": 0.202059 }, { "acc": 0.78314896, "epoch": 1.2414295921492002, "grad_norm": 5.96875, "learning_rate": 3.321160333247334e-06, "loss": 0.78785071, "memory(GiB)": 146.85, "step": 53210, "train_speed(iter/s)": 0.202078 }, { "acc": 0.75772934, "epoch": 1.2416628997214891, "grad_norm": 5.8125, "learning_rate": 3.319381014344424e-06, "loss": 0.87384892, "memory(GiB)": 146.85, "step": 53220, "train_speed(iter/s)": 0.202097 }, { "acc": 0.78264542, "epoch": 1.241896207293778, "grad_norm": 6.40625, "learning_rate": 3.3176019353493873e-06, "loss": 0.77114725, "memory(GiB)": 146.85, "step": 53230, "train_speed(iter/s)": 0.202117 }, { "acc": 0.73832331, "epoch": 1.242129514866067, "grad_norm": 7.34375, "learning_rate": 3.315823096516184e-06, "loss": 0.96363182, "memory(GiB)": 146.85, "step": 53240, "train_speed(iter/s)": 0.202137 }, { "acc": 0.77827358, "epoch": 1.2423628224383558, "grad_norm": 7.0, "learning_rate": 3.314044498098745e-06, "loss": 0.78635254, "memory(GiB)": 146.85, "step": 53250, "train_speed(iter/s)": 0.202155 }, { "acc": 0.79461684, "epoch": 1.2425961300106447, "grad_norm": 5.71875, "learning_rate": 3.3122661403509643e-06, "loss": 0.7259737, "memory(GiB)": 146.85, "step": 53260, "train_speed(iter/s)": 0.202173 }, { "acc": 0.76879396, "epoch": 1.2428294375829336, "grad_norm": 5.625, "learning_rate": 3.3104880235267014e-06, "loss": 0.82705498, "memory(GiB)": 146.85, "step": 53270, "train_speed(iter/s)": 0.202193 }, { "acc": 0.75617552, "epoch": 1.2430627451552225, "grad_norm": 8.4375, "learning_rate": 3.3087101478797846e-06, "loss": 0.87891588, "memory(GiB)": 146.85, "step": 53280, "train_speed(iter/s)": 0.202212 }, { "acc": 0.7734499, "epoch": 1.2432960527275114, "grad_norm": 6.125, "learning_rate": 3.3069325136640007e-06, "loss": 0.83169127, "memory(GiB)": 146.85, "step": 53290, "train_speed(iter/s)": 0.202231 }, { "acc": 0.77115417, "epoch": 1.2435293602998003, "grad_norm": 6.34375, "learning_rate": 3.305155121133109e-06, "loss": 0.84065475, "memory(GiB)": 146.85, "step": 53300, "train_speed(iter/s)": 0.202251 }, { "acc": 0.76978312, "epoch": 1.243762667872089, "grad_norm": 6.96875, "learning_rate": 3.303377970540832e-06, "loss": 0.83196449, "memory(GiB)": 146.85, "step": 53310, "train_speed(iter/s)": 0.202271 }, { "acc": 0.76479902, "epoch": 1.2439959754443781, "grad_norm": 8.4375, "learning_rate": 3.3016010621408558e-06, "loss": 0.86299982, "memory(GiB)": 146.85, "step": 53320, "train_speed(iter/s)": 0.20229 }, { "acc": 0.78929644, "epoch": 1.2442292830166668, "grad_norm": 5.875, "learning_rate": 3.299824396186835e-06, "loss": 0.77098255, "memory(GiB)": 146.85, "step": 53330, "train_speed(iter/s)": 0.202308 }, { "acc": 0.79076824, "epoch": 1.244462590588956, "grad_norm": 4.28125, "learning_rate": 3.2980479729323867e-06, "loss": 0.75044756, "memory(GiB)": 146.85, "step": 53340, "train_speed(iter/s)": 0.202328 }, { "acc": 0.76722736, "epoch": 1.2446958981612446, "grad_norm": 7.6875, "learning_rate": 3.2962717926310966e-06, "loss": 0.83425121, "memory(GiB)": 146.85, "step": 53350, "train_speed(iter/s)": 0.202348 }, { "acc": 0.76628909, "epoch": 1.2449292057335335, "grad_norm": 4.625, "learning_rate": 3.2944958555365135e-06, "loss": 0.8266778, "memory(GiB)": 146.85, "step": 53360, "train_speed(iter/s)": 0.202368 }, { "acc": 0.77633314, "epoch": 1.2451625133058224, "grad_norm": 7.4375, "learning_rate": 3.292720161902152e-06, "loss": 0.79606762, "memory(GiB)": 146.85, "step": 53370, "train_speed(iter/s)": 0.202387 }, { "acc": 0.78187056, "epoch": 1.2453958208781113, "grad_norm": 6.40625, "learning_rate": 3.2909447119814907e-06, "loss": 0.79851375, "memory(GiB)": 146.85, "step": 53380, "train_speed(iter/s)": 0.202408 }, { "acc": 0.77362518, "epoch": 1.2456291284504002, "grad_norm": 6.09375, "learning_rate": 3.289169506027977e-06, "loss": 0.82295132, "memory(GiB)": 146.85, "step": 53390, "train_speed(iter/s)": 0.202427 }, { "acc": 0.76630669, "epoch": 1.245862436022689, "grad_norm": 5.15625, "learning_rate": 3.287394544295018e-06, "loss": 0.85046568, "memory(GiB)": 146.85, "step": 53400, "train_speed(iter/s)": 0.202447 }, { "acc": 0.77045002, "epoch": 1.246095743594978, "grad_norm": 8.0, "learning_rate": 3.2856198270359895e-06, "loss": 0.82994499, "memory(GiB)": 146.85, "step": 53410, "train_speed(iter/s)": 0.202466 }, { "acc": 0.79774637, "epoch": 1.2463290511672669, "grad_norm": 4.3125, "learning_rate": 3.2838453545042326e-06, "loss": 0.71843872, "memory(GiB)": 146.85, "step": 53420, "train_speed(iter/s)": 0.202485 }, { "acc": 0.78066521, "epoch": 1.2465623587395558, "grad_norm": 5.5625, "learning_rate": 3.2820711269530535e-06, "loss": 0.78684707, "memory(GiB)": 146.85, "step": 53430, "train_speed(iter/s)": 0.202504 }, { "acc": 0.77510123, "epoch": 1.2467956663118447, "grad_norm": 5.75, "learning_rate": 3.280297144635721e-06, "loss": 0.80519562, "memory(GiB)": 146.85, "step": 53440, "train_speed(iter/s)": 0.202524 }, { "acc": 0.7606636, "epoch": 1.2470289738841336, "grad_norm": 6.1875, "learning_rate": 3.278523407805474e-06, "loss": 0.87189341, "memory(GiB)": 146.85, "step": 53450, "train_speed(iter/s)": 0.202545 }, { "acc": 0.76305456, "epoch": 1.2472622814564225, "grad_norm": 5.5625, "learning_rate": 3.276749916715508e-06, "loss": 0.84630022, "memory(GiB)": 146.85, "step": 53460, "train_speed(iter/s)": 0.202566 }, { "acc": 0.77565093, "epoch": 1.2474955890287114, "grad_norm": 9.0, "learning_rate": 3.274976671618992e-06, "loss": 0.79410505, "memory(GiB)": 146.85, "step": 53470, "train_speed(iter/s)": 0.202586 }, { "acc": 0.77333083, "epoch": 1.2477288966010003, "grad_norm": 5.59375, "learning_rate": 3.2732036727690543e-06, "loss": 0.81930656, "memory(GiB)": 146.85, "step": 53480, "train_speed(iter/s)": 0.202606 }, { "acc": 0.78297315, "epoch": 1.2479622041732892, "grad_norm": 4.53125, "learning_rate": 3.2714309204187905e-06, "loss": 0.78778787, "memory(GiB)": 146.85, "step": 53490, "train_speed(iter/s)": 0.202625 }, { "acc": 0.76547847, "epoch": 1.248195511745578, "grad_norm": 5.375, "learning_rate": 3.2696584148212606e-06, "loss": 0.84722252, "memory(GiB)": 146.85, "step": 53500, "train_speed(iter/s)": 0.202645 }, { "epoch": 1.248195511745578, "eval_acc": 0.735144353482909, "eval_loss": 0.8341158628463745, "eval_runtime": 1264.3167, "eval_samples_per_second": 28.467, "eval_steps_per_second": 14.234, "step": 53500 }, { "acc": 0.78304901, "epoch": 1.248428819317867, "grad_norm": 6.78125, "learning_rate": 3.2678861562294916e-06, "loss": 0.78769054, "memory(GiB)": 146.85, "step": 53510, "train_speed(iter/s)": 0.201679 }, { "acc": 0.79847193, "epoch": 1.2486621268901559, "grad_norm": 6.46875, "learning_rate": 3.2661141448964688e-06, "loss": 0.73158264, "memory(GiB)": 146.85, "step": 53520, "train_speed(iter/s)": 0.201698 }, { "acc": 0.77289267, "epoch": 1.2488954344624448, "grad_norm": 4.71875, "learning_rate": 3.2643423810751497e-06, "loss": 0.81023407, "memory(GiB)": 146.85, "step": 53530, "train_speed(iter/s)": 0.20172 }, { "acc": 0.76297178, "epoch": 1.2491287420347337, "grad_norm": 5.65625, "learning_rate": 3.2625708650184496e-06, "loss": 0.87260904, "memory(GiB)": 146.85, "step": 53540, "train_speed(iter/s)": 0.20174 }, { "acc": 0.79476147, "epoch": 1.2493620496070226, "grad_norm": 5.8125, "learning_rate": 3.260799596979254e-06, "loss": 0.76445394, "memory(GiB)": 146.85, "step": 53550, "train_speed(iter/s)": 0.20176 }, { "acc": 0.78898287, "epoch": 1.2495953571793115, "grad_norm": 5.59375, "learning_rate": 3.25902857721041e-06, "loss": 0.7774684, "memory(GiB)": 146.85, "step": 53560, "train_speed(iter/s)": 0.201779 }, { "acc": 0.75639124, "epoch": 1.2498286647516004, "grad_norm": 6.65625, "learning_rate": 3.257257805964732e-06, "loss": 0.8745307, "memory(GiB)": 146.85, "step": 53570, "train_speed(iter/s)": 0.2018 }, { "acc": 0.76230025, "epoch": 1.2500619723238893, "grad_norm": 6.15625, "learning_rate": 3.255487283494995e-06, "loss": 0.86926765, "memory(GiB)": 146.85, "step": 53580, "train_speed(iter/s)": 0.201821 }, { "acc": 0.77916803, "epoch": 1.2502952798961782, "grad_norm": 6.46875, "learning_rate": 3.253717010053943e-06, "loss": 0.79105759, "memory(GiB)": 146.85, "step": 53590, "train_speed(iter/s)": 0.201839 }, { "acc": 0.74656134, "epoch": 1.250528587468467, "grad_norm": 5.15625, "learning_rate": 3.25194698589428e-06, "loss": 0.9319047, "memory(GiB)": 146.85, "step": 53600, "train_speed(iter/s)": 0.201857 }, { "acc": 0.75667834, "epoch": 1.250761895040756, "grad_norm": 4.96875, "learning_rate": 3.2501772112686757e-06, "loss": 0.88050251, "memory(GiB)": 146.85, "step": 53610, "train_speed(iter/s)": 0.201877 }, { "acc": 0.77794223, "epoch": 1.2509952026130449, "grad_norm": 5.5625, "learning_rate": 3.2484076864297687e-06, "loss": 0.79231749, "memory(GiB)": 146.85, "step": 53620, "train_speed(iter/s)": 0.201897 }, { "acc": 0.76518679, "epoch": 1.2512285101853338, "grad_norm": 6.09375, "learning_rate": 3.246638411630154e-06, "loss": 0.85035934, "memory(GiB)": 146.85, "step": 53630, "train_speed(iter/s)": 0.201917 }, { "acc": 0.76422338, "epoch": 1.2514618177576227, "grad_norm": 8.25, "learning_rate": 3.2448693871223968e-06, "loss": 0.82832785, "memory(GiB)": 146.85, "step": 53640, "train_speed(iter/s)": 0.201936 }, { "acc": 0.76787233, "epoch": 1.2516951253299116, "grad_norm": 6.28125, "learning_rate": 3.2431006131590244e-06, "loss": 0.83890743, "memory(GiB)": 146.85, "step": 53650, "train_speed(iter/s)": 0.201955 }, { "acc": 0.77475672, "epoch": 1.2519284329022005, "grad_norm": 5.3125, "learning_rate": 3.2413320899925287e-06, "loss": 0.80839062, "memory(GiB)": 146.85, "step": 53660, "train_speed(iter/s)": 0.201975 }, { "acc": 0.79345541, "epoch": 1.2521617404744894, "grad_norm": 6.25, "learning_rate": 3.2395638178753673e-06, "loss": 0.74076796, "memory(GiB)": 146.85, "step": 53670, "train_speed(iter/s)": 0.201995 }, { "acc": 0.77657228, "epoch": 1.252395048046778, "grad_norm": 5.5625, "learning_rate": 3.2377957970599594e-06, "loss": 0.81450081, "memory(GiB)": 146.85, "step": 53680, "train_speed(iter/s)": 0.202014 }, { "acc": 0.781001, "epoch": 1.2526283556190672, "grad_norm": 6.15625, "learning_rate": 3.2360280277986887e-06, "loss": 0.78348627, "memory(GiB)": 146.85, "step": 53690, "train_speed(iter/s)": 0.202034 }, { "acc": 0.76449132, "epoch": 1.2528616631913558, "grad_norm": 5.59375, "learning_rate": 3.234260510343905e-06, "loss": 0.87568741, "memory(GiB)": 146.85, "step": 53700, "train_speed(iter/s)": 0.202055 }, { "acc": 0.76867933, "epoch": 1.253094970763645, "grad_norm": 5.5625, "learning_rate": 3.23249324494792e-06, "loss": 0.83469486, "memory(GiB)": 146.85, "step": 53710, "train_speed(iter/s)": 0.202075 }, { "acc": 0.76899319, "epoch": 1.2533282783359336, "grad_norm": 5.65625, "learning_rate": 3.230726231863013e-06, "loss": 0.83415918, "memory(GiB)": 146.85, "step": 53720, "train_speed(iter/s)": 0.202094 }, { "acc": 0.76486211, "epoch": 1.2535615859082228, "grad_norm": 7.59375, "learning_rate": 3.2289594713414207e-06, "loss": 0.85925713, "memory(GiB)": 146.85, "step": 53730, "train_speed(iter/s)": 0.202113 }, { "acc": 0.77430754, "epoch": 1.2537948934805114, "grad_norm": 9.0625, "learning_rate": 3.2271929636353494e-06, "loss": 0.79187469, "memory(GiB)": 146.85, "step": 53740, "train_speed(iter/s)": 0.202133 }, { "acc": 0.79457598, "epoch": 1.2540282010528006, "grad_norm": 3.828125, "learning_rate": 3.2254267089969688e-06, "loss": 0.72847743, "memory(GiB)": 146.85, "step": 53750, "train_speed(iter/s)": 0.202153 }, { "acc": 0.78111839, "epoch": 1.2542615086250892, "grad_norm": 6.0, "learning_rate": 3.2236607076784086e-06, "loss": 0.77545052, "memory(GiB)": 146.85, "step": 53760, "train_speed(iter/s)": 0.202172 }, { "acc": 0.7626771, "epoch": 1.2544948161973781, "grad_norm": 6.5, "learning_rate": 3.2218949599317664e-06, "loss": 0.85685539, "memory(GiB)": 146.85, "step": 53770, "train_speed(iter/s)": 0.202193 }, { "acc": 0.77774634, "epoch": 1.254728123769667, "grad_norm": 5.53125, "learning_rate": 3.220129466009102e-06, "loss": 0.82173204, "memory(GiB)": 146.85, "step": 53780, "train_speed(iter/s)": 0.202212 }, { "acc": 0.78953171, "epoch": 1.254961431341956, "grad_norm": 5.8125, "learning_rate": 3.2183642261624393e-06, "loss": 0.73661456, "memory(GiB)": 146.85, "step": 53790, "train_speed(iter/s)": 0.202233 }, { "acc": 0.74630451, "epoch": 1.2551947389142448, "grad_norm": 6.53125, "learning_rate": 3.216599240643765e-06, "loss": 0.93164339, "memory(GiB)": 146.85, "step": 53800, "train_speed(iter/s)": 0.202252 }, { "acc": 0.77546234, "epoch": 1.2554280464865337, "grad_norm": 6.84375, "learning_rate": 3.2148345097050332e-06, "loss": 0.81313477, "memory(GiB)": 146.85, "step": 53810, "train_speed(iter/s)": 0.202271 }, { "acc": 0.78138142, "epoch": 1.2556613540588226, "grad_norm": 7.09375, "learning_rate": 3.213070033598155e-06, "loss": 0.80463047, "memory(GiB)": 146.85, "step": 53820, "train_speed(iter/s)": 0.202292 }, { "acc": 0.78350115, "epoch": 1.2558946616311115, "grad_norm": 5.28125, "learning_rate": 3.211305812575011e-06, "loss": 0.77275796, "memory(GiB)": 146.85, "step": 53830, "train_speed(iter/s)": 0.202312 }, { "acc": 0.77172871, "epoch": 1.2561279692034004, "grad_norm": 5.59375, "learning_rate": 3.209541846887442e-06, "loss": 0.83550243, "memory(GiB)": 146.85, "step": 53840, "train_speed(iter/s)": 0.20233 }, { "acc": 0.76745443, "epoch": 1.2563612767756893, "grad_norm": 6.1875, "learning_rate": 3.207778136787256e-06, "loss": 0.82812433, "memory(GiB)": 146.85, "step": 53850, "train_speed(iter/s)": 0.202349 }, { "acc": 0.79264522, "epoch": 1.2565945843479782, "grad_norm": 4.125, "learning_rate": 3.2060146825262196e-06, "loss": 0.75974803, "memory(GiB)": 146.85, "step": 53860, "train_speed(iter/s)": 0.202368 }, { "acc": 0.7780035, "epoch": 1.2568278919202671, "grad_norm": 4.96875, "learning_rate": 3.2042514843560644e-06, "loss": 0.78056231, "memory(GiB)": 146.85, "step": 53870, "train_speed(iter/s)": 0.202389 }, { "acc": 0.76643438, "epoch": 1.257061199492556, "grad_norm": 5.75, "learning_rate": 3.2024885425284893e-06, "loss": 0.84561243, "memory(GiB)": 146.85, "step": 53880, "train_speed(iter/s)": 0.202408 }, { "acc": 0.78000989, "epoch": 1.257294507064845, "grad_norm": 6.8125, "learning_rate": 3.200725857295153e-06, "loss": 0.77617388, "memory(GiB)": 146.85, "step": 53890, "train_speed(iter/s)": 0.202428 }, { "acc": 0.75451632, "epoch": 1.2575278146371338, "grad_norm": 6.25, "learning_rate": 3.1989634289076776e-06, "loss": 0.89144039, "memory(GiB)": 146.85, "step": 53900, "train_speed(iter/s)": 0.202447 }, { "acc": 0.77497234, "epoch": 1.2577611222094227, "grad_norm": 5.5625, "learning_rate": 3.197201257617649e-06, "loss": 0.80287123, "memory(GiB)": 146.85, "step": 53910, "train_speed(iter/s)": 0.202467 }, { "acc": 0.77349391, "epoch": 1.2579944297817116, "grad_norm": 10.5, "learning_rate": 3.195439343676617e-06, "loss": 0.82288685, "memory(GiB)": 146.85, "step": 53920, "train_speed(iter/s)": 0.202487 }, { "acc": 0.76874576, "epoch": 1.2582277373540005, "grad_norm": 5.4375, "learning_rate": 3.1936776873360947e-06, "loss": 0.84688416, "memory(GiB)": 146.85, "step": 53930, "train_speed(iter/s)": 0.202507 }, { "acc": 0.77152591, "epoch": 1.2584610449262894, "grad_norm": 4.9375, "learning_rate": 3.1919162888475586e-06, "loss": 0.81849756, "memory(GiB)": 146.85, "step": 53940, "train_speed(iter/s)": 0.202527 }, { "acc": 0.81220379, "epoch": 1.2586943524985783, "grad_norm": 5.25, "learning_rate": 3.190155148462446e-06, "loss": 0.65648842, "memory(GiB)": 146.85, "step": 53950, "train_speed(iter/s)": 0.202547 }, { "acc": 0.75633907, "epoch": 1.2589276600708672, "grad_norm": 5.5625, "learning_rate": 3.188394266432162e-06, "loss": 0.90841389, "memory(GiB)": 146.85, "step": 53960, "train_speed(iter/s)": 0.202567 }, { "acc": 0.75019326, "epoch": 1.259160967643156, "grad_norm": 7.09375, "learning_rate": 3.186633643008069e-06, "loss": 0.91238089, "memory(GiB)": 146.85, "step": 53970, "train_speed(iter/s)": 0.202587 }, { "acc": 0.75919843, "epoch": 1.259394275215445, "grad_norm": 4.15625, "learning_rate": 3.1848732784414965e-06, "loss": 0.86039982, "memory(GiB)": 146.85, "step": 53980, "train_speed(iter/s)": 0.202606 }, { "acc": 0.77906413, "epoch": 1.259627582787734, "grad_norm": 5.65625, "learning_rate": 3.183113172983736e-06, "loss": 0.80572853, "memory(GiB)": 146.85, "step": 53990, "train_speed(iter/s)": 0.202625 }, { "acc": 0.77344303, "epoch": 1.2598608903600228, "grad_norm": 6.09375, "learning_rate": 3.181353326886042e-06, "loss": 0.81496811, "memory(GiB)": 146.85, "step": 54000, "train_speed(iter/s)": 0.202644 }, { "epoch": 1.2598608903600228, "eval_acc": 0.7351517752558091, "eval_loss": 0.8340672850608826, "eval_runtime": 1264.4201, "eval_samples_per_second": 28.464, "eval_steps_per_second": 14.233, "step": 54000 }, { "acc": 0.79143858, "epoch": 1.2600941979323117, "grad_norm": 5.71875, "learning_rate": 3.1795937403996324e-06, "loss": 0.73304038, "memory(GiB)": 146.85, "step": 54010, "train_speed(iter/s)": 0.201687 }, { "acc": 0.77974954, "epoch": 1.2603275055046006, "grad_norm": 6.15625, "learning_rate": 3.1778344137756887e-06, "loss": 0.78602147, "memory(GiB)": 146.85, "step": 54020, "train_speed(iter/s)": 0.201706 }, { "acc": 0.77399836, "epoch": 1.2605608130768895, "grad_norm": 6.9375, "learning_rate": 3.176075347265352e-06, "loss": 0.8369606, "memory(GiB)": 146.85, "step": 54030, "train_speed(iter/s)": 0.201725 }, { "acc": 0.77964849, "epoch": 1.2607941206491784, "grad_norm": 7.125, "learning_rate": 3.17431654111973e-06, "loss": 0.78693557, "memory(GiB)": 146.85, "step": 54040, "train_speed(iter/s)": 0.201744 }, { "acc": 0.77223716, "epoch": 1.2610274282214673, "grad_norm": 5.6875, "learning_rate": 3.1725579955898904e-06, "loss": 0.82048016, "memory(GiB)": 146.85, "step": 54050, "train_speed(iter/s)": 0.201763 }, { "acc": 0.76624722, "epoch": 1.2612607357937562, "grad_norm": 6.59375, "learning_rate": 3.170799710926867e-06, "loss": 0.84760933, "memory(GiB)": 146.85, "step": 54060, "train_speed(iter/s)": 0.201783 }, { "acc": 0.75739374, "epoch": 1.2614940433660449, "grad_norm": 6.40625, "learning_rate": 3.1690416873816533e-06, "loss": 0.90084095, "memory(GiB)": 146.85, "step": 54070, "train_speed(iter/s)": 0.201802 }, { "acc": 0.77813425, "epoch": 1.261727350938334, "grad_norm": 4.84375, "learning_rate": 3.1672839252052083e-06, "loss": 0.7776588, "memory(GiB)": 146.85, "step": 54080, "train_speed(iter/s)": 0.201822 }, { "acc": 0.77102785, "epoch": 1.2619606585106227, "grad_norm": 5.90625, "learning_rate": 3.165526424648449e-06, "loss": 0.82027016, "memory(GiB)": 146.85, "step": 54090, "train_speed(iter/s)": 0.201841 }, { "acc": 0.76562605, "epoch": 1.2621939660829118, "grad_norm": 6.46875, "learning_rate": 3.1637691859622612e-06, "loss": 0.86024876, "memory(GiB)": 146.85, "step": 54100, "train_speed(iter/s)": 0.201861 }, { "acc": 0.76398263, "epoch": 1.2624272736552005, "grad_norm": 9.0625, "learning_rate": 3.1620122093974864e-06, "loss": 0.87755499, "memory(GiB)": 146.85, "step": 54110, "train_speed(iter/s)": 0.20188 }, { "acc": 0.76979823, "epoch": 1.2626605812274896, "grad_norm": 5.90625, "learning_rate": 3.160255495204936e-06, "loss": 0.82866116, "memory(GiB)": 146.85, "step": 54120, "train_speed(iter/s)": 0.201899 }, { "acc": 0.78603125, "epoch": 1.2628938887997783, "grad_norm": 7.34375, "learning_rate": 3.158499043635378e-06, "loss": 0.77072487, "memory(GiB)": 146.85, "step": 54130, "train_speed(iter/s)": 0.201919 }, { "acc": 0.773563, "epoch": 1.2631271963720674, "grad_norm": 7.96875, "learning_rate": 3.156742854939547e-06, "loss": 0.82741642, "memory(GiB)": 146.85, "step": 54140, "train_speed(iter/s)": 0.201939 }, { "acc": 0.77878332, "epoch": 1.263360503944356, "grad_norm": 6.5, "learning_rate": 3.1549869293681385e-06, "loss": 0.80497513, "memory(GiB)": 146.85, "step": 54150, "train_speed(iter/s)": 0.201959 }, { "acc": 0.79452295, "epoch": 1.263593811516645, "grad_norm": 8.1875, "learning_rate": 3.1532312671718102e-06, "loss": 0.72498398, "memory(GiB)": 146.85, "step": 54160, "train_speed(iter/s)": 0.201979 }, { "acc": 0.74489775, "epoch": 1.2638271190889339, "grad_norm": 4.96875, "learning_rate": 3.1514758686011816e-06, "loss": 0.94619284, "memory(GiB)": 146.85, "step": 54170, "train_speed(iter/s)": 0.201999 }, { "acc": 0.75445623, "epoch": 1.2640604266612228, "grad_norm": 7.59375, "learning_rate": 3.149720733906836e-06, "loss": 0.89108353, "memory(GiB)": 146.85, "step": 54180, "train_speed(iter/s)": 0.202017 }, { "acc": 0.75369987, "epoch": 1.2642937342335117, "grad_norm": 6.0, "learning_rate": 3.1479658633393194e-06, "loss": 0.93283691, "memory(GiB)": 146.85, "step": 54190, "train_speed(iter/s)": 0.202037 }, { "acc": 0.76705256, "epoch": 1.2645270418058006, "grad_norm": 5.9375, "learning_rate": 3.146211257149136e-06, "loss": 0.84018593, "memory(GiB)": 146.85, "step": 54200, "train_speed(iter/s)": 0.202056 }, { "acc": 0.77911558, "epoch": 1.2647603493780895, "grad_norm": 5.3125, "learning_rate": 3.1444569155867573e-06, "loss": 0.79894223, "memory(GiB)": 146.85, "step": 54210, "train_speed(iter/s)": 0.202074 }, { "acc": 0.76606884, "epoch": 1.2649936569503784, "grad_norm": 6.125, "learning_rate": 3.1427028389026147e-06, "loss": 0.86145458, "memory(GiB)": 146.85, "step": 54220, "train_speed(iter/s)": 0.202093 }, { "acc": 0.77536612, "epoch": 1.2652269645226673, "grad_norm": 6.59375, "learning_rate": 3.140949027347102e-06, "loss": 0.8037879, "memory(GiB)": 146.85, "step": 54230, "train_speed(iter/s)": 0.202112 }, { "acc": 0.75945458, "epoch": 1.2654602720949562, "grad_norm": 6.375, "learning_rate": 3.139195481170577e-06, "loss": 0.86415815, "memory(GiB)": 146.85, "step": 54240, "train_speed(iter/s)": 0.20213 }, { "acc": 0.77448816, "epoch": 1.265693579667245, "grad_norm": 6.5625, "learning_rate": 3.1374422006233553e-06, "loss": 0.81621819, "memory(GiB)": 146.85, "step": 54250, "train_speed(iter/s)": 0.20215 }, { "acc": 0.78662758, "epoch": 1.265926887239534, "grad_norm": 8.3125, "learning_rate": 3.1356891859557187e-06, "loss": 0.77362967, "memory(GiB)": 146.85, "step": 54260, "train_speed(iter/s)": 0.20217 }, { "acc": 0.79799013, "epoch": 1.2661601948118228, "grad_norm": 6.0, "learning_rate": 3.1339364374179092e-06, "loss": 0.71059871, "memory(GiB)": 146.85, "step": 54270, "train_speed(iter/s)": 0.202189 }, { "acc": 0.76176929, "epoch": 1.2663935023841117, "grad_norm": 6.1875, "learning_rate": 3.1321839552601308e-06, "loss": 0.8366643, "memory(GiB)": 146.85, "step": 54280, "train_speed(iter/s)": 0.202209 }, { "acc": 0.75749245, "epoch": 1.2666268099564006, "grad_norm": 5.09375, "learning_rate": 3.1304317397325503e-06, "loss": 0.87858019, "memory(GiB)": 146.85, "step": 54290, "train_speed(iter/s)": 0.202228 }, { "acc": 0.79012098, "epoch": 1.2668601175286895, "grad_norm": 7.625, "learning_rate": 3.128679791085297e-06, "loss": 0.74465704, "memory(GiB)": 146.85, "step": 54300, "train_speed(iter/s)": 0.202248 }, { "acc": 0.77378273, "epoch": 1.2670934251009784, "grad_norm": 5.3125, "learning_rate": 3.1269281095684594e-06, "loss": 0.81487122, "memory(GiB)": 146.85, "step": 54310, "train_speed(iter/s)": 0.202268 }, { "acc": 0.76809092, "epoch": 1.2673267326732673, "grad_norm": 6.25, "learning_rate": 3.1251766954320906e-06, "loss": 0.83304977, "memory(GiB)": 146.85, "step": 54320, "train_speed(iter/s)": 0.202286 }, { "acc": 0.79764776, "epoch": 1.2675600402455562, "grad_norm": 6.28125, "learning_rate": 3.123425548926203e-06, "loss": 0.72233815, "memory(GiB)": 146.85, "step": 54330, "train_speed(iter/s)": 0.202306 }, { "acc": 0.76879826, "epoch": 1.2677933478178451, "grad_norm": 5.96875, "learning_rate": 3.121674670300773e-06, "loss": 0.82453098, "memory(GiB)": 146.85, "step": 54340, "train_speed(iter/s)": 0.202325 }, { "acc": 0.77692509, "epoch": 1.268026655390134, "grad_norm": 4.9375, "learning_rate": 3.1199240598057377e-06, "loss": 0.81415186, "memory(GiB)": 146.85, "step": 54350, "train_speed(iter/s)": 0.202345 }, { "acc": 0.75582309, "epoch": 1.268259962962423, "grad_norm": 6.4375, "learning_rate": 3.1181737176909967e-06, "loss": 0.90352678, "memory(GiB)": 146.85, "step": 54360, "train_speed(iter/s)": 0.202365 }, { "acc": 0.76647706, "epoch": 1.2684932705347118, "grad_norm": 6.65625, "learning_rate": 3.116423644206411e-06, "loss": 0.83484497, "memory(GiB)": 146.85, "step": 54370, "train_speed(iter/s)": 0.202385 }, { "acc": 0.75725579, "epoch": 1.2687265781070007, "grad_norm": 6.84375, "learning_rate": 3.1146738396018043e-06, "loss": 0.87051907, "memory(GiB)": 146.85, "step": 54380, "train_speed(iter/s)": 0.202405 }, { "acc": 0.77186131, "epoch": 1.2689598856792896, "grad_norm": 6.09375, "learning_rate": 3.112924304126958e-06, "loss": 0.83053722, "memory(GiB)": 146.85, "step": 54390, "train_speed(iter/s)": 0.202425 }, { "acc": 0.79713173, "epoch": 1.2691931932515785, "grad_norm": 4.71875, "learning_rate": 3.111175038031619e-06, "loss": 0.7172987, "memory(GiB)": 146.85, "step": 54400, "train_speed(iter/s)": 0.202444 }, { "acc": 0.79799614, "epoch": 1.2694265008238674, "grad_norm": 7.0625, "learning_rate": 3.1094260415654955e-06, "loss": 0.72231941, "memory(GiB)": 146.85, "step": 54410, "train_speed(iter/s)": 0.202464 }, { "acc": 0.77209883, "epoch": 1.2696598083961563, "grad_norm": 4.59375, "learning_rate": 3.1076773149782557e-06, "loss": 0.84023371, "memory(GiB)": 146.85, "step": 54420, "train_speed(iter/s)": 0.202484 }, { "acc": 0.7716342, "epoch": 1.2698931159684452, "grad_norm": 5.84375, "learning_rate": 3.105928858519529e-06, "loss": 0.82643967, "memory(GiB)": 146.85, "step": 54430, "train_speed(iter/s)": 0.202503 }, { "acc": 0.76944399, "epoch": 1.270126423540734, "grad_norm": 6.71875, "learning_rate": 3.1041806724389067e-06, "loss": 0.82154684, "memory(GiB)": 146.85, "step": 54440, "train_speed(iter/s)": 0.202522 }, { "acc": 0.78264799, "epoch": 1.270359731113023, "grad_norm": 7.71875, "learning_rate": 3.1024327569859425e-06, "loss": 0.78317227, "memory(GiB)": 146.85, "step": 54450, "train_speed(iter/s)": 0.202541 }, { "acc": 0.76169548, "epoch": 1.2705930386853117, "grad_norm": 6.9375, "learning_rate": 3.1006851124101524e-06, "loss": 0.87836742, "memory(GiB)": 146.85, "step": 54460, "train_speed(iter/s)": 0.202561 }, { "acc": 0.75731239, "epoch": 1.2708263462576008, "grad_norm": 5.71875, "learning_rate": 3.0989377389610097e-06, "loss": 0.87447319, "memory(GiB)": 146.85, "step": 54470, "train_speed(iter/s)": 0.20258 }, { "acc": 0.76959095, "epoch": 1.2710596538298895, "grad_norm": 9.5625, "learning_rate": 3.0971906368879524e-06, "loss": 0.82562265, "memory(GiB)": 146.85, "step": 54480, "train_speed(iter/s)": 0.202599 }, { "acc": 0.79334412, "epoch": 1.2712929614021786, "grad_norm": 5.28125, "learning_rate": 3.095443806440379e-06, "loss": 0.74426484, "memory(GiB)": 146.85, "step": 54490, "train_speed(iter/s)": 0.202617 }, { "acc": 0.78153324, "epoch": 1.2715262689744673, "grad_norm": 4.96875, "learning_rate": 3.0936972478676493e-06, "loss": 0.78611021, "memory(GiB)": 146.85, "step": 54500, "train_speed(iter/s)": 0.202636 }, { "epoch": 1.2715262689744673, "eval_acc": 0.7351817850331882, "eval_loss": 0.834069013595581, "eval_runtime": 1263.4229, "eval_samples_per_second": 28.487, "eval_steps_per_second": 14.244, "step": 54500 }, { "acc": 0.77880702, "epoch": 1.2717595765467564, "grad_norm": 4.78125, "learning_rate": 3.0919509614190836e-06, "loss": 0.77733369, "memory(GiB)": 146.85, "step": 54510, "train_speed(iter/s)": 0.201689 }, { "acc": 0.75859618, "epoch": 1.271992884119045, "grad_norm": 5.125, "learning_rate": 3.0902049473439643e-06, "loss": 0.88021946, "memory(GiB)": 146.85, "step": 54520, "train_speed(iter/s)": 0.201708 }, { "acc": 0.76851807, "epoch": 1.2722261916913342, "grad_norm": 6.03125, "learning_rate": 3.0884592058915342e-06, "loss": 0.83956957, "memory(GiB)": 146.85, "step": 54530, "train_speed(iter/s)": 0.201727 }, { "acc": 0.75603561, "epoch": 1.272459499263623, "grad_norm": 5.59375, "learning_rate": 3.0867137373109972e-06, "loss": 0.89612799, "memory(GiB)": 146.85, "step": 54540, "train_speed(iter/s)": 0.201746 }, { "acc": 0.77847095, "epoch": 1.2726928068359118, "grad_norm": 4.8125, "learning_rate": 3.0849685418515174e-06, "loss": 0.81282024, "memory(GiB)": 146.85, "step": 54550, "train_speed(iter/s)": 0.201766 }, { "acc": 0.77102261, "epoch": 1.2729261144082007, "grad_norm": 6.28125, "learning_rate": 3.0832236197622223e-06, "loss": 0.84057312, "memory(GiB)": 146.85, "step": 54560, "train_speed(iter/s)": 0.201786 }, { "acc": 0.78804746, "epoch": 1.2731594219804896, "grad_norm": 5.53125, "learning_rate": 3.0814789712921977e-06, "loss": 0.74874516, "memory(GiB)": 146.85, "step": 54570, "train_speed(iter/s)": 0.201806 }, { "acc": 0.78272896, "epoch": 1.2733927295527785, "grad_norm": 6.0625, "learning_rate": 3.0797345966904933e-06, "loss": 0.76815882, "memory(GiB)": 146.85, "step": 54580, "train_speed(iter/s)": 0.201825 }, { "acc": 0.7783288, "epoch": 1.2736260371250674, "grad_norm": 6.125, "learning_rate": 3.0779904962061173e-06, "loss": 0.79085927, "memory(GiB)": 146.85, "step": 54590, "train_speed(iter/s)": 0.201843 }, { "acc": 0.77046165, "epoch": 1.2738593446973563, "grad_norm": 5.40625, "learning_rate": 3.076246670088041e-06, "loss": 0.83913689, "memory(GiB)": 146.85, "step": 54600, "train_speed(iter/s)": 0.201861 }, { "acc": 0.79034243, "epoch": 1.2740926522696452, "grad_norm": 4.625, "learning_rate": 3.074503118585192e-06, "loss": 0.77294607, "memory(GiB)": 146.85, "step": 54610, "train_speed(iter/s)": 0.201881 }, { "acc": 0.75485811, "epoch": 1.274325959841934, "grad_norm": 5.0, "learning_rate": 3.072759841946464e-06, "loss": 0.90005989, "memory(GiB)": 146.85, "step": 54620, "train_speed(iter/s)": 0.201901 }, { "acc": 0.76026249, "epoch": 1.274559267414223, "grad_norm": 6.625, "learning_rate": 3.0710168404207086e-06, "loss": 0.85896692, "memory(GiB)": 146.85, "step": 54630, "train_speed(iter/s)": 0.20192 }, { "acc": 0.75860019, "epoch": 1.2747925749865119, "grad_norm": 5.59375, "learning_rate": 3.0692741142567385e-06, "loss": 0.86254444, "memory(GiB)": 146.85, "step": 54640, "train_speed(iter/s)": 0.201941 }, { "acc": 0.78105164, "epoch": 1.2750258825588008, "grad_norm": 6.4375, "learning_rate": 3.0675316637033296e-06, "loss": 0.78772635, "memory(GiB)": 146.85, "step": 54650, "train_speed(iter/s)": 0.20196 }, { "acc": 0.75686822, "epoch": 1.2752591901310897, "grad_norm": 3.71875, "learning_rate": 3.0657894890092134e-06, "loss": 0.8890522, "memory(GiB)": 146.85, "step": 54660, "train_speed(iter/s)": 0.201977 }, { "acc": 0.78002443, "epoch": 1.2754924977033786, "grad_norm": 5.6875, "learning_rate": 3.0640475904230848e-06, "loss": 0.79422836, "memory(GiB)": 146.85, "step": 54670, "train_speed(iter/s)": 0.201996 }, { "acc": 0.79444942, "epoch": 1.2757258052756675, "grad_norm": 5.21875, "learning_rate": 3.062305968193601e-06, "loss": 0.71587052, "memory(GiB)": 146.85, "step": 54680, "train_speed(iter/s)": 0.202016 }, { "acc": 0.77519064, "epoch": 1.2759591128479564, "grad_norm": 5.03125, "learning_rate": 3.060564622569377e-06, "loss": 0.81102362, "memory(GiB)": 146.85, "step": 54690, "train_speed(iter/s)": 0.202035 }, { "acc": 0.78082023, "epoch": 1.2761924204202453, "grad_norm": 7.09375, "learning_rate": 3.0588235537989897e-06, "loss": 0.78142843, "memory(GiB)": 146.85, "step": 54700, "train_speed(iter/s)": 0.202054 }, { "acc": 0.76986046, "epoch": 1.2764257279925342, "grad_norm": 6.3125, "learning_rate": 3.057082762130976e-06, "loss": 0.84283991, "memory(GiB)": 146.85, "step": 54710, "train_speed(iter/s)": 0.202073 }, { "acc": 0.76929588, "epoch": 1.276659035564823, "grad_norm": 6.4375, "learning_rate": 3.0553422478138333e-06, "loss": 0.81583824, "memory(GiB)": 146.85, "step": 54720, "train_speed(iter/s)": 0.202093 }, { "acc": 0.75166655, "epoch": 1.276892343137112, "grad_norm": 6.6875, "learning_rate": 3.0536020110960214e-06, "loss": 0.89222164, "memory(GiB)": 146.85, "step": 54730, "train_speed(iter/s)": 0.202113 }, { "acc": 0.75660124, "epoch": 1.2771256507094009, "grad_norm": 6.625, "learning_rate": 3.0518620522259557e-06, "loss": 0.88784628, "memory(GiB)": 146.85, "step": 54740, "train_speed(iter/s)": 0.202133 }, { "acc": 0.78362656, "epoch": 1.2773589582816898, "grad_norm": 4.75, "learning_rate": 3.0501223714520155e-06, "loss": 0.76338062, "memory(GiB)": 146.85, "step": 54750, "train_speed(iter/s)": 0.202152 }, { "acc": 0.77826853, "epoch": 1.2775922658539787, "grad_norm": 7.09375, "learning_rate": 3.048382969022543e-06, "loss": 0.80953951, "memory(GiB)": 146.85, "step": 54760, "train_speed(iter/s)": 0.202172 }, { "acc": 0.76995864, "epoch": 1.2778255734262676, "grad_norm": 8.3125, "learning_rate": 3.0466438451858326e-06, "loss": 0.81588993, "memory(GiB)": 146.85, "step": 54770, "train_speed(iter/s)": 0.20219 }, { "acc": 0.77879124, "epoch": 1.2780588809985565, "grad_norm": 22.875, "learning_rate": 3.044905000190146e-06, "loss": 0.82273722, "memory(GiB)": 146.85, "step": 54780, "train_speed(iter/s)": 0.202208 }, { "acc": 0.77997179, "epoch": 1.2782921885708454, "grad_norm": 4.90625, "learning_rate": 3.043166434283703e-06, "loss": 0.79682922, "memory(GiB)": 146.85, "step": 54790, "train_speed(iter/s)": 0.202227 }, { "acc": 0.76462798, "epoch": 1.2785254961431343, "grad_norm": 5.8125, "learning_rate": 3.0414281477146823e-06, "loss": 0.84449654, "memory(GiB)": 146.85, "step": 54800, "train_speed(iter/s)": 0.202246 }, { "acc": 0.75749121, "epoch": 1.2787588037154232, "grad_norm": 4.3125, "learning_rate": 3.0396901407312263e-06, "loss": 0.89200506, "memory(GiB)": 146.85, "step": 54810, "train_speed(iter/s)": 0.202266 }, { "acc": 0.7766088, "epoch": 1.278992111287712, "grad_norm": 5.3125, "learning_rate": 3.037952413581431e-06, "loss": 0.79202533, "memory(GiB)": 146.85, "step": 54820, "train_speed(iter/s)": 0.202285 }, { "acc": 0.75067492, "epoch": 1.2792254188600007, "grad_norm": 7.125, "learning_rate": 3.03621496651336e-06, "loss": 0.90050964, "memory(GiB)": 146.85, "step": 54830, "train_speed(iter/s)": 0.202304 }, { "acc": 0.76099567, "epoch": 1.2794587264322899, "grad_norm": 6.0, "learning_rate": 3.0344777997750313e-06, "loss": 0.8924017, "memory(GiB)": 146.85, "step": 54840, "train_speed(iter/s)": 0.202323 }, { "acc": 0.78929629, "epoch": 1.2796920340045785, "grad_norm": 5.0625, "learning_rate": 3.0327409136144257e-06, "loss": 0.76173468, "memory(GiB)": 146.85, "step": 54850, "train_speed(iter/s)": 0.202342 }, { "acc": 0.78347902, "epoch": 1.2799253415768677, "grad_norm": 5.8125, "learning_rate": 3.031004308279484e-06, "loss": 0.77233677, "memory(GiB)": 146.85, "step": 54860, "train_speed(iter/s)": 0.202363 }, { "acc": 0.78302922, "epoch": 1.2801586491491563, "grad_norm": 4.9375, "learning_rate": 3.0292679840181048e-06, "loss": 0.78275986, "memory(GiB)": 146.85, "step": 54870, "train_speed(iter/s)": 0.202381 }, { "acc": 0.77879047, "epoch": 1.2803919567214455, "grad_norm": 6.5625, "learning_rate": 3.02753194107815e-06, "loss": 0.8101738, "memory(GiB)": 146.85, "step": 54880, "train_speed(iter/s)": 0.202399 }, { "acc": 0.78698907, "epoch": 1.2806252642937341, "grad_norm": 5.34375, "learning_rate": 3.0257961797074353e-06, "loss": 0.77618685, "memory(GiB)": 146.85, "step": 54890, "train_speed(iter/s)": 0.202418 }, { "acc": 0.7760643, "epoch": 1.2808585718660233, "grad_norm": 6.78125, "learning_rate": 3.0240607001537442e-06, "loss": 0.80198336, "memory(GiB)": 146.85, "step": 54900, "train_speed(iter/s)": 0.202437 }, { "acc": 0.76896276, "epoch": 1.281091879438312, "grad_norm": 4.84375, "learning_rate": 3.022325502664813e-06, "loss": 0.8247366, "memory(GiB)": 146.85, "step": 54910, "train_speed(iter/s)": 0.202455 }, { "acc": 0.79066982, "epoch": 1.2813251870106008, "grad_norm": 7.28125, "learning_rate": 3.020590587488342e-06, "loss": 0.74745421, "memory(GiB)": 146.85, "step": 54920, "train_speed(iter/s)": 0.202474 }, { "acc": 0.77408657, "epoch": 1.2815584945828897, "grad_norm": 6.375, "learning_rate": 3.0188559548719888e-06, "loss": 0.80157671, "memory(GiB)": 146.85, "step": 54930, "train_speed(iter/s)": 0.202493 }, { "acc": 0.77967882, "epoch": 1.2817918021551786, "grad_norm": 6.09375, "learning_rate": 3.0171216050633735e-06, "loss": 0.79814773, "memory(GiB)": 146.85, "step": 54940, "train_speed(iter/s)": 0.202511 }, { "acc": 0.78352828, "epoch": 1.2820251097274675, "grad_norm": 7.21875, "learning_rate": 3.0153875383100732e-06, "loss": 0.78877459, "memory(GiB)": 146.85, "step": 54950, "train_speed(iter/s)": 0.202529 }, { "acc": 0.75806317, "epoch": 1.2822584172997564, "grad_norm": 5.75, "learning_rate": 3.0136537548596247e-06, "loss": 0.8857502, "memory(GiB)": 146.85, "step": 54960, "train_speed(iter/s)": 0.202547 }, { "acc": 0.76597805, "epoch": 1.2824917248720453, "grad_norm": 4.9375, "learning_rate": 3.011920254959526e-06, "loss": 0.86267967, "memory(GiB)": 146.85, "step": 54970, "train_speed(iter/s)": 0.202566 }, { "acc": 0.76898518, "epoch": 1.2827250324443342, "grad_norm": 5.34375, "learning_rate": 3.010187038857233e-06, "loss": 0.8319149, "memory(GiB)": 146.85, "step": 54980, "train_speed(iter/s)": 0.202585 }, { "acc": 0.78459983, "epoch": 1.2829583400166231, "grad_norm": 6.0, "learning_rate": 3.008454106800164e-06, "loss": 0.774512, "memory(GiB)": 146.85, "step": 54990, "train_speed(iter/s)": 0.202605 }, { "acc": 0.76388683, "epoch": 1.283191647588912, "grad_norm": 5.21875, "learning_rate": 3.006721459035691e-06, "loss": 0.84073772, "memory(GiB)": 146.85, "step": 55000, "train_speed(iter/s)": 0.202624 }, { "epoch": 1.283191647588912, "eval_acc": 0.7351417719966827, "eval_loss": 0.8340476155281067, "eval_runtime": 1262.4888, "eval_samples_per_second": 28.508, "eval_steps_per_second": 14.254, "step": 55000 }, { "acc": 0.75723362, "epoch": 1.283424955161201, "grad_norm": 4.8125, "learning_rate": 3.0049890958111505e-06, "loss": 0.88368769, "memory(GiB)": 146.85, "step": 55010, "train_speed(iter/s)": 0.201686 }, { "acc": 0.7768208, "epoch": 1.2836582627334898, "grad_norm": 6.28125, "learning_rate": 3.0032570173738367e-06, "loss": 0.793185, "memory(GiB)": 146.85, "step": 55020, "train_speed(iter/s)": 0.201706 }, { "acc": 0.77232447, "epoch": 1.2838915703057787, "grad_norm": 4.03125, "learning_rate": 3.0015252239710052e-06, "loss": 0.82753296, "memory(GiB)": 146.85, "step": 55030, "train_speed(iter/s)": 0.201724 }, { "acc": 0.78555937, "epoch": 1.2841248778780676, "grad_norm": 8.0625, "learning_rate": 2.9997937158498657e-06, "loss": 0.76087875, "memory(GiB)": 146.85, "step": 55040, "train_speed(iter/s)": 0.201744 }, { "acc": 0.74537559, "epoch": 1.2843581854503565, "grad_norm": 6.6875, "learning_rate": 2.998062493257593e-06, "loss": 0.92483654, "memory(GiB)": 146.85, "step": 55050, "train_speed(iter/s)": 0.201764 }, { "acc": 0.77452621, "epoch": 1.2845914930226454, "grad_norm": 4.3125, "learning_rate": 2.9963315564413174e-06, "loss": 0.80334816, "memory(GiB)": 146.85, "step": 55060, "train_speed(iter/s)": 0.201783 }, { "acc": 0.7708427, "epoch": 1.2848248005949343, "grad_norm": 6.625, "learning_rate": 2.994600905648131e-06, "loss": 0.83620405, "memory(GiB)": 146.85, "step": 55070, "train_speed(iter/s)": 0.201802 }, { "acc": 0.76769772, "epoch": 1.2850581081672232, "grad_norm": 6.40625, "learning_rate": 2.9928705411250813e-06, "loss": 0.84449024, "memory(GiB)": 146.85, "step": 55080, "train_speed(iter/s)": 0.201821 }, { "acc": 0.75353417, "epoch": 1.285291415739512, "grad_norm": 5.28125, "learning_rate": 2.9911404631191796e-06, "loss": 0.90458603, "memory(GiB)": 146.85, "step": 55090, "train_speed(iter/s)": 0.201842 }, { "acc": 0.7747015, "epoch": 1.285524723311801, "grad_norm": 5.03125, "learning_rate": 2.9894106718773936e-06, "loss": 0.82320576, "memory(GiB)": 146.85, "step": 55100, "train_speed(iter/s)": 0.201861 }, { "acc": 0.78630581, "epoch": 1.28575803088409, "grad_norm": 7.0625, "learning_rate": 2.987681167646652e-06, "loss": 0.75665874, "memory(GiB)": 146.85, "step": 55110, "train_speed(iter/s)": 0.201879 }, { "acc": 0.77090979, "epoch": 1.2859913384563788, "grad_norm": 6.75, "learning_rate": 2.985951950673836e-06, "loss": 0.82826881, "memory(GiB)": 146.85, "step": 55120, "train_speed(iter/s)": 0.201898 }, { "acc": 0.73967228, "epoch": 1.2862246460286677, "grad_norm": 6.875, "learning_rate": 2.984223021205795e-06, "loss": 0.94461155, "memory(GiB)": 146.85, "step": 55130, "train_speed(iter/s)": 0.201917 }, { "acc": 0.75847893, "epoch": 1.2864579536009566, "grad_norm": 5.75, "learning_rate": 2.9824943794893312e-06, "loss": 0.87752533, "memory(GiB)": 146.85, "step": 55140, "train_speed(iter/s)": 0.201936 }, { "acc": 0.7866962, "epoch": 1.2866912611732455, "grad_norm": 4.78125, "learning_rate": 2.9807660257712097e-06, "loss": 0.77187262, "memory(GiB)": 146.85, "step": 55150, "train_speed(iter/s)": 0.201955 }, { "acc": 0.792488, "epoch": 1.2869245687455344, "grad_norm": 4.84375, "learning_rate": 2.9790379602981508e-06, "loss": 0.7246171, "memory(GiB)": 146.85, "step": 55160, "train_speed(iter/s)": 0.201974 }, { "acc": 0.76644764, "epoch": 1.2871578763178233, "grad_norm": 4.625, "learning_rate": 2.9773101833168374e-06, "loss": 0.84529572, "memory(GiB)": 146.85, "step": 55170, "train_speed(iter/s)": 0.201992 }, { "acc": 0.77031565, "epoch": 1.2873911838901122, "grad_norm": 6.0625, "learning_rate": 2.9755826950739057e-06, "loss": 0.81709976, "memory(GiB)": 146.85, "step": 55180, "train_speed(iter/s)": 0.202012 }, { "acc": 0.76625309, "epoch": 1.287624491462401, "grad_norm": 5.125, "learning_rate": 2.973855495815957e-06, "loss": 0.86203909, "memory(GiB)": 146.85, "step": 55190, "train_speed(iter/s)": 0.20203 }, { "acc": 0.76601248, "epoch": 1.28785779903469, "grad_norm": 5.25, "learning_rate": 2.9721285857895475e-06, "loss": 0.85477962, "memory(GiB)": 146.85, "step": 55200, "train_speed(iter/s)": 0.20205 }, { "acc": 0.77036572, "epoch": 1.288091106606979, "grad_norm": 6.0, "learning_rate": 2.9704019652411933e-06, "loss": 0.82353163, "memory(GiB)": 146.85, "step": 55210, "train_speed(iter/s)": 0.20207 }, { "acc": 0.7644721, "epoch": 1.2883244141792676, "grad_norm": 5.4375, "learning_rate": 2.9686756344173712e-06, "loss": 0.84911489, "memory(GiB)": 146.85, "step": 55220, "train_speed(iter/s)": 0.202088 }, { "acc": 0.76865749, "epoch": 1.2885577217515567, "grad_norm": 4.59375, "learning_rate": 2.96694959356451e-06, "loss": 0.83917942, "memory(GiB)": 146.85, "step": 55230, "train_speed(iter/s)": 0.202108 }, { "acc": 0.77254729, "epoch": 1.2887910293238454, "grad_norm": 6.8125, "learning_rate": 2.9652238429290036e-06, "loss": 0.81691341, "memory(GiB)": 146.85, "step": 55240, "train_speed(iter/s)": 0.202127 }, { "acc": 0.77159286, "epoch": 1.2890243368961345, "grad_norm": 6.46875, "learning_rate": 2.9634983827572038e-06, "loss": 0.81736946, "memory(GiB)": 146.85, "step": 55250, "train_speed(iter/s)": 0.202147 }, { "acc": 0.76879392, "epoch": 1.2892576444684232, "grad_norm": 4.9375, "learning_rate": 2.961773213295417e-06, "loss": 0.80871077, "memory(GiB)": 146.85, "step": 55260, "train_speed(iter/s)": 0.202166 }, { "acc": 0.78025637, "epoch": 1.2894909520407123, "grad_norm": 5.96875, "learning_rate": 2.960048334789912e-06, "loss": 0.7806201, "memory(GiB)": 146.85, "step": 55270, "train_speed(iter/s)": 0.202185 }, { "acc": 0.77724013, "epoch": 1.289724259613001, "grad_norm": 5.78125, "learning_rate": 2.9583237474869143e-06, "loss": 0.80622368, "memory(GiB)": 146.85, "step": 55280, "train_speed(iter/s)": 0.202202 }, { "acc": 0.7753057, "epoch": 1.28995756718529, "grad_norm": 5.65625, "learning_rate": 2.956599451632609e-06, "loss": 0.8075963, "memory(GiB)": 146.85, "step": 55290, "train_speed(iter/s)": 0.202222 }, { "acc": 0.75721011, "epoch": 1.2901908747575788, "grad_norm": 5.25, "learning_rate": 2.9548754474731376e-06, "loss": 0.87673397, "memory(GiB)": 146.85, "step": 55300, "train_speed(iter/s)": 0.202241 }, { "acc": 0.75709925, "epoch": 1.2904241823298677, "grad_norm": 6.71875, "learning_rate": 2.953151735254604e-06, "loss": 0.87996483, "memory(GiB)": 146.85, "step": 55310, "train_speed(iter/s)": 0.20226 }, { "acc": 0.76296034, "epoch": 1.2906574899021566, "grad_norm": 6.1875, "learning_rate": 2.9514283152230637e-06, "loss": 0.85264788, "memory(GiB)": 146.85, "step": 55320, "train_speed(iter/s)": 0.202279 }, { "acc": 0.76588426, "epoch": 1.2908907974744455, "grad_norm": 8.0625, "learning_rate": 2.949705187624539e-06, "loss": 0.84266176, "memory(GiB)": 146.85, "step": 55330, "train_speed(iter/s)": 0.202298 }, { "acc": 0.76511064, "epoch": 1.2911241050467344, "grad_norm": 5.34375, "learning_rate": 2.947982352705001e-06, "loss": 0.83056641, "memory(GiB)": 146.85, "step": 55340, "train_speed(iter/s)": 0.202318 }, { "acc": 0.77463951, "epoch": 1.2913574126190233, "grad_norm": 5.03125, "learning_rate": 2.9462598107103855e-06, "loss": 0.82625189, "memory(GiB)": 146.85, "step": 55350, "train_speed(iter/s)": 0.202336 }, { "acc": 0.76851792, "epoch": 1.2915907201913122, "grad_norm": 4.875, "learning_rate": 2.9445375618865857e-06, "loss": 0.83891983, "memory(GiB)": 146.85, "step": 55360, "train_speed(iter/s)": 0.202354 }, { "acc": 0.77663555, "epoch": 1.291824027763601, "grad_norm": 7.28125, "learning_rate": 2.942815606479452e-06, "loss": 0.80800438, "memory(GiB)": 146.85, "step": 55370, "train_speed(iter/s)": 0.202374 }, { "acc": 0.76209269, "epoch": 1.29205733533589, "grad_norm": 6.03125, "learning_rate": 2.941093944734793e-06, "loss": 0.86186161, "memory(GiB)": 146.85, "step": 55380, "train_speed(iter/s)": 0.202395 }, { "acc": 0.78104429, "epoch": 1.2922906429081789, "grad_norm": 5.0, "learning_rate": 2.939372576898376e-06, "loss": 0.78885765, "memory(GiB)": 146.85, "step": 55390, "train_speed(iter/s)": 0.202414 }, { "acc": 0.76509466, "epoch": 1.2925239504804678, "grad_norm": 4.84375, "learning_rate": 2.937651503215924e-06, "loss": 0.84805994, "memory(GiB)": 146.85, "step": 55400, "train_speed(iter/s)": 0.202433 }, { "acc": 0.7663384, "epoch": 1.2927572580527567, "grad_norm": 4.5, "learning_rate": 2.9359307239331214e-06, "loss": 0.85253468, "memory(GiB)": 146.85, "step": 55410, "train_speed(iter/s)": 0.202453 }, { "acc": 0.7565876, "epoch": 1.2929905656250456, "grad_norm": 5.40625, "learning_rate": 2.9342102392956075e-06, "loss": 0.88858776, "memory(GiB)": 146.85, "step": 55420, "train_speed(iter/s)": 0.202472 }, { "acc": 0.75060143, "epoch": 1.2932238731973345, "grad_norm": 7.15625, "learning_rate": 2.932490049548982e-06, "loss": 0.91898117, "memory(GiB)": 146.85, "step": 55430, "train_speed(iter/s)": 0.202491 }, { "acc": 0.77812719, "epoch": 1.2934571807696233, "grad_norm": 5.34375, "learning_rate": 2.9307701549388025e-06, "loss": 0.81079559, "memory(GiB)": 146.85, "step": 55440, "train_speed(iter/s)": 0.20251 }, { "acc": 0.76271892, "epoch": 1.2936904883419122, "grad_norm": 4.96875, "learning_rate": 2.929050555710582e-06, "loss": 0.84705162, "memory(GiB)": 146.85, "step": 55450, "train_speed(iter/s)": 0.202529 }, { "acc": 0.74692159, "epoch": 1.2939237959142011, "grad_norm": 8.9375, "learning_rate": 2.9273312521097926e-06, "loss": 0.90147419, "memory(GiB)": 146.85, "step": 55460, "train_speed(iter/s)": 0.202547 }, { "acc": 0.77728515, "epoch": 1.29415710348649, "grad_norm": 6.125, "learning_rate": 2.9256122443818657e-06, "loss": 0.80032711, "memory(GiB)": 146.85, "step": 55470, "train_speed(iter/s)": 0.202566 }, { "acc": 0.77344227, "epoch": 1.294390411058779, "grad_norm": 5.90625, "learning_rate": 2.923893532772187e-06, "loss": 0.8171032, "memory(GiB)": 146.85, "step": 55480, "train_speed(iter/s)": 0.202585 }, { "acc": 0.76193833, "epoch": 1.2946237186310678, "grad_norm": 6.53125, "learning_rate": 2.9221751175261036e-06, "loss": 0.8516552, "memory(GiB)": 146.85, "step": 55490, "train_speed(iter/s)": 0.202605 }, { "acc": 0.76796894, "epoch": 1.2948570262033567, "grad_norm": 6.96875, "learning_rate": 2.9204569988889186e-06, "loss": 0.84278011, "memory(GiB)": 146.85, "step": 55500, "train_speed(iter/s)": 0.202623 }, { "epoch": 1.2948570262033567, "eval_acc": 0.7351940470927625, "eval_loss": 0.834033727645874, "eval_runtime": 1264.8517, "eval_samples_per_second": 28.455, "eval_steps_per_second": 14.228, "step": 55500 }, { "acc": 0.77598104, "epoch": 1.2950903337756456, "grad_norm": 5.625, "learning_rate": 2.9187391771058938e-06, "loss": 0.81446905, "memory(GiB)": 146.85, "step": 55510, "train_speed(iter/s)": 0.201691 }, { "acc": 0.77573462, "epoch": 1.2953236413479345, "grad_norm": 5.46875, "learning_rate": 2.9170216524222446e-06, "loss": 0.79735956, "memory(GiB)": 146.85, "step": 55520, "train_speed(iter/s)": 0.201711 }, { "acc": 0.76764565, "epoch": 1.2955569489202234, "grad_norm": 5.21875, "learning_rate": 2.9153044250831512e-06, "loss": 0.83011265, "memory(GiB)": 146.85, "step": 55530, "train_speed(iter/s)": 0.201729 }, { "acc": 0.7692256, "epoch": 1.2957902564925123, "grad_norm": 4.71875, "learning_rate": 2.913587495333744e-06, "loss": 0.83097324, "memory(GiB)": 146.85, "step": 55540, "train_speed(iter/s)": 0.201748 }, { "acc": 0.76287341, "epoch": 1.2960235640648012, "grad_norm": 6.03125, "learning_rate": 2.9118708634191177e-06, "loss": 0.86045914, "memory(GiB)": 146.85, "step": 55550, "train_speed(iter/s)": 0.201767 }, { "acc": 0.77690525, "epoch": 1.2962568716370901, "grad_norm": 6.875, "learning_rate": 2.910154529584319e-06, "loss": 0.79706516, "memory(GiB)": 146.85, "step": 55560, "train_speed(iter/s)": 0.201787 }, { "acc": 0.77640495, "epoch": 1.296490179209379, "grad_norm": 6.875, "learning_rate": 2.9084384940743543e-06, "loss": 0.80197172, "memory(GiB)": 146.85, "step": 55570, "train_speed(iter/s)": 0.201807 }, { "acc": 0.77870636, "epoch": 1.296723486781668, "grad_norm": 5.1875, "learning_rate": 2.9067227571341873e-06, "loss": 0.82234964, "memory(GiB)": 146.85, "step": 55580, "train_speed(iter/s)": 0.201826 }, { "acc": 0.78134513, "epoch": 1.2969567943539568, "grad_norm": 4.6875, "learning_rate": 2.905007319008736e-06, "loss": 0.77744198, "memory(GiB)": 146.85, "step": 55590, "train_speed(iter/s)": 0.201845 }, { "acc": 0.7764287, "epoch": 1.2971901019262457, "grad_norm": 8.375, "learning_rate": 2.903292179942883e-06, "loss": 0.80407858, "memory(GiB)": 146.85, "step": 55600, "train_speed(iter/s)": 0.201864 }, { "acc": 0.77487926, "epoch": 1.2974234094985344, "grad_norm": 5.0, "learning_rate": 2.9015773401814606e-06, "loss": 0.81430473, "memory(GiB)": 146.85, "step": 55610, "train_speed(iter/s)": 0.201883 }, { "acc": 0.78575659, "epoch": 1.2976567170708235, "grad_norm": 6.03125, "learning_rate": 2.899862799969265e-06, "loss": 0.78336802, "memory(GiB)": 146.85, "step": 55620, "train_speed(iter/s)": 0.201902 }, { "acc": 0.75749941, "epoch": 1.2978900246431122, "grad_norm": 5.8125, "learning_rate": 2.898148559551045e-06, "loss": 0.85524845, "memory(GiB)": 146.85, "step": 55630, "train_speed(iter/s)": 0.201921 }, { "acc": 0.76639824, "epoch": 1.2981233322154013, "grad_norm": 5.90625, "learning_rate": 2.8964346191715058e-06, "loss": 0.8389986, "memory(GiB)": 146.85, "step": 55640, "train_speed(iter/s)": 0.20194 }, { "acc": 0.78312244, "epoch": 1.29835663978769, "grad_norm": 5.40625, "learning_rate": 2.894720979075315e-06, "loss": 0.78411741, "memory(GiB)": 146.85, "step": 55650, "train_speed(iter/s)": 0.20196 }, { "acc": 0.7829051, "epoch": 1.2985899473599791, "grad_norm": 5.625, "learning_rate": 2.8930076395070915e-06, "loss": 0.76372957, "memory(GiB)": 146.85, "step": 55660, "train_speed(iter/s)": 0.201978 }, { "acc": 0.76557775, "epoch": 1.2988232549322678, "grad_norm": 4.90625, "learning_rate": 2.8912946007114175e-06, "loss": 0.84680529, "memory(GiB)": 146.85, "step": 55670, "train_speed(iter/s)": 0.201998 }, { "acc": 0.78361769, "epoch": 1.299056562504557, "grad_norm": 5.5, "learning_rate": 2.8895818629328254e-06, "loss": 0.78018246, "memory(GiB)": 146.85, "step": 55680, "train_speed(iter/s)": 0.202017 }, { "acc": 0.76331415, "epoch": 1.2992898700768456, "grad_norm": 4.8125, "learning_rate": 2.8878694264158103e-06, "loss": 0.84916744, "memory(GiB)": 146.85, "step": 55690, "train_speed(iter/s)": 0.202036 }, { "acc": 0.78652229, "epoch": 1.2995231776491345, "grad_norm": 5.90625, "learning_rate": 2.8861572914048184e-06, "loss": 0.74926195, "memory(GiB)": 146.85, "step": 55700, "train_speed(iter/s)": 0.202057 }, { "acc": 0.77238827, "epoch": 1.2997564852214234, "grad_norm": 5.1875, "learning_rate": 2.8844454581442614e-06, "loss": 0.82394695, "memory(GiB)": 146.85, "step": 55710, "train_speed(iter/s)": 0.202076 }, { "acc": 0.78171301, "epoch": 1.2999897927937123, "grad_norm": 6.15625, "learning_rate": 2.8827339268785015e-06, "loss": 0.79469113, "memory(GiB)": 146.85, "step": 55720, "train_speed(iter/s)": 0.202094 }, { "acc": 0.76400671, "epoch": 1.3002231003660012, "grad_norm": 6.0, "learning_rate": 2.881022697851855e-06, "loss": 0.84748287, "memory(GiB)": 146.85, "step": 55730, "train_speed(iter/s)": 0.202114 }, { "acc": 0.76635323, "epoch": 1.30045640793829, "grad_norm": 8.0625, "learning_rate": 2.879311771308606e-06, "loss": 0.84386253, "memory(GiB)": 146.85, "step": 55740, "train_speed(iter/s)": 0.202133 }, { "acc": 0.78211951, "epoch": 1.300689715510579, "grad_norm": 4.5625, "learning_rate": 2.877601147492983e-06, "loss": 0.78392878, "memory(GiB)": 146.85, "step": 55750, "train_speed(iter/s)": 0.202152 }, { "acc": 0.76850414, "epoch": 1.300923023082868, "grad_norm": 5.21875, "learning_rate": 2.8758908266491815e-06, "loss": 0.84832048, "memory(GiB)": 146.85, "step": 55760, "train_speed(iter/s)": 0.202169 }, { "acc": 0.7781147, "epoch": 1.3011563306551568, "grad_norm": 5.84375, "learning_rate": 2.874180809021348e-06, "loss": 0.78585663, "memory(GiB)": 146.85, "step": 55770, "train_speed(iter/s)": 0.202186 }, { "acc": 0.76526294, "epoch": 1.3013896382274457, "grad_norm": 5.625, "learning_rate": 2.872471094853584e-06, "loss": 0.83896046, "memory(GiB)": 146.85, "step": 55780, "train_speed(iter/s)": 0.202205 }, { "acc": 0.77648573, "epoch": 1.3016229457997346, "grad_norm": 5.1875, "learning_rate": 2.8707616843899554e-06, "loss": 0.80965137, "memory(GiB)": 146.85, "step": 55790, "train_speed(iter/s)": 0.202225 }, { "acc": 0.76868334, "epoch": 1.3018562533720235, "grad_norm": 7.21875, "learning_rate": 2.8690525778744777e-06, "loss": 0.86346951, "memory(GiB)": 146.85, "step": 55800, "train_speed(iter/s)": 0.202244 }, { "acc": 0.78997097, "epoch": 1.3020895609443124, "grad_norm": 6.40625, "learning_rate": 2.867343775551126e-06, "loss": 0.76011419, "memory(GiB)": 146.85, "step": 55810, "train_speed(iter/s)": 0.202264 }, { "acc": 0.75506325, "epoch": 1.3023228685166013, "grad_norm": 8.625, "learning_rate": 2.8656352776638274e-06, "loss": 0.88944521, "memory(GiB)": 146.85, "step": 55820, "train_speed(iter/s)": 0.202283 }, { "acc": 0.78290377, "epoch": 1.3025561760888902, "grad_norm": 4.75, "learning_rate": 2.863927084456476e-06, "loss": 0.78628983, "memory(GiB)": 146.85, "step": 55830, "train_speed(iter/s)": 0.202302 }, { "acc": 0.77155728, "epoch": 1.302789483661179, "grad_norm": 5.875, "learning_rate": 2.862219196172911e-06, "loss": 0.82785378, "memory(GiB)": 146.85, "step": 55840, "train_speed(iter/s)": 0.202322 }, { "acc": 0.78467484, "epoch": 1.303022791233468, "grad_norm": 5.96875, "learning_rate": 2.8605116130569355e-06, "loss": 0.78114939, "memory(GiB)": 146.85, "step": 55850, "train_speed(iter/s)": 0.202341 }, { "acc": 0.7646183, "epoch": 1.3032560988057569, "grad_norm": 5.9375, "learning_rate": 2.8588043353523066e-06, "loss": 0.84907398, "memory(GiB)": 146.85, "step": 55860, "train_speed(iter/s)": 0.202361 }, { "acc": 0.76037807, "epoch": 1.3034894063780458, "grad_norm": 6.03125, "learning_rate": 2.8570973633027342e-06, "loss": 0.86209183, "memory(GiB)": 146.85, "step": 55870, "train_speed(iter/s)": 0.202381 }, { "acc": 0.76059093, "epoch": 1.3037227139503347, "grad_norm": 6.46875, "learning_rate": 2.8553906971518936e-06, "loss": 0.85338221, "memory(GiB)": 146.85, "step": 55880, "train_speed(iter/s)": 0.202401 }, { "acc": 0.77114286, "epoch": 1.3039560215226236, "grad_norm": 10.1875, "learning_rate": 2.8536843371434054e-06, "loss": 0.81605854, "memory(GiB)": 146.85, "step": 55890, "train_speed(iter/s)": 0.20242 }, { "acc": 0.7828207, "epoch": 1.3041893290949125, "grad_norm": 5.4375, "learning_rate": 2.851978283520859e-06, "loss": 0.79497309, "memory(GiB)": 146.85, "step": 55900, "train_speed(iter/s)": 0.202438 }, { "acc": 0.76823692, "epoch": 1.3044226366672014, "grad_norm": 8.0625, "learning_rate": 2.850272536527784e-06, "loss": 0.83200693, "memory(GiB)": 146.85, "step": 55910, "train_speed(iter/s)": 0.202458 }, { "acc": 0.76935453, "epoch": 1.3046559442394903, "grad_norm": 7.40625, "learning_rate": 2.848567096407682e-06, "loss": 0.84299221, "memory(GiB)": 146.85, "step": 55920, "train_speed(iter/s)": 0.202476 }, { "acc": 0.77484741, "epoch": 1.3048892518117792, "grad_norm": 5.875, "learning_rate": 2.8468619634040017e-06, "loss": 0.81433144, "memory(GiB)": 146.85, "step": 55930, "train_speed(iter/s)": 0.202494 }, { "acc": 0.75655613, "epoch": 1.305122559384068, "grad_norm": 5.90625, "learning_rate": 2.8451571377601495e-06, "loss": 0.88792839, "memory(GiB)": 146.85, "step": 55940, "train_speed(iter/s)": 0.202513 }, { "acc": 0.78883553, "epoch": 1.305355866956357, "grad_norm": 4.75, "learning_rate": 2.8434526197194915e-06, "loss": 0.75276585, "memory(GiB)": 146.85, "step": 55950, "train_speed(iter/s)": 0.202531 }, { "acc": 0.77748146, "epoch": 1.3055891745286459, "grad_norm": 7.1875, "learning_rate": 2.8417484095253434e-06, "loss": 0.81048374, "memory(GiB)": 146.85, "step": 55960, "train_speed(iter/s)": 0.202551 }, { "acc": 0.77109718, "epoch": 1.3058224821009348, "grad_norm": 5.78125, "learning_rate": 2.8400445074209852e-06, "loss": 0.83219738, "memory(GiB)": 146.85, "step": 55970, "train_speed(iter/s)": 0.202568 }, { "acc": 0.75238581, "epoch": 1.3060557896732234, "grad_norm": 6.90625, "learning_rate": 2.8383409136496443e-06, "loss": 0.90591383, "memory(GiB)": 146.85, "step": 55980, "train_speed(iter/s)": 0.202587 }, { "acc": 0.78553209, "epoch": 1.3062890972455126, "grad_norm": 6.71875, "learning_rate": 2.8366376284545117e-06, "loss": 0.77759676, "memory(GiB)": 146.85, "step": 55990, "train_speed(iter/s)": 0.202605 }, { "acc": 0.76914377, "epoch": 1.3065224048178012, "grad_norm": 6.21875, "learning_rate": 2.8349346520787284e-06, "loss": 0.84901066, "memory(GiB)": 146.85, "step": 56000, "train_speed(iter/s)": 0.202622 }, { "epoch": 1.3065224048178012, "eval_acc": 0.7352035663232215, "eval_loss": 0.8340468406677246, "eval_runtime": 1262.9862, "eval_samples_per_second": 28.497, "eval_steps_per_second": 14.249, "step": 56000 }, { "acc": 0.75866451, "epoch": 1.3067557123900904, "grad_norm": 5.78125, "learning_rate": 2.833231984765393e-06, "loss": 0.87015581, "memory(GiB)": 146.85, "step": 56010, "train_speed(iter/s)": 0.201702 }, { "acc": 0.79182196, "epoch": 1.306989019962379, "grad_norm": 7.90625, "learning_rate": 2.8315296267575672e-06, "loss": 0.75225515, "memory(GiB)": 146.85, "step": 56020, "train_speed(iter/s)": 0.201721 }, { "acc": 0.76525688, "epoch": 1.3072223275346682, "grad_norm": 5.8125, "learning_rate": 2.8298275782982525e-06, "loss": 0.89069595, "memory(GiB)": 146.85, "step": 56030, "train_speed(iter/s)": 0.201738 }, { "acc": 0.76961288, "epoch": 1.3074556351069568, "grad_norm": 5.90625, "learning_rate": 2.8281258396304224e-06, "loss": 0.84866333, "memory(GiB)": 146.85, "step": 56040, "train_speed(iter/s)": 0.201757 }, { "acc": 0.78202744, "epoch": 1.307688942679246, "grad_norm": 5.21875, "learning_rate": 2.8264244109969963e-06, "loss": 0.794489, "memory(GiB)": 146.85, "step": 56050, "train_speed(iter/s)": 0.201777 }, { "acc": 0.78056703, "epoch": 1.3079222502515346, "grad_norm": 7.6875, "learning_rate": 2.824723292640856e-06, "loss": 0.7973052, "memory(GiB)": 146.85, "step": 56060, "train_speed(iter/s)": 0.201795 }, { "acc": 0.77469683, "epoch": 1.3081555578238238, "grad_norm": 4.5, "learning_rate": 2.823022484804834e-06, "loss": 0.79345198, "memory(GiB)": 146.85, "step": 56070, "train_speed(iter/s)": 0.201813 }, { "acc": 0.76816144, "epoch": 1.3083888653961124, "grad_norm": 7.25, "learning_rate": 2.8213219877317164e-06, "loss": 0.85416546, "memory(GiB)": 146.85, "step": 56080, "train_speed(iter/s)": 0.201832 }, { "acc": 0.76544499, "epoch": 1.3086221729684013, "grad_norm": 6.625, "learning_rate": 2.819621801664256e-06, "loss": 0.85448122, "memory(GiB)": 146.85, "step": 56090, "train_speed(iter/s)": 0.201851 }, { "acc": 0.78134265, "epoch": 1.3088554805406902, "grad_norm": 4.90625, "learning_rate": 2.817921926845147e-06, "loss": 0.80759029, "memory(GiB)": 146.85, "step": 56100, "train_speed(iter/s)": 0.20187 }, { "acc": 0.78094769, "epoch": 1.3090887881129791, "grad_norm": 6.4375, "learning_rate": 2.8162223635170515e-06, "loss": 0.78721571, "memory(GiB)": 146.85, "step": 56110, "train_speed(iter/s)": 0.201887 }, { "acc": 0.79755783, "epoch": 1.309322095685268, "grad_norm": 5.65625, "learning_rate": 2.814523111922577e-06, "loss": 0.71591768, "memory(GiB)": 146.85, "step": 56120, "train_speed(iter/s)": 0.201906 }, { "acc": 0.77897987, "epoch": 1.309555403257557, "grad_norm": 5.0625, "learning_rate": 2.812824172304297e-06, "loss": 0.81980705, "memory(GiB)": 146.85, "step": 56130, "train_speed(iter/s)": 0.201925 }, { "acc": 0.78349252, "epoch": 1.3097887108298458, "grad_norm": 7.09375, "learning_rate": 2.8111255449047277e-06, "loss": 0.77535572, "memory(GiB)": 146.85, "step": 56140, "train_speed(iter/s)": 0.201944 }, { "acc": 0.78470354, "epoch": 1.3100220184021347, "grad_norm": 4.5, "learning_rate": 2.809427229966353e-06, "loss": 0.7578114, "memory(GiB)": 146.85, "step": 56150, "train_speed(iter/s)": 0.201963 }, { "acc": 0.76731687, "epoch": 1.3102553259744236, "grad_norm": 7.59375, "learning_rate": 2.8077292277316036e-06, "loss": 0.83429298, "memory(GiB)": 146.85, "step": 56160, "train_speed(iter/s)": 0.201981 }, { "acc": 0.76218219, "epoch": 1.3104886335467125, "grad_norm": 5.03125, "learning_rate": 2.8060315384428692e-06, "loss": 0.87048531, "memory(GiB)": 146.85, "step": 56170, "train_speed(iter/s)": 0.202001 }, { "acc": 0.7770124, "epoch": 1.3107219411190014, "grad_norm": 9.25, "learning_rate": 2.8043341623424974e-06, "loss": 0.83079805, "memory(GiB)": 146.85, "step": 56180, "train_speed(iter/s)": 0.202019 }, { "acc": 0.78012552, "epoch": 1.3109552486912903, "grad_norm": 5.6875, "learning_rate": 2.8026370996727835e-06, "loss": 0.7911499, "memory(GiB)": 146.85, "step": 56190, "train_speed(iter/s)": 0.202038 }, { "acc": 0.7875946, "epoch": 1.3111885562635792, "grad_norm": 8.125, "learning_rate": 2.800940350675988e-06, "loss": 0.76266332, "memory(GiB)": 146.85, "step": 56200, "train_speed(iter/s)": 0.202058 }, { "acc": 0.781739, "epoch": 1.3114218638358681, "grad_norm": 6.5625, "learning_rate": 2.7992439155943185e-06, "loss": 0.78995237, "memory(GiB)": 146.85, "step": 56210, "train_speed(iter/s)": 0.202077 }, { "acc": 0.75933647, "epoch": 1.311655171408157, "grad_norm": 4.53125, "learning_rate": 2.797547794669938e-06, "loss": 0.86246414, "memory(GiB)": 146.85, "step": 56220, "train_speed(iter/s)": 0.202095 }, { "acc": 0.77485781, "epoch": 1.311888478980446, "grad_norm": 4.71875, "learning_rate": 2.7958519881449723e-06, "loss": 0.81103086, "memory(GiB)": 146.85, "step": 56230, "train_speed(iter/s)": 0.202114 }, { "acc": 0.77391744, "epoch": 1.3121217865527348, "grad_norm": 5.34375, "learning_rate": 2.794156496261493e-06, "loss": 0.77928791, "memory(GiB)": 146.85, "step": 56240, "train_speed(iter/s)": 0.202133 }, { "acc": 0.75226135, "epoch": 1.3123550941250237, "grad_norm": 6.75, "learning_rate": 2.792461319261538e-06, "loss": 0.90252361, "memory(GiB)": 146.85, "step": 56250, "train_speed(iter/s)": 0.202153 }, { "acc": 0.78322229, "epoch": 1.3125884016973126, "grad_norm": 4.75, "learning_rate": 2.790766457387083e-06, "loss": 0.79578757, "memory(GiB)": 146.85, "step": 56260, "train_speed(iter/s)": 0.20217 }, { "acc": 0.77192554, "epoch": 1.3128217092696015, "grad_norm": 6.03125, "learning_rate": 2.7890719108800766e-06, "loss": 0.81696529, "memory(GiB)": 146.85, "step": 56270, "train_speed(iter/s)": 0.202188 }, { "acc": 0.76889458, "epoch": 1.3130550168418904, "grad_norm": 6.375, "learning_rate": 2.7873776799824115e-06, "loss": 0.82898397, "memory(GiB)": 146.85, "step": 56280, "train_speed(iter/s)": 0.202207 }, { "acc": 0.77150846, "epoch": 1.3132883244141793, "grad_norm": 6.15625, "learning_rate": 2.7856837649359416e-06, "loss": 0.80232182, "memory(GiB)": 146.85, "step": 56290, "train_speed(iter/s)": 0.202225 }, { "acc": 0.7655344, "epoch": 1.3135216319864682, "grad_norm": 6.1875, "learning_rate": 2.7839901659824707e-06, "loss": 0.84427547, "memory(GiB)": 146.85, "step": 56300, "train_speed(iter/s)": 0.202244 }, { "acc": 0.77064381, "epoch": 1.313754939558757, "grad_norm": 6.03125, "learning_rate": 2.7822968833637577e-06, "loss": 0.82581348, "memory(GiB)": 146.85, "step": 56310, "train_speed(iter/s)": 0.202263 }, { "acc": 0.77015219, "epoch": 1.313988247131046, "grad_norm": 5.59375, "learning_rate": 2.7806039173215225e-06, "loss": 0.80838594, "memory(GiB)": 146.85, "step": 56320, "train_speed(iter/s)": 0.202282 }, { "acc": 0.76740808, "epoch": 1.314221554703335, "grad_norm": 7.625, "learning_rate": 2.7789112680974316e-06, "loss": 0.84647083, "memory(GiB)": 146.85, "step": 56330, "train_speed(iter/s)": 0.2023 }, { "acc": 0.7935811, "epoch": 1.3144548622756238, "grad_norm": 3.828125, "learning_rate": 2.7772189359331136e-06, "loss": 0.71832447, "memory(GiB)": 146.85, "step": 56340, "train_speed(iter/s)": 0.202319 }, { "acc": 0.79208622, "epoch": 1.3146881698479127, "grad_norm": 6.0, "learning_rate": 2.7755269210701475e-06, "loss": 0.76291838, "memory(GiB)": 146.85, "step": 56350, "train_speed(iter/s)": 0.202337 }, { "acc": 0.78336744, "epoch": 1.3149214774202016, "grad_norm": 5.0625, "learning_rate": 2.7738352237500667e-06, "loss": 0.78528042, "memory(GiB)": 146.85, "step": 56360, "train_speed(iter/s)": 0.202356 }, { "acc": 0.75361509, "epoch": 1.3151547849924903, "grad_norm": 5.875, "learning_rate": 2.7721438442143607e-06, "loss": 0.89121761, "memory(GiB)": 146.85, "step": 56370, "train_speed(iter/s)": 0.202374 }, { "acc": 0.76625342, "epoch": 1.3153880925647794, "grad_norm": 4.59375, "learning_rate": 2.7704527827044714e-06, "loss": 0.83009186, "memory(GiB)": 146.85, "step": 56380, "train_speed(iter/s)": 0.202393 }, { "acc": 0.7660604, "epoch": 1.315621400137068, "grad_norm": 5.03125, "learning_rate": 2.7687620394618025e-06, "loss": 0.85052071, "memory(GiB)": 146.85, "step": 56390, "train_speed(iter/s)": 0.202411 }, { "acc": 0.77648897, "epoch": 1.3158547077093572, "grad_norm": 6.59375, "learning_rate": 2.767071614727702e-06, "loss": 0.81099558, "memory(GiB)": 146.85, "step": 56400, "train_speed(iter/s)": 0.202429 }, { "acc": 0.76614866, "epoch": 1.3160880152816459, "grad_norm": 7.125, "learning_rate": 2.765381508743482e-06, "loss": 0.85021791, "memory(GiB)": 146.85, "step": 56410, "train_speed(iter/s)": 0.202447 }, { "acc": 0.76074743, "epoch": 1.316321322853935, "grad_norm": 5.5, "learning_rate": 2.7636917217504007e-06, "loss": 0.86594706, "memory(GiB)": 146.85, "step": 56420, "train_speed(iter/s)": 0.202466 }, { "acc": 0.77234387, "epoch": 1.3165546304262237, "grad_norm": 5.125, "learning_rate": 2.762002253989678e-06, "loss": 0.81988735, "memory(GiB)": 146.85, "step": 56430, "train_speed(iter/s)": 0.202484 }, { "acc": 0.76379228, "epoch": 1.3167879379985128, "grad_norm": 6.53125, "learning_rate": 2.7603131057024835e-06, "loss": 0.87251673, "memory(GiB)": 146.85, "step": 56440, "train_speed(iter/s)": 0.202503 }, { "acc": 0.77535024, "epoch": 1.3170212455708015, "grad_norm": 5.75, "learning_rate": 2.7586242771299404e-06, "loss": 0.80857143, "memory(GiB)": 146.85, "step": 56450, "train_speed(iter/s)": 0.202521 }, { "acc": 0.76321392, "epoch": 1.3172545531430904, "grad_norm": 5.28125, "learning_rate": 2.7569357685131325e-06, "loss": 0.85759602, "memory(GiB)": 146.85, "step": 56460, "train_speed(iter/s)": 0.202539 }, { "acc": 0.76619134, "epoch": 1.3174878607153793, "grad_norm": 6.78125, "learning_rate": 2.7552475800930907e-06, "loss": 0.85277462, "memory(GiB)": 146.85, "step": 56470, "train_speed(iter/s)": 0.202557 }, { "acc": 0.77769084, "epoch": 1.3177211682876682, "grad_norm": 5.65625, "learning_rate": 2.753559712110808e-06, "loss": 0.81264629, "memory(GiB)": 146.85, "step": 56480, "train_speed(iter/s)": 0.202575 }, { "acc": 0.78065329, "epoch": 1.317954475859957, "grad_norm": 6.15625, "learning_rate": 2.75187216480722e-06, "loss": 0.80991268, "memory(GiB)": 146.85, "step": 56490, "train_speed(iter/s)": 0.202594 }, { "acc": 0.76249199, "epoch": 1.318187783432246, "grad_norm": 5.625, "learning_rate": 2.75018493842323e-06, "loss": 0.84612541, "memory(GiB)": 146.85, "step": 56500, "train_speed(iter/s)": 0.202612 }, { "epoch": 1.318187783432246, "eval_acc": 0.7352839150820106, "eval_loss": 0.8340603709220886, "eval_runtime": 1262.7595, "eval_samples_per_second": 28.502, "eval_steps_per_second": 14.251, "step": 56500 }, { "acc": 0.79823904, "epoch": 1.3184210910045349, "grad_norm": 5.65625, "learning_rate": 2.748498033199686e-06, "loss": 0.70853934, "memory(GiB)": 146.85, "step": 56510, "train_speed(iter/s)": 0.201698 }, { "acc": 0.77275577, "epoch": 1.3186543985768238, "grad_norm": 5.625, "learning_rate": 2.7468114493773913e-06, "loss": 0.83758879, "memory(GiB)": 146.85, "step": 56520, "train_speed(iter/s)": 0.201718 }, { "acc": 0.75451307, "epoch": 1.3188877061491127, "grad_norm": 5.375, "learning_rate": 2.7451251871971103e-06, "loss": 0.89319267, "memory(GiB)": 146.85, "step": 56530, "train_speed(iter/s)": 0.201737 }, { "acc": 0.76059303, "epoch": 1.3191210137214016, "grad_norm": 5.21875, "learning_rate": 2.743439246899552e-06, "loss": 0.85532761, "memory(GiB)": 146.85, "step": 56540, "train_speed(iter/s)": 0.201756 }, { "acc": 0.75908251, "epoch": 1.3193543212936905, "grad_norm": 5.28125, "learning_rate": 2.7417536287253864e-06, "loss": 0.87564659, "memory(GiB)": 146.85, "step": 56550, "train_speed(iter/s)": 0.201775 }, { "acc": 0.773843, "epoch": 1.3195876288659794, "grad_norm": 5.75, "learning_rate": 2.7400683329152358e-06, "loss": 0.83258095, "memory(GiB)": 146.85, "step": 56560, "train_speed(iter/s)": 0.201794 }, { "acc": 0.77879524, "epoch": 1.3198209364382683, "grad_norm": 5.375, "learning_rate": 2.738383359709671e-06, "loss": 0.78695173, "memory(GiB)": 146.85, "step": 56570, "train_speed(iter/s)": 0.201813 }, { "acc": 0.76146154, "epoch": 1.3200542440105572, "grad_norm": 5.59375, "learning_rate": 2.736698709349227e-06, "loss": 0.85865736, "memory(GiB)": 146.85, "step": 56580, "train_speed(iter/s)": 0.201832 }, { "acc": 0.78706541, "epoch": 1.320287551582846, "grad_norm": 6.78125, "learning_rate": 2.7350143820743847e-06, "loss": 0.77142429, "memory(GiB)": 146.85, "step": 56590, "train_speed(iter/s)": 0.20185 }, { "acc": 0.75575294, "epoch": 1.320520859155135, "grad_norm": 5.1875, "learning_rate": 2.7333303781255816e-06, "loss": 0.88191833, "memory(GiB)": 146.85, "step": 56600, "train_speed(iter/s)": 0.201869 }, { "acc": 0.76945853, "epoch": 1.3207541667274239, "grad_norm": 5.0625, "learning_rate": 2.7316466977432067e-06, "loss": 0.83839703, "memory(GiB)": 146.85, "step": 56610, "train_speed(iter/s)": 0.201888 }, { "acc": 0.76779089, "epoch": 1.3209874742997127, "grad_norm": 4.8125, "learning_rate": 2.729963341167608e-06, "loss": 0.84511166, "memory(GiB)": 146.85, "step": 56620, "train_speed(iter/s)": 0.201907 }, { "acc": 0.77504158, "epoch": 1.3212207818720016, "grad_norm": 5.375, "learning_rate": 2.728280308639081e-06, "loss": 0.8223464, "memory(GiB)": 146.85, "step": 56630, "train_speed(iter/s)": 0.201927 }, { "acc": 0.75718417, "epoch": 1.3214540894442905, "grad_norm": 6.3125, "learning_rate": 2.7265976003978828e-06, "loss": 0.86357059, "memory(GiB)": 146.85, "step": 56640, "train_speed(iter/s)": 0.201945 }, { "acc": 0.76878195, "epoch": 1.3216873970165794, "grad_norm": 7.09375, "learning_rate": 2.7249152166842164e-06, "loss": 0.82916155, "memory(GiB)": 146.85, "step": 56650, "train_speed(iter/s)": 0.201965 }, { "acc": 0.76755066, "epoch": 1.3219207045888683, "grad_norm": 5.125, "learning_rate": 2.72323315773824e-06, "loss": 0.84954739, "memory(GiB)": 146.85, "step": 56660, "train_speed(iter/s)": 0.201984 }, { "acc": 0.76039038, "epoch": 1.3221540121611572, "grad_norm": 9.875, "learning_rate": 2.72155142380007e-06, "loss": 0.85981464, "memory(GiB)": 146.85, "step": 56670, "train_speed(iter/s)": 0.202004 }, { "acc": 0.76832247, "epoch": 1.3223873197334461, "grad_norm": 7.84375, "learning_rate": 2.7198700151097714e-06, "loss": 0.84699659, "memory(GiB)": 146.85, "step": 56680, "train_speed(iter/s)": 0.202023 }, { "acc": 0.75552359, "epoch": 1.322620627305735, "grad_norm": 6.40625, "learning_rate": 2.7181889319073674e-06, "loss": 0.89217014, "memory(GiB)": 146.85, "step": 56690, "train_speed(iter/s)": 0.202041 }, { "acc": 0.77681775, "epoch": 1.322853934878024, "grad_norm": 6.84375, "learning_rate": 2.7165081744328304e-06, "loss": 0.81347427, "memory(GiB)": 146.85, "step": 56700, "train_speed(iter/s)": 0.202059 }, { "acc": 0.77529945, "epoch": 1.3230872424503128, "grad_norm": 6.59375, "learning_rate": 2.714827742926088e-06, "loss": 0.81865578, "memory(GiB)": 146.85, "step": 56710, "train_speed(iter/s)": 0.202076 }, { "acc": 0.79201078, "epoch": 1.3233205500226017, "grad_norm": 6.53125, "learning_rate": 2.7131476376270215e-06, "loss": 0.73602557, "memory(GiB)": 146.85, "step": 56720, "train_speed(iter/s)": 0.202095 }, { "acc": 0.77600632, "epoch": 1.3235538575948906, "grad_norm": 5.1875, "learning_rate": 2.711467858775464e-06, "loss": 0.8077692, "memory(GiB)": 146.85, "step": 56730, "train_speed(iter/s)": 0.202111 }, { "acc": 0.77185798, "epoch": 1.3237871651671795, "grad_norm": 4.96875, "learning_rate": 2.7097884066112062e-06, "loss": 0.82124863, "memory(GiB)": 146.85, "step": 56740, "train_speed(iter/s)": 0.202129 }, { "acc": 0.76982031, "epoch": 1.3240204727394684, "grad_norm": 7.0, "learning_rate": 2.7081092813739863e-06, "loss": 0.8243866, "memory(GiB)": 146.85, "step": 56750, "train_speed(iter/s)": 0.202148 }, { "acc": 0.76952429, "epoch": 1.3242537803117571, "grad_norm": 6.03125, "learning_rate": 2.7064304833035027e-06, "loss": 0.83053398, "memory(GiB)": 146.85, "step": 56760, "train_speed(iter/s)": 0.202168 }, { "acc": 0.74489746, "epoch": 1.3244870878840462, "grad_norm": 9.5, "learning_rate": 2.704752012639399e-06, "loss": 0.93402958, "memory(GiB)": 146.85, "step": 56770, "train_speed(iter/s)": 0.202188 }, { "acc": 0.7829618, "epoch": 1.324720395456335, "grad_norm": 6.03125, "learning_rate": 2.703073869621281e-06, "loss": 0.77828069, "memory(GiB)": 146.85, "step": 56780, "train_speed(iter/s)": 0.202207 }, { "acc": 0.77976117, "epoch": 1.324953703028624, "grad_norm": 5.71875, "learning_rate": 2.7013960544887007e-06, "loss": 0.78992071, "memory(GiB)": 146.85, "step": 56790, "train_speed(iter/s)": 0.202226 }, { "acc": 0.7906065, "epoch": 1.3251870106009127, "grad_norm": 7.3125, "learning_rate": 2.699718567481164e-06, "loss": 0.74453802, "memory(GiB)": 146.85, "step": 56800, "train_speed(iter/s)": 0.202245 }, { "acc": 0.75609365, "epoch": 1.3254203181732018, "grad_norm": 5.8125, "learning_rate": 2.698041408838136e-06, "loss": 0.87776976, "memory(GiB)": 146.85, "step": 56810, "train_speed(iter/s)": 0.202264 }, { "acc": 0.79445724, "epoch": 1.3256536257454905, "grad_norm": 4.96875, "learning_rate": 2.696364578799028e-06, "loss": 0.71787348, "memory(GiB)": 146.85, "step": 56820, "train_speed(iter/s)": 0.202283 }, { "acc": 0.77634611, "epoch": 1.3258869333177796, "grad_norm": 5.65625, "learning_rate": 2.694688077603207e-06, "loss": 0.82097635, "memory(GiB)": 146.85, "step": 56830, "train_speed(iter/s)": 0.202301 }, { "acc": 0.78219113, "epoch": 1.3261202408900683, "grad_norm": 5.625, "learning_rate": 2.6930119054899905e-06, "loss": 0.79426031, "memory(GiB)": 146.85, "step": 56840, "train_speed(iter/s)": 0.202319 }, { "acc": 0.78868895, "epoch": 1.3263535484623572, "grad_norm": 6.71875, "learning_rate": 2.6913360626986575e-06, "loss": 0.76916485, "memory(GiB)": 146.85, "step": 56850, "train_speed(iter/s)": 0.202337 }, { "acc": 0.78204069, "epoch": 1.326586856034646, "grad_norm": 5.3125, "learning_rate": 2.68966054946843e-06, "loss": 0.76761818, "memory(GiB)": 146.85, "step": 56860, "train_speed(iter/s)": 0.202356 }, { "acc": 0.7840148, "epoch": 1.326820163606935, "grad_norm": 4.84375, "learning_rate": 2.687985366038486e-06, "loss": 0.76348834, "memory(GiB)": 146.85, "step": 56870, "train_speed(iter/s)": 0.202376 }, { "acc": 0.77319527, "epoch": 1.327053471179224, "grad_norm": 4.78125, "learning_rate": 2.6863105126479616e-06, "loss": 0.8076972, "memory(GiB)": 146.85, "step": 56880, "train_speed(iter/s)": 0.202395 }, { "acc": 0.76160183, "epoch": 1.3272867787515128, "grad_norm": 6.84375, "learning_rate": 2.6846359895359373e-06, "loss": 0.86586304, "memory(GiB)": 146.85, "step": 56890, "train_speed(iter/s)": 0.202414 }, { "acc": 0.78182874, "epoch": 1.3275200863238017, "grad_norm": 4.90625, "learning_rate": 2.682961796941456e-06, "loss": 0.78389187, "memory(GiB)": 146.85, "step": 56900, "train_speed(iter/s)": 0.202432 }, { "acc": 0.7813993, "epoch": 1.3277533938960906, "grad_norm": 4.0625, "learning_rate": 2.6812879351035015e-06, "loss": 0.76933894, "memory(GiB)": 146.85, "step": 56910, "train_speed(iter/s)": 0.202451 }, { "acc": 0.77860384, "epoch": 1.3279867014683795, "grad_norm": 5.15625, "learning_rate": 2.679614404261023e-06, "loss": 0.79533873, "memory(GiB)": 146.85, "step": 56920, "train_speed(iter/s)": 0.202469 }, { "acc": 0.79444194, "epoch": 1.3282200090406684, "grad_norm": 4.875, "learning_rate": 2.677941204652914e-06, "loss": 0.73389244, "memory(GiB)": 146.85, "step": 56930, "train_speed(iter/s)": 0.202487 }, { "acc": 0.76796999, "epoch": 1.3284533166129573, "grad_norm": 5.96875, "learning_rate": 2.676268336518024e-06, "loss": 0.83531208, "memory(GiB)": 146.85, "step": 56940, "train_speed(iter/s)": 0.202506 }, { "acc": 0.76294069, "epoch": 1.3286866241852462, "grad_norm": 3.9375, "learning_rate": 2.6745958000951546e-06, "loss": 0.8432888, "memory(GiB)": 146.85, "step": 56950, "train_speed(iter/s)": 0.202524 }, { "acc": 0.77080526, "epoch": 1.328919931757535, "grad_norm": 5.25, "learning_rate": 2.672923595623056e-06, "loss": 0.80147533, "memory(GiB)": 146.85, "step": 56960, "train_speed(iter/s)": 0.202543 }, { "acc": 0.78746467, "epoch": 1.329153239329824, "grad_norm": 4.4375, "learning_rate": 2.67125172334044e-06, "loss": 0.77326365, "memory(GiB)": 146.85, "step": 56970, "train_speed(iter/s)": 0.20256 }, { "acc": 0.77155142, "epoch": 1.3293865469021129, "grad_norm": 7.46875, "learning_rate": 2.669580183485963e-06, "loss": 0.83571501, "memory(GiB)": 146.85, "step": 56980, "train_speed(iter/s)": 0.202578 }, { "acc": 0.76695776, "epoch": 1.3296198544744018, "grad_norm": 5.0625, "learning_rate": 2.667908976298239e-06, "loss": 0.83726654, "memory(GiB)": 146.85, "step": 56990, "train_speed(iter/s)": 0.202597 }, { "acc": 0.76479712, "epoch": 1.3298531620466907, "grad_norm": 10.625, "learning_rate": 2.666238102015832e-06, "loss": 0.86180153, "memory(GiB)": 146.85, "step": 57000, "train_speed(iter/s)": 0.202615 }, { "epoch": 1.3298531620466907, "eval_acc": 0.7352080839241172, "eval_loss": 0.8340936303138733, "eval_runtime": 1264.2547, "eval_samples_per_second": 28.468, "eval_steps_per_second": 14.234, "step": 57000 }, { "acc": 0.78417044, "epoch": 1.3300864696189796, "grad_norm": 6.0625, "learning_rate": 2.6645675608772554e-06, "loss": 0.76926279, "memory(GiB)": 146.85, "step": 57010, "train_speed(iter/s)": 0.201708 }, { "acc": 0.7757329, "epoch": 1.3303197771912685, "grad_norm": 4.96875, "learning_rate": 2.662897353120983e-06, "loss": 0.80028658, "memory(GiB)": 146.85, "step": 57020, "train_speed(iter/s)": 0.201727 }, { "acc": 0.78579822, "epoch": 1.3305530847635574, "grad_norm": 9.3125, "learning_rate": 2.6612274789854326e-06, "loss": 0.7561882, "memory(GiB)": 146.85, "step": 57030, "train_speed(iter/s)": 0.201746 }, { "acc": 0.77751579, "epoch": 1.3307863923358463, "grad_norm": 4.34375, "learning_rate": 2.659557938708982e-06, "loss": 0.81424007, "memory(GiB)": 146.85, "step": 57040, "train_speed(iter/s)": 0.201764 }, { "acc": 0.78268909, "epoch": 1.3310196999081352, "grad_norm": 10.4375, "learning_rate": 2.657888732529956e-06, "loss": 0.78520036, "memory(GiB)": 146.85, "step": 57050, "train_speed(iter/s)": 0.201782 }, { "acc": 0.76730194, "epoch": 1.331253007480424, "grad_norm": 5.0, "learning_rate": 2.656219860686633e-06, "loss": 0.84456968, "memory(GiB)": 146.85, "step": 57060, "train_speed(iter/s)": 0.201801 }, { "acc": 0.77129574, "epoch": 1.331486315052713, "grad_norm": 8.5, "learning_rate": 2.6545513234172413e-06, "loss": 0.83837032, "memory(GiB)": 146.85, "step": 57070, "train_speed(iter/s)": 0.201819 }, { "acc": 0.7784894, "epoch": 1.3317196226250019, "grad_norm": 5.28125, "learning_rate": 2.65288312095997e-06, "loss": 0.802034, "memory(GiB)": 146.85, "step": 57080, "train_speed(iter/s)": 0.201838 }, { "acc": 0.77504759, "epoch": 1.3319529301972908, "grad_norm": 5.90625, "learning_rate": 2.651215253552951e-06, "loss": 0.81125851, "memory(GiB)": 146.85, "step": 57090, "train_speed(iter/s)": 0.201857 }, { "acc": 0.77672825, "epoch": 1.3321862377695797, "grad_norm": 7.03125, "learning_rate": 2.6495477214342704e-06, "loss": 0.7942831, "memory(GiB)": 146.85, "step": 57100, "train_speed(iter/s)": 0.201875 }, { "acc": 0.76061492, "epoch": 1.3324195453418686, "grad_norm": 5.625, "learning_rate": 2.647880524841971e-06, "loss": 0.86404123, "memory(GiB)": 146.85, "step": 57110, "train_speed(iter/s)": 0.201894 }, { "acc": 0.78190913, "epoch": 1.3326528529141575, "grad_norm": 5.09375, "learning_rate": 2.646213664014042e-06, "loss": 0.78220739, "memory(GiB)": 146.85, "step": 57120, "train_speed(iter/s)": 0.201913 }, { "acc": 0.76264815, "epoch": 1.3328861604864464, "grad_norm": 5.1875, "learning_rate": 2.6445471391884304e-06, "loss": 0.86811733, "memory(GiB)": 146.85, "step": 57130, "train_speed(iter/s)": 0.201931 }, { "acc": 0.76050758, "epoch": 1.3331194680587353, "grad_norm": 5.0, "learning_rate": 2.6428809506030306e-06, "loss": 0.86981554, "memory(GiB)": 146.85, "step": 57140, "train_speed(iter/s)": 0.201949 }, { "acc": 0.78228064, "epoch": 1.333352775631024, "grad_norm": 6.1875, "learning_rate": 2.641215098495688e-06, "loss": 0.78431435, "memory(GiB)": 146.85, "step": 57150, "train_speed(iter/s)": 0.201968 }, { "acc": 0.76421413, "epoch": 1.333586083203313, "grad_norm": 5.5625, "learning_rate": 2.639549583104209e-06, "loss": 0.84939976, "memory(GiB)": 146.85, "step": 57160, "train_speed(iter/s)": 0.201987 }, { "acc": 0.78023791, "epoch": 1.3338193907756017, "grad_norm": 6.3125, "learning_rate": 2.6378844046663375e-06, "loss": 0.81349955, "memory(GiB)": 146.85, "step": 57170, "train_speed(iter/s)": 0.202005 }, { "acc": 0.78771105, "epoch": 1.3340526983478909, "grad_norm": 5.6875, "learning_rate": 2.636219563419783e-06, "loss": 0.75435238, "memory(GiB)": 146.85, "step": 57180, "train_speed(iter/s)": 0.202024 }, { "acc": 0.79091015, "epoch": 1.3342860059201795, "grad_norm": 5.71875, "learning_rate": 2.6345550596021967e-06, "loss": 0.75712614, "memory(GiB)": 146.85, "step": 57190, "train_speed(iter/s)": 0.202043 }, { "acc": 0.79447069, "epoch": 1.3345193134924687, "grad_norm": 4.65625, "learning_rate": 2.632890893451191e-06, "loss": 0.74264135, "memory(GiB)": 146.85, "step": 57200, "train_speed(iter/s)": 0.202061 }, { "acc": 0.79602065, "epoch": 1.3347526210647573, "grad_norm": 4.65625, "learning_rate": 2.63122706520432e-06, "loss": 0.73450451, "memory(GiB)": 146.85, "step": 57210, "train_speed(iter/s)": 0.20208 }, { "acc": 0.76553984, "epoch": 1.3349859286370465, "grad_norm": 5.4375, "learning_rate": 2.6295635750990998e-06, "loss": 0.84088631, "memory(GiB)": 146.85, "step": 57220, "train_speed(iter/s)": 0.202099 }, { "acc": 0.7754045, "epoch": 1.3352192362093351, "grad_norm": 6.03125, "learning_rate": 2.627900423372991e-06, "loss": 0.80108719, "memory(GiB)": 146.85, "step": 57230, "train_speed(iter/s)": 0.202116 }, { "acc": 0.77270603, "epoch": 1.335452543781624, "grad_norm": 5.4375, "learning_rate": 2.626237610263406e-06, "loss": 0.80022488, "memory(GiB)": 146.85, "step": 57240, "train_speed(iter/s)": 0.202135 }, { "acc": 0.77814255, "epoch": 1.335685851353913, "grad_norm": 7.71875, "learning_rate": 2.6245751360077133e-06, "loss": 0.79491158, "memory(GiB)": 146.85, "step": 57250, "train_speed(iter/s)": 0.202153 }, { "acc": 0.75988216, "epoch": 1.3359191589262018, "grad_norm": 6.96875, "learning_rate": 2.622913000843228e-06, "loss": 0.8737299, "memory(GiB)": 146.85, "step": 57260, "train_speed(iter/s)": 0.202172 }, { "acc": 0.75729456, "epoch": 1.3361524664984907, "grad_norm": 5.6875, "learning_rate": 2.6212512050072236e-06, "loss": 0.87122145, "memory(GiB)": 146.85, "step": 57270, "train_speed(iter/s)": 0.20219 }, { "acc": 0.76784248, "epoch": 1.3363857740707796, "grad_norm": 6.15625, "learning_rate": 2.6195897487369195e-06, "loss": 0.81927471, "memory(GiB)": 146.85, "step": 57280, "train_speed(iter/s)": 0.202209 }, { "acc": 0.76948805, "epoch": 1.3366190816430685, "grad_norm": 5.46875, "learning_rate": 2.6179286322694866e-06, "loss": 0.83230247, "memory(GiB)": 146.85, "step": 57290, "train_speed(iter/s)": 0.202227 }, { "acc": 0.77213488, "epoch": 1.3368523892153574, "grad_norm": 6.3125, "learning_rate": 2.6162678558420484e-06, "loss": 0.82116394, "memory(GiB)": 146.85, "step": 57300, "train_speed(iter/s)": 0.202246 }, { "acc": 0.75976543, "epoch": 1.3370856967876463, "grad_norm": 4.5625, "learning_rate": 2.6146074196916806e-06, "loss": 0.8641489, "memory(GiB)": 146.85, "step": 57310, "train_speed(iter/s)": 0.202264 }, { "acc": 0.7844964, "epoch": 1.3373190043599352, "grad_norm": 5.15625, "learning_rate": 2.6129473240554126e-06, "loss": 0.76430759, "memory(GiB)": 146.85, "step": 57320, "train_speed(iter/s)": 0.202283 }, { "acc": 0.7573122, "epoch": 1.3375523119322241, "grad_norm": 14.9375, "learning_rate": 2.6112875691702176e-06, "loss": 0.8514308, "memory(GiB)": 146.85, "step": 57330, "train_speed(iter/s)": 0.202303 }, { "acc": 0.78351231, "epoch": 1.337785619504513, "grad_norm": 4.65625, "learning_rate": 2.609628155273032e-06, "loss": 0.79090672, "memory(GiB)": 146.85, "step": 57340, "train_speed(iter/s)": 0.202321 }, { "acc": 0.75743074, "epoch": 1.338018927076802, "grad_norm": 5.21875, "learning_rate": 2.6079690826007307e-06, "loss": 0.90133219, "memory(GiB)": 146.85, "step": 57350, "train_speed(iter/s)": 0.20234 }, { "acc": 0.77045588, "epoch": 1.3382522346490908, "grad_norm": 5.125, "learning_rate": 2.606310351390148e-06, "loss": 0.82225094, "memory(GiB)": 146.85, "step": 57360, "train_speed(iter/s)": 0.202359 }, { "acc": 0.77174387, "epoch": 1.3384855422213797, "grad_norm": 5.65625, "learning_rate": 2.6046519618780673e-06, "loss": 0.79365168, "memory(GiB)": 146.85, "step": 57370, "train_speed(iter/s)": 0.202378 }, { "acc": 0.77681479, "epoch": 1.3387188497936686, "grad_norm": 7.1875, "learning_rate": 2.6029939143012228e-06, "loss": 0.81753159, "memory(GiB)": 146.85, "step": 57380, "train_speed(iter/s)": 0.202396 }, { "acc": 0.78976879, "epoch": 1.3389521573659575, "grad_norm": 6.21875, "learning_rate": 2.601336208896304e-06, "loss": 0.75942373, "memory(GiB)": 146.85, "step": 57390, "train_speed(iter/s)": 0.202414 }, { "acc": 0.78169413, "epoch": 1.3391854649382464, "grad_norm": 5.34375, "learning_rate": 2.5996788458999404e-06, "loss": 0.77588615, "memory(GiB)": 146.85, "step": 57400, "train_speed(iter/s)": 0.202432 }, { "acc": 0.76722937, "epoch": 1.3394187725105353, "grad_norm": 5.125, "learning_rate": 2.598021825548727e-06, "loss": 0.83739452, "memory(GiB)": 146.85, "step": 57410, "train_speed(iter/s)": 0.202451 }, { "acc": 0.78708777, "epoch": 1.3396520800828242, "grad_norm": 5.375, "learning_rate": 2.596365148079197e-06, "loss": 0.74823141, "memory(GiB)": 146.85, "step": 57420, "train_speed(iter/s)": 0.202469 }, { "acc": 0.76486263, "epoch": 1.3398853876551131, "grad_norm": 5.625, "learning_rate": 2.594708813727847e-06, "loss": 0.84189825, "memory(GiB)": 146.85, "step": 57430, "train_speed(iter/s)": 0.202487 }, { "acc": 0.79259353, "epoch": 1.340118695227402, "grad_norm": 7.34375, "learning_rate": 2.5930528227311148e-06, "loss": 0.73081436, "memory(GiB)": 146.85, "step": 57440, "train_speed(iter/s)": 0.202504 }, { "acc": 0.77775936, "epoch": 1.340352002799691, "grad_norm": 5.40625, "learning_rate": 2.591397175325391e-06, "loss": 0.77562838, "memory(GiB)": 146.85, "step": 57450, "train_speed(iter/s)": 0.20252 }, { "acc": 0.77927217, "epoch": 1.3405853103719798, "grad_norm": 6.8125, "learning_rate": 2.5897418717470224e-06, "loss": 0.79521008, "memory(GiB)": 146.85, "step": 57460, "train_speed(iter/s)": 0.202538 }, { "acc": 0.76718936, "epoch": 1.3408186179442687, "grad_norm": 5.21875, "learning_rate": 2.5880869122322994e-06, "loss": 0.83275862, "memory(GiB)": 146.85, "step": 57470, "train_speed(iter/s)": 0.202556 }, { "acc": 0.75768814, "epoch": 1.3410519255165576, "grad_norm": 6.625, "learning_rate": 2.5864322970174714e-06, "loss": 0.88648567, "memory(GiB)": 146.85, "step": 57480, "train_speed(iter/s)": 0.202575 }, { "acc": 0.77651539, "epoch": 1.3412852330888465, "grad_norm": 4.375, "learning_rate": 2.5847780263387314e-06, "loss": 0.79773855, "memory(GiB)": 146.85, "step": 57490, "train_speed(iter/s)": 0.202594 }, { "acc": 0.76871133, "epoch": 1.3415185406611354, "grad_norm": 8.4375, "learning_rate": 2.583124100432227e-06, "loss": 0.83304138, "memory(GiB)": 146.85, "step": 57500, "train_speed(iter/s)": 0.202612 }, { "epoch": 1.3415185406611354, "eval_acc": 0.7351792035469621, "eval_loss": 0.8340417146682739, "eval_runtime": 1263.1724, "eval_samples_per_second": 28.493, "eval_steps_per_second": 14.247, "step": 57500 }, { "acc": 0.76075335, "epoch": 1.3417518482334243, "grad_norm": 6.21875, "learning_rate": 2.5814705195340527e-06, "loss": 0.86611719, "memory(GiB)": 146.85, "step": 57510, "train_speed(iter/s)": 0.201713 }, { "acc": 0.7715024, "epoch": 1.341985155805713, "grad_norm": 5.5625, "learning_rate": 2.5798172838802616e-06, "loss": 0.81158619, "memory(GiB)": 146.85, "step": 57520, "train_speed(iter/s)": 0.201732 }, { "acc": 0.77010374, "epoch": 1.342218463378002, "grad_norm": 6.625, "learning_rate": 2.5781643937068495e-06, "loss": 0.82078457, "memory(GiB)": 146.85, "step": 57530, "train_speed(iter/s)": 0.20175 }, { "acc": 0.77641397, "epoch": 1.3424517709502908, "grad_norm": 5.59375, "learning_rate": 2.5765118492497654e-06, "loss": 0.79472337, "memory(GiB)": 146.85, "step": 57540, "train_speed(iter/s)": 0.201768 }, { "acc": 0.77185879, "epoch": 1.34268507852258, "grad_norm": 5.46875, "learning_rate": 2.5748596507449118e-06, "loss": 0.82676964, "memory(GiB)": 146.85, "step": 57550, "train_speed(iter/s)": 0.201787 }, { "acc": 0.7630435, "epoch": 1.3429183860948686, "grad_norm": 6.0, "learning_rate": 2.5732077984281378e-06, "loss": 0.85849361, "memory(GiB)": 146.85, "step": 57560, "train_speed(iter/s)": 0.201806 }, { "acc": 0.78282437, "epoch": 1.3431516936671577, "grad_norm": 5.5, "learning_rate": 2.571556292535247e-06, "loss": 0.7852932, "memory(GiB)": 146.85, "step": 57570, "train_speed(iter/s)": 0.201824 }, { "acc": 0.77315283, "epoch": 1.3433850012394464, "grad_norm": 7.875, "learning_rate": 2.5699051333019897e-06, "loss": 0.83648777, "memory(GiB)": 146.85, "step": 57580, "train_speed(iter/s)": 0.201841 }, { "acc": 0.77092285, "epoch": 1.3436183088117355, "grad_norm": 5.75, "learning_rate": 2.568254320964067e-06, "loss": 0.825875, "memory(GiB)": 146.85, "step": 57590, "train_speed(iter/s)": 0.20186 }, { "acc": 0.77275171, "epoch": 1.3438516163840242, "grad_norm": 6.84375, "learning_rate": 2.5666038557571355e-06, "loss": 0.84104958, "memory(GiB)": 146.85, "step": 57600, "train_speed(iter/s)": 0.201877 }, { "acc": 0.76711826, "epoch": 1.3440849239563133, "grad_norm": 4.3125, "learning_rate": 2.5649537379167944e-06, "loss": 0.83654184, "memory(GiB)": 146.85, "step": 57610, "train_speed(iter/s)": 0.201894 }, { "acc": 0.76739502, "epoch": 1.344318231528602, "grad_norm": 5.59375, "learning_rate": 2.5633039676786044e-06, "loss": 0.82592249, "memory(GiB)": 146.85, "step": 57620, "train_speed(iter/s)": 0.201912 }, { "acc": 0.78554687, "epoch": 1.3445515391008909, "grad_norm": 6.0, "learning_rate": 2.5616545452780607e-06, "loss": 0.7641613, "memory(GiB)": 146.85, "step": 57630, "train_speed(iter/s)": 0.201931 }, { "acc": 0.77446098, "epoch": 1.3447848466731798, "grad_norm": 5.96875, "learning_rate": 2.5600054709506244e-06, "loss": 0.82534389, "memory(GiB)": 146.85, "step": 57640, "train_speed(iter/s)": 0.201948 }, { "acc": 0.77806883, "epoch": 1.3450181542454687, "grad_norm": 5.21875, "learning_rate": 2.5583567449316983e-06, "loss": 0.77685623, "memory(GiB)": 146.85, "step": 57650, "train_speed(iter/s)": 0.201967 }, { "acc": 0.76826429, "epoch": 1.3452514618177576, "grad_norm": 6.28125, "learning_rate": 2.5567083674566363e-06, "loss": 0.84580879, "memory(GiB)": 146.85, "step": 57660, "train_speed(iter/s)": 0.201986 }, { "acc": 0.78148241, "epoch": 1.3454847693900465, "grad_norm": 5.1875, "learning_rate": 2.555060338760746e-06, "loss": 0.77003431, "memory(GiB)": 146.85, "step": 57670, "train_speed(iter/s)": 0.202005 }, { "acc": 0.76498985, "epoch": 1.3457180769623354, "grad_norm": 10.3125, "learning_rate": 2.553412659079281e-06, "loss": 0.87571726, "memory(GiB)": 146.85, "step": 57680, "train_speed(iter/s)": 0.202023 }, { "acc": 0.77421026, "epoch": 1.3459513845346243, "grad_norm": 5.09375, "learning_rate": 2.5517653286474486e-06, "loss": 0.80097246, "memory(GiB)": 146.85, "step": 57690, "train_speed(iter/s)": 0.20204 }, { "acc": 0.75908651, "epoch": 1.3461846921069132, "grad_norm": 8.625, "learning_rate": 2.5501183477004036e-06, "loss": 0.89296494, "memory(GiB)": 146.85, "step": 57700, "train_speed(iter/s)": 0.202059 }, { "acc": 0.77634516, "epoch": 1.346417999679202, "grad_norm": 5.0, "learning_rate": 2.548471716473255e-06, "loss": 0.79046507, "memory(GiB)": 146.85, "step": 57710, "train_speed(iter/s)": 0.202077 }, { "acc": 0.76660461, "epoch": 1.346651307251491, "grad_norm": 5.59375, "learning_rate": 2.546825435201056e-06, "loss": 0.8458168, "memory(GiB)": 146.85, "step": 57720, "train_speed(iter/s)": 0.202094 }, { "acc": 0.76195168, "epoch": 1.3468846148237799, "grad_norm": 5.46875, "learning_rate": 2.5451795041188137e-06, "loss": 0.84371548, "memory(GiB)": 146.85, "step": 57730, "train_speed(iter/s)": 0.202113 }, { "acc": 0.80459127, "epoch": 1.3471179223960688, "grad_norm": 8.5, "learning_rate": 2.543533923461484e-06, "loss": 0.69416728, "memory(GiB)": 146.85, "step": 57740, "train_speed(iter/s)": 0.202131 }, { "acc": 0.77578316, "epoch": 1.3473512299683577, "grad_norm": 5.125, "learning_rate": 2.541888693463971e-06, "loss": 0.79683714, "memory(GiB)": 146.85, "step": 57750, "train_speed(iter/s)": 0.20215 }, { "acc": 0.77123785, "epoch": 1.3475845375406466, "grad_norm": 5.46875, "learning_rate": 2.540243814361135e-06, "loss": 0.80429707, "memory(GiB)": 146.85, "step": 57760, "train_speed(iter/s)": 0.202169 }, { "acc": 0.77327309, "epoch": 1.3478178451129355, "grad_norm": 7.3125, "learning_rate": 2.5385992863877783e-06, "loss": 0.81788502, "memory(GiB)": 146.85, "step": 57770, "train_speed(iter/s)": 0.202188 }, { "acc": 0.77107797, "epoch": 1.3480511526852244, "grad_norm": 8.0625, "learning_rate": 2.5369551097786606e-06, "loss": 0.80781374, "memory(GiB)": 146.85, "step": 57780, "train_speed(iter/s)": 0.202206 }, { "acc": 0.79030385, "epoch": 1.3482844602575133, "grad_norm": 4.875, "learning_rate": 2.5353112847684846e-06, "loss": 0.77795358, "memory(GiB)": 146.85, "step": 57790, "train_speed(iter/s)": 0.202223 }, { "acc": 0.76702485, "epoch": 1.3485177678298021, "grad_norm": 5.0, "learning_rate": 2.5336678115919056e-06, "loss": 0.83924885, "memory(GiB)": 146.85, "step": 57800, "train_speed(iter/s)": 0.202242 }, { "acc": 0.77809467, "epoch": 1.348751075402091, "grad_norm": 5.53125, "learning_rate": 2.532024690483531e-06, "loss": 0.77355185, "memory(GiB)": 146.85, "step": 57810, "train_speed(iter/s)": 0.202261 }, { "acc": 0.76766558, "epoch": 1.34898438297438, "grad_norm": 5.9375, "learning_rate": 2.5303819216779134e-06, "loss": 0.82976837, "memory(GiB)": 146.85, "step": 57820, "train_speed(iter/s)": 0.202281 }, { "acc": 0.77709665, "epoch": 1.3492176905466688, "grad_norm": 6.0625, "learning_rate": 2.528739505409561e-06, "loss": 0.82212677, "memory(GiB)": 146.85, "step": 57830, "train_speed(iter/s)": 0.2023 }, { "acc": 0.79226685, "epoch": 1.3494509981189577, "grad_norm": 4.34375, "learning_rate": 2.5270974419129248e-06, "loss": 0.7562665, "memory(GiB)": 146.85, "step": 57840, "train_speed(iter/s)": 0.202318 }, { "acc": 0.76442947, "epoch": 1.3496843056912466, "grad_norm": 6.0625, "learning_rate": 2.525455731422414e-06, "loss": 0.84800978, "memory(GiB)": 146.85, "step": 57850, "train_speed(iter/s)": 0.202337 }, { "acc": 0.75936422, "epoch": 1.3499176132635355, "grad_norm": 8.875, "learning_rate": 2.5238143741723743e-06, "loss": 0.87453794, "memory(GiB)": 146.85, "step": 57860, "train_speed(iter/s)": 0.202354 }, { "acc": 0.78516855, "epoch": 1.3501509208358244, "grad_norm": 6.5625, "learning_rate": 2.5221733703971165e-06, "loss": 0.77638044, "memory(GiB)": 146.85, "step": 57870, "train_speed(iter/s)": 0.202372 }, { "acc": 0.75529232, "epoch": 1.3503842284081133, "grad_norm": 5.21875, "learning_rate": 2.5205327203308887e-06, "loss": 0.85896921, "memory(GiB)": 146.85, "step": 57880, "train_speed(iter/s)": 0.202391 }, { "acc": 0.78393526, "epoch": 1.3506175359804022, "grad_norm": 6.21875, "learning_rate": 2.518892424207894e-06, "loss": 0.78194494, "memory(GiB)": 146.85, "step": 57890, "train_speed(iter/s)": 0.202409 }, { "acc": 0.75131269, "epoch": 1.3508508435526911, "grad_norm": 7.28125, "learning_rate": 2.517252482262286e-06, "loss": 0.91308918, "memory(GiB)": 146.85, "step": 57900, "train_speed(iter/s)": 0.202427 }, { "acc": 0.78062897, "epoch": 1.3510841511249798, "grad_norm": 5.0625, "learning_rate": 2.515612894728164e-06, "loss": 0.7826385, "memory(GiB)": 146.85, "step": 57910, "train_speed(iter/s)": 0.202445 }, { "acc": 0.77247114, "epoch": 1.351317458697269, "grad_norm": 9.0625, "learning_rate": 2.5139736618395804e-06, "loss": 0.82136927, "memory(GiB)": 146.85, "step": 57920, "train_speed(iter/s)": 0.202464 }, { "acc": 0.76308184, "epoch": 1.3515507662695576, "grad_norm": 7.25, "learning_rate": 2.5123347838305354e-06, "loss": 0.84461517, "memory(GiB)": 146.85, "step": 57930, "train_speed(iter/s)": 0.202482 }, { "acc": 0.76814613, "epoch": 1.3517840738418467, "grad_norm": 6.03125, "learning_rate": 2.510696260934975e-06, "loss": 0.82353039, "memory(GiB)": 146.85, "step": 57940, "train_speed(iter/s)": 0.2025 }, { "acc": 0.77356782, "epoch": 1.3520173814141354, "grad_norm": 7.40625, "learning_rate": 2.509058093386802e-06, "loss": 0.83272495, "memory(GiB)": 146.85, "step": 57950, "train_speed(iter/s)": 0.202518 }, { "acc": 0.73344245, "epoch": 1.3522506889864245, "grad_norm": 5.59375, "learning_rate": 2.507420281419862e-06, "loss": 0.97485561, "memory(GiB)": 146.85, "step": 57960, "train_speed(iter/s)": 0.202535 }, { "acc": 0.75813661, "epoch": 1.3524839965587132, "grad_norm": 6.8125, "learning_rate": 2.505782825267954e-06, "loss": 0.88210669, "memory(GiB)": 146.85, "step": 57970, "train_speed(iter/s)": 0.202553 }, { "acc": 0.7726418, "epoch": 1.3527173041310023, "grad_norm": 5.875, "learning_rate": 2.5041457251648204e-06, "loss": 0.82399445, "memory(GiB)": 146.85, "step": 57980, "train_speed(iter/s)": 0.202571 }, { "acc": 0.75660238, "epoch": 1.352950611703291, "grad_norm": 5.59375, "learning_rate": 2.502508981344162e-06, "loss": 0.88580618, "memory(GiB)": 146.85, "step": 57990, "train_speed(iter/s)": 0.20259 }, { "acc": 0.77243156, "epoch": 1.35318391927558, "grad_norm": 5.8125, "learning_rate": 2.5008725940396182e-06, "loss": 0.81888351, "memory(GiB)": 146.85, "step": 58000, "train_speed(iter/s)": 0.202608 }, { "epoch": 1.35318391927558, "eval_acc": 0.7352087292956737, "eval_loss": 0.8340564370155334, "eval_runtime": 1262.3106, "eval_samples_per_second": 28.512, "eval_steps_per_second": 14.256, "step": 58000 }, { "acc": 0.77138596, "epoch": 1.3534172268478688, "grad_norm": 5.46875, "learning_rate": 2.499236563484788e-06, "loss": 0.8291029, "memory(GiB)": 146.85, "step": 58010, "train_speed(iter/s)": 0.201718 }, { "acc": 0.79540548, "epoch": 1.3536505344201577, "grad_norm": 5.09375, "learning_rate": 2.4976008899132122e-06, "loss": 0.7281949, "memory(GiB)": 146.85, "step": 58020, "train_speed(iter/s)": 0.201736 }, { "acc": 0.7808846, "epoch": 1.3538838419924466, "grad_norm": 5.71875, "learning_rate": 2.49596557355838e-06, "loss": 0.78136415, "memory(GiB)": 146.85, "step": 58030, "train_speed(iter/s)": 0.201754 }, { "acc": 0.76097174, "epoch": 1.3541171495647355, "grad_norm": 7.125, "learning_rate": 2.4943306146537365e-06, "loss": 0.8631609, "memory(GiB)": 146.85, "step": 58040, "train_speed(iter/s)": 0.201773 }, { "acc": 0.75908422, "epoch": 1.3543504571370244, "grad_norm": 6.65625, "learning_rate": 2.4926960134326684e-06, "loss": 0.86481152, "memory(GiB)": 146.85, "step": 58050, "train_speed(iter/s)": 0.201792 }, { "acc": 0.76684632, "epoch": 1.3545837647093133, "grad_norm": 5.90625, "learning_rate": 2.491061770128518e-06, "loss": 0.83764725, "memory(GiB)": 146.85, "step": 58060, "train_speed(iter/s)": 0.20181 }, { "acc": 0.76290073, "epoch": 1.3548170722816022, "grad_norm": 7.625, "learning_rate": 2.4894278849745705e-06, "loss": 0.85553226, "memory(GiB)": 146.85, "step": 58070, "train_speed(iter/s)": 0.201828 }, { "acc": 0.76868467, "epoch": 1.355050379853891, "grad_norm": 5.84375, "learning_rate": 2.4877943582040636e-06, "loss": 0.83967438, "memory(GiB)": 146.85, "step": 58080, "train_speed(iter/s)": 0.201846 }, { "acc": 0.77324643, "epoch": 1.35528368742618, "grad_norm": 5.65625, "learning_rate": 2.486161190050182e-06, "loss": 0.81810341, "memory(GiB)": 146.85, "step": 58090, "train_speed(iter/s)": 0.201865 }, { "acc": 0.77702827, "epoch": 1.355516994998469, "grad_norm": 6.1875, "learning_rate": 2.4845283807460587e-06, "loss": 0.78744555, "memory(GiB)": 146.85, "step": 58100, "train_speed(iter/s)": 0.201884 }, { "acc": 0.77251873, "epoch": 1.3557503025707578, "grad_norm": 5.59375, "learning_rate": 2.4828959305247795e-06, "loss": 0.84302769, "memory(GiB)": 146.85, "step": 58110, "train_speed(iter/s)": 0.201903 }, { "acc": 0.78101106, "epoch": 1.3559836101430467, "grad_norm": 4.84375, "learning_rate": 2.4812638396193734e-06, "loss": 0.79228973, "memory(GiB)": 146.85, "step": 58120, "train_speed(iter/s)": 0.201921 }, { "acc": 0.77220306, "epoch": 1.3562169177153356, "grad_norm": 6.34375, "learning_rate": 2.479632108262825e-06, "loss": 0.80810776, "memory(GiB)": 146.85, "step": 58130, "train_speed(iter/s)": 0.201939 }, { "acc": 0.78334236, "epoch": 1.3564502252876245, "grad_norm": 5.46875, "learning_rate": 2.4780007366880584e-06, "loss": 0.77494097, "memory(GiB)": 146.85, "step": 58140, "train_speed(iter/s)": 0.201957 }, { "acc": 0.77392297, "epoch": 1.3566835328599134, "grad_norm": 5.90625, "learning_rate": 2.476369725127956e-06, "loss": 0.81856747, "memory(GiB)": 146.85, "step": 58150, "train_speed(iter/s)": 0.201975 }, { "acc": 0.76699877, "epoch": 1.3569168404322023, "grad_norm": 5.0, "learning_rate": 2.474739073815342e-06, "loss": 0.84811354, "memory(GiB)": 146.85, "step": 58160, "train_speed(iter/s)": 0.201993 }, { "acc": 0.75498753, "epoch": 1.3571501480044912, "grad_norm": 6.46875, "learning_rate": 2.47310878298299e-06, "loss": 0.90446119, "memory(GiB)": 146.85, "step": 58170, "train_speed(iter/s)": 0.202012 }, { "acc": 0.77441425, "epoch": 1.35738345557678, "grad_norm": 5.375, "learning_rate": 2.4714788528636275e-06, "loss": 0.80630226, "memory(GiB)": 146.85, "step": 58180, "train_speed(iter/s)": 0.202029 }, { "acc": 0.77943006, "epoch": 1.357616763149069, "grad_norm": 6.15625, "learning_rate": 2.4698492836899234e-06, "loss": 0.76328125, "memory(GiB)": 146.85, "step": 58190, "train_speed(iter/s)": 0.202047 }, { "acc": 0.76655579, "epoch": 1.3578500707213579, "grad_norm": 7.25, "learning_rate": 2.4682200756944997e-06, "loss": 0.85136986, "memory(GiB)": 146.85, "step": 58200, "train_speed(iter/s)": 0.202065 }, { "acc": 0.78485146, "epoch": 1.3580833782936468, "grad_norm": 9.3125, "learning_rate": 2.4665912291099225e-06, "loss": 0.74981208, "memory(GiB)": 146.85, "step": 58210, "train_speed(iter/s)": 0.202083 }, { "acc": 0.75859876, "epoch": 1.3583166858659357, "grad_norm": 5.8125, "learning_rate": 2.4649627441687134e-06, "loss": 0.88694801, "memory(GiB)": 146.85, "step": 58220, "train_speed(iter/s)": 0.2021 }, { "acc": 0.76297212, "epoch": 1.3585499934382246, "grad_norm": 7.59375, "learning_rate": 2.463334621103336e-06, "loss": 0.86032181, "memory(GiB)": 146.85, "step": 58230, "train_speed(iter/s)": 0.20212 }, { "acc": 0.75668125, "epoch": 1.3587833010105135, "grad_norm": 7.59375, "learning_rate": 2.461706860146203e-06, "loss": 0.86856327, "memory(GiB)": 146.85, "step": 58240, "train_speed(iter/s)": 0.202138 }, { "acc": 0.76667714, "epoch": 1.3590166085828024, "grad_norm": 5.875, "learning_rate": 2.4600794615296797e-06, "loss": 0.84297791, "memory(GiB)": 146.85, "step": 58250, "train_speed(iter/s)": 0.202156 }, { "acc": 0.76820774, "epoch": 1.3592499161550913, "grad_norm": 41.0, "learning_rate": 2.4584524254860736e-06, "loss": 0.86441174, "memory(GiB)": 146.85, "step": 58260, "train_speed(iter/s)": 0.202174 }, { "acc": 0.77475519, "epoch": 1.3594832237273802, "grad_norm": 5.21875, "learning_rate": 2.4568257522476476e-06, "loss": 0.82377558, "memory(GiB)": 146.85, "step": 58270, "train_speed(iter/s)": 0.202192 }, { "acc": 0.76116533, "epoch": 1.359716531299669, "grad_norm": 5.03125, "learning_rate": 2.455199442046607e-06, "loss": 0.86805935, "memory(GiB)": 146.85, "step": 58280, "train_speed(iter/s)": 0.20221 }, { "acc": 0.77617607, "epoch": 1.359949838871958, "grad_norm": 5.5625, "learning_rate": 2.453573495115104e-06, "loss": 0.78906813, "memory(GiB)": 146.85, "step": 58290, "train_speed(iter/s)": 0.202229 }, { "acc": 0.79638882, "epoch": 1.3601831464442466, "grad_norm": 6.3125, "learning_rate": 2.4519479116852476e-06, "loss": 0.72148094, "memory(GiB)": 146.85, "step": 58300, "train_speed(iter/s)": 0.202246 }, { "acc": 0.76910229, "epoch": 1.3604164540165358, "grad_norm": 5.375, "learning_rate": 2.450322691989086e-06, "loss": 0.8550087, "memory(GiB)": 146.85, "step": 58310, "train_speed(iter/s)": 0.202266 }, { "acc": 0.77437658, "epoch": 1.3606497615888244, "grad_norm": 7.28125, "learning_rate": 2.4486978362586196e-06, "loss": 0.8098999, "memory(GiB)": 146.85, "step": 58320, "train_speed(iter/s)": 0.202283 }, { "acc": 0.75899277, "epoch": 1.3608830691611136, "grad_norm": 5.5625, "learning_rate": 2.447073344725794e-06, "loss": 0.8773077, "memory(GiB)": 146.85, "step": 58330, "train_speed(iter/s)": 0.202301 }, { "acc": 0.7804595, "epoch": 1.3611163767334022, "grad_norm": 6.78125, "learning_rate": 2.4454492176225087e-06, "loss": 0.7956378, "memory(GiB)": 146.85, "step": 58340, "train_speed(iter/s)": 0.202319 }, { "acc": 0.77625275, "epoch": 1.3613496843056914, "grad_norm": 4.9375, "learning_rate": 2.4438254551806034e-06, "loss": 0.81270237, "memory(GiB)": 146.85, "step": 58350, "train_speed(iter/s)": 0.202336 }, { "acc": 0.77016234, "epoch": 1.36158299187798, "grad_norm": 5.21875, "learning_rate": 2.4422020576318737e-06, "loss": 0.83682156, "memory(GiB)": 146.85, "step": 58360, "train_speed(iter/s)": 0.202354 }, { "acc": 0.74679627, "epoch": 1.3618162994502692, "grad_norm": 5.78125, "learning_rate": 2.4405790252080576e-06, "loss": 0.93053207, "memory(GiB)": 146.85, "step": 58370, "train_speed(iter/s)": 0.202373 }, { "acc": 0.78679843, "epoch": 1.3620496070225578, "grad_norm": 4.46875, "learning_rate": 2.4389563581408397e-06, "loss": 0.76657686, "memory(GiB)": 146.85, "step": 58380, "train_speed(iter/s)": 0.20239 }, { "acc": 0.76557307, "epoch": 1.3622829145948467, "grad_norm": 7.625, "learning_rate": 2.4373340566618603e-06, "loss": 0.84852552, "memory(GiB)": 146.85, "step": 58390, "train_speed(iter/s)": 0.202408 }, { "acc": 0.77317982, "epoch": 1.3625162221671356, "grad_norm": 6.90625, "learning_rate": 2.435712121002698e-06, "loss": 0.83980141, "memory(GiB)": 146.85, "step": 58400, "train_speed(iter/s)": 0.202427 }, { "acc": 0.78961754, "epoch": 1.3627495297394245, "grad_norm": 9.625, "learning_rate": 2.4340905513948866e-06, "loss": 0.75868998, "memory(GiB)": 146.85, "step": 58410, "train_speed(iter/s)": 0.202445 }, { "acc": 0.78666725, "epoch": 1.3629828373117134, "grad_norm": 6.3125, "learning_rate": 2.432469348069904e-06, "loss": 0.76193657, "memory(GiB)": 146.85, "step": 58420, "train_speed(iter/s)": 0.202463 }, { "acc": 0.76860943, "epoch": 1.3632161448840023, "grad_norm": 5.5625, "learning_rate": 2.4308485112591764e-06, "loss": 0.83180285, "memory(GiB)": 146.85, "step": 58430, "train_speed(iter/s)": 0.202482 }, { "acc": 0.76599255, "epoch": 1.3634494524562912, "grad_norm": 6.15625, "learning_rate": 2.429228041194077e-06, "loss": 0.85792847, "memory(GiB)": 146.85, "step": 58440, "train_speed(iter/s)": 0.2025 }, { "acc": 0.74869199, "epoch": 1.3636827600285801, "grad_norm": 4.4375, "learning_rate": 2.4276079381059258e-06, "loss": 0.9255827, "memory(GiB)": 146.85, "step": 58450, "train_speed(iter/s)": 0.202519 }, { "acc": 0.75236444, "epoch": 1.363916067600869, "grad_norm": 7.09375, "learning_rate": 2.4259882022259968e-06, "loss": 0.91199055, "memory(GiB)": 146.85, "step": 58460, "train_speed(iter/s)": 0.202536 }, { "acc": 0.78696566, "epoch": 1.364149375173158, "grad_norm": 4.4375, "learning_rate": 2.424368833785502e-06, "loss": 0.76347742, "memory(GiB)": 146.85, "step": 58470, "train_speed(iter/s)": 0.202554 }, { "acc": 0.77617378, "epoch": 1.3643826827454468, "grad_norm": 5.46875, "learning_rate": 2.4227498330156095e-06, "loss": 0.81318655, "memory(GiB)": 146.85, "step": 58480, "train_speed(iter/s)": 0.202572 }, { "acc": 0.76788597, "epoch": 1.3646159903177357, "grad_norm": 5.75, "learning_rate": 2.421131200147428e-06, "loss": 0.83021622, "memory(GiB)": 146.85, "step": 58490, "train_speed(iter/s)": 0.20259 }, { "acc": 0.77245722, "epoch": 1.3648492978900246, "grad_norm": 4.65625, "learning_rate": 2.4195129354120204e-06, "loss": 0.82009945, "memory(GiB)": 146.85, "step": 58500, "train_speed(iter/s)": 0.202607 }, { "epoch": 1.3648492978900246, "eval_acc": 0.7352250249274763, "eval_loss": 0.8340473771095276, "eval_runtime": 1263.307, "eval_samples_per_second": 28.49, "eval_steps_per_second": 14.245, "step": 58500 }, { "acc": 0.77770028, "epoch": 1.3650826054623135, "grad_norm": 6.15625, "learning_rate": 2.4178950390403917e-06, "loss": 0.80343113, "memory(GiB)": 146.85, "step": 58510, "train_speed(iter/s)": 0.201724 }, { "acc": 0.760992, "epoch": 1.3653159130346024, "grad_norm": 8.0625, "learning_rate": 2.416277511263494e-06, "loss": 0.86614256, "memory(GiB)": 146.85, "step": 58520, "train_speed(iter/s)": 0.201741 }, { "acc": 0.76679487, "epoch": 1.3655492206068913, "grad_norm": 6.09375, "learning_rate": 2.4146603523122347e-06, "loss": 0.85150585, "memory(GiB)": 146.85, "step": 58530, "train_speed(iter/s)": 0.201758 }, { "acc": 0.78077612, "epoch": 1.3657825281791802, "grad_norm": 10.5625, "learning_rate": 2.413043562417456e-06, "loss": 0.80709887, "memory(GiB)": 146.85, "step": 58540, "train_speed(iter/s)": 0.201777 }, { "acc": 0.75831623, "epoch": 1.3660158357514691, "grad_norm": 6.84375, "learning_rate": 2.4114271418099583e-06, "loss": 0.89949121, "memory(GiB)": 146.85, "step": 58550, "train_speed(iter/s)": 0.201795 }, { "acc": 0.76411142, "epoch": 1.366249143323758, "grad_norm": 5.90625, "learning_rate": 2.4098110907204824e-06, "loss": 0.83706436, "memory(GiB)": 146.85, "step": 58560, "train_speed(iter/s)": 0.201813 }, { "acc": 0.75313578, "epoch": 1.366482450896047, "grad_norm": 5.21875, "learning_rate": 2.4081954093797234e-06, "loss": 0.91465969, "memory(GiB)": 146.85, "step": 58570, "train_speed(iter/s)": 0.201831 }, { "acc": 0.76545076, "epoch": 1.3667157584683358, "grad_norm": 5.9375, "learning_rate": 2.406580098018316e-06, "loss": 0.85291586, "memory(GiB)": 146.85, "step": 58580, "train_speed(iter/s)": 0.201849 }, { "acc": 0.77314634, "epoch": 1.3669490660406247, "grad_norm": 7.1875, "learning_rate": 2.4049651568668447e-06, "loss": 0.81438837, "memory(GiB)": 146.85, "step": 58590, "train_speed(iter/s)": 0.201867 }, { "acc": 0.76518364, "epoch": 1.3671823736129136, "grad_norm": 6.03125, "learning_rate": 2.403350586155845e-06, "loss": 0.85747757, "memory(GiB)": 146.85, "step": 58600, "train_speed(iter/s)": 0.201884 }, { "acc": 0.78190069, "epoch": 1.3674156811852025, "grad_norm": 6.625, "learning_rate": 2.4017363861157927e-06, "loss": 0.79305267, "memory(GiB)": 146.85, "step": 58610, "train_speed(iter/s)": 0.201903 }, { "acc": 0.77687893, "epoch": 1.3676489887574914, "grad_norm": 6.71875, "learning_rate": 2.400122556977119e-06, "loss": 0.80575132, "memory(GiB)": 146.85, "step": 58620, "train_speed(iter/s)": 0.201921 }, { "acc": 0.76041522, "epoch": 1.3678822963297803, "grad_norm": 5.9375, "learning_rate": 2.398509098970193e-06, "loss": 0.8743475, "memory(GiB)": 146.85, "step": 58630, "train_speed(iter/s)": 0.20194 }, { "acc": 0.78300838, "epoch": 1.3681156039020692, "grad_norm": 5.53125, "learning_rate": 2.3968960123253392e-06, "loss": 0.79275932, "memory(GiB)": 146.85, "step": 58640, "train_speed(iter/s)": 0.201956 }, { "acc": 0.76833982, "epoch": 1.368348911474358, "grad_norm": 5.8125, "learning_rate": 2.3952832972728234e-06, "loss": 0.8481823, "memory(GiB)": 146.85, "step": 58650, "train_speed(iter/s)": 0.201975 }, { "acc": 0.75686054, "epoch": 1.368582219046647, "grad_norm": 7.75, "learning_rate": 2.39367095404286e-06, "loss": 0.89792767, "memory(GiB)": 146.85, "step": 58660, "train_speed(iter/s)": 0.201993 }, { "acc": 0.7877861, "epoch": 1.368815526618936, "grad_norm": 11.5, "learning_rate": 2.392058982865611e-06, "loss": 0.75863361, "memory(GiB)": 146.85, "step": 58670, "train_speed(iter/s)": 0.202012 }, { "acc": 0.76908579, "epoch": 1.3690488341912248, "grad_norm": 6.25, "learning_rate": 2.3904473839711826e-06, "loss": 0.84558296, "memory(GiB)": 146.85, "step": 58680, "train_speed(iter/s)": 0.20203 }, { "acc": 0.78158913, "epoch": 1.3692821417635135, "grad_norm": 5.90625, "learning_rate": 2.388836157589634e-06, "loss": 0.7986536, "memory(GiB)": 146.85, "step": 58690, "train_speed(iter/s)": 0.202047 }, { "acc": 0.79226427, "epoch": 1.3695154493358026, "grad_norm": 5.6875, "learning_rate": 2.3872253039509637e-06, "loss": 0.75451908, "memory(GiB)": 146.85, "step": 58700, "train_speed(iter/s)": 0.202064 }, { "acc": 0.75562677, "epoch": 1.3697487569080913, "grad_norm": 6.09375, "learning_rate": 2.3856148232851237e-06, "loss": 0.91230831, "memory(GiB)": 146.85, "step": 58710, "train_speed(iter/s)": 0.202082 }, { "acc": 0.78570423, "epoch": 1.3699820644803804, "grad_norm": 4.53125, "learning_rate": 2.384004715822009e-06, "loss": 0.77315803, "memory(GiB)": 146.85, "step": 58720, "train_speed(iter/s)": 0.2021 }, { "acc": 0.77355404, "epoch": 1.370215372052669, "grad_norm": 8.0625, "learning_rate": 2.3823949817914584e-06, "loss": 0.83710241, "memory(GiB)": 146.85, "step": 58730, "train_speed(iter/s)": 0.202118 }, { "acc": 0.77772641, "epoch": 1.3704486796249582, "grad_norm": 9.375, "learning_rate": 2.380785621423266e-06, "loss": 0.80306368, "memory(GiB)": 146.85, "step": 58740, "train_speed(iter/s)": 0.202135 }, { "acc": 0.77650962, "epoch": 1.3706819871972469, "grad_norm": 6.3125, "learning_rate": 2.379176634947163e-06, "loss": 0.80954266, "memory(GiB)": 146.85, "step": 58750, "train_speed(iter/s)": 0.202153 }, { "acc": 0.77217689, "epoch": 1.370915294769536, "grad_norm": 5.75, "learning_rate": 2.377568022592838e-06, "loss": 0.80695143, "memory(GiB)": 146.85, "step": 58760, "train_speed(iter/s)": 0.20217 }, { "acc": 0.75097938, "epoch": 1.3711486023418247, "grad_norm": 4.65625, "learning_rate": 2.3759597845899123e-06, "loss": 0.90040474, "memory(GiB)": 146.85, "step": 58770, "train_speed(iter/s)": 0.202187 }, { "acc": 0.77406721, "epoch": 1.3713819099141136, "grad_norm": 6.0, "learning_rate": 2.374351921167967e-06, "loss": 0.79676123, "memory(GiB)": 146.85, "step": 58780, "train_speed(iter/s)": 0.202205 }, { "acc": 0.77163544, "epoch": 1.3716152174864025, "grad_norm": 5.875, "learning_rate": 2.37274443255652e-06, "loss": 0.83432159, "memory(GiB)": 146.85, "step": 58790, "train_speed(iter/s)": 0.202221 }, { "acc": 0.76376309, "epoch": 1.3718485250586914, "grad_norm": 6.78125, "learning_rate": 2.3711373189850444e-06, "loss": 0.85014362, "memory(GiB)": 146.85, "step": 58800, "train_speed(iter/s)": 0.202239 }, { "acc": 0.77654219, "epoch": 1.3720818326309803, "grad_norm": 5.375, "learning_rate": 2.369530580682953e-06, "loss": 0.8189436, "memory(GiB)": 146.85, "step": 58810, "train_speed(iter/s)": 0.202257 }, { "acc": 0.77265282, "epoch": 1.3723151402032692, "grad_norm": 5.4375, "learning_rate": 2.367924217879604e-06, "loss": 0.80827713, "memory(GiB)": 146.85, "step": 58820, "train_speed(iter/s)": 0.202275 }, { "acc": 0.77588992, "epoch": 1.372548447775558, "grad_norm": 4.5, "learning_rate": 2.3663182308043115e-06, "loss": 0.81154451, "memory(GiB)": 146.85, "step": 58830, "train_speed(iter/s)": 0.202291 }, { "acc": 0.78031845, "epoch": 1.372781755347847, "grad_norm": 5.71875, "learning_rate": 2.3647126196863234e-06, "loss": 0.79252024, "memory(GiB)": 146.85, "step": 58840, "train_speed(iter/s)": 0.202305 }, { "acc": 0.76751943, "epoch": 1.3730150629201359, "grad_norm": 8.875, "learning_rate": 2.3631073847548457e-06, "loss": 0.8450202, "memory(GiB)": 146.85, "step": 58850, "train_speed(iter/s)": 0.202323 }, { "acc": 0.75078082, "epoch": 1.3732483704924248, "grad_norm": 4.0, "learning_rate": 2.3615025262390228e-06, "loss": 0.92655869, "memory(GiB)": 146.85, "step": 58860, "train_speed(iter/s)": 0.20234 }, { "acc": 0.76502762, "epoch": 1.3734816780647137, "grad_norm": 5.0625, "learning_rate": 2.3598980443679483e-06, "loss": 0.85167446, "memory(GiB)": 146.85, "step": 58870, "train_speed(iter/s)": 0.202358 }, { "acc": 0.77820921, "epoch": 1.3737149856370026, "grad_norm": 4.8125, "learning_rate": 2.3582939393706604e-06, "loss": 0.80149345, "memory(GiB)": 146.85, "step": 58880, "train_speed(iter/s)": 0.202376 }, { "acc": 0.77712126, "epoch": 1.3739482932092915, "grad_norm": 4.46875, "learning_rate": 2.3566902114761435e-06, "loss": 0.80856466, "memory(GiB)": 146.85, "step": 58890, "train_speed(iter/s)": 0.202395 }, { "acc": 0.77778502, "epoch": 1.3741816007815804, "grad_norm": 9.5, "learning_rate": 2.3550868609133326e-06, "loss": 0.78273273, "memory(GiB)": 146.85, "step": 58900, "train_speed(iter/s)": 0.202413 }, { "acc": 0.7499433, "epoch": 1.3744149083538693, "grad_norm": 6.84375, "learning_rate": 2.3534838879111026e-06, "loss": 0.8994278, "memory(GiB)": 146.85, "step": 58910, "train_speed(iter/s)": 0.202431 }, { "acc": 0.75718918, "epoch": 1.3746482159261582, "grad_norm": 5.15625, "learning_rate": 2.35188129269828e-06, "loss": 0.89333773, "memory(GiB)": 146.85, "step": 58920, "train_speed(iter/s)": 0.202448 }, { "acc": 0.7739542, "epoch": 1.374881523498447, "grad_norm": 6.59375, "learning_rate": 2.3502790755036324e-06, "loss": 0.80519886, "memory(GiB)": 146.85, "step": 58930, "train_speed(iter/s)": 0.202465 }, { "acc": 0.77241807, "epoch": 1.375114831070736, "grad_norm": 6.53125, "learning_rate": 2.3486772365558786e-06, "loss": 0.85259933, "memory(GiB)": 146.85, "step": 58940, "train_speed(iter/s)": 0.202483 }, { "acc": 0.77130384, "epoch": 1.3753481386430249, "grad_norm": 10.3125, "learning_rate": 2.3470757760836794e-06, "loss": 0.83315659, "memory(GiB)": 146.85, "step": 58950, "train_speed(iter/s)": 0.202502 }, { "acc": 0.75972652, "epoch": 1.3755814462153138, "grad_norm": 5.0, "learning_rate": 2.34547469431564e-06, "loss": 0.88609476, "memory(GiB)": 146.85, "step": 58960, "train_speed(iter/s)": 0.202519 }, { "acc": 0.77900553, "epoch": 1.3758147537876027, "grad_norm": 5.53125, "learning_rate": 2.3438739914803193e-06, "loss": 0.80969296, "memory(GiB)": 146.85, "step": 58970, "train_speed(iter/s)": 0.202536 }, { "acc": 0.75939074, "epoch": 1.3760480613598916, "grad_norm": 4.78125, "learning_rate": 2.3422736678062126e-06, "loss": 0.87354145, "memory(GiB)": 146.85, "step": 58980, "train_speed(iter/s)": 0.202553 }, { "acc": 0.78326492, "epoch": 1.3762813689321804, "grad_norm": 7.9375, "learning_rate": 2.3406737235217714e-06, "loss": 0.7530592, "memory(GiB)": 146.85, "step": 58990, "train_speed(iter/s)": 0.202572 }, { "acc": 0.76805391, "epoch": 1.3765146765044693, "grad_norm": 6.875, "learning_rate": 2.33907415885538e-06, "loss": 0.82641869, "memory(GiB)": 146.85, "step": 59000, "train_speed(iter/s)": 0.202591 }, { "epoch": 1.3765146765044693, "eval_acc": 0.735150484512696, "eval_loss": 0.8340362310409546, "eval_runtime": 1264.2606, "eval_samples_per_second": 28.468, "eval_steps_per_second": 14.234, "step": 59000 }, { "acc": 0.78292685, "epoch": 1.3767479840767582, "grad_norm": 6.75, "learning_rate": 2.3374749740353815e-06, "loss": 0.79680209, "memory(GiB)": 146.85, "step": 59010, "train_speed(iter/s)": 0.201716 }, { "acc": 0.74934435, "epoch": 1.3769812916490471, "grad_norm": 5.96875, "learning_rate": 2.335876169290056e-06, "loss": 0.9243022, "memory(GiB)": 146.85, "step": 59020, "train_speed(iter/s)": 0.201733 }, { "acc": 0.7753406, "epoch": 1.377214599221336, "grad_norm": 5.09375, "learning_rate": 2.3342777448476326e-06, "loss": 0.78887496, "memory(GiB)": 146.85, "step": 59030, "train_speed(iter/s)": 0.201752 }, { "acc": 0.77368164, "epoch": 1.377447906793625, "grad_norm": 5.4375, "learning_rate": 2.3326797009362884e-06, "loss": 0.82738476, "memory(GiB)": 146.85, "step": 59040, "train_speed(iter/s)": 0.20177 }, { "acc": 0.76708121, "epoch": 1.3776812143659138, "grad_norm": 5.84375, "learning_rate": 2.33108203778414e-06, "loss": 0.83799486, "memory(GiB)": 146.85, "step": 59050, "train_speed(iter/s)": 0.201789 }, { "acc": 0.77026339, "epoch": 1.3779145219382025, "grad_norm": 4.5, "learning_rate": 2.3294847556192575e-06, "loss": 0.83560696, "memory(GiB)": 146.85, "step": 59060, "train_speed(iter/s)": 0.201806 }, { "acc": 0.76003656, "epoch": 1.3781478295104916, "grad_norm": 6.6875, "learning_rate": 2.32788785466965e-06, "loss": 0.84768524, "memory(GiB)": 146.85, "step": 59070, "train_speed(iter/s)": 0.201824 }, { "acc": 0.77295876, "epoch": 1.3783811370827803, "grad_norm": 6.34375, "learning_rate": 2.3262913351632725e-06, "loss": 0.80956669, "memory(GiB)": 146.85, "step": 59080, "train_speed(iter/s)": 0.201842 }, { "acc": 0.76658421, "epoch": 1.3786144446550694, "grad_norm": 5.34375, "learning_rate": 2.3246951973280328e-06, "loss": 0.83789768, "memory(GiB)": 146.85, "step": 59090, "train_speed(iter/s)": 0.201861 }, { "acc": 0.77064943, "epoch": 1.3788477522273581, "grad_norm": 5.1875, "learning_rate": 2.3230994413917767e-06, "loss": 0.82967348, "memory(GiB)": 146.85, "step": 59100, "train_speed(iter/s)": 0.201879 }, { "acc": 0.77498121, "epoch": 1.3790810597996472, "grad_norm": 6.21875, "learning_rate": 2.3215040675822976e-06, "loss": 0.80588236, "memory(GiB)": 146.85, "step": 59110, "train_speed(iter/s)": 0.201897 }, { "acc": 0.78048129, "epoch": 1.379314367371936, "grad_norm": 6.0625, "learning_rate": 2.319909076127333e-06, "loss": 0.80840921, "memory(GiB)": 146.85, "step": 59120, "train_speed(iter/s)": 0.201916 }, { "acc": 0.79808435, "epoch": 1.379547674944225, "grad_norm": 5.90625, "learning_rate": 2.3183144672545706e-06, "loss": 0.71542969, "memory(GiB)": 146.85, "step": 59130, "train_speed(iter/s)": 0.201934 }, { "acc": 0.77001643, "epoch": 1.3797809825165137, "grad_norm": 5.75, "learning_rate": 2.3167202411916372e-06, "loss": 0.80921669, "memory(GiB)": 146.85, "step": 59140, "train_speed(iter/s)": 0.201951 }, { "acc": 0.77669301, "epoch": 1.3800142900888028, "grad_norm": 7.71875, "learning_rate": 2.315126398166112e-06, "loss": 0.79920969, "memory(GiB)": 146.85, "step": 59150, "train_speed(iter/s)": 0.201968 }, { "acc": 0.77610588, "epoch": 1.3802475976610915, "grad_norm": 5.71875, "learning_rate": 2.3135329384055134e-06, "loss": 0.79539385, "memory(GiB)": 146.85, "step": 59160, "train_speed(iter/s)": 0.201985 }, { "acc": 0.76143999, "epoch": 1.3804809052333804, "grad_norm": 6.15625, "learning_rate": 2.3119398621373055e-06, "loss": 0.87689943, "memory(GiB)": 146.85, "step": 59170, "train_speed(iter/s)": 0.202003 }, { "acc": 0.76723595, "epoch": 1.3807142128056693, "grad_norm": 4.96875, "learning_rate": 2.3103471695889035e-06, "loss": 0.83367329, "memory(GiB)": 146.85, "step": 59180, "train_speed(iter/s)": 0.202022 }, { "acc": 0.77189379, "epoch": 1.3809475203779582, "grad_norm": 5.34375, "learning_rate": 2.308754860987659e-06, "loss": 0.80697832, "memory(GiB)": 146.85, "step": 59190, "train_speed(iter/s)": 0.202039 }, { "acc": 0.76604519, "epoch": 1.381180827950247, "grad_norm": 6.8125, "learning_rate": 2.3071629365608793e-06, "loss": 0.8536623, "memory(GiB)": 146.85, "step": 59200, "train_speed(iter/s)": 0.202058 }, { "acc": 0.75514612, "epoch": 1.381414135522536, "grad_norm": 7.9375, "learning_rate": 2.305571396535807e-06, "loss": 0.91051788, "memory(GiB)": 146.85, "step": 59210, "train_speed(iter/s)": 0.202075 }, { "acc": 0.77238445, "epoch": 1.381647443094825, "grad_norm": 6.25, "learning_rate": 2.303980241139636e-06, "loss": 0.81702957, "memory(GiB)": 146.85, "step": 59220, "train_speed(iter/s)": 0.202093 }, { "acc": 0.75543499, "epoch": 1.3818807506671138, "grad_norm": 9.25, "learning_rate": 2.3023894705995e-06, "loss": 0.8841445, "memory(GiB)": 146.85, "step": 59230, "train_speed(iter/s)": 0.20211 }, { "acc": 0.77305551, "epoch": 1.3821140582394027, "grad_norm": 7.78125, "learning_rate": 2.3007990851424862e-06, "loss": 0.84027233, "memory(GiB)": 146.85, "step": 59240, "train_speed(iter/s)": 0.202128 }, { "acc": 0.7788506, "epoch": 1.3823473658116916, "grad_norm": 4.71875, "learning_rate": 2.2992090849956176e-06, "loss": 0.80902624, "memory(GiB)": 146.85, "step": 59250, "train_speed(iter/s)": 0.202146 }, { "acc": 0.79038095, "epoch": 1.3825806733839805, "grad_norm": 7.40625, "learning_rate": 2.2976194703858666e-06, "loss": 0.75809922, "memory(GiB)": 146.85, "step": 59260, "train_speed(iter/s)": 0.202163 }, { "acc": 0.76701393, "epoch": 1.3828139809562694, "grad_norm": 5.25, "learning_rate": 2.2960302415401525e-06, "loss": 0.82084265, "memory(GiB)": 146.85, "step": 59270, "train_speed(iter/s)": 0.202182 }, { "acc": 0.78622408, "epoch": 1.3830472885285583, "grad_norm": 5.65625, "learning_rate": 2.2944413986853344e-06, "loss": 0.76544971, "memory(GiB)": 146.85, "step": 59280, "train_speed(iter/s)": 0.2022 }, { "acc": 0.75856724, "epoch": 1.3832805961008472, "grad_norm": 5.84375, "learning_rate": 2.292852942048222e-06, "loss": 0.8617877, "memory(GiB)": 146.85, "step": 59290, "train_speed(iter/s)": 0.202218 }, { "acc": 0.78225517, "epoch": 1.383513903673136, "grad_norm": 4.90625, "learning_rate": 2.2912648718555665e-06, "loss": 0.79882555, "memory(GiB)": 146.85, "step": 59300, "train_speed(iter/s)": 0.202236 }, { "acc": 0.75083685, "epoch": 1.383747211245425, "grad_norm": 5.84375, "learning_rate": 2.2896771883340614e-06, "loss": 0.90261307, "memory(GiB)": 146.85, "step": 59310, "train_speed(iter/s)": 0.202254 }, { "acc": 0.77638302, "epoch": 1.383980518817714, "grad_norm": 5.0625, "learning_rate": 2.2880898917103515e-06, "loss": 0.82227554, "memory(GiB)": 146.85, "step": 59320, "train_speed(iter/s)": 0.202269 }, { "acc": 0.78838329, "epoch": 1.3842138263900028, "grad_norm": 6.59375, "learning_rate": 2.2865029822110222e-06, "loss": 0.74590807, "memory(GiB)": 146.85, "step": 59330, "train_speed(iter/s)": 0.202287 }, { "acc": 0.77394552, "epoch": 1.3844471339622917, "grad_norm": 4.875, "learning_rate": 2.2849164600626045e-06, "loss": 0.82990847, "memory(GiB)": 146.85, "step": 59340, "train_speed(iter/s)": 0.202305 }, { "acc": 0.77396841, "epoch": 1.3846804415345806, "grad_norm": 4.34375, "learning_rate": 2.2833303254915713e-06, "loss": 0.79315882, "memory(GiB)": 146.85, "step": 59350, "train_speed(iter/s)": 0.202323 }, { "acc": 0.78223801, "epoch": 1.3849137491068695, "grad_norm": 5.5, "learning_rate": 2.2817445787243464e-06, "loss": 0.78998566, "memory(GiB)": 146.85, "step": 59360, "train_speed(iter/s)": 0.202342 }, { "acc": 0.76472607, "epoch": 1.3851470566791584, "grad_norm": 4.96875, "learning_rate": 2.280159219987293e-06, "loss": 0.84331369, "memory(GiB)": 146.85, "step": 59370, "train_speed(iter/s)": 0.20236 }, { "acc": 0.76141562, "epoch": 1.3853803642514473, "grad_norm": 4.96875, "learning_rate": 2.27857424950672e-06, "loss": 0.87422066, "memory(GiB)": 146.85, "step": 59380, "train_speed(iter/s)": 0.202379 }, { "acc": 0.77278194, "epoch": 1.3856136718237362, "grad_norm": 6.03125, "learning_rate": 2.2769896675088833e-06, "loss": 0.81296358, "memory(GiB)": 146.85, "step": 59390, "train_speed(iter/s)": 0.202396 }, { "acc": 0.76582651, "epoch": 1.385846979396025, "grad_norm": 5.28125, "learning_rate": 2.2754054742199787e-06, "loss": 0.86609449, "memory(GiB)": 146.85, "step": 59400, "train_speed(iter/s)": 0.202414 }, { "acc": 0.77138128, "epoch": 1.386080286968314, "grad_norm": 6.3125, "learning_rate": 2.273821669866153e-06, "loss": 0.80793285, "memory(GiB)": 146.85, "step": 59410, "train_speed(iter/s)": 0.202431 }, { "acc": 0.76822467, "epoch": 1.3863135945406029, "grad_norm": 6.3125, "learning_rate": 2.2722382546734904e-06, "loss": 0.83576221, "memory(GiB)": 146.85, "step": 59420, "train_speed(iter/s)": 0.202447 }, { "acc": 0.76599998, "epoch": 1.3865469021128918, "grad_norm": 4.875, "learning_rate": 2.270655228868026e-06, "loss": 0.8353178, "memory(GiB)": 146.85, "step": 59430, "train_speed(iter/s)": 0.202465 }, { "acc": 0.79232016, "epoch": 1.3867802096851807, "grad_norm": 5.25, "learning_rate": 2.2690725926757355e-06, "loss": 0.74691849, "memory(GiB)": 146.85, "step": 59440, "train_speed(iter/s)": 0.202482 }, { "acc": 0.75607486, "epoch": 1.3870135172574694, "grad_norm": 6.25, "learning_rate": 2.267490346322539e-06, "loss": 0.88647575, "memory(GiB)": 146.85, "step": 59450, "train_speed(iter/s)": 0.2025 }, { "acc": 0.78539848, "epoch": 1.3872468248297585, "grad_norm": 6.46875, "learning_rate": 2.265908490034301e-06, "loss": 0.79183149, "memory(GiB)": 146.85, "step": 59460, "train_speed(iter/s)": 0.202518 }, { "acc": 0.77283435, "epoch": 1.3874801324020472, "grad_norm": 5.09375, "learning_rate": 2.2643270240368305e-06, "loss": 0.82431507, "memory(GiB)": 146.85, "step": 59470, "train_speed(iter/s)": 0.202535 }, { "acc": 0.76766257, "epoch": 1.3877134399743363, "grad_norm": 6.0, "learning_rate": 2.2627459485558846e-06, "loss": 0.84664459, "memory(GiB)": 146.85, "step": 59480, "train_speed(iter/s)": 0.202554 }, { "acc": 0.76296902, "epoch": 1.387946747546625, "grad_norm": 7.375, "learning_rate": 2.2611652638171568e-06, "loss": 0.8558939, "memory(GiB)": 146.85, "step": 59490, "train_speed(iter/s)": 0.202571 }, { "acc": 0.75398407, "epoch": 1.388180055118914, "grad_norm": 5.71875, "learning_rate": 2.259584970046294e-06, "loss": 0.89067574, "memory(GiB)": 146.85, "step": 59500, "train_speed(iter/s)": 0.202589 }, { "epoch": 1.388180055118914, "eval_acc": 0.7351967899218778, "eval_loss": 0.8340466022491455, "eval_runtime": 1263.6685, "eval_samples_per_second": 28.481, "eval_steps_per_second": 14.241, "step": 59500 }, { "acc": 0.76907167, "epoch": 1.3884133626912027, "grad_norm": 6.03125, "learning_rate": 2.2580050674688815e-06, "loss": 0.83936901, "memory(GiB)": 146.85, "step": 59510, "train_speed(iter/s)": 0.201721 }, { "acc": 0.75504436, "epoch": 1.3886466702634919, "grad_norm": 4.4375, "learning_rate": 2.2564255563104465e-06, "loss": 0.89479084, "memory(GiB)": 146.85, "step": 59520, "train_speed(iter/s)": 0.201739 }, { "acc": 0.76324787, "epoch": 1.3888799778357805, "grad_norm": 4.53125, "learning_rate": 2.254846436796468e-06, "loss": 0.86641273, "memory(GiB)": 146.85, "step": 59530, "train_speed(iter/s)": 0.201758 }, { "acc": 0.77439384, "epoch": 1.3891132854080694, "grad_norm": 8.75, "learning_rate": 2.2532677091523615e-06, "loss": 0.81448975, "memory(GiB)": 146.85, "step": 59540, "train_speed(iter/s)": 0.201777 }, { "acc": 0.77819238, "epoch": 1.3893465929803583, "grad_norm": 5.46875, "learning_rate": 2.2516893736034935e-06, "loss": 0.79381266, "memory(GiB)": 146.85, "step": 59550, "train_speed(iter/s)": 0.201795 }, { "acc": 0.74956913, "epoch": 1.3895799005526472, "grad_norm": 7.46875, "learning_rate": 2.250111430375169e-06, "loss": 0.89626312, "memory(GiB)": 146.85, "step": 59560, "train_speed(iter/s)": 0.201812 }, { "acc": 0.76800661, "epoch": 1.3898132081249361, "grad_norm": 5.4375, "learning_rate": 2.248533879692639e-06, "loss": 0.81193819, "memory(GiB)": 146.85, "step": 59570, "train_speed(iter/s)": 0.201829 }, { "acc": 0.78386765, "epoch": 1.390046515697225, "grad_norm": 4.75, "learning_rate": 2.246956721781097e-06, "loss": 0.79206467, "memory(GiB)": 146.85, "step": 59580, "train_speed(iter/s)": 0.201846 }, { "acc": 0.78440213, "epoch": 1.390279823269514, "grad_norm": 6.21875, "learning_rate": 2.245379956865684e-06, "loss": 0.78892608, "memory(GiB)": 146.85, "step": 59590, "train_speed(iter/s)": 0.201865 }, { "acc": 0.74080734, "epoch": 1.3905131308418028, "grad_norm": 5.34375, "learning_rate": 2.243803585171483e-06, "loss": 0.93578568, "memory(GiB)": 146.85, "step": 59600, "train_speed(iter/s)": 0.201882 }, { "acc": 0.75313873, "epoch": 1.3907464384140917, "grad_norm": 6.40625, "learning_rate": 2.2422276069235174e-06, "loss": 0.88533459, "memory(GiB)": 146.85, "step": 59610, "train_speed(iter/s)": 0.2019 }, { "acc": 0.77745504, "epoch": 1.3909797459863806, "grad_norm": 4.34375, "learning_rate": 2.240652022346761e-06, "loss": 0.80005331, "memory(GiB)": 146.85, "step": 59620, "train_speed(iter/s)": 0.201918 }, { "acc": 0.76351037, "epoch": 1.3912130535586695, "grad_norm": 6.03125, "learning_rate": 2.2390768316661256e-06, "loss": 0.85887318, "memory(GiB)": 146.85, "step": 59630, "train_speed(iter/s)": 0.201936 }, { "acc": 0.79199028, "epoch": 1.3914463611309584, "grad_norm": 4.6875, "learning_rate": 2.237502035106472e-06, "loss": 0.74876347, "memory(GiB)": 146.85, "step": 59640, "train_speed(iter/s)": 0.201953 }, { "acc": 0.77106905, "epoch": 1.3916796687032473, "grad_norm": 15.4375, "learning_rate": 2.2359276328926007e-06, "loss": 0.83125706, "memory(GiB)": 146.85, "step": 59650, "train_speed(iter/s)": 0.201971 }, { "acc": 0.77963285, "epoch": 1.3919129762755362, "grad_norm": 5.71875, "learning_rate": 2.2343536252492542e-06, "loss": 0.78623161, "memory(GiB)": 146.85, "step": 59660, "train_speed(iter/s)": 0.201988 }, { "acc": 0.78513298, "epoch": 1.3921462838478251, "grad_norm": 4.46875, "learning_rate": 2.2327800124011285e-06, "loss": 0.78242245, "memory(GiB)": 146.85, "step": 59670, "train_speed(iter/s)": 0.202006 }, { "acc": 0.76695681, "epoch": 1.392379591420114, "grad_norm": 7.5625, "learning_rate": 2.231206794572848e-06, "loss": 0.83407421, "memory(GiB)": 146.85, "step": 59680, "train_speed(iter/s)": 0.202023 }, { "acc": 0.77610373, "epoch": 1.392612898992403, "grad_norm": 7.15625, "learning_rate": 2.229633971988996e-06, "loss": 0.80062428, "memory(GiB)": 146.85, "step": 59690, "train_speed(iter/s)": 0.202041 }, { "acc": 0.78498697, "epoch": 1.3928462065646918, "grad_norm": 7.28125, "learning_rate": 2.2280615448740873e-06, "loss": 0.77125726, "memory(GiB)": 146.85, "step": 59700, "train_speed(iter/s)": 0.202059 }, { "acc": 0.78751593, "epoch": 1.3930795141369807, "grad_norm": 9.625, "learning_rate": 2.2264895134525898e-06, "loss": 0.76506319, "memory(GiB)": 146.85, "step": 59710, "train_speed(iter/s)": 0.202075 }, { "acc": 0.76889629, "epoch": 1.3933128217092696, "grad_norm": 5.9375, "learning_rate": 2.2249178779489065e-06, "loss": 0.83388081, "memory(GiB)": 146.85, "step": 59720, "train_speed(iter/s)": 0.202093 }, { "acc": 0.77873287, "epoch": 1.3935461292815585, "grad_norm": 5.1875, "learning_rate": 2.223346638587392e-06, "loss": 0.80616932, "memory(GiB)": 146.85, "step": 59730, "train_speed(iter/s)": 0.20211 }, { "acc": 0.78770761, "epoch": 1.3937794368538474, "grad_norm": 6.84375, "learning_rate": 2.2217757955923386e-06, "loss": 0.7767107, "memory(GiB)": 146.85, "step": 59740, "train_speed(iter/s)": 0.202128 }, { "acc": 0.79523106, "epoch": 1.3940127444261363, "grad_norm": 6.4375, "learning_rate": 2.220205349187981e-06, "loss": 0.724403, "memory(GiB)": 146.85, "step": 59750, "train_speed(iter/s)": 0.202145 }, { "acc": 0.75776162, "epoch": 1.3942460519984252, "grad_norm": 6.125, "learning_rate": 2.218635299598504e-06, "loss": 0.8814106, "memory(GiB)": 146.85, "step": 59760, "train_speed(iter/s)": 0.202163 }, { "acc": 0.78653116, "epoch": 1.3944793595707141, "grad_norm": 5.59375, "learning_rate": 2.2170656470480284e-06, "loss": 0.77877202, "memory(GiB)": 146.85, "step": 59770, "train_speed(iter/s)": 0.202179 }, { "acc": 0.75317469, "epoch": 1.394712667143003, "grad_norm": 5.71875, "learning_rate": 2.215496391760625e-06, "loss": 0.88787746, "memory(GiB)": 146.85, "step": 59780, "train_speed(iter/s)": 0.202196 }, { "acc": 0.78019204, "epoch": 1.394945974715292, "grad_norm": 6.9375, "learning_rate": 2.2139275339603023e-06, "loss": 0.79625068, "memory(GiB)": 146.85, "step": 59790, "train_speed(iter/s)": 0.202214 }, { "acc": 0.79043655, "epoch": 1.3951792822875808, "grad_norm": 5.1875, "learning_rate": 2.2123590738710153e-06, "loss": 0.72975974, "memory(GiB)": 146.85, "step": 59800, "train_speed(iter/s)": 0.202232 }, { "acc": 0.78623919, "epoch": 1.3954125898598697, "grad_norm": 7.28125, "learning_rate": 2.2107910117166608e-06, "loss": 0.77559509, "memory(GiB)": 146.85, "step": 59810, "train_speed(iter/s)": 0.202249 }, { "acc": 0.78548403, "epoch": 1.3956458974321586, "grad_norm": 5.0, "learning_rate": 2.2092233477210767e-06, "loss": 0.75539036, "memory(GiB)": 146.85, "step": 59820, "train_speed(iter/s)": 0.202266 }, { "acc": 0.76330738, "epoch": 1.3958792050044475, "grad_norm": 4.96875, "learning_rate": 2.2076560821080515e-06, "loss": 0.86006365, "memory(GiB)": 146.85, "step": 59830, "train_speed(iter/s)": 0.202284 }, { "acc": 0.76916885, "epoch": 1.3961125125767362, "grad_norm": 5.40625, "learning_rate": 2.2060892151013067e-06, "loss": 0.83991299, "memory(GiB)": 146.85, "step": 59840, "train_speed(iter/s)": 0.202303 }, { "acc": 0.74093285, "epoch": 1.3963458201490253, "grad_norm": 5.8125, "learning_rate": 2.2045227469245178e-06, "loss": 0.91835146, "memory(GiB)": 146.85, "step": 59850, "train_speed(iter/s)": 0.202321 }, { "acc": 0.77408557, "epoch": 1.396579127721314, "grad_norm": 6.3125, "learning_rate": 2.202956677801292e-06, "loss": 0.80813684, "memory(GiB)": 146.85, "step": 59860, "train_speed(iter/s)": 0.202338 }, { "acc": 0.77122746, "epoch": 1.396812435293603, "grad_norm": 6.40625, "learning_rate": 2.2013910079551905e-06, "loss": 0.82681274, "memory(GiB)": 146.85, "step": 59870, "train_speed(iter/s)": 0.202355 }, { "acc": 0.76744962, "epoch": 1.3970457428658918, "grad_norm": 6.6875, "learning_rate": 2.199825737609709e-06, "loss": 0.84128485, "memory(GiB)": 146.85, "step": 59880, "train_speed(iter/s)": 0.202372 }, { "acc": 0.75331292, "epoch": 1.397279050438181, "grad_norm": 7.8125, "learning_rate": 2.198260866988288e-06, "loss": 0.88429832, "memory(GiB)": 146.85, "step": 59890, "train_speed(iter/s)": 0.20239 }, { "acc": 0.77507467, "epoch": 1.3975123580104696, "grad_norm": 5.375, "learning_rate": 2.1966963963143184e-06, "loss": 0.8145546, "memory(GiB)": 146.85, "step": 59900, "train_speed(iter/s)": 0.202406 }, { "acc": 0.7798018, "epoch": 1.3977456655827587, "grad_norm": 4.65625, "learning_rate": 2.1951323258111194e-06, "loss": 0.7922761, "memory(GiB)": 146.85, "step": 59910, "train_speed(iter/s)": 0.202423 }, { "acc": 0.77852564, "epoch": 1.3979789731550474, "grad_norm": 6.6875, "learning_rate": 2.193568655701969e-06, "loss": 0.77441111, "memory(GiB)": 146.85, "step": 59920, "train_speed(iter/s)": 0.20244 }, { "acc": 0.76237273, "epoch": 1.3982122807273363, "grad_norm": 4.0625, "learning_rate": 2.1920053862100754e-06, "loss": 0.8705162, "memory(GiB)": 146.85, "step": 59930, "train_speed(iter/s)": 0.202458 }, { "acc": 0.77003975, "epoch": 1.3984455882996252, "grad_norm": 17.125, "learning_rate": 2.190442517558599e-06, "loss": 0.81333017, "memory(GiB)": 146.85, "step": 59940, "train_speed(iter/s)": 0.202475 }, { "acc": 0.78742085, "epoch": 1.398678895871914, "grad_norm": 6.78125, "learning_rate": 2.188880049970637e-06, "loss": 0.76135855, "memory(GiB)": 146.85, "step": 59950, "train_speed(iter/s)": 0.202493 }, { "acc": 0.77852316, "epoch": 1.398912203444203, "grad_norm": 5.5, "learning_rate": 2.18731798366923e-06, "loss": 0.79437609, "memory(GiB)": 146.85, "step": 59960, "train_speed(iter/s)": 0.202511 }, { "acc": 0.77639918, "epoch": 1.3991455110164919, "grad_norm": 6.46875, "learning_rate": 2.1857563188773644e-06, "loss": 0.80768623, "memory(GiB)": 146.85, "step": 59970, "train_speed(iter/s)": 0.202528 }, { "acc": 0.76187201, "epoch": 1.3993788185887808, "grad_norm": 7.03125, "learning_rate": 2.184195055817966e-06, "loss": 0.85525513, "memory(GiB)": 146.85, "step": 59980, "train_speed(iter/s)": 0.202544 }, { "acc": 0.78287554, "epoch": 1.3996121261610697, "grad_norm": 5.78125, "learning_rate": 2.1826341947139067e-06, "loss": 0.76438308, "memory(GiB)": 146.85, "step": 59990, "train_speed(iter/s)": 0.202562 }, { "acc": 0.7627183, "epoch": 1.3998454337333586, "grad_norm": 3.984375, "learning_rate": 2.181073735787998e-06, "loss": 0.86684132, "memory(GiB)": 146.85, "step": 60000, "train_speed(iter/s)": 0.20258 }, { "epoch": 1.3998454337333586, "eval_acc": 0.7351529046560331, "eval_loss": 0.8340141177177429, "eval_runtime": 1263.4244, "eval_samples_per_second": 28.487, "eval_steps_per_second": 14.244, "step": 60000 }, { "acc": 0.78266072, "epoch": 1.4000787413056475, "grad_norm": 5.25, "learning_rate": 2.179513679262992e-06, "loss": 0.7818161, "memory(GiB)": 146.85, "step": 60010, "train_speed(iter/s)": 0.20172 }, { "acc": 0.77389059, "epoch": 1.4003120488779364, "grad_norm": 5.46875, "learning_rate": 2.1779540253615917e-06, "loss": 0.81616497, "memory(GiB)": 146.85, "step": 60020, "train_speed(iter/s)": 0.201738 }, { "acc": 0.78978982, "epoch": 1.4005453564502253, "grad_norm": 5.25, "learning_rate": 2.176394774306434e-06, "loss": 0.76432819, "memory(GiB)": 146.85, "step": 60030, "train_speed(iter/s)": 0.201754 }, { "acc": 0.79059734, "epoch": 1.4007786640225142, "grad_norm": 6.625, "learning_rate": 2.174835926320102e-06, "loss": 0.75097947, "memory(GiB)": 146.85, "step": 60040, "train_speed(iter/s)": 0.201772 }, { "acc": 0.75273104, "epoch": 1.401011971594803, "grad_norm": 6.59375, "learning_rate": 2.173277481625119e-06, "loss": 0.89239016, "memory(GiB)": 146.85, "step": 60050, "train_speed(iter/s)": 0.201788 }, { "acc": 0.76573877, "epoch": 1.401245279167092, "grad_norm": 5.78125, "learning_rate": 2.1717194404439563e-06, "loss": 0.83680134, "memory(GiB)": 146.85, "step": 60060, "train_speed(iter/s)": 0.201805 }, { "acc": 0.77691507, "epoch": 1.4014785867393809, "grad_norm": 5.5625, "learning_rate": 2.17016180299902e-06, "loss": 0.79188929, "memory(GiB)": 146.85, "step": 60070, "train_speed(iter/s)": 0.201822 }, { "acc": 0.76826024, "epoch": 1.4017118943116698, "grad_norm": 3.796875, "learning_rate": 2.168604569512666e-06, "loss": 0.83474865, "memory(GiB)": 146.85, "step": 60080, "train_speed(iter/s)": 0.20184 }, { "acc": 0.77514405, "epoch": 1.4019452018839587, "grad_norm": 4.40625, "learning_rate": 2.167047740207187e-06, "loss": 0.82190456, "memory(GiB)": 146.85, "step": 60090, "train_speed(iter/s)": 0.201857 }, { "acc": 0.77628975, "epoch": 1.4021785094562476, "grad_norm": 5.375, "learning_rate": 2.1654913153048186e-06, "loss": 0.82493811, "memory(GiB)": 146.85, "step": 60100, "train_speed(iter/s)": 0.201875 }, { "acc": 0.7695992, "epoch": 1.4024118170285365, "grad_norm": 9.875, "learning_rate": 2.1639352950277433e-06, "loss": 0.82572365, "memory(GiB)": 146.85, "step": 60110, "train_speed(iter/s)": 0.201892 }, { "acc": 0.78561907, "epoch": 1.4026451246008254, "grad_norm": 5.75, "learning_rate": 2.162379679598079e-06, "loss": 0.76238794, "memory(GiB)": 146.85, "step": 60120, "train_speed(iter/s)": 0.20191 }, { "acc": 0.77025385, "epoch": 1.4028784321731143, "grad_norm": 7.03125, "learning_rate": 2.1608244692378946e-06, "loss": 0.84390039, "memory(GiB)": 146.85, "step": 60130, "train_speed(iter/s)": 0.201927 }, { "acc": 0.7785563, "epoch": 1.4031117397454032, "grad_norm": 7.6875, "learning_rate": 2.1592696641691884e-06, "loss": 0.79282503, "memory(GiB)": 146.85, "step": 60140, "train_speed(iter/s)": 0.201946 }, { "acc": 0.7802371, "epoch": 1.403345047317692, "grad_norm": 6.8125, "learning_rate": 2.157715264613915e-06, "loss": 0.82513323, "memory(GiB)": 146.85, "step": 60150, "train_speed(iter/s)": 0.201964 }, { "acc": 0.7777338, "epoch": 1.403578354889981, "grad_norm": 7.0625, "learning_rate": 2.156161270793961e-06, "loss": 0.80008221, "memory(GiB)": 146.85, "step": 60160, "train_speed(iter/s)": 0.201981 }, { "acc": 0.77106862, "epoch": 1.4038116624622698, "grad_norm": 7.75, "learning_rate": 2.1546076829311584e-06, "loss": 0.81915474, "memory(GiB)": 146.85, "step": 60170, "train_speed(iter/s)": 0.201998 }, { "acc": 0.7849, "epoch": 1.4040449700345587, "grad_norm": 6.1875, "learning_rate": 2.153054501247284e-06, "loss": 0.7831151, "memory(GiB)": 146.85, "step": 60180, "train_speed(iter/s)": 0.202015 }, { "acc": 0.77284236, "epoch": 1.4042782776068476, "grad_norm": 5.59375, "learning_rate": 2.151501725964051e-06, "loss": 0.81775494, "memory(GiB)": 146.85, "step": 60190, "train_speed(iter/s)": 0.202033 }, { "acc": 0.7658514, "epoch": 1.4045115851791365, "grad_norm": 5.875, "learning_rate": 2.14994935730312e-06, "loss": 0.8448185, "memory(GiB)": 146.85, "step": 60200, "train_speed(iter/s)": 0.20205 }, { "acc": 0.77050796, "epoch": 1.4047448927514254, "grad_norm": 7.03125, "learning_rate": 2.1483973954860894e-06, "loss": 0.84863834, "memory(GiB)": 146.85, "step": 60210, "train_speed(iter/s)": 0.202067 }, { "acc": 0.78184023, "epoch": 1.4049782003237143, "grad_norm": 7.34375, "learning_rate": 2.146845840734504e-06, "loss": 0.75570774, "memory(GiB)": 146.85, "step": 60220, "train_speed(iter/s)": 0.202084 }, { "acc": 0.7848732, "epoch": 1.405211507896003, "grad_norm": 7.5, "learning_rate": 2.1452946932698454e-06, "loss": 0.76073866, "memory(GiB)": 146.85, "step": 60230, "train_speed(iter/s)": 0.202102 }, { "acc": 0.7583569, "epoch": 1.4054448154682921, "grad_norm": 4.875, "learning_rate": 2.1437439533135386e-06, "loss": 0.87292795, "memory(GiB)": 146.85, "step": 60240, "train_speed(iter/s)": 0.202119 }, { "acc": 0.76806889, "epoch": 1.4056781230405808, "grad_norm": 4.75, "learning_rate": 2.142193621086956e-06, "loss": 0.8178443, "memory(GiB)": 146.85, "step": 60250, "train_speed(iter/s)": 0.202137 }, { "acc": 0.78727951, "epoch": 1.40591143061287, "grad_norm": 4.4375, "learning_rate": 2.140643696811401e-06, "loss": 0.7941824, "memory(GiB)": 146.85, "step": 60260, "train_speed(iter/s)": 0.202154 }, { "acc": 0.77671413, "epoch": 1.4061447381851586, "grad_norm": 7.28125, "learning_rate": 2.1390941807081285e-06, "loss": 0.7997632, "memory(GiB)": 146.85, "step": 60270, "train_speed(iter/s)": 0.202172 }, { "acc": 0.80020456, "epoch": 1.4063780457574477, "grad_norm": 6.03125, "learning_rate": 2.1375450729983294e-06, "loss": 0.72378054, "memory(GiB)": 146.85, "step": 60280, "train_speed(iter/s)": 0.202189 }, { "acc": 0.79033799, "epoch": 1.4066113533297364, "grad_norm": 5.6875, "learning_rate": 2.1359963739031407e-06, "loss": 0.75780277, "memory(GiB)": 146.85, "step": 60290, "train_speed(iter/s)": 0.202207 }, { "acc": 0.77974992, "epoch": 1.4068446609020255, "grad_norm": 4.34375, "learning_rate": 2.134448083643638e-06, "loss": 0.79717865, "memory(GiB)": 146.85, "step": 60300, "train_speed(iter/s)": 0.202223 }, { "acc": 0.79221487, "epoch": 1.4070779684743142, "grad_norm": 5.71875, "learning_rate": 2.1329002024408375e-06, "loss": 0.75881348, "memory(GiB)": 146.85, "step": 60310, "train_speed(iter/s)": 0.202241 }, { "acc": 0.78013792, "epoch": 1.407311276046603, "grad_norm": 5.53125, "learning_rate": 2.1313527305157015e-06, "loss": 0.77911205, "memory(GiB)": 146.85, "step": 60320, "train_speed(iter/s)": 0.202259 }, { "acc": 0.77259474, "epoch": 1.407544583618892, "grad_norm": 5.53125, "learning_rate": 2.1298056680891288e-06, "loss": 0.82849855, "memory(GiB)": 146.85, "step": 60330, "train_speed(iter/s)": 0.202276 }, { "acc": 0.77697172, "epoch": 1.407777891191181, "grad_norm": 5.4375, "learning_rate": 2.1282590153819645e-06, "loss": 0.82214899, "memory(GiB)": 146.85, "step": 60340, "train_speed(iter/s)": 0.202292 }, { "acc": 0.78464966, "epoch": 1.4080111987634698, "grad_norm": 5.78125, "learning_rate": 2.1267127726149896e-06, "loss": 0.78157854, "memory(GiB)": 146.85, "step": 60350, "train_speed(iter/s)": 0.202309 }, { "acc": 0.75966749, "epoch": 1.4082445063357587, "grad_norm": 5.78125, "learning_rate": 2.1251669400089353e-06, "loss": 0.86236191, "memory(GiB)": 146.85, "step": 60360, "train_speed(iter/s)": 0.202327 }, { "acc": 0.77125721, "epoch": 1.4084778139080476, "grad_norm": 6.0625, "learning_rate": 2.1236215177844617e-06, "loss": 0.81693325, "memory(GiB)": 146.85, "step": 60370, "train_speed(iter/s)": 0.202345 }, { "acc": 0.79619923, "epoch": 1.4087111214803365, "grad_norm": 6.59375, "learning_rate": 2.1220765061621828e-06, "loss": 0.73295717, "memory(GiB)": 146.85, "step": 60380, "train_speed(iter/s)": 0.202361 }, { "acc": 0.75484495, "epoch": 1.4089444290526254, "grad_norm": 9.0, "learning_rate": 2.120531905362646e-06, "loss": 0.87022743, "memory(GiB)": 146.85, "step": 60390, "train_speed(iter/s)": 0.202378 }, { "acc": 0.78618793, "epoch": 1.4091777366249143, "grad_norm": 5.59375, "learning_rate": 2.118987715606342e-06, "loss": 0.78784199, "memory(GiB)": 146.85, "step": 60400, "train_speed(iter/s)": 0.202395 }, { "acc": 0.79771366, "epoch": 1.4094110441972032, "grad_norm": 6.15625, "learning_rate": 2.1174439371137064e-06, "loss": 0.72600913, "memory(GiB)": 146.85, "step": 60410, "train_speed(iter/s)": 0.202412 }, { "acc": 0.77672434, "epoch": 1.409644351769492, "grad_norm": 5.59375, "learning_rate": 2.1159005701051093e-06, "loss": 0.78969226, "memory(GiB)": 146.85, "step": 60420, "train_speed(iter/s)": 0.202429 }, { "acc": 0.78214827, "epoch": 1.409877659341781, "grad_norm": 7.375, "learning_rate": 2.11435761480087e-06, "loss": 0.78593874, "memory(GiB)": 146.85, "step": 60430, "train_speed(iter/s)": 0.202447 }, { "acc": 0.75093398, "epoch": 1.41011096691407, "grad_norm": 5.34375, "learning_rate": 2.112815071421243e-06, "loss": 0.91495056, "memory(GiB)": 146.85, "step": 60440, "train_speed(iter/s)": 0.202464 }, { "acc": 0.77732544, "epoch": 1.4103442744863588, "grad_norm": 4.53125, "learning_rate": 2.111272940186424e-06, "loss": 0.82368526, "memory(GiB)": 146.85, "step": 60450, "train_speed(iter/s)": 0.202481 }, { "acc": 0.76822453, "epoch": 1.4105775820586477, "grad_norm": 6.9375, "learning_rate": 2.109731221316555e-06, "loss": 0.82876139, "memory(GiB)": 146.85, "step": 60460, "train_speed(iter/s)": 0.202497 }, { "acc": 0.78303018, "epoch": 1.4108108896309366, "grad_norm": 5.4375, "learning_rate": 2.108189915031715e-06, "loss": 0.77431326, "memory(GiB)": 146.85, "step": 60470, "train_speed(iter/s)": 0.202515 }, { "acc": 0.78396807, "epoch": 1.4110441972032255, "grad_norm": 5.625, "learning_rate": 2.1066490215519243e-06, "loss": 0.7720665, "memory(GiB)": 146.85, "step": 60480, "train_speed(iter/s)": 0.202533 }, { "acc": 0.77685699, "epoch": 1.4112775047755144, "grad_norm": 6.03125, "learning_rate": 2.105108541097143e-06, "loss": 0.82929773, "memory(GiB)": 146.85, "step": 60490, "train_speed(iter/s)": 0.20255 }, { "acc": 0.76118245, "epoch": 1.4115108123478033, "grad_norm": 5.65625, "learning_rate": 2.1035684738872792e-06, "loss": 0.86014614, "memory(GiB)": 146.85, "step": 60500, "train_speed(iter/s)": 0.202568 }, { "epoch": 1.4115108123478033, "eval_acc": 0.7352329307290439, "eval_loss": 0.8340166211128235, "eval_runtime": 1263.6497, "eval_samples_per_second": 28.482, "eval_steps_per_second": 14.241, "step": 60500 }, { "acc": 0.77070298, "epoch": 1.4117441199200922, "grad_norm": 5.0625, "learning_rate": 2.1020288201421722e-06, "loss": 0.83735199, "memory(GiB)": 146.85, "step": 60510, "train_speed(iter/s)": 0.201714 }, { "acc": 0.76356554, "epoch": 1.411977427492381, "grad_norm": 5.59375, "learning_rate": 2.100489580081611e-06, "loss": 0.85794563, "memory(GiB)": 146.85, "step": 60520, "train_speed(iter/s)": 0.201733 }, { "acc": 0.77613034, "epoch": 1.41221073506467, "grad_norm": 5.03125, "learning_rate": 2.09895075392532e-06, "loss": 0.79294472, "memory(GiB)": 146.85, "step": 60530, "train_speed(iter/s)": 0.201751 }, { "acc": 0.77628775, "epoch": 1.4124440426369589, "grad_norm": 5.46875, "learning_rate": 2.0974123418929644e-06, "loss": 0.79325213, "memory(GiB)": 146.85, "step": 60540, "train_speed(iter/s)": 0.201768 }, { "acc": 0.7806272, "epoch": 1.4126773502092478, "grad_norm": 5.8125, "learning_rate": 2.095874344204155e-06, "loss": 0.78656573, "memory(GiB)": 146.85, "step": 60550, "train_speed(iter/s)": 0.201786 }, { "acc": 0.77638073, "epoch": 1.4129106577815367, "grad_norm": 7.75, "learning_rate": 2.094336761078438e-06, "loss": 0.8130518, "memory(GiB)": 146.85, "step": 60560, "train_speed(iter/s)": 0.201802 }, { "acc": 0.75781012, "epoch": 1.4131439653538256, "grad_norm": 6.34375, "learning_rate": 2.0927995927353062e-06, "loss": 0.88795872, "memory(GiB)": 146.85, "step": 60570, "train_speed(iter/s)": 0.201819 }, { "acc": 0.79199963, "epoch": 1.4133772729261145, "grad_norm": 5.84375, "learning_rate": 2.091262839394188e-06, "loss": 0.74814415, "memory(GiB)": 146.85, "step": 60580, "train_speed(iter/s)": 0.201835 }, { "acc": 0.78787756, "epoch": 1.4136105804984034, "grad_norm": 5.65625, "learning_rate": 2.0897265012744543e-06, "loss": 0.76457281, "memory(GiB)": 146.85, "step": 60590, "train_speed(iter/s)": 0.201852 }, { "acc": 0.77757902, "epoch": 1.413843888070692, "grad_norm": 4.40625, "learning_rate": 2.0881905785954172e-06, "loss": 0.78647766, "memory(GiB)": 146.85, "step": 60600, "train_speed(iter/s)": 0.20187 }, { "acc": 0.75902424, "epoch": 1.4140771956429812, "grad_norm": 6.78125, "learning_rate": 2.086655071576327e-06, "loss": 0.8783926, "memory(GiB)": 146.85, "step": 60610, "train_speed(iter/s)": 0.201887 }, { "acc": 0.76471004, "epoch": 1.4143105032152699, "grad_norm": 5.5625, "learning_rate": 2.085119980436381e-06, "loss": 0.82620697, "memory(GiB)": 146.85, "step": 60620, "train_speed(iter/s)": 0.201905 }, { "acc": 0.79224195, "epoch": 1.414543810787559, "grad_norm": 4.15625, "learning_rate": 2.083585305394709e-06, "loss": 0.73669653, "memory(GiB)": 146.85, "step": 60630, "train_speed(iter/s)": 0.201922 }, { "acc": 0.76733141, "epoch": 1.4147771183598477, "grad_norm": 4.78125, "learning_rate": 2.0820510466703898e-06, "loss": 0.83159256, "memory(GiB)": 146.85, "step": 60640, "train_speed(iter/s)": 0.20194 }, { "acc": 0.79418545, "epoch": 1.4150104259321368, "grad_norm": 4.75, "learning_rate": 2.080517204482434e-06, "loss": 0.72847357, "memory(GiB)": 146.85, "step": 60650, "train_speed(iter/s)": 0.201957 }, { "acc": 0.7890811, "epoch": 1.4152437335044254, "grad_norm": 5.75, "learning_rate": 2.078983779049801e-06, "loss": 0.75519052, "memory(GiB)": 146.85, "step": 60660, "train_speed(iter/s)": 0.201974 }, { "acc": 0.79352837, "epoch": 1.4154770410767146, "grad_norm": 4.71875, "learning_rate": 2.0774507705913844e-06, "loss": 0.72599897, "memory(GiB)": 146.85, "step": 60670, "train_speed(iter/s)": 0.201993 }, { "acc": 0.77132745, "epoch": 1.4157103486490032, "grad_norm": 6.46875, "learning_rate": 2.07591817932602e-06, "loss": 0.81976728, "memory(GiB)": 146.85, "step": 60680, "train_speed(iter/s)": 0.202009 }, { "acc": 0.75539966, "epoch": 1.4159436562212924, "grad_norm": 7.34375, "learning_rate": 2.074386005472488e-06, "loss": 0.92003078, "memory(GiB)": 146.85, "step": 60690, "train_speed(iter/s)": 0.202027 }, { "acc": 0.76474051, "epoch": 1.416176963793581, "grad_norm": 5.09375, "learning_rate": 2.072854249249503e-06, "loss": 0.83549356, "memory(GiB)": 146.85, "step": 60700, "train_speed(iter/s)": 0.202045 }, { "acc": 0.77060413, "epoch": 1.41641027136587, "grad_norm": 5.21875, "learning_rate": 2.0713229108757244e-06, "loss": 0.83267384, "memory(GiB)": 146.85, "step": 60710, "train_speed(iter/s)": 0.202061 }, { "acc": 0.76525946, "epoch": 1.4166435789381588, "grad_norm": 6.75, "learning_rate": 2.0697919905697474e-06, "loss": 0.83688564, "memory(GiB)": 146.85, "step": 60720, "train_speed(iter/s)": 0.202077 }, { "acc": 0.76810493, "epoch": 1.4168768865104477, "grad_norm": 4.0625, "learning_rate": 2.0682614885501147e-06, "loss": 0.84390602, "memory(GiB)": 146.85, "step": 60730, "train_speed(iter/s)": 0.202092 }, { "acc": 0.77044024, "epoch": 1.4171101940827366, "grad_norm": 6.5625, "learning_rate": 2.066731405035302e-06, "loss": 0.84097509, "memory(GiB)": 146.85, "step": 60740, "train_speed(iter/s)": 0.20211 }, { "acc": 0.78769035, "epoch": 1.4173435016550255, "grad_norm": 6.46875, "learning_rate": 2.065201740243728e-06, "loss": 0.75696125, "memory(GiB)": 146.85, "step": 60750, "train_speed(iter/s)": 0.202128 }, { "acc": 0.76435766, "epoch": 1.4175768092273144, "grad_norm": 16.625, "learning_rate": 2.063672494393755e-06, "loss": 0.84791384, "memory(GiB)": 146.85, "step": 60760, "train_speed(iter/s)": 0.202147 }, { "acc": 0.76575642, "epoch": 1.4178101167996033, "grad_norm": 7.125, "learning_rate": 2.0621436677036775e-06, "loss": 0.84616432, "memory(GiB)": 146.85, "step": 60770, "train_speed(iter/s)": 0.202165 }, { "acc": 0.76658087, "epoch": 1.4180434243718922, "grad_norm": 5.3125, "learning_rate": 2.0606152603917406e-06, "loss": 0.86496725, "memory(GiB)": 146.85, "step": 60780, "train_speed(iter/s)": 0.202182 }, { "acc": 0.77459831, "epoch": 1.4182767319441811, "grad_norm": 7.1875, "learning_rate": 2.0590872726761215e-06, "loss": 0.81580353, "memory(GiB)": 146.85, "step": 60790, "train_speed(iter/s)": 0.2022 }, { "acc": 0.77789049, "epoch": 1.41851003951647, "grad_norm": 5.6875, "learning_rate": 2.057559704774938e-06, "loss": 0.79745007, "memory(GiB)": 146.85, "step": 60800, "train_speed(iter/s)": 0.202217 }, { "acc": 0.78160791, "epoch": 1.418743347088759, "grad_norm": 4.15625, "learning_rate": 2.0560325569062535e-06, "loss": 0.7460207, "memory(GiB)": 146.85, "step": 60810, "train_speed(iter/s)": 0.202234 }, { "acc": 0.77785726, "epoch": 1.4189766546610478, "grad_norm": 5.125, "learning_rate": 2.054505829288066e-06, "loss": 0.81106777, "memory(GiB)": 146.85, "step": 60820, "train_speed(iter/s)": 0.202251 }, { "acc": 0.77366734, "epoch": 1.4192099622333367, "grad_norm": 6.03125, "learning_rate": 2.0529795221383164e-06, "loss": 0.79457269, "memory(GiB)": 146.85, "step": 60830, "train_speed(iter/s)": 0.202268 }, { "acc": 0.75964737, "epoch": 1.4194432698056256, "grad_norm": 6.84375, "learning_rate": 2.0514536356748814e-06, "loss": 0.89869108, "memory(GiB)": 146.85, "step": 60840, "train_speed(iter/s)": 0.202287 }, { "acc": 0.75969257, "epoch": 1.4196765773779145, "grad_norm": 5.28125, "learning_rate": 2.0499281701155852e-06, "loss": 0.8560627, "memory(GiB)": 146.85, "step": 60850, "train_speed(iter/s)": 0.202304 }, { "acc": 0.7622015, "epoch": 1.4199098849502034, "grad_norm": 6.25, "learning_rate": 2.0484031256781845e-06, "loss": 0.864855, "memory(GiB)": 146.85, "step": 60860, "train_speed(iter/s)": 0.20232 }, { "acc": 0.75008383, "epoch": 1.4201431925224923, "grad_norm": 5.96875, "learning_rate": 2.046878502580382e-06, "loss": 0.89802256, "memory(GiB)": 146.85, "step": 60870, "train_speed(iter/s)": 0.202338 }, { "acc": 0.77599082, "epoch": 1.4203765000947812, "grad_norm": 4.9375, "learning_rate": 2.045354301039815e-06, "loss": 0.8020689, "memory(GiB)": 146.85, "step": 60880, "train_speed(iter/s)": 0.202355 }, { "acc": 0.78191833, "epoch": 1.4206098076670701, "grad_norm": 9.375, "learning_rate": 2.043830521274061e-06, "loss": 0.79015169, "memory(GiB)": 146.85, "step": 60890, "train_speed(iter/s)": 0.202372 }, { "acc": 0.75735769, "epoch": 1.420843115239359, "grad_norm": 6.3125, "learning_rate": 2.0423071635006436e-06, "loss": 0.8887413, "memory(GiB)": 146.85, "step": 60900, "train_speed(iter/s)": 0.20239 }, { "acc": 0.78609734, "epoch": 1.421076422811648, "grad_norm": 5.125, "learning_rate": 2.0407842279370176e-06, "loss": 0.76196818, "memory(GiB)": 146.85, "step": 60910, "train_speed(iter/s)": 0.202408 }, { "acc": 0.78789635, "epoch": 1.4213097303839368, "grad_norm": 5.90625, "learning_rate": 2.039261714800585e-06, "loss": 0.76891527, "memory(GiB)": 146.85, "step": 60920, "train_speed(iter/s)": 0.202425 }, { "acc": 0.77300138, "epoch": 1.4215430379562257, "grad_norm": 5.6875, "learning_rate": 2.0377396243086827e-06, "loss": 0.81639757, "memory(GiB)": 146.85, "step": 60930, "train_speed(iter/s)": 0.202442 }, { "acc": 0.76106358, "epoch": 1.4217763455285146, "grad_norm": 23.5, "learning_rate": 2.036217956678588e-06, "loss": 0.85756836, "memory(GiB)": 146.85, "step": 60940, "train_speed(iter/s)": 0.202459 }, { "acc": 0.76120129, "epoch": 1.4220096531008035, "grad_norm": 5.34375, "learning_rate": 2.034696712127518e-06, "loss": 0.83640184, "memory(GiB)": 146.85, "step": 60950, "train_speed(iter/s)": 0.202476 }, { "acc": 0.78267384, "epoch": 1.4222429606730924, "grad_norm": 5.71875, "learning_rate": 2.0331758908726323e-06, "loss": 0.77584577, "memory(GiB)": 146.85, "step": 60960, "train_speed(iter/s)": 0.202493 }, { "acc": 0.77863054, "epoch": 1.4224762682453813, "grad_norm": 5.75, "learning_rate": 2.031655493131026e-06, "loss": 0.79091077, "memory(GiB)": 146.85, "step": 60970, "train_speed(iter/s)": 0.202511 }, { "acc": 0.76731071, "epoch": 1.4227095758176702, "grad_norm": 6.25, "learning_rate": 2.030135519119735e-06, "loss": 0.84546137, "memory(GiB)": 146.85, "step": 60980, "train_speed(iter/s)": 0.202527 }, { "acc": 0.77159147, "epoch": 1.422942883389959, "grad_norm": 5.53125, "learning_rate": 2.0286159690557366e-06, "loss": 0.81171541, "memory(GiB)": 146.85, "step": 60990, "train_speed(iter/s)": 0.202544 }, { "acc": 0.78361883, "epoch": 1.423176190962248, "grad_norm": 6.125, "learning_rate": 2.027096843155944e-06, "loss": 0.78934622, "memory(GiB)": 146.85, "step": 61000, "train_speed(iter/s)": 0.202562 }, { "epoch": 1.423176190962248, "eval_acc": 0.7352074385525607, "eval_loss": 0.8340109586715698, "eval_runtime": 1264.2784, "eval_samples_per_second": 28.468, "eval_steps_per_second": 14.234, "step": 61000 }, { "acc": 0.76394815, "epoch": 1.4234094985345367, "grad_norm": 6.34375, "learning_rate": 2.025578141637215e-06, "loss": 0.86800327, "memory(GiB)": 146.85, "step": 61010, "train_speed(iter/s)": 0.201715 }, { "acc": 0.77176876, "epoch": 1.4236428061068258, "grad_norm": 5.0625, "learning_rate": 2.024059864716343e-06, "loss": 0.82335215, "memory(GiB)": 146.85, "step": 61020, "train_speed(iter/s)": 0.201733 }, { "acc": 0.76870179, "epoch": 1.4238761136791145, "grad_norm": 6.71875, "learning_rate": 2.022542012610058e-06, "loss": 0.83218803, "memory(GiB)": 146.85, "step": 61030, "train_speed(iter/s)": 0.201749 }, { "acc": 0.79633179, "epoch": 1.4241094212514036, "grad_norm": 3.9375, "learning_rate": 2.0210245855350397e-06, "loss": 0.73330946, "memory(GiB)": 146.85, "step": 61040, "train_speed(iter/s)": 0.201768 }, { "acc": 0.75603285, "epoch": 1.4243427288236923, "grad_norm": 5.96875, "learning_rate": 2.019507583707893e-06, "loss": 0.86169004, "memory(GiB)": 146.85, "step": 61050, "train_speed(iter/s)": 0.201785 }, { "acc": 0.78065815, "epoch": 1.4245760363959814, "grad_norm": 5.65625, "learning_rate": 2.017991007345175e-06, "loss": 0.78074398, "memory(GiB)": 146.85, "step": 61060, "train_speed(iter/s)": 0.201802 }, { "acc": 0.78365116, "epoch": 1.42480934396827, "grad_norm": 4.03125, "learning_rate": 2.016474856663372e-06, "loss": 0.77421212, "memory(GiB)": 146.85, "step": 61070, "train_speed(iter/s)": 0.201819 }, { "acc": 0.78929405, "epoch": 1.425042651540559, "grad_norm": 4.9375, "learning_rate": 2.014959131878918e-06, "loss": 0.75659308, "memory(GiB)": 146.85, "step": 61080, "train_speed(iter/s)": 0.201836 }, { "acc": 0.77524128, "epoch": 1.4252759591128479, "grad_norm": 5.28125, "learning_rate": 2.0134438332081814e-06, "loss": 0.80929489, "memory(GiB)": 146.85, "step": 61090, "train_speed(iter/s)": 0.201854 }, { "acc": 0.78275108, "epoch": 1.4255092666851368, "grad_norm": 8.25, "learning_rate": 2.0119289608674682e-06, "loss": 0.79001713, "memory(GiB)": 146.85, "step": 61100, "train_speed(iter/s)": 0.201871 }, { "acc": 0.78717175, "epoch": 1.4257425742574257, "grad_norm": 4.96875, "learning_rate": 2.010414515073029e-06, "loss": 0.76965699, "memory(GiB)": 146.85, "step": 61110, "train_speed(iter/s)": 0.201887 }, { "acc": 0.78417177, "epoch": 1.4259758818297146, "grad_norm": 5.21875, "learning_rate": 2.0089004960410485e-06, "loss": 0.77389641, "memory(GiB)": 146.85, "step": 61120, "train_speed(iter/s)": 0.201903 }, { "acc": 0.77700033, "epoch": 1.4262091894020035, "grad_norm": 5.3125, "learning_rate": 2.007386903987654e-06, "loss": 0.80914698, "memory(GiB)": 146.85, "step": 61130, "train_speed(iter/s)": 0.201921 }, { "acc": 0.7946197, "epoch": 1.4264424969742924, "grad_norm": 7.3125, "learning_rate": 2.0058737391289085e-06, "loss": 0.7080307, "memory(GiB)": 146.85, "step": 61140, "train_speed(iter/s)": 0.201938 }, { "acc": 0.79753141, "epoch": 1.4266758045465813, "grad_norm": 9.875, "learning_rate": 2.0043610016808185e-06, "loss": 0.72653165, "memory(GiB)": 146.85, "step": 61150, "train_speed(iter/s)": 0.201955 }, { "acc": 0.77105703, "epoch": 1.4269091121188702, "grad_norm": 5.75, "learning_rate": 2.0028486918593253e-06, "loss": 0.82401428, "memory(GiB)": 146.85, "step": 61160, "train_speed(iter/s)": 0.201972 }, { "acc": 0.7731617, "epoch": 1.427142419691159, "grad_norm": 7.875, "learning_rate": 2.001336809880311e-06, "loss": 0.81915169, "memory(GiB)": 146.85, "step": 61170, "train_speed(iter/s)": 0.201989 }, { "acc": 0.76208935, "epoch": 1.427375727263448, "grad_norm": 7.1875, "learning_rate": 1.9998253559595952e-06, "loss": 0.86610479, "memory(GiB)": 146.85, "step": 61180, "train_speed(iter/s)": 0.202008 }, { "acc": 0.77387676, "epoch": 1.4276090348357369, "grad_norm": 8.1875, "learning_rate": 1.9983143303129373e-06, "loss": 0.83140697, "memory(GiB)": 146.85, "step": 61190, "train_speed(iter/s)": 0.202026 }, { "acc": 0.79076657, "epoch": 1.4278423424080258, "grad_norm": 4.9375, "learning_rate": 1.996803733156038e-06, "loss": 0.74633417, "memory(GiB)": 146.85, "step": 61200, "train_speed(iter/s)": 0.202042 }, { "acc": 0.76867423, "epoch": 1.4280756499803147, "grad_norm": 5.375, "learning_rate": 1.9952935647045317e-06, "loss": 0.84919815, "memory(GiB)": 146.85, "step": 61210, "train_speed(iter/s)": 0.202059 }, { "acc": 0.78731565, "epoch": 1.4283089575526036, "grad_norm": 5.53125, "learning_rate": 1.9937838251739983e-06, "loss": 0.75333834, "memory(GiB)": 146.85, "step": 61220, "train_speed(iter/s)": 0.202075 }, { "acc": 0.78247747, "epoch": 1.4285422651248925, "grad_norm": 5.25, "learning_rate": 1.9922745147799505e-06, "loss": 0.79818349, "memory(GiB)": 146.85, "step": 61230, "train_speed(iter/s)": 0.202093 }, { "acc": 0.76262865, "epoch": 1.4287755726971814, "grad_norm": 4.78125, "learning_rate": 1.9907656337378396e-06, "loss": 0.8482851, "memory(GiB)": 146.85, "step": 61240, "train_speed(iter/s)": 0.202109 }, { "acc": 0.77479882, "epoch": 1.4290088802694703, "grad_norm": 4.5625, "learning_rate": 1.9892571822630622e-06, "loss": 0.83433895, "memory(GiB)": 146.85, "step": 61250, "train_speed(iter/s)": 0.202126 }, { "acc": 0.77029953, "epoch": 1.4292421878417592, "grad_norm": 5.40625, "learning_rate": 1.987749160570946e-06, "loss": 0.84342432, "memory(GiB)": 146.85, "step": 61260, "train_speed(iter/s)": 0.202143 }, { "acc": 0.76776905, "epoch": 1.429475495414048, "grad_norm": 6.59375, "learning_rate": 1.9862415688767657e-06, "loss": 0.82494812, "memory(GiB)": 146.85, "step": 61270, "train_speed(iter/s)": 0.20216 }, { "acc": 0.77763958, "epoch": 1.429708802986337, "grad_norm": 4.5625, "learning_rate": 1.984734407395722e-06, "loss": 0.81214905, "memory(GiB)": 146.85, "step": 61280, "train_speed(iter/s)": 0.202177 }, { "acc": 0.75931859, "epoch": 1.4299421105586259, "grad_norm": 6.5625, "learning_rate": 1.9832276763429674e-06, "loss": 0.88490305, "memory(GiB)": 146.85, "step": 61290, "train_speed(iter/s)": 0.202195 }, { "acc": 0.7697731, "epoch": 1.4301754181309148, "grad_norm": 4.71875, "learning_rate": 1.9817213759335846e-06, "loss": 0.83693504, "memory(GiB)": 146.85, "step": 61300, "train_speed(iter/s)": 0.202211 }, { "acc": 0.75851851, "epoch": 1.4304087257032037, "grad_norm": 5.75, "learning_rate": 1.9802155063825995e-06, "loss": 0.85352383, "memory(GiB)": 146.85, "step": 61310, "train_speed(iter/s)": 0.202229 }, { "acc": 0.76167836, "epoch": 1.4306420332754926, "grad_norm": 4.59375, "learning_rate": 1.9787100679049742e-06, "loss": 0.89028397, "memory(GiB)": 146.85, "step": 61320, "train_speed(iter/s)": 0.202246 }, { "acc": 0.7718998, "epoch": 1.4308753408477815, "grad_norm": 6.03125, "learning_rate": 1.977205060715607e-06, "loss": 0.80673962, "memory(GiB)": 146.85, "step": 61330, "train_speed(iter/s)": 0.202263 }, { "acc": 0.77931194, "epoch": 1.4311086484200704, "grad_norm": 4.34375, "learning_rate": 1.975700485029341e-06, "loss": 0.79734402, "memory(GiB)": 146.85, "step": 61340, "train_speed(iter/s)": 0.202279 }, { "acc": 0.76717157, "epoch": 1.4313419559923592, "grad_norm": 5.65625, "learning_rate": 1.9741963410609506e-06, "loss": 0.84912539, "memory(GiB)": 146.85, "step": 61350, "train_speed(iter/s)": 0.202296 }, { "acc": 0.78405433, "epoch": 1.4315752635646481, "grad_norm": 4.59375, "learning_rate": 1.9726926290251548e-06, "loss": 0.78345914, "memory(GiB)": 146.85, "step": 61360, "train_speed(iter/s)": 0.202312 }, { "acc": 0.77370048, "epoch": 1.431808571136937, "grad_norm": 5.6875, "learning_rate": 1.971189349136607e-06, "loss": 0.81963797, "memory(GiB)": 146.85, "step": 61370, "train_speed(iter/s)": 0.20233 }, { "acc": 0.76112661, "epoch": 1.4320418787092257, "grad_norm": 5.59375, "learning_rate": 1.969686501609898e-06, "loss": 0.88126993, "memory(GiB)": 146.85, "step": 61380, "train_speed(iter/s)": 0.202347 }, { "acc": 0.77387271, "epoch": 1.4322751862815148, "grad_norm": 12.9375, "learning_rate": 1.9681840866595644e-06, "loss": 0.80414839, "memory(GiB)": 146.85, "step": 61390, "train_speed(iter/s)": 0.202365 }, { "acc": 0.76838694, "epoch": 1.4325084938538035, "grad_norm": 5.34375, "learning_rate": 1.966682104500068e-06, "loss": 0.82558556, "memory(GiB)": 146.85, "step": 61400, "train_speed(iter/s)": 0.202381 }, { "acc": 0.76489892, "epoch": 1.4327418014260926, "grad_norm": 7.375, "learning_rate": 1.9651805553458212e-06, "loss": 0.81913509, "memory(GiB)": 146.85, "step": 61410, "train_speed(iter/s)": 0.202398 }, { "acc": 0.75875778, "epoch": 1.4329751089983813, "grad_norm": 4.75, "learning_rate": 1.9636794394111676e-06, "loss": 0.88710766, "memory(GiB)": 146.85, "step": 61420, "train_speed(iter/s)": 0.202415 }, { "acc": 0.78243999, "epoch": 1.4332084165706704, "grad_norm": 5.5625, "learning_rate": 1.962178756910393e-06, "loss": 0.79697332, "memory(GiB)": 146.85, "step": 61430, "train_speed(iter/s)": 0.202433 }, { "acc": 0.77684679, "epoch": 1.4334417241429591, "grad_norm": 5.96875, "learning_rate": 1.9606785080577173e-06, "loss": 0.79884291, "memory(GiB)": 146.85, "step": 61440, "train_speed(iter/s)": 0.202449 }, { "acc": 0.76758809, "epoch": 1.4336750317152482, "grad_norm": 6.375, "learning_rate": 1.959178693067303e-06, "loss": 0.84553556, "memory(GiB)": 146.85, "step": 61450, "train_speed(iter/s)": 0.202466 }, { "acc": 0.76643057, "epoch": 1.433908339287537, "grad_norm": 5.6875, "learning_rate": 1.9576793121532467e-06, "loss": 0.83572159, "memory(GiB)": 146.85, "step": 61460, "train_speed(iter/s)": 0.202483 }, { "acc": 0.76694722, "epoch": 1.4341416468598258, "grad_norm": 6.03125, "learning_rate": 1.9561803655295835e-06, "loss": 0.85933466, "memory(GiB)": 146.85, "step": 61470, "train_speed(iter/s)": 0.2025 }, { "acc": 0.7561018, "epoch": 1.4343749544321147, "grad_norm": 5.03125, "learning_rate": 1.9546818534102903e-06, "loss": 0.88065548, "memory(GiB)": 146.85, "step": 61480, "train_speed(iter/s)": 0.202517 }, { "acc": 0.77612877, "epoch": 1.4346082620044036, "grad_norm": 4.53125, "learning_rate": 1.9531837760092765e-06, "loss": 0.81001148, "memory(GiB)": 146.85, "step": 61490, "train_speed(iter/s)": 0.202535 }, { "acc": 0.7741889, "epoch": 1.4348415695766925, "grad_norm": 5.15625, "learning_rate": 1.9516861335403963e-06, "loss": 0.80939293, "memory(GiB)": 146.85, "step": 61500, "train_speed(iter/s)": 0.202551 }, { "epoch": 1.4348415695766925, "eval_acc": 0.7351488710838048, "eval_loss": 0.8340046405792236, "eval_runtime": 1263.5259, "eval_samples_per_second": 28.485, "eval_steps_per_second": 14.243, "step": 61500 }, { "acc": 0.7601593, "epoch": 1.4350748771489814, "grad_norm": 5.28125, "learning_rate": 1.9501889262174323e-06, "loss": 0.8702282, "memory(GiB)": 146.85, "step": 61510, "train_speed(iter/s)": 0.201712 }, { "acc": 0.77572622, "epoch": 1.4353081847212703, "grad_norm": 6.4375, "learning_rate": 1.9486921542541147e-06, "loss": 0.81945267, "memory(GiB)": 146.85, "step": 61520, "train_speed(iter/s)": 0.20173 }, { "acc": 0.78631964, "epoch": 1.4355414922935592, "grad_norm": 6.84375, "learning_rate": 1.9471958178641055e-06, "loss": 0.75901384, "memory(GiB)": 146.85, "step": 61530, "train_speed(iter/s)": 0.201746 }, { "acc": 0.77964578, "epoch": 1.435774799865848, "grad_norm": 5.6875, "learning_rate": 1.9456999172610046e-06, "loss": 0.79179192, "memory(GiB)": 146.85, "step": 61540, "train_speed(iter/s)": 0.201764 }, { "acc": 0.76152744, "epoch": 1.436008107438137, "grad_norm": 6.25, "learning_rate": 1.9442044526583555e-06, "loss": 0.87397413, "memory(GiB)": 146.85, "step": 61550, "train_speed(iter/s)": 0.201782 }, { "acc": 0.76473722, "epoch": 1.436241415010426, "grad_norm": 7.28125, "learning_rate": 1.9427094242696304e-06, "loss": 0.86833725, "memory(GiB)": 146.85, "step": 61560, "train_speed(iter/s)": 0.201799 }, { "acc": 0.76968746, "epoch": 1.4364747225827148, "grad_norm": 6.625, "learning_rate": 1.941214832308249e-06, "loss": 0.84866934, "memory(GiB)": 146.85, "step": 61570, "train_speed(iter/s)": 0.201816 }, { "acc": 0.76424227, "epoch": 1.4367080301550037, "grad_norm": 5.09375, "learning_rate": 1.9397206769875602e-06, "loss": 0.85958958, "memory(GiB)": 146.85, "step": 61580, "train_speed(iter/s)": 0.201833 }, { "acc": 0.78531189, "epoch": 1.4369413377272926, "grad_norm": 4.96875, "learning_rate": 1.9382269585208576e-06, "loss": 0.77159629, "memory(GiB)": 146.85, "step": 61590, "train_speed(iter/s)": 0.20185 }, { "acc": 0.74351959, "epoch": 1.4371746452995815, "grad_norm": 4.90625, "learning_rate": 1.936733677121367e-06, "loss": 0.92281227, "memory(GiB)": 146.85, "step": 61600, "train_speed(iter/s)": 0.201868 }, { "acc": 0.76486731, "epoch": 1.4374079528718704, "grad_norm": 5.21875, "learning_rate": 1.935240833002252e-06, "loss": 0.86211243, "memory(GiB)": 146.85, "step": 61610, "train_speed(iter/s)": 0.201885 }, { "acc": 0.7999568, "epoch": 1.4376412604441593, "grad_norm": 6.0, "learning_rate": 1.933748426376622e-06, "loss": 0.71405048, "memory(GiB)": 146.85, "step": 61620, "train_speed(iter/s)": 0.201902 }, { "acc": 0.77287016, "epoch": 1.4378745680164482, "grad_norm": 5.34375, "learning_rate": 1.932256457457509e-06, "loss": 0.79683418, "memory(GiB)": 146.85, "step": 61630, "train_speed(iter/s)": 0.201919 }, { "acc": 0.76876669, "epoch": 1.438107875588737, "grad_norm": 5.09375, "learning_rate": 1.9307649264578982e-06, "loss": 0.83242865, "memory(GiB)": 146.85, "step": 61640, "train_speed(iter/s)": 0.201938 }, { "acc": 0.78682575, "epoch": 1.438341183161026, "grad_norm": 7.1875, "learning_rate": 1.9292738335907e-06, "loss": 0.79037728, "memory(GiB)": 146.85, "step": 61650, "train_speed(iter/s)": 0.201954 }, { "acc": 0.77234774, "epoch": 1.438574490733315, "grad_norm": 6.15625, "learning_rate": 1.9277831790687724e-06, "loss": 0.82007427, "memory(GiB)": 146.85, "step": 61660, "train_speed(iter/s)": 0.201968 }, { "acc": 0.77166824, "epoch": 1.4388077983056038, "grad_norm": 6.0, "learning_rate": 1.9262929631049034e-06, "loss": 0.84178753, "memory(GiB)": 146.85, "step": 61670, "train_speed(iter/s)": 0.201986 }, { "acc": 0.78034716, "epoch": 1.4390411058778927, "grad_norm": 6.0, "learning_rate": 1.924803185911819e-06, "loss": 0.8002039, "memory(GiB)": 146.85, "step": 61680, "train_speed(iter/s)": 0.202004 }, { "acc": 0.77952948, "epoch": 1.4392744134501816, "grad_norm": 5.84375, "learning_rate": 1.923313847702188e-06, "loss": 0.76908851, "memory(GiB)": 146.85, "step": 61690, "train_speed(iter/s)": 0.20202 }, { "acc": 0.78125777, "epoch": 1.4395077210224705, "grad_norm": 6.25, "learning_rate": 1.9218249486886097e-06, "loss": 0.79903679, "memory(GiB)": 146.85, "step": 61700, "train_speed(iter/s)": 0.202037 }, { "acc": 0.78226881, "epoch": 1.4397410285947594, "grad_norm": 6.90625, "learning_rate": 1.9203364890836277e-06, "loss": 0.79076481, "memory(GiB)": 146.85, "step": 61710, "train_speed(iter/s)": 0.202054 }, { "acc": 0.75256119, "epoch": 1.4399743361670483, "grad_norm": 7.90625, "learning_rate": 1.918848469099718e-06, "loss": 0.87783909, "memory(GiB)": 146.85, "step": 61720, "train_speed(iter/s)": 0.202071 }, { "acc": 0.77678947, "epoch": 1.4402076437393372, "grad_norm": 5.53125, "learning_rate": 1.9173608889492936e-06, "loss": 0.79443483, "memory(GiB)": 146.85, "step": 61730, "train_speed(iter/s)": 0.202088 }, { "acc": 0.76191797, "epoch": 1.440440951311626, "grad_norm": 8.75, "learning_rate": 1.915873748844705e-06, "loss": 0.87472811, "memory(GiB)": 146.85, "step": 61740, "train_speed(iter/s)": 0.202104 }, { "acc": 0.76966171, "epoch": 1.440674258883915, "grad_norm": 6.96875, "learning_rate": 1.9143870489982443e-06, "loss": 0.81911449, "memory(GiB)": 146.85, "step": 61750, "train_speed(iter/s)": 0.202122 }, { "acc": 0.76400599, "epoch": 1.4409075664562039, "grad_norm": 6.25, "learning_rate": 1.9129007896221365e-06, "loss": 0.87433109, "memory(GiB)": 146.85, "step": 61760, "train_speed(iter/s)": 0.20214 }, { "acc": 0.77622566, "epoch": 1.4411408740284926, "grad_norm": 4.5625, "learning_rate": 1.9114149709285416e-06, "loss": 0.81944313, "memory(GiB)": 146.85, "step": 61770, "train_speed(iter/s)": 0.202158 }, { "acc": 0.78112793, "epoch": 1.4413741816007817, "grad_norm": 4.71875, "learning_rate": 1.909929593129565e-06, "loss": 0.77993879, "memory(GiB)": 146.85, "step": 61780, "train_speed(iter/s)": 0.202175 }, { "acc": 0.7681108, "epoch": 1.4416074891730704, "grad_norm": 4.96875, "learning_rate": 1.9084446564372393e-06, "loss": 0.87350216, "memory(GiB)": 146.85, "step": 61790, "train_speed(iter/s)": 0.202192 }, { "acc": 0.79452271, "epoch": 1.4418407967453595, "grad_norm": 5.84375, "learning_rate": 1.9069601610635424e-06, "loss": 0.7385972, "memory(GiB)": 146.85, "step": 61800, "train_speed(iter/s)": 0.20221 }, { "acc": 0.76389832, "epoch": 1.4420741043176482, "grad_norm": 7.03125, "learning_rate": 1.9054761072203843e-06, "loss": 0.85603237, "memory(GiB)": 146.85, "step": 61810, "train_speed(iter/s)": 0.202227 }, { "acc": 0.76004868, "epoch": 1.4423074118899373, "grad_norm": 5.34375, "learning_rate": 1.9039924951196109e-06, "loss": 0.86778812, "memory(GiB)": 146.85, "step": 61820, "train_speed(iter/s)": 0.202245 }, { "acc": 0.77584124, "epoch": 1.442540719462226, "grad_norm": 6.0, "learning_rate": 1.9025093249730108e-06, "loss": 0.82667561, "memory(GiB)": 146.85, "step": 61830, "train_speed(iter/s)": 0.202262 }, { "acc": 0.77854047, "epoch": 1.442774027034515, "grad_norm": 4.3125, "learning_rate": 1.9010265969923052e-06, "loss": 0.82153997, "memory(GiB)": 146.85, "step": 61840, "train_speed(iter/s)": 0.202278 }, { "acc": 0.77929511, "epoch": 1.4430073346068037, "grad_norm": 9.1875, "learning_rate": 1.8995443113891527e-06, "loss": 0.80170155, "memory(GiB)": 146.85, "step": 61850, "train_speed(iter/s)": 0.202295 }, { "acc": 0.76870165, "epoch": 1.4432406421790926, "grad_norm": 5.78125, "learning_rate": 1.898062468375147e-06, "loss": 0.8165102, "memory(GiB)": 146.85, "step": 61860, "train_speed(iter/s)": 0.202312 }, { "acc": 0.77676249, "epoch": 1.4434739497513815, "grad_norm": 7.90625, "learning_rate": 1.8965810681618251e-06, "loss": 0.82823668, "memory(GiB)": 146.85, "step": 61870, "train_speed(iter/s)": 0.20233 }, { "acc": 0.75105686, "epoch": 1.4437072573236704, "grad_norm": 6.5, "learning_rate": 1.8951001109606538e-06, "loss": 0.91603165, "memory(GiB)": 146.85, "step": 61880, "train_speed(iter/s)": 0.202348 }, { "acc": 0.77240505, "epoch": 1.4439405648959593, "grad_norm": 4.9375, "learning_rate": 1.893619596983038e-06, "loss": 0.82409697, "memory(GiB)": 146.85, "step": 61890, "train_speed(iter/s)": 0.202365 }, { "acc": 0.77215023, "epoch": 1.4441738724682482, "grad_norm": 6.9375, "learning_rate": 1.8921395264403236e-06, "loss": 0.85067101, "memory(GiB)": 146.85, "step": 61900, "train_speed(iter/s)": 0.202382 }, { "acc": 0.77304473, "epoch": 1.4444071800405371, "grad_norm": 5.71875, "learning_rate": 1.890659899543788e-06, "loss": 0.8315938, "memory(GiB)": 146.85, "step": 61910, "train_speed(iter/s)": 0.202398 }, { "acc": 0.76576014, "epoch": 1.444640487612826, "grad_norm": 5.625, "learning_rate": 1.88918071650465e-06, "loss": 0.82371197, "memory(GiB)": 146.85, "step": 61920, "train_speed(iter/s)": 0.202415 }, { "acc": 0.77403898, "epoch": 1.444873795185115, "grad_norm": 4.96875, "learning_rate": 1.8877019775340587e-06, "loss": 0.82768993, "memory(GiB)": 146.85, "step": 61930, "train_speed(iter/s)": 0.202433 }, { "acc": 0.78100457, "epoch": 1.4451071027574038, "grad_norm": 10.5, "learning_rate": 1.8862236828431086e-06, "loss": 0.80514021, "memory(GiB)": 146.85, "step": 61940, "train_speed(iter/s)": 0.20245 }, { "acc": 0.76982374, "epoch": 1.4453404103296927, "grad_norm": 6.625, "learning_rate": 1.8847458326428226e-06, "loss": 0.82300043, "memory(GiB)": 146.85, "step": 61950, "train_speed(iter/s)": 0.202466 }, { "acc": 0.77465081, "epoch": 1.4455737179019816, "grad_norm": 6.8125, "learning_rate": 1.8832684271441643e-06, "loss": 0.81136131, "memory(GiB)": 146.85, "step": 61960, "train_speed(iter/s)": 0.202483 }, { "acc": 0.78776941, "epoch": 1.4458070254742705, "grad_norm": 4.5625, "learning_rate": 1.8817914665580322e-06, "loss": 0.75386982, "memory(GiB)": 146.85, "step": 61970, "train_speed(iter/s)": 0.2025 }, { "acc": 0.78930807, "epoch": 1.4460403330465594, "grad_norm": 8.25, "learning_rate": 1.8803149510952613e-06, "loss": 0.77693882, "memory(GiB)": 146.85, "step": 61980, "train_speed(iter/s)": 0.202517 }, { "acc": 0.76916995, "epoch": 1.4462736406188483, "grad_norm": 5.71875, "learning_rate": 1.8788388809666259e-06, "loss": 0.84623604, "memory(GiB)": 146.85, "step": 61990, "train_speed(iter/s)": 0.202533 }, { "acc": 0.76752343, "epoch": 1.4465069481911372, "grad_norm": 7.375, "learning_rate": 1.877363256382832e-06, "loss": 0.83409472, "memory(GiB)": 146.85, "step": 62000, "train_speed(iter/s)": 0.20255 }, { "epoch": 1.4465069481911372, "eval_acc": 0.7351988873794365, "eval_loss": 0.8340214490890503, "eval_runtime": 1262.8127, "eval_samples_per_second": 28.501, "eval_steps_per_second": 14.251, "step": 62000 }, { "acc": 0.77661781, "epoch": 1.4467402557634261, "grad_norm": 6.5, "learning_rate": 1.8758880775545279e-06, "loss": 0.80348854, "memory(GiB)": 146.85, "step": 62010, "train_speed(iter/s)": 0.201717 }, { "acc": 0.77355595, "epoch": 1.446973563335715, "grad_norm": 5.21875, "learning_rate": 1.8744133446922935e-06, "loss": 0.80828123, "memory(GiB)": 146.85, "step": 62020, "train_speed(iter/s)": 0.201733 }, { "acc": 0.76104746, "epoch": 1.447206870908004, "grad_norm": 6.0625, "learning_rate": 1.8729390580066442e-06, "loss": 0.88371286, "memory(GiB)": 146.85, "step": 62030, "train_speed(iter/s)": 0.201749 }, { "acc": 0.77932091, "epoch": 1.4474401784802928, "grad_norm": 7.0625, "learning_rate": 1.8714652177080377e-06, "loss": 0.77972527, "memory(GiB)": 146.85, "step": 62040, "train_speed(iter/s)": 0.201765 }, { "acc": 0.77196579, "epoch": 1.4476734860525817, "grad_norm": 6.40625, "learning_rate": 1.869991824006861e-06, "loss": 0.82009602, "memory(GiB)": 146.85, "step": 62050, "train_speed(iter/s)": 0.201782 }, { "acc": 0.77428732, "epoch": 1.4479067936248706, "grad_norm": 4.75, "learning_rate": 1.8685188771134433e-06, "loss": 0.81713133, "memory(GiB)": 146.85, "step": 62060, "train_speed(iter/s)": 0.201799 }, { "acc": 0.75682344, "epoch": 1.4481401011971595, "grad_norm": 5.78125, "learning_rate": 1.8670463772380464e-06, "loss": 0.86784801, "memory(GiB)": 146.85, "step": 62070, "train_speed(iter/s)": 0.201817 }, { "acc": 0.75543623, "epoch": 1.4483734087694484, "grad_norm": 6.84375, "learning_rate": 1.8655743245908692e-06, "loss": 0.88399734, "memory(GiB)": 146.85, "step": 62080, "train_speed(iter/s)": 0.201834 }, { "acc": 0.77106638, "epoch": 1.4486067163417373, "grad_norm": 8.3125, "learning_rate": 1.864102719382045e-06, "loss": 0.8204628, "memory(GiB)": 146.85, "step": 62090, "train_speed(iter/s)": 0.20185 }, { "acc": 0.77186975, "epoch": 1.4488400239140262, "grad_norm": 5.28125, "learning_rate": 1.8626315618216484e-06, "loss": 0.82876949, "memory(GiB)": 146.85, "step": 62100, "train_speed(iter/s)": 0.201868 }, { "acc": 0.77113218, "epoch": 1.4490733314863151, "grad_norm": 6.5625, "learning_rate": 1.8611608521196844e-06, "loss": 0.83038139, "memory(GiB)": 146.85, "step": 62110, "train_speed(iter/s)": 0.201886 }, { "acc": 0.77811947, "epoch": 1.449306639058604, "grad_norm": 7.0, "learning_rate": 1.8596905904860956e-06, "loss": 0.78366961, "memory(GiB)": 146.85, "step": 62120, "train_speed(iter/s)": 0.201902 }, { "acc": 0.78505011, "epoch": 1.449539946630893, "grad_norm": 5.8125, "learning_rate": 1.8582207771307647e-06, "loss": 0.77748108, "memory(GiB)": 146.85, "step": 62130, "train_speed(iter/s)": 0.201918 }, { "acc": 0.77463489, "epoch": 1.4497732542031816, "grad_norm": 6.5, "learning_rate": 1.8567514122635027e-06, "loss": 0.812253, "memory(GiB)": 146.85, "step": 62140, "train_speed(iter/s)": 0.201935 }, { "acc": 0.7877852, "epoch": 1.4500065617754707, "grad_norm": 6.96875, "learning_rate": 1.8552824960940658e-06, "loss": 0.7527566, "memory(GiB)": 146.85, "step": 62150, "train_speed(iter/s)": 0.201952 }, { "acc": 0.77017794, "epoch": 1.4502398693477594, "grad_norm": 7.5, "learning_rate": 1.8538140288321387e-06, "loss": 0.82361717, "memory(GiB)": 146.85, "step": 62160, "train_speed(iter/s)": 0.201969 }, { "acc": 0.78415132, "epoch": 1.4504731769200485, "grad_norm": 5.90625, "learning_rate": 1.8523460106873436e-06, "loss": 0.78301206, "memory(GiB)": 146.85, "step": 62170, "train_speed(iter/s)": 0.201985 }, { "acc": 0.75943809, "epoch": 1.4507064844923372, "grad_norm": 5.21875, "learning_rate": 1.8508784418692428e-06, "loss": 0.86683769, "memory(GiB)": 146.85, "step": 62180, "train_speed(iter/s)": 0.202002 }, { "acc": 0.77470703, "epoch": 1.4509397920646263, "grad_norm": 4.21875, "learning_rate": 1.8494113225873295e-06, "loss": 0.81645279, "memory(GiB)": 146.85, "step": 62190, "train_speed(iter/s)": 0.202019 }, { "acc": 0.78653297, "epoch": 1.451173099636915, "grad_norm": 9.5625, "learning_rate": 1.8479446530510348e-06, "loss": 0.76089735, "memory(GiB)": 146.85, "step": 62200, "train_speed(iter/s)": 0.202036 }, { "acc": 0.77291431, "epoch": 1.451406407209204, "grad_norm": 5.75, "learning_rate": 1.8464784334697234e-06, "loss": 0.84289913, "memory(GiB)": 146.85, "step": 62210, "train_speed(iter/s)": 0.202053 }, { "acc": 0.77877512, "epoch": 1.4516397147814928, "grad_norm": 5.40625, "learning_rate": 1.845012664052701e-06, "loss": 0.79849577, "memory(GiB)": 146.85, "step": 62220, "train_speed(iter/s)": 0.202069 }, { "acc": 0.78128085, "epoch": 1.451873022353782, "grad_norm": 6.75, "learning_rate": 1.843547345009203e-06, "loss": 0.77348356, "memory(GiB)": 146.85, "step": 62230, "train_speed(iter/s)": 0.202086 }, { "acc": 0.76756916, "epoch": 1.4521063299260706, "grad_norm": 8.375, "learning_rate": 1.8420824765484058e-06, "loss": 0.82725849, "memory(GiB)": 146.85, "step": 62240, "train_speed(iter/s)": 0.202104 }, { "acc": 0.76276703, "epoch": 1.4523396374983595, "grad_norm": 5.65625, "learning_rate": 1.8406180588794176e-06, "loss": 0.87531528, "memory(GiB)": 146.85, "step": 62250, "train_speed(iter/s)": 0.202121 }, { "acc": 0.76925707, "epoch": 1.4525729450706484, "grad_norm": 5.34375, "learning_rate": 1.8391540922112822e-06, "loss": 0.81807537, "memory(GiB)": 146.85, "step": 62260, "train_speed(iter/s)": 0.202139 }, { "acc": 0.79033146, "epoch": 1.4528062526429373, "grad_norm": 6.71875, "learning_rate": 1.8376905767529834e-06, "loss": 0.75113759, "memory(GiB)": 146.85, "step": 62270, "train_speed(iter/s)": 0.202156 }, { "acc": 0.78664637, "epoch": 1.4530395602152262, "grad_norm": 5.9375, "learning_rate": 1.8362275127134348e-06, "loss": 0.75387707, "memory(GiB)": 146.85, "step": 62280, "train_speed(iter/s)": 0.202174 }, { "acc": 0.76653481, "epoch": 1.453272867787515, "grad_norm": 5.5625, "learning_rate": 1.8347649003014911e-06, "loss": 0.83646507, "memory(GiB)": 146.85, "step": 62290, "train_speed(iter/s)": 0.20219 }, { "acc": 0.7709341, "epoch": 1.453506175359804, "grad_norm": 7.40625, "learning_rate": 1.833302739725939e-06, "loss": 0.83708019, "memory(GiB)": 146.85, "step": 62300, "train_speed(iter/s)": 0.202206 }, { "acc": 0.7614996, "epoch": 1.4537394829320929, "grad_norm": 7.625, "learning_rate": 1.8318410311955003e-06, "loss": 0.86620865, "memory(GiB)": 146.85, "step": 62310, "train_speed(iter/s)": 0.202223 }, { "acc": 0.77791805, "epoch": 1.4539727905043818, "grad_norm": 6.625, "learning_rate": 1.830379774918834e-06, "loss": 0.78655949, "memory(GiB)": 146.85, "step": 62320, "train_speed(iter/s)": 0.20224 }, { "acc": 0.7894206, "epoch": 1.4542060980766707, "grad_norm": 5.53125, "learning_rate": 1.8289189711045324e-06, "loss": 0.76816015, "memory(GiB)": 146.85, "step": 62330, "train_speed(iter/s)": 0.202257 }, { "acc": 0.79108629, "epoch": 1.4544394056489596, "grad_norm": 5.46875, "learning_rate": 1.8274586199611283e-06, "loss": 0.73965645, "memory(GiB)": 146.85, "step": 62340, "train_speed(iter/s)": 0.202273 }, { "acc": 0.78406382, "epoch": 1.4546727132212485, "grad_norm": 5.53125, "learning_rate": 1.8259987216970826e-06, "loss": 0.78000059, "memory(GiB)": 146.85, "step": 62350, "train_speed(iter/s)": 0.20229 }, { "acc": 0.76954699, "epoch": 1.4549060207935374, "grad_norm": 7.0625, "learning_rate": 1.8245392765207993e-06, "loss": 0.84312305, "memory(GiB)": 146.85, "step": 62360, "train_speed(iter/s)": 0.202307 }, { "acc": 0.77452173, "epoch": 1.4551393283658263, "grad_norm": 5.625, "learning_rate": 1.8230802846406104e-06, "loss": 0.81340036, "memory(GiB)": 146.85, "step": 62370, "train_speed(iter/s)": 0.202324 }, { "acc": 0.76221371, "epoch": 1.4553726359381152, "grad_norm": 5.53125, "learning_rate": 1.821621746264789e-06, "loss": 0.85954475, "memory(GiB)": 146.85, "step": 62380, "train_speed(iter/s)": 0.202341 }, { "acc": 0.78060417, "epoch": 1.455605943510404, "grad_norm": 10.25, "learning_rate": 1.8201636616015405e-06, "loss": 0.78786178, "memory(GiB)": 146.85, "step": 62390, "train_speed(iter/s)": 0.202357 }, { "acc": 0.76814198, "epoch": 1.455839251082693, "grad_norm": 5.5, "learning_rate": 1.8187060308590038e-06, "loss": 0.82037249, "memory(GiB)": 146.85, "step": 62400, "train_speed(iter/s)": 0.202374 }, { "acc": 0.77293792, "epoch": 1.4560725586549819, "grad_norm": 5.8125, "learning_rate": 1.8172488542452583e-06, "loss": 0.81262083, "memory(GiB)": 146.85, "step": 62410, "train_speed(iter/s)": 0.202392 }, { "acc": 0.76755505, "epoch": 1.4563058662272708, "grad_norm": 6.1875, "learning_rate": 1.8157921319683147e-06, "loss": 0.84030437, "memory(GiB)": 146.85, "step": 62420, "train_speed(iter/s)": 0.202409 }, { "acc": 0.76367292, "epoch": 1.4565391737995597, "grad_norm": 6.09375, "learning_rate": 1.8143358642361191e-06, "loss": 0.85973577, "memory(GiB)": 146.85, "step": 62430, "train_speed(iter/s)": 0.202425 }, { "acc": 0.76229897, "epoch": 1.4567724813718486, "grad_norm": 5.90625, "learning_rate": 1.8128800512565514e-06, "loss": 0.84309568, "memory(GiB)": 146.85, "step": 62440, "train_speed(iter/s)": 0.202442 }, { "acc": 0.77139025, "epoch": 1.4570057889441375, "grad_norm": 6.6875, "learning_rate": 1.811424693237433e-06, "loss": 0.822299, "memory(GiB)": 146.85, "step": 62450, "train_speed(iter/s)": 0.20246 }, { "acc": 0.78427472, "epoch": 1.4572390965164264, "grad_norm": 5.03125, "learning_rate": 1.8099697903865127e-06, "loss": 0.77293959, "memory(GiB)": 146.85, "step": 62460, "train_speed(iter/s)": 0.202478 }, { "acc": 0.76397247, "epoch": 1.4574724040887153, "grad_norm": 4.8125, "learning_rate": 1.8085153429114766e-06, "loss": 0.86567802, "memory(GiB)": 146.85, "step": 62470, "train_speed(iter/s)": 0.202495 }, { "acc": 0.7873579, "epoch": 1.4577057116610042, "grad_norm": 5.0625, "learning_rate": 1.8070613510199497e-06, "loss": 0.78247485, "memory(GiB)": 146.85, "step": 62480, "train_speed(iter/s)": 0.202511 }, { "acc": 0.76866002, "epoch": 1.457939019233293, "grad_norm": 7.5, "learning_rate": 1.8056078149194861e-06, "loss": 0.84009953, "memory(GiB)": 146.85, "step": 62490, "train_speed(iter/s)": 0.202528 }, { "acc": 0.76605015, "epoch": 1.458172326805582, "grad_norm": 5.03125, "learning_rate": 1.8041547348175803e-06, "loss": 0.86240749, "memory(GiB)": 146.85, "step": 62500, "train_speed(iter/s)": 0.202546 }, { "epoch": 1.458172326805582, "eval_acc": 0.7352032436374432, "eval_loss": 0.8339763879776001, "eval_runtime": 1263.2284, "eval_samples_per_second": 28.491, "eval_steps_per_second": 14.246, "step": 62500 }, { "acc": 0.77711134, "epoch": 1.4584056343778709, "grad_norm": 5.3125, "learning_rate": 1.802702110921658e-06, "loss": 0.7954567, "memory(GiB)": 146.85, "step": 62510, "train_speed(iter/s)": 0.20172 }, { "acc": 0.7551158, "epoch": 1.4586389419501598, "grad_norm": 6.78125, "learning_rate": 1.8012499434390784e-06, "loss": 0.89037056, "memory(GiB)": 146.85, "step": 62520, "train_speed(iter/s)": 0.201736 }, { "acc": 0.74227877, "epoch": 1.4588722495224484, "grad_norm": 7.4375, "learning_rate": 1.7997982325771425e-06, "loss": 0.9449192, "memory(GiB)": 146.85, "step": 62530, "train_speed(iter/s)": 0.201752 }, { "acc": 0.78595009, "epoch": 1.4591055570947375, "grad_norm": 7.3125, "learning_rate": 1.7983469785430785e-06, "loss": 0.78605423, "memory(GiB)": 146.85, "step": 62540, "train_speed(iter/s)": 0.201769 }, { "acc": 0.77731781, "epoch": 1.4593388646670262, "grad_norm": 7.8125, "learning_rate": 1.7968961815440534e-06, "loss": 0.82090874, "memory(GiB)": 146.85, "step": 62550, "train_speed(iter/s)": 0.201786 }, { "acc": 0.76807566, "epoch": 1.4595721722393153, "grad_norm": 7.0, "learning_rate": 1.7954458417871667e-06, "loss": 0.83290205, "memory(GiB)": 146.85, "step": 62560, "train_speed(iter/s)": 0.201804 }, { "acc": 0.7816071, "epoch": 1.459805479811604, "grad_norm": 6.28125, "learning_rate": 1.7939959594794564e-06, "loss": 0.76198306, "memory(GiB)": 146.85, "step": 62570, "train_speed(iter/s)": 0.20182 }, { "acc": 0.78467169, "epoch": 1.4600387873838931, "grad_norm": 6.21875, "learning_rate": 1.7925465348278898e-06, "loss": 0.75322022, "memory(GiB)": 146.85, "step": 62580, "train_speed(iter/s)": 0.201837 }, { "acc": 0.76520414, "epoch": 1.4602720949561818, "grad_norm": 8.1875, "learning_rate": 1.7910975680393756e-06, "loss": 0.84796677, "memory(GiB)": 146.85, "step": 62590, "train_speed(iter/s)": 0.201855 }, { "acc": 0.76479769, "epoch": 1.460505402528471, "grad_norm": 5.21875, "learning_rate": 1.789649059320751e-06, "loss": 0.83268471, "memory(GiB)": 146.85, "step": 62600, "train_speed(iter/s)": 0.201871 }, { "acc": 0.77815552, "epoch": 1.4607387101007596, "grad_norm": 7.1875, "learning_rate": 1.7882010088787888e-06, "loss": 0.79866714, "memory(GiB)": 146.85, "step": 62610, "train_speed(iter/s)": 0.201889 }, { "acc": 0.76383791, "epoch": 1.4609720176730487, "grad_norm": 5.6875, "learning_rate": 1.7867534169202018e-06, "loss": 0.8511898, "memory(GiB)": 146.85, "step": 62620, "train_speed(iter/s)": 0.201906 }, { "acc": 0.76677113, "epoch": 1.4612053252453374, "grad_norm": 5.5625, "learning_rate": 1.785306283651629e-06, "loss": 0.83073864, "memory(GiB)": 146.85, "step": 62630, "train_speed(iter/s)": 0.201922 }, { "acc": 0.770574, "epoch": 1.4614386328176263, "grad_norm": 5.8125, "learning_rate": 1.783859609279654e-06, "loss": 0.834865, "memory(GiB)": 146.85, "step": 62640, "train_speed(iter/s)": 0.20194 }, { "acc": 0.76440907, "epoch": 1.4616719403899152, "grad_norm": 6.0625, "learning_rate": 1.7824133940107818e-06, "loss": 0.87554054, "memory(GiB)": 146.85, "step": 62650, "train_speed(iter/s)": 0.201955 }, { "acc": 0.74777708, "epoch": 1.4619052479622041, "grad_norm": 5.125, "learning_rate": 1.7809676380514646e-06, "loss": 0.90903263, "memory(GiB)": 146.85, "step": 62660, "train_speed(iter/s)": 0.201972 }, { "acc": 0.74929657, "epoch": 1.462138555534493, "grad_norm": 5.75, "learning_rate": 1.7795223416080804e-06, "loss": 0.91680431, "memory(GiB)": 146.85, "step": 62670, "train_speed(iter/s)": 0.201988 }, { "acc": 0.77972546, "epoch": 1.462371863106782, "grad_norm": 6.65625, "learning_rate": 1.778077504886948e-06, "loss": 0.79405794, "memory(GiB)": 146.85, "step": 62680, "train_speed(iter/s)": 0.202005 }, { "acc": 0.78348293, "epoch": 1.4626051706790708, "grad_norm": 4.84375, "learning_rate": 1.7766331280943156e-06, "loss": 0.78161135, "memory(GiB)": 146.85, "step": 62690, "train_speed(iter/s)": 0.202022 }, { "acc": 0.78187895, "epoch": 1.4628384782513597, "grad_norm": 5.125, "learning_rate": 1.775189211436366e-06, "loss": 0.78948855, "memory(GiB)": 146.85, "step": 62700, "train_speed(iter/s)": 0.202038 }, { "acc": 0.77887449, "epoch": 1.4630717858236486, "grad_norm": 5.09375, "learning_rate": 1.7737457551192221e-06, "loss": 0.79203162, "memory(GiB)": 146.85, "step": 62710, "train_speed(iter/s)": 0.202054 }, { "acc": 0.75884852, "epoch": 1.4633050933959375, "grad_norm": 8.5625, "learning_rate": 1.7723027593489322e-06, "loss": 0.86778708, "memory(GiB)": 146.85, "step": 62720, "train_speed(iter/s)": 0.202072 }, { "acc": 0.74979787, "epoch": 1.4635384009682264, "grad_norm": 6.9375, "learning_rate": 1.7708602243314876e-06, "loss": 0.90491686, "memory(GiB)": 146.85, "step": 62730, "train_speed(iter/s)": 0.20209 }, { "acc": 0.78235898, "epoch": 1.4637717085405153, "grad_norm": 7.09375, "learning_rate": 1.7694181502728074e-06, "loss": 0.78068047, "memory(GiB)": 146.85, "step": 62740, "train_speed(iter/s)": 0.202108 }, { "acc": 0.78238082, "epoch": 1.4640050161128042, "grad_norm": 5.875, "learning_rate": 1.7679765373787467e-06, "loss": 0.78747325, "memory(GiB)": 146.85, "step": 62750, "train_speed(iter/s)": 0.202125 }, { "acc": 0.7729969, "epoch": 1.464238323685093, "grad_norm": 7.15625, "learning_rate": 1.7665353858550993e-06, "loss": 0.82173557, "memory(GiB)": 146.85, "step": 62760, "train_speed(iter/s)": 0.202142 }, { "acc": 0.77222919, "epoch": 1.464471631257382, "grad_norm": 8.875, "learning_rate": 1.7650946959075833e-06, "loss": 0.82946415, "memory(GiB)": 146.85, "step": 62770, "train_speed(iter/s)": 0.202159 }, { "acc": 0.76508884, "epoch": 1.464704938829671, "grad_norm": 5.78125, "learning_rate": 1.763654467741861e-06, "loss": 0.84112129, "memory(GiB)": 146.85, "step": 62780, "train_speed(iter/s)": 0.202176 }, { "acc": 0.7803607, "epoch": 1.4649382464019598, "grad_norm": 7.125, "learning_rate": 1.7622147015635222e-06, "loss": 0.79158916, "memory(GiB)": 146.85, "step": 62790, "train_speed(iter/s)": 0.202193 }, { "acc": 0.76752863, "epoch": 1.4651715539742487, "grad_norm": 6.46875, "learning_rate": 1.760775397578095e-06, "loss": 0.84686928, "memory(GiB)": 146.85, "step": 62800, "train_speed(iter/s)": 0.20221 }, { "acc": 0.77685394, "epoch": 1.4654048615465376, "grad_norm": 6.21875, "learning_rate": 1.7593365559910397e-06, "loss": 0.8101984, "memory(GiB)": 146.85, "step": 62810, "train_speed(iter/s)": 0.202227 }, { "acc": 0.77305136, "epoch": 1.4656381691188265, "grad_norm": 6.0, "learning_rate": 1.7578981770077474e-06, "loss": 0.8211195, "memory(GiB)": 146.85, "step": 62820, "train_speed(iter/s)": 0.202245 }, { "acc": 0.77585459, "epoch": 1.4658714766911154, "grad_norm": 8.1875, "learning_rate": 1.7564602608335502e-06, "loss": 0.79657784, "memory(GiB)": 146.85, "step": 62830, "train_speed(iter/s)": 0.202261 }, { "acc": 0.77420845, "epoch": 1.4661047842634043, "grad_norm": 6.3125, "learning_rate": 1.7550228076737069e-06, "loss": 0.83573093, "memory(GiB)": 146.85, "step": 62840, "train_speed(iter/s)": 0.202278 }, { "acc": 0.75865011, "epoch": 1.4663380918356932, "grad_norm": 6.4375, "learning_rate": 1.7535858177334163e-06, "loss": 0.86194725, "memory(GiB)": 146.85, "step": 62850, "train_speed(iter/s)": 0.202294 }, { "acc": 0.77832227, "epoch": 1.466571399407982, "grad_norm": 5.4375, "learning_rate": 1.7521492912178062e-06, "loss": 0.79447174, "memory(GiB)": 146.85, "step": 62860, "train_speed(iter/s)": 0.20231 }, { "acc": 0.78025656, "epoch": 1.466804706980271, "grad_norm": 5.09375, "learning_rate": 1.7507132283319445e-06, "loss": 0.78183222, "memory(GiB)": 146.85, "step": 62870, "train_speed(iter/s)": 0.202327 }, { "acc": 0.77599545, "epoch": 1.4670380145525599, "grad_norm": 4.40625, "learning_rate": 1.7492776292808217e-06, "loss": 0.79424443, "memory(GiB)": 146.85, "step": 62880, "train_speed(iter/s)": 0.202343 }, { "acc": 0.76801276, "epoch": 1.4672713221248488, "grad_norm": 4.5, "learning_rate": 1.7478424942693751e-06, "loss": 0.83760633, "memory(GiB)": 146.85, "step": 62890, "train_speed(iter/s)": 0.202359 }, { "acc": 0.77222991, "epoch": 1.4675046296971377, "grad_norm": 6.6875, "learning_rate": 1.7464078235024678e-06, "loss": 0.82752876, "memory(GiB)": 146.85, "step": 62900, "train_speed(iter/s)": 0.202376 }, { "acc": 0.7731473, "epoch": 1.4677379372694266, "grad_norm": 7.0625, "learning_rate": 1.7449736171848964e-06, "loss": 0.83903513, "memory(GiB)": 146.85, "step": 62910, "train_speed(iter/s)": 0.202392 }, { "acc": 0.77867937, "epoch": 1.4679712448417153, "grad_norm": 9.8125, "learning_rate": 1.7435398755213977e-06, "loss": 0.80373707, "memory(GiB)": 146.85, "step": 62920, "train_speed(iter/s)": 0.202408 }, { "acc": 0.77646093, "epoch": 1.4682045524140044, "grad_norm": 5.78125, "learning_rate": 1.7421065987166335e-06, "loss": 0.81115074, "memory(GiB)": 146.85, "step": 62930, "train_speed(iter/s)": 0.202424 }, { "acc": 0.79385719, "epoch": 1.468437859986293, "grad_norm": 5.71875, "learning_rate": 1.7406737869752082e-06, "loss": 0.74772425, "memory(GiB)": 146.85, "step": 62940, "train_speed(iter/s)": 0.202441 }, { "acc": 0.78522625, "epoch": 1.4686711675585822, "grad_norm": 4.75, "learning_rate": 1.7392414405016527e-06, "loss": 0.7786211, "memory(GiB)": 146.85, "step": 62950, "train_speed(iter/s)": 0.202458 }, { "acc": 0.77125359, "epoch": 1.4689044751308709, "grad_norm": 6.40625, "learning_rate": 1.7378095595004323e-06, "loss": 0.83653011, "memory(GiB)": 146.85, "step": 62960, "train_speed(iter/s)": 0.202474 }, { "acc": 0.7637352, "epoch": 1.46913778270316, "grad_norm": 5.375, "learning_rate": 1.736378144175952e-06, "loss": 0.85679493, "memory(GiB)": 146.85, "step": 62970, "train_speed(iter/s)": 0.202492 }, { "acc": 0.7894978, "epoch": 1.4693710902754487, "grad_norm": 5.46875, "learning_rate": 1.7349471947325414e-06, "loss": 0.7547039, "memory(GiB)": 146.85, "step": 62980, "train_speed(iter/s)": 0.202508 }, { "acc": 0.77331572, "epoch": 1.4696043978477378, "grad_norm": 5.875, "learning_rate": 1.7335167113744732e-06, "loss": 0.8069025, "memory(GiB)": 146.85, "step": 62990, "train_speed(iter/s)": 0.202525 }, { "acc": 0.7904407, "epoch": 1.4698377054200265, "grad_norm": 4.5, "learning_rate": 1.7320866943059427e-06, "loss": 0.76850529, "memory(GiB)": 146.85, "step": 63000, "train_speed(iter/s)": 0.202542 }, { "epoch": 1.4698377054200265, "eval_acc": 0.7352372869870506, "eval_loss": 0.8339859247207642, "eval_runtime": 1263.4658, "eval_samples_per_second": 28.486, "eval_steps_per_second": 14.243, "step": 63000 }, { "acc": 0.78109941, "epoch": 1.4700710129923154, "grad_norm": 6.40625, "learning_rate": 1.7306571437310893e-06, "loss": 0.78466196, "memory(GiB)": 146.85, "step": 63010, "train_speed(iter/s)": 0.201724 }, { "acc": 0.76268854, "epoch": 1.4703043205646043, "grad_norm": 6.09375, "learning_rate": 1.7292280598539769e-06, "loss": 0.85921154, "memory(GiB)": 146.85, "step": 63020, "train_speed(iter/s)": 0.20174 }, { "acc": 0.78069115, "epoch": 1.4705376281368931, "grad_norm": 5.15625, "learning_rate": 1.72779944287861e-06, "loss": 0.78000555, "memory(GiB)": 146.85, "step": 63030, "train_speed(iter/s)": 0.201757 }, { "acc": 0.77319756, "epoch": 1.470770935709182, "grad_norm": 4.6875, "learning_rate": 1.7263712930089227e-06, "loss": 0.81422663, "memory(GiB)": 146.85, "step": 63040, "train_speed(iter/s)": 0.201773 }, { "acc": 0.78080668, "epoch": 1.471004243281471, "grad_norm": 6.6875, "learning_rate": 1.7249436104487805e-06, "loss": 0.77703905, "memory(GiB)": 146.85, "step": 63050, "train_speed(iter/s)": 0.201789 }, { "acc": 0.76801629, "epoch": 1.4712375508537598, "grad_norm": 6.125, "learning_rate": 1.7235163954019878e-06, "loss": 0.8619873, "memory(GiB)": 146.85, "step": 63060, "train_speed(iter/s)": 0.201805 }, { "acc": 0.7768136, "epoch": 1.4714708584260487, "grad_norm": 5.125, "learning_rate": 1.7220896480722766e-06, "loss": 0.79348917, "memory(GiB)": 146.85, "step": 63070, "train_speed(iter/s)": 0.201822 }, { "acc": 0.77665596, "epoch": 1.4717041659983376, "grad_norm": 5.125, "learning_rate": 1.7206633686633172e-06, "loss": 0.80527277, "memory(GiB)": 146.85, "step": 63080, "train_speed(iter/s)": 0.201838 }, { "acc": 0.76219082, "epoch": 1.4719374735706265, "grad_norm": 4.375, "learning_rate": 1.719237557378709e-06, "loss": 0.86197309, "memory(GiB)": 146.85, "step": 63090, "train_speed(iter/s)": 0.201856 }, { "acc": 0.76205807, "epoch": 1.4721707811429154, "grad_norm": 5.875, "learning_rate": 1.7178122144219873e-06, "loss": 0.84997673, "memory(GiB)": 146.85, "step": 63100, "train_speed(iter/s)": 0.201872 }, { "acc": 0.76927204, "epoch": 1.4724040887152043, "grad_norm": 5.8125, "learning_rate": 1.716387339996618e-06, "loss": 0.81726961, "memory(GiB)": 146.85, "step": 63110, "train_speed(iter/s)": 0.201889 }, { "acc": 0.76634278, "epoch": 1.4726373962874932, "grad_norm": 5.78125, "learning_rate": 1.7149629343060003e-06, "loss": 0.857726, "memory(GiB)": 146.85, "step": 63120, "train_speed(iter/s)": 0.201906 }, { "acc": 0.7955286, "epoch": 1.4728707038597821, "grad_norm": 6.71875, "learning_rate": 1.7135389975534711e-06, "loss": 0.7324399, "memory(GiB)": 146.85, "step": 63130, "train_speed(iter/s)": 0.201921 }, { "acc": 0.78500004, "epoch": 1.473104011432071, "grad_norm": 4.84375, "learning_rate": 1.7121155299422936e-06, "loss": 0.76613989, "memory(GiB)": 146.85, "step": 63140, "train_speed(iter/s)": 0.201937 }, { "acc": 0.77809544, "epoch": 1.47333731900436, "grad_norm": 5.3125, "learning_rate": 1.710692531675671e-06, "loss": 0.82732487, "memory(GiB)": 146.85, "step": 63150, "train_speed(iter/s)": 0.201955 }, { "acc": 0.78612041, "epoch": 1.4735706265766488, "grad_norm": 5.6875, "learning_rate": 1.709270002956732e-06, "loss": 0.7675333, "memory(GiB)": 146.85, "step": 63160, "train_speed(iter/s)": 0.201971 }, { "acc": 0.75789089, "epoch": 1.4738039341489377, "grad_norm": 4.6875, "learning_rate": 1.7078479439885458e-06, "loss": 0.87306595, "memory(GiB)": 146.85, "step": 63170, "train_speed(iter/s)": 0.201987 }, { "acc": 0.78230381, "epoch": 1.4740372417212266, "grad_norm": 7.0625, "learning_rate": 1.7064263549741095e-06, "loss": 0.78271317, "memory(GiB)": 146.85, "step": 63180, "train_speed(iter/s)": 0.202003 }, { "acc": 0.78653083, "epoch": 1.4742705492935155, "grad_norm": 4.75, "learning_rate": 1.7050052361163522e-06, "loss": 0.77051716, "memory(GiB)": 146.85, "step": 63190, "train_speed(iter/s)": 0.202021 }, { "acc": 0.7886548, "epoch": 1.4745038568658044, "grad_norm": 6.21875, "learning_rate": 1.7035845876181422e-06, "loss": 0.77971883, "memory(GiB)": 146.85, "step": 63200, "train_speed(iter/s)": 0.202037 }, { "acc": 0.76395879, "epoch": 1.4747371644380933, "grad_norm": 7.625, "learning_rate": 1.7021644096822748e-06, "loss": 0.85244608, "memory(GiB)": 146.85, "step": 63210, "train_speed(iter/s)": 0.202054 }, { "acc": 0.75231409, "epoch": 1.4749704720103822, "grad_norm": 5.78125, "learning_rate": 1.7007447025114798e-06, "loss": 0.90288219, "memory(GiB)": 146.85, "step": 63220, "train_speed(iter/s)": 0.202071 }, { "acc": 0.78022585, "epoch": 1.4752037795826711, "grad_norm": 6.78125, "learning_rate": 1.699325466308418e-06, "loss": 0.7945828, "memory(GiB)": 146.85, "step": 63230, "train_speed(iter/s)": 0.202088 }, { "acc": 0.78901749, "epoch": 1.47543708715496, "grad_norm": 5.1875, "learning_rate": 1.6979067012756888e-06, "loss": 0.75863619, "memory(GiB)": 146.85, "step": 63240, "train_speed(iter/s)": 0.202104 }, { "acc": 0.78325253, "epoch": 1.475670394727249, "grad_norm": 4.9375, "learning_rate": 1.6964884076158194e-06, "loss": 0.77717352, "memory(GiB)": 146.85, "step": 63250, "train_speed(iter/s)": 0.202121 }, { "acc": 0.76974802, "epoch": 1.4759037022995378, "grad_norm": 5.46875, "learning_rate": 1.6950705855312677e-06, "loss": 0.83988485, "memory(GiB)": 146.85, "step": 63260, "train_speed(iter/s)": 0.202137 }, { "acc": 0.77803946, "epoch": 1.4761370098718267, "grad_norm": 13.375, "learning_rate": 1.6936532352244316e-06, "loss": 0.80825634, "memory(GiB)": 146.85, "step": 63270, "train_speed(iter/s)": 0.202154 }, { "acc": 0.78252077, "epoch": 1.4763703174441156, "grad_norm": 7.1875, "learning_rate": 1.6922363568976347e-06, "loss": 0.78785315, "memory(GiB)": 146.85, "step": 63280, "train_speed(iter/s)": 0.202171 }, { "acc": 0.76579466, "epoch": 1.4766036250164045, "grad_norm": 7.9375, "learning_rate": 1.690819950753138e-06, "loss": 0.85404301, "memory(GiB)": 146.85, "step": 63290, "train_speed(iter/s)": 0.202188 }, { "acc": 0.75891976, "epoch": 1.4768369325886934, "grad_norm": 5.0625, "learning_rate": 1.6894040169931303e-06, "loss": 0.88561277, "memory(GiB)": 146.85, "step": 63300, "train_speed(iter/s)": 0.202206 }, { "acc": 0.76500769, "epoch": 1.477070240160982, "grad_norm": 5.0625, "learning_rate": 1.6879885558197395e-06, "loss": 0.84217539, "memory(GiB)": 146.85, "step": 63310, "train_speed(iter/s)": 0.202224 }, { "acc": 0.76148233, "epoch": 1.4773035477332712, "grad_norm": 4.71875, "learning_rate": 1.6865735674350198e-06, "loss": 0.87538776, "memory(GiB)": 146.85, "step": 63320, "train_speed(iter/s)": 0.20224 }, { "acc": 0.7847271, "epoch": 1.47753685530556, "grad_norm": 4.6875, "learning_rate": 1.6851590520409611e-06, "loss": 0.77367678, "memory(GiB)": 146.85, "step": 63330, "train_speed(iter/s)": 0.202256 }, { "acc": 0.7639297, "epoch": 1.477770162877849, "grad_norm": 6.03125, "learning_rate": 1.6837450098394848e-06, "loss": 0.85417938, "memory(GiB)": 146.85, "step": 63340, "train_speed(iter/s)": 0.202273 }, { "acc": 0.76329355, "epoch": 1.4780034704501377, "grad_norm": 6.65625, "learning_rate": 1.6823314410324426e-06, "loss": 0.85795908, "memory(GiB)": 146.85, "step": 63350, "train_speed(iter/s)": 0.20229 }, { "acc": 0.78540697, "epoch": 1.4782367780224268, "grad_norm": 8.25, "learning_rate": 1.680918345821626e-06, "loss": 0.77931137, "memory(GiB)": 146.85, "step": 63360, "train_speed(iter/s)": 0.202306 }, { "acc": 0.78103862, "epoch": 1.4784700855947155, "grad_norm": 6.21875, "learning_rate": 1.6795057244087493e-06, "loss": 0.77955418, "memory(GiB)": 146.85, "step": 63370, "train_speed(iter/s)": 0.202321 }, { "acc": 0.77654333, "epoch": 1.4787033931670046, "grad_norm": 6.6875, "learning_rate": 1.678093576995467e-06, "loss": 0.79634442, "memory(GiB)": 146.85, "step": 63380, "train_speed(iter/s)": 0.202338 }, { "acc": 0.76148272, "epoch": 1.4789367007392933, "grad_norm": 5.34375, "learning_rate": 1.676681903783362e-06, "loss": 0.87586746, "memory(GiB)": 146.85, "step": 63390, "train_speed(iter/s)": 0.202354 }, { "acc": 0.77707605, "epoch": 1.4791700083115822, "grad_norm": 6.5, "learning_rate": 1.6752707049739487e-06, "loss": 0.79334922, "memory(GiB)": 146.85, "step": 63400, "train_speed(iter/s)": 0.202371 }, { "acc": 0.77834702, "epoch": 1.479403315883871, "grad_norm": 6.625, "learning_rate": 1.6738599807686774e-06, "loss": 0.81129217, "memory(GiB)": 146.85, "step": 63410, "train_speed(iter/s)": 0.202389 }, { "acc": 0.78924384, "epoch": 1.47963662345616, "grad_norm": 4.9375, "learning_rate": 1.6724497313689258e-06, "loss": 0.75368204, "memory(GiB)": 146.85, "step": 63420, "train_speed(iter/s)": 0.202404 }, { "acc": 0.77353086, "epoch": 1.4798699310284489, "grad_norm": 6.34375, "learning_rate": 1.6710399569760105e-06, "loss": 0.81027775, "memory(GiB)": 146.85, "step": 63430, "train_speed(iter/s)": 0.202421 }, { "acc": 0.76555176, "epoch": 1.4801032386007378, "grad_norm": 5.375, "learning_rate": 1.669630657791174e-06, "loss": 0.85182285, "memory(GiB)": 146.85, "step": 63440, "train_speed(iter/s)": 0.202439 }, { "acc": 0.76562967, "epoch": 1.4803365461730267, "grad_norm": 6.0, "learning_rate": 1.6682218340155936e-06, "loss": 0.85666771, "memory(GiB)": 146.85, "step": 63450, "train_speed(iter/s)": 0.202456 }, { "acc": 0.76523681, "epoch": 1.4805698537453156, "grad_norm": 6.0625, "learning_rate": 1.666813485850377e-06, "loss": 0.84424353, "memory(GiB)": 146.85, "step": 63460, "train_speed(iter/s)": 0.202472 }, { "acc": 0.77291327, "epoch": 1.4808031613176045, "grad_norm": 4.78125, "learning_rate": 1.665405613496569e-06, "loss": 0.80558119, "memory(GiB)": 146.85, "step": 63470, "train_speed(iter/s)": 0.202489 }, { "acc": 0.7612071, "epoch": 1.4810364688898934, "grad_norm": 3.796875, "learning_rate": 1.6639982171551405e-06, "loss": 0.87131529, "memory(GiB)": 146.85, "step": 63480, "train_speed(iter/s)": 0.202506 }, { "acc": 0.7827323, "epoch": 1.4812697764621823, "grad_norm": 4.5, "learning_rate": 1.6625912970269958e-06, "loss": 0.79220152, "memory(GiB)": 146.85, "step": 63490, "train_speed(iter/s)": 0.202523 }, { "acc": 0.7682673, "epoch": 1.4815030840344712, "grad_norm": 6.28125, "learning_rate": 1.6611848533129754e-06, "loss": 0.82767334, "memory(GiB)": 146.85, "step": 63500, "train_speed(iter/s)": 0.202541 }, { "epoch": 1.4815030840344712, "eval_acc": 0.735209051981452, "eval_loss": 0.8339876532554626, "eval_runtime": 1263.0309, "eval_samples_per_second": 28.496, "eval_steps_per_second": 14.248, "step": 63500 }, { "acc": 0.7632556, "epoch": 1.48173639160676, "grad_norm": 4.59375, "learning_rate": 1.6597788862138458e-06, "loss": 0.84822674, "memory(GiB)": 146.85, "step": 63510, "train_speed(iter/s)": 0.20173 }, { "acc": 0.77074308, "epoch": 1.481969699179049, "grad_norm": 4.96875, "learning_rate": 1.6583733959303116e-06, "loss": 0.81717138, "memory(GiB)": 146.85, "step": 63520, "train_speed(iter/s)": 0.201746 }, { "acc": 0.76860552, "epoch": 1.4822030067513379, "grad_norm": 5.03125, "learning_rate": 1.6569683826630045e-06, "loss": 0.84257917, "memory(GiB)": 146.85, "step": 63530, "train_speed(iter/s)": 0.201764 }, { "acc": 0.7672965, "epoch": 1.4824363143236268, "grad_norm": 5.5625, "learning_rate": 1.6555638466124878e-06, "loss": 0.82119865, "memory(GiB)": 146.85, "step": 63540, "train_speed(iter/s)": 0.20178 }, { "acc": 0.75648699, "epoch": 1.4826696218959157, "grad_norm": 6.46875, "learning_rate": 1.654159787979262e-06, "loss": 0.86511555, "memory(GiB)": 146.85, "step": 63550, "train_speed(iter/s)": 0.201797 }, { "acc": 0.77979188, "epoch": 1.4829029294682046, "grad_norm": 4.75, "learning_rate": 1.6527562069637543e-06, "loss": 0.79360991, "memory(GiB)": 146.85, "step": 63560, "train_speed(iter/s)": 0.201815 }, { "acc": 0.74967966, "epoch": 1.4831362370404935, "grad_norm": 6.53125, "learning_rate": 1.6513531037663262e-06, "loss": 0.91493645, "memory(GiB)": 146.85, "step": 63570, "train_speed(iter/s)": 0.201832 }, { "acc": 0.74924822, "epoch": 1.4833695446127824, "grad_norm": 5.0, "learning_rate": 1.6499504785872679e-06, "loss": 0.94347801, "memory(GiB)": 146.85, "step": 63580, "train_speed(iter/s)": 0.201848 }, { "acc": 0.80020618, "epoch": 1.4836028521850713, "grad_norm": 3.875, "learning_rate": 1.648548331626807e-06, "loss": 0.72465868, "memory(GiB)": 146.85, "step": 63590, "train_speed(iter/s)": 0.201864 }, { "acc": 0.75843887, "epoch": 1.4838361597573602, "grad_norm": 8.375, "learning_rate": 1.6471466630850985e-06, "loss": 0.88056602, "memory(GiB)": 146.85, "step": 63600, "train_speed(iter/s)": 0.201881 }, { "acc": 0.76666451, "epoch": 1.484069467329649, "grad_norm": 4.84375, "learning_rate": 1.645745473162228e-06, "loss": 0.83379135, "memory(GiB)": 146.85, "step": 63610, "train_speed(iter/s)": 0.201899 }, { "acc": 0.77585135, "epoch": 1.484302774901938, "grad_norm": 5.46875, "learning_rate": 1.644344762058218e-06, "loss": 0.81626301, "memory(GiB)": 146.85, "step": 63620, "train_speed(iter/s)": 0.201915 }, { "acc": 0.77931986, "epoch": 1.4845360824742269, "grad_norm": 9.5, "learning_rate": 1.6429445299730173e-06, "loss": 0.8049263, "memory(GiB)": 146.85, "step": 63630, "train_speed(iter/s)": 0.201932 }, { "acc": 0.75424385, "epoch": 1.4847693900465158, "grad_norm": 5.0625, "learning_rate": 1.6415447771065112e-06, "loss": 0.92045116, "memory(GiB)": 146.85, "step": 63640, "train_speed(iter/s)": 0.201949 }, { "acc": 0.76476727, "epoch": 1.4850026976188047, "grad_norm": 6.6875, "learning_rate": 1.6401455036585111e-06, "loss": 0.84765396, "memory(GiB)": 146.85, "step": 63650, "train_speed(iter/s)": 0.201965 }, { "acc": 0.77746792, "epoch": 1.4852360051910936, "grad_norm": 5.21875, "learning_rate": 1.6387467098287656e-06, "loss": 0.79747562, "memory(GiB)": 146.85, "step": 63660, "train_speed(iter/s)": 0.201982 }, { "acc": 0.7834322, "epoch": 1.4854693127633825, "grad_norm": 5.8125, "learning_rate": 1.637348395816951e-06, "loss": 0.7817646, "memory(GiB)": 146.85, "step": 63670, "train_speed(iter/s)": 0.201998 }, { "acc": 0.78283987, "epoch": 1.4857026203356711, "grad_norm": 5.65625, "learning_rate": 1.635950561822676e-06, "loss": 0.7769474, "memory(GiB)": 146.85, "step": 63680, "train_speed(iter/s)": 0.202014 }, { "acc": 0.77115312, "epoch": 1.4859359279079603, "grad_norm": 5.625, "learning_rate": 1.6345532080454813e-06, "loss": 0.83292103, "memory(GiB)": 146.85, "step": 63690, "train_speed(iter/s)": 0.202031 }, { "acc": 0.7782733, "epoch": 1.486169235480249, "grad_norm": 7.96875, "learning_rate": 1.6331563346848366e-06, "loss": 0.80389719, "memory(GiB)": 146.85, "step": 63700, "train_speed(iter/s)": 0.202047 }, { "acc": 0.76014318, "epoch": 1.486402543052538, "grad_norm": 5.28125, "learning_rate": 1.6317599419401486e-06, "loss": 0.87375813, "memory(GiB)": 146.85, "step": 63710, "train_speed(iter/s)": 0.202064 }, { "acc": 0.77432609, "epoch": 1.4866358506248267, "grad_norm": 8.0625, "learning_rate": 1.6303640300107493e-06, "loss": 0.81092281, "memory(GiB)": 146.85, "step": 63720, "train_speed(iter/s)": 0.202079 }, { "acc": 0.76987815, "epoch": 1.4868691581971158, "grad_norm": 8.0625, "learning_rate": 1.628968599095907e-06, "loss": 0.83615608, "memory(GiB)": 146.85, "step": 63730, "train_speed(iter/s)": 0.202095 }, { "acc": 0.76316013, "epoch": 1.4871024657694045, "grad_norm": 6.40625, "learning_rate": 1.6275736493948174e-06, "loss": 0.86635122, "memory(GiB)": 146.85, "step": 63740, "train_speed(iter/s)": 0.202112 }, { "acc": 0.78692942, "epoch": 1.4873357733416936, "grad_norm": 5.40625, "learning_rate": 1.626179181106609e-06, "loss": 0.77403541, "memory(GiB)": 146.85, "step": 63750, "train_speed(iter/s)": 0.202129 }, { "acc": 0.76408587, "epoch": 1.4875690809139823, "grad_norm": 8.4375, "learning_rate": 1.6247851944303433e-06, "loss": 0.84547863, "memory(GiB)": 146.85, "step": 63760, "train_speed(iter/s)": 0.202145 }, { "acc": 0.76955252, "epoch": 1.4878023884862714, "grad_norm": 5.6875, "learning_rate": 1.6233916895650093e-06, "loss": 0.82860451, "memory(GiB)": 146.85, "step": 63770, "train_speed(iter/s)": 0.202162 }, { "acc": 0.76302261, "epoch": 1.4880356960585601, "grad_norm": 4.8125, "learning_rate": 1.6219986667095323e-06, "loss": 0.86653633, "memory(GiB)": 146.85, "step": 63780, "train_speed(iter/s)": 0.202179 }, { "acc": 0.7618638, "epoch": 1.488269003630849, "grad_norm": 6.625, "learning_rate": 1.6206061260627643e-06, "loss": 0.86314049, "memory(GiB)": 146.85, "step": 63790, "train_speed(iter/s)": 0.202195 }, { "acc": 0.76492176, "epoch": 1.488502311203138, "grad_norm": 6.65625, "learning_rate": 1.6192140678234903e-06, "loss": 0.85781822, "memory(GiB)": 146.85, "step": 63800, "train_speed(iter/s)": 0.202211 }, { "acc": 0.78153601, "epoch": 1.4887356187754268, "grad_norm": 7.75, "learning_rate": 1.617822492190424e-06, "loss": 0.80529213, "memory(GiB)": 146.85, "step": 63810, "train_speed(iter/s)": 0.202228 }, { "acc": 0.78079987, "epoch": 1.4889689263477157, "grad_norm": 4.875, "learning_rate": 1.616431399362216e-06, "loss": 0.78248596, "memory(GiB)": 146.85, "step": 63820, "train_speed(iter/s)": 0.202245 }, { "acc": 0.79952269, "epoch": 1.4892022339200046, "grad_norm": 5.6875, "learning_rate": 1.615040789537443e-06, "loss": 0.72157931, "memory(GiB)": 146.85, "step": 63830, "train_speed(iter/s)": 0.202261 }, { "acc": 0.78501797, "epoch": 1.4894355414922935, "grad_norm": 5.78125, "learning_rate": 1.6136506629146125e-06, "loss": 0.75992107, "memory(GiB)": 146.85, "step": 63840, "train_speed(iter/s)": 0.202278 }, { "acc": 0.76219339, "epoch": 1.4896688490645824, "grad_norm": 5.71875, "learning_rate": 1.6122610196921673e-06, "loss": 0.86164646, "memory(GiB)": 146.85, "step": 63850, "train_speed(iter/s)": 0.202295 }, { "acc": 0.76470518, "epoch": 1.4899021566368713, "grad_norm": 6.625, "learning_rate": 1.6108718600684764e-06, "loss": 0.88414869, "memory(GiB)": 146.85, "step": 63860, "train_speed(iter/s)": 0.202313 }, { "acc": 0.76777096, "epoch": 1.4901354642091602, "grad_norm": 4.5, "learning_rate": 1.609483184241844e-06, "loss": 0.83669415, "memory(GiB)": 146.85, "step": 63870, "train_speed(iter/s)": 0.20233 }, { "acc": 0.7887198, "epoch": 1.490368771781449, "grad_norm": 6.21875, "learning_rate": 1.6080949924105022e-06, "loss": 0.75622315, "memory(GiB)": 146.85, "step": 63880, "train_speed(iter/s)": 0.202346 }, { "acc": 0.77019277, "epoch": 1.490602079353738, "grad_norm": 9.0, "learning_rate": 1.6067072847726134e-06, "loss": 0.83693628, "memory(GiB)": 146.85, "step": 63890, "train_speed(iter/s)": 0.202363 }, { "acc": 0.73736768, "epoch": 1.490835386926027, "grad_norm": 5.875, "learning_rate": 1.605320061526277e-06, "loss": 0.9855732, "memory(GiB)": 146.85, "step": 63900, "train_speed(iter/s)": 0.202379 }, { "acc": 0.79099979, "epoch": 1.4910686944983158, "grad_norm": 5.71875, "learning_rate": 1.6039333228695132e-06, "loss": 0.74043255, "memory(GiB)": 146.85, "step": 63910, "train_speed(iter/s)": 0.202396 }, { "acc": 0.77146544, "epoch": 1.4913020020706047, "grad_norm": 8.375, "learning_rate": 1.6025470690002815e-06, "loss": 0.80076418, "memory(GiB)": 146.85, "step": 63920, "train_speed(iter/s)": 0.202413 }, { "acc": 0.76680593, "epoch": 1.4915353096428936, "grad_norm": 4.1875, "learning_rate": 1.6011613001164677e-06, "loss": 0.84349232, "memory(GiB)": 146.85, "step": 63930, "train_speed(iter/s)": 0.202429 }, { "acc": 0.7727354, "epoch": 1.4917686172151825, "grad_norm": 7.96875, "learning_rate": 1.5997760164158927e-06, "loss": 0.83502922, "memory(GiB)": 146.85, "step": 63940, "train_speed(iter/s)": 0.202446 }, { "acc": 0.77877502, "epoch": 1.4920019247874714, "grad_norm": 5.6875, "learning_rate": 1.5983912180963012e-06, "loss": 0.79055519, "memory(GiB)": 146.85, "step": 63950, "train_speed(iter/s)": 0.202461 }, { "acc": 0.7697485, "epoch": 1.4922352323597603, "grad_norm": 6.75, "learning_rate": 1.5970069053553776e-06, "loss": 0.83996658, "memory(GiB)": 146.85, "step": 63960, "train_speed(iter/s)": 0.202478 }, { "acc": 0.7879981, "epoch": 1.4924685399320492, "grad_norm": 6.1875, "learning_rate": 1.5956230783907294e-06, "loss": 0.76869512, "memory(GiB)": 146.85, "step": 63970, "train_speed(iter/s)": 0.202495 }, { "acc": 0.77174101, "epoch": 1.492701847504338, "grad_norm": 6.15625, "learning_rate": 1.5942397373998959e-06, "loss": 0.82211628, "memory(GiB)": 146.85, "step": 63980, "train_speed(iter/s)": 0.202511 }, { "acc": 0.75362663, "epoch": 1.492935155076627, "grad_norm": 5.46875, "learning_rate": 1.5928568825803526e-06, "loss": 0.89932766, "memory(GiB)": 146.85, "step": 63990, "train_speed(iter/s)": 0.202528 }, { "acc": 0.76843786, "epoch": 1.493168462648916, "grad_norm": 8.8125, "learning_rate": 1.5914745141294974e-06, "loss": 0.82768126, "memory(GiB)": 146.85, "step": 64000, "train_speed(iter/s)": 0.202544 }, { "epoch": 1.493168462648916, "eval_acc": 0.7352742345086625, "eval_loss": 0.83402419090271, "eval_runtime": 1262.6756, "eval_samples_per_second": 28.504, "eval_steps_per_second": 14.252, "step": 64000 }, { "acc": 0.78563328, "epoch": 1.4934017702212048, "grad_norm": 3.875, "learning_rate": 1.5900926322446686e-06, "loss": 0.78299522, "memory(GiB)": 146.85, "step": 64010, "train_speed(iter/s)": 0.201738 }, { "acc": 0.77448444, "epoch": 1.4936350777934937, "grad_norm": 6.8125, "learning_rate": 1.5887112371231227e-06, "loss": 0.80708008, "memory(GiB)": 146.85, "step": 64020, "train_speed(iter/s)": 0.201755 }, { "acc": 0.77385225, "epoch": 1.4938683853657826, "grad_norm": 6.0, "learning_rate": 1.5873303289620585e-06, "loss": 0.8213541, "memory(GiB)": 146.85, "step": 64030, "train_speed(iter/s)": 0.201769 }, { "acc": 0.78269091, "epoch": 1.4941016929380715, "grad_norm": 6.125, "learning_rate": 1.5859499079585982e-06, "loss": 0.79319429, "memory(GiB)": 146.85, "step": 64040, "train_speed(iter/s)": 0.201785 }, { "acc": 0.77811399, "epoch": 1.4943350005103604, "grad_norm": 5.96875, "learning_rate": 1.5845699743097953e-06, "loss": 0.82251625, "memory(GiB)": 146.85, "step": 64050, "train_speed(iter/s)": 0.201801 }, { "acc": 0.76595001, "epoch": 1.4945683080826493, "grad_norm": 5.3125, "learning_rate": 1.583190528212638e-06, "loss": 0.86660347, "memory(GiB)": 146.85, "step": 64060, "train_speed(iter/s)": 0.201818 }, { "acc": 0.78854771, "epoch": 1.494801615654938, "grad_norm": 5.53125, "learning_rate": 1.5818115698640386e-06, "loss": 0.75993962, "memory(GiB)": 146.85, "step": 64070, "train_speed(iter/s)": 0.201834 }, { "acc": 0.77039881, "epoch": 1.495034923227227, "grad_norm": 5.5625, "learning_rate": 1.5804330994608463e-06, "loss": 0.83244047, "memory(GiB)": 146.85, "step": 64080, "train_speed(iter/s)": 0.20185 }, { "acc": 0.77743597, "epoch": 1.4952682307995158, "grad_norm": 4.625, "learning_rate": 1.5790551171998337e-06, "loss": 0.81538343, "memory(GiB)": 146.85, "step": 64090, "train_speed(iter/s)": 0.201866 }, { "acc": 0.77585135, "epoch": 1.4955015383718049, "grad_norm": 6.625, "learning_rate": 1.5776776232777114e-06, "loss": 0.81453381, "memory(GiB)": 146.85, "step": 64100, "train_speed(iter/s)": 0.201883 }, { "acc": 0.75708809, "epoch": 1.4957348459440936, "grad_norm": 7.65625, "learning_rate": 1.5763006178911139e-06, "loss": 0.89973774, "memory(GiB)": 146.85, "step": 64110, "train_speed(iter/s)": 0.201898 }, { "acc": 0.78552504, "epoch": 1.4959681535163827, "grad_norm": 4.875, "learning_rate": 1.5749241012366068e-06, "loss": 0.76663103, "memory(GiB)": 146.85, "step": 64120, "train_speed(iter/s)": 0.201913 }, { "acc": 0.76537275, "epoch": 1.4962014610886714, "grad_norm": 5.0625, "learning_rate": 1.5735480735106927e-06, "loss": 0.84830551, "memory(GiB)": 146.85, "step": 64130, "train_speed(iter/s)": 0.201929 }, { "acc": 0.78224592, "epoch": 1.4964347686609605, "grad_norm": 5.03125, "learning_rate": 1.5721725349097926e-06, "loss": 0.77421541, "memory(GiB)": 146.85, "step": 64140, "train_speed(iter/s)": 0.201945 }, { "acc": 0.77992144, "epoch": 1.4966680762332492, "grad_norm": 5.34375, "learning_rate": 1.570797485630269e-06, "loss": 0.80311108, "memory(GiB)": 146.85, "step": 64150, "train_speed(iter/s)": 0.201961 }, { "acc": 0.76923122, "epoch": 1.4969013838055383, "grad_norm": 5.9375, "learning_rate": 1.5694229258684063e-06, "loss": 0.85034323, "memory(GiB)": 146.85, "step": 64160, "train_speed(iter/s)": 0.201978 }, { "acc": 0.76899424, "epoch": 1.497134691377827, "grad_norm": 4.53125, "learning_rate": 1.5680488558204259e-06, "loss": 0.84487867, "memory(GiB)": 146.85, "step": 64170, "train_speed(iter/s)": 0.201994 }, { "acc": 0.75771923, "epoch": 1.4973679989501159, "grad_norm": 4.71875, "learning_rate": 1.566675275682475e-06, "loss": 0.88353882, "memory(GiB)": 146.85, "step": 64180, "train_speed(iter/s)": 0.20201 }, { "acc": 0.77886171, "epoch": 1.4976013065224048, "grad_norm": 6.40625, "learning_rate": 1.565302185650629e-06, "loss": 0.80532122, "memory(GiB)": 146.85, "step": 64190, "train_speed(iter/s)": 0.202027 }, { "acc": 0.79620085, "epoch": 1.4978346140946937, "grad_norm": 6.5, "learning_rate": 1.5639295859208998e-06, "loss": 0.73219047, "memory(GiB)": 146.85, "step": 64200, "train_speed(iter/s)": 0.202044 }, { "acc": 0.77239566, "epoch": 1.4980679216669825, "grad_norm": 5.71875, "learning_rate": 1.562557476689222e-06, "loss": 0.81025066, "memory(GiB)": 146.85, "step": 64210, "train_speed(iter/s)": 0.202059 }, { "acc": 0.77292371, "epoch": 1.4983012292392714, "grad_norm": 4.875, "learning_rate": 1.5611858581514683e-06, "loss": 0.81362419, "memory(GiB)": 146.85, "step": 64220, "train_speed(iter/s)": 0.202075 }, { "acc": 0.75554028, "epoch": 1.4985345368115603, "grad_norm": 5.71875, "learning_rate": 1.559814730503434e-06, "loss": 0.89956398, "memory(GiB)": 146.85, "step": 64230, "train_speed(iter/s)": 0.202092 }, { "acc": 0.77208309, "epoch": 1.4987678443838492, "grad_norm": 5.71875, "learning_rate": 1.5584440939408473e-06, "loss": 0.83837833, "memory(GiB)": 146.85, "step": 64240, "train_speed(iter/s)": 0.202106 }, { "acc": 0.75363207, "epoch": 1.4990011519561381, "grad_norm": 6.5625, "learning_rate": 1.557073948659365e-06, "loss": 0.88769569, "memory(GiB)": 146.85, "step": 64250, "train_speed(iter/s)": 0.202122 }, { "acc": 0.78892121, "epoch": 1.499234459528427, "grad_norm": 5.65625, "learning_rate": 1.555704294854578e-06, "loss": 0.75720453, "memory(GiB)": 146.85, "step": 64260, "train_speed(iter/s)": 0.202137 }, { "acc": 0.77572351, "epoch": 1.499467767100716, "grad_norm": 5.65625, "learning_rate": 1.5543351327220025e-06, "loss": 0.81458969, "memory(GiB)": 146.85, "step": 64270, "train_speed(iter/s)": 0.202153 }, { "acc": 0.77248616, "epoch": 1.4997010746730048, "grad_norm": 7.1875, "learning_rate": 1.5529664624570839e-06, "loss": 0.79407568, "memory(GiB)": 146.85, "step": 64280, "train_speed(iter/s)": 0.20217 }, { "acc": 0.77457733, "epoch": 1.4999343822452937, "grad_norm": 6.6875, "learning_rate": 1.551598284255203e-06, "loss": 0.81264305, "memory(GiB)": 146.85, "step": 64290, "train_speed(iter/s)": 0.202185 }, { "acc": 0.76415148, "epoch": 1.5001676898175826, "grad_norm": 6.15625, "learning_rate": 1.550230598311664e-06, "loss": 0.87184963, "memory(GiB)": 146.85, "step": 64300, "train_speed(iter/s)": 0.202202 }, { "acc": 0.79061308, "epoch": 1.5004009973898715, "grad_norm": 4.28125, "learning_rate": 1.548863404821706e-06, "loss": 0.75729294, "memory(GiB)": 146.85, "step": 64310, "train_speed(iter/s)": 0.202217 }, { "acc": 0.7921566, "epoch": 1.5006343049621604, "grad_norm": 5.84375, "learning_rate": 1.547496703980495e-06, "loss": 0.74758081, "memory(GiB)": 146.85, "step": 64320, "train_speed(iter/s)": 0.202234 }, { "acc": 0.76889038, "epoch": 1.5008676125344493, "grad_norm": 6.3125, "learning_rate": 1.5461304959831248e-06, "loss": 0.83570042, "memory(GiB)": 146.85, "step": 64330, "train_speed(iter/s)": 0.202249 }, { "acc": 0.79086123, "epoch": 1.5011009201067382, "grad_norm": 6.625, "learning_rate": 1.5447647810246241e-06, "loss": 0.75380144, "memory(GiB)": 146.85, "step": 64340, "train_speed(iter/s)": 0.202266 }, { "acc": 0.75684276, "epoch": 1.5013342276790271, "grad_norm": 5.4375, "learning_rate": 1.5433995592999457e-06, "loss": 0.86945305, "memory(GiB)": 146.85, "step": 64350, "train_speed(iter/s)": 0.202282 }, { "acc": 0.78495922, "epoch": 1.501567535251316, "grad_norm": 5.625, "learning_rate": 1.5420348310039796e-06, "loss": 0.76797829, "memory(GiB)": 146.85, "step": 64360, "train_speed(iter/s)": 0.202299 }, { "acc": 0.78491497, "epoch": 1.501800842823605, "grad_norm": 5.125, "learning_rate": 1.5406705963315333e-06, "loss": 0.75228271, "memory(GiB)": 146.85, "step": 64370, "train_speed(iter/s)": 0.202315 }, { "acc": 0.77982593, "epoch": 1.5020341503958938, "grad_norm": 6.0625, "learning_rate": 1.539306855477356e-06, "loss": 0.79252281, "memory(GiB)": 146.85, "step": 64380, "train_speed(iter/s)": 0.202332 }, { "acc": 0.78916636, "epoch": 1.5022674579681827, "grad_norm": 6.25, "learning_rate": 1.5379436086361187e-06, "loss": 0.75730314, "memory(GiB)": 146.85, "step": 64390, "train_speed(iter/s)": 0.202348 }, { "acc": 0.77468452, "epoch": 1.5025007655404716, "grad_norm": 5.125, "learning_rate": 1.5365808560024264e-06, "loss": 0.79208574, "memory(GiB)": 146.85, "step": 64400, "train_speed(iter/s)": 0.202364 }, { "acc": 0.76520944, "epoch": 1.5027340731127605, "grad_norm": 4.84375, "learning_rate": 1.5352185977708112e-06, "loss": 0.85008087, "memory(GiB)": 146.85, "step": 64410, "train_speed(iter/s)": 0.202381 }, { "acc": 0.79571657, "epoch": 1.5029673806850492, "grad_norm": 5.46875, "learning_rate": 1.533856834135733e-06, "loss": 0.72607107, "memory(GiB)": 146.85, "step": 64420, "train_speed(iter/s)": 0.202398 }, { "acc": 0.77127099, "epoch": 1.5032006882573383, "grad_norm": 6.5, "learning_rate": 1.532495565291587e-06, "loss": 0.83961296, "memory(GiB)": 146.85, "step": 64430, "train_speed(iter/s)": 0.202414 }, { "acc": 0.77603154, "epoch": 1.503433995829627, "grad_norm": 5.9375, "learning_rate": 1.5311347914326891e-06, "loss": 0.80167017, "memory(GiB)": 146.85, "step": 64440, "train_speed(iter/s)": 0.20243 }, { "acc": 0.75209274, "epoch": 1.5036673034019161, "grad_norm": 5.375, "learning_rate": 1.5297745127532942e-06, "loss": 0.88315544, "memory(GiB)": 146.85, "step": 64450, "train_speed(iter/s)": 0.202446 }, { "acc": 0.77707043, "epoch": 1.5039006109742048, "grad_norm": 6.09375, "learning_rate": 1.5284147294475792e-06, "loss": 0.80328369, "memory(GiB)": 146.85, "step": 64460, "train_speed(iter/s)": 0.202461 }, { "acc": 0.79849033, "epoch": 1.504133918546494, "grad_norm": 4.3125, "learning_rate": 1.5270554417096533e-06, "loss": 0.72555056, "memory(GiB)": 146.85, "step": 64470, "train_speed(iter/s)": 0.202476 }, { "acc": 0.76891756, "epoch": 1.5043672261187826, "grad_norm": 7.0, "learning_rate": 1.5256966497335541e-06, "loss": 0.83750248, "memory(GiB)": 146.85, "step": 64480, "train_speed(iter/s)": 0.202493 }, { "acc": 0.75874524, "epoch": 1.5046005336910717, "grad_norm": 5.25, "learning_rate": 1.5243383537132473e-06, "loss": 0.87245302, "memory(GiB)": 146.85, "step": 64490, "train_speed(iter/s)": 0.202509 }, { "acc": 0.79219031, "epoch": 1.5048338412633604, "grad_norm": 6.40625, "learning_rate": 1.5229805538426323e-06, "loss": 0.74336853, "memory(GiB)": 146.85, "step": 64500, "train_speed(iter/s)": 0.202525 }, { "epoch": 1.5048338412633604, "eval_acc": 0.7352045343805562, "eval_loss": 0.8339295387268066, "eval_runtime": 1262.0621, "eval_samples_per_second": 28.518, "eval_steps_per_second": 14.259, "step": 64500 } ], "logging_steps": 10, "max_steps": 85722, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.5218312146782781e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }